xref: /freebsd/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S (revision 4fbb9c43aa44d9145151bb5f77d302ba01fb7551)
1/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=8
5.arch	armv8-a+crypto
6.text
7.globl	aes_gcm_enc_128_kernel
8.type	aes_gcm_enc_128_kernel,%function
9.align	4
10aes_gcm_enc_128_kernel:
11	cbz	x1, .L128_enc_ret
12	stp	x19, x20, [sp, #-112]!
13	mov	x16, x4
14	mov	x8, x5
15	stp	x21, x22, [sp, #16]
16	stp	x23, x24, [sp, #32]
17	stp	d8, d9, [sp, #48]
18	stp	d10, d11, [sp, #64]
19	stp	d12, d13, [sp, #80]
20	stp	d14, d15, [sp, #96]
21
22	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
23#ifdef __AARCH64EB__
24	rev	x10, x10
25	rev	x11, x11
26#endif
27	ldp	x13, x14, [x8, #160]                     //load rk10
28#ifdef __AARCH64EB__
29	ror	x13, x13, #32
30	ror	x14, x14, #32
31#endif
32	ld1	{v11.16b}, [x3]
33	ext	v11.16b, v11.16b, v11.16b, #8
34	rev64	v11.16b, v11.16b
35	lsr	x5, x1, #3              //byte_len
36	mov	x15, x5
37
38	ld1	{v18.4s}, [x8], #16								  //load rk0
39	add	x4, x0, x1, lsr #3   //end_input_ptr
40	sub	x5, x5, #1      //byte_len - 1
41
42	lsr	x12, x11, #32
43	ldr	q15, [x3, #112]                        //load h4l | h4h
44#ifndef __AARCH64EB__
45	ext	v15.16b, v15.16b, v15.16b, #8
46#endif
47	fmov	d1, x10                               //CTR block 1
48	rev	w12, w12                                //rev_ctr32
49
50	add	w12, w12, #1                            //increment rev_ctr32
51	orr	w11, w11, w11
52	ld1	{v19.4s}, [x8], #16								  //load rk1
53
54	rev	w9, w12                                 //CTR block 1
55	add	w12, w12, #1                            //CTR block 1
56	fmov	d3, x10                               //CTR block 3
57
58	orr	x9, x11, x9, lsl #32            //CTR block 1
59	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
60
61	fmov	v1.d[1], x9                               //CTR block 1
62	rev	w9, w12                                 //CTR block 2
63
64	fmov	d2, x10                               //CTR block 2
65	orr	x9, x11, x9, lsl #32            //CTR block 2
66	add	w12, w12, #1                            //CTR block 2
67
68	fmov	v2.d[1], x9                               //CTR block 2
69	rev	w9, w12                                 //CTR block 3
70
71	orr	x9, x11, x9, lsl #32            //CTR block 3
72	ld1	{v20.4s}, [x8], #16								  //load rk2
73
74	add	w12, w12, #1                            //CTR block 3
75	fmov	v3.d[1], x9                               //CTR block 3
76
77	ldr	q14, [x3, #80]                         //load h3l | h3h
78#ifndef __AARCH64EB__
79	ext	v14.16b, v14.16b, v14.16b, #8
80#endif
81	aese	v1.16b, v18.16b
82	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
83	ld1	{v21.4s}, [x8], #16								  //load rk3
84
85	aese	v2.16b, v18.16b
86	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
87	ldr	q12, [x3, #32]                         //load h1l | h1h
88#ifndef __AARCH64EB__
89	ext	v12.16b, v12.16b, v12.16b, #8
90#endif
91
92	aese	v0.16b, v18.16b
93	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
94	ld1	{v22.4s}, [x8], #16								  //load rk4
95
96	aese	v3.16b, v18.16b
97	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
98	ld1	{v23.4s}, [x8], #16								  //load rk5
99
100	aese	v2.16b, v19.16b
101	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
102	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
103
104	aese	v0.16b, v19.16b
105	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
106	ld1	{v24.4s}, [x8], #16								  //load rk6
107
108	aese	v1.16b, v19.16b
109	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
110	ld1	{v25.4s}, [x8], #16								  //load rk7
111
112	aese	v3.16b, v19.16b
113	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
114	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
115
116	aese	v0.16b, v20.16b
117	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
118	ld1	{v26.4s}, [x8], #16								  //load rk8
119
120	aese	v1.16b, v20.16b
121	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
122	ldr	q13, [x3, #64]                         //load h2l | h2h
123#ifndef __AARCH64EB__
124	ext	v13.16b, v13.16b, v13.16b, #8
125#endif
126
127	aese	v3.16b, v20.16b
128	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
129
130	aese	v2.16b, v20.16b
131	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
132	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
133
134	aese	v0.16b, v21.16b
135	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
136
137	aese	v1.16b, v21.16b
138	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
139
140	aese	v2.16b, v21.16b
141	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
142	ld1	{v27.4s}, [x8], #16								  //load rk9
143
144	aese	v3.16b, v21.16b
145	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
146
147	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
148	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
149
150	aese	v3.16b, v22.16b
151	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
152	add	x5, x5, x0
153
154	aese	v2.16b, v22.16b
155	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
156	cmp	x0, x5                   //check if we have <= 4 blocks
157
158	aese	v0.16b, v22.16b
159	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
160
161	aese	v3.16b, v23.16b
162	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
163
164	aese	v2.16b, v23.16b
165	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
166
167	aese	v0.16b, v23.16b
168	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
169
170	aese	v3.16b, v24.16b
171	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
172
173	aese	v1.16b, v22.16b
174	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
175
176	aese	v2.16b, v24.16b
177	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
178	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
179
180	aese	v0.16b, v24.16b
181	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
182
183	aese	v1.16b, v23.16b
184	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
185
186	aese	v3.16b, v25.16b
187	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
188
189	aese	v0.16b, v25.16b
190	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
191
192	aese	v1.16b, v24.16b
193	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
194
195	aese	v2.16b, v25.16b
196	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
197
198	aese	v0.16b, v26.16b
199	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
200
201	aese	v1.16b, v25.16b
202	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
203
204	aese	v2.16b, v26.16b
205	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
206
207	aese	v3.16b, v26.16b
208	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
209
210	aese	v1.16b, v26.16b
211	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
212
213	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
214
215	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
216
217	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
218
219	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
220
221	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
222	b.ge	.L128_enc_tail                                    //handle tail
223
224	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
225#ifdef __AARCH64EB__
226	rev	x6, x6
227	rev	x7, x7
228#endif
229	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
230#ifdef __AARCH64EB__
231	rev	x21, x21
232	rev	x22, x22
233#endif
234	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
235#ifdef __AARCH64EB__
236	rev	x19, x19
237	rev	x20, x20
238#endif
239	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
240#ifdef __AARCH64EB__
241	rev	x23, x23
242	rev	x24, x24
243#endif
244	eor	x6, x6, x13                     //AES block 0 - round 10 low
245	eor	x7, x7, x14                     //AES block 0 - round 10 high
246
247	eor	x21, x21, x13                     //AES block 2 - round 10 low
248	fmov	d4, x6                               //AES block 0 - mov low
249
250	eor	x19, x19, x13                     //AES block 1 - round 10 low
251	eor	x22, x22, x14                     //AES block 2 - round 10 high
252	fmov	v4.d[1], x7                           //AES block 0 - mov high
253
254	fmov	d5, x19                               //AES block 1 - mov low
255	eor	x20, x20, x14                     //AES block 1 - round 10 high
256
257	eor	x23, x23, x13                     //AES block 3 - round 10 low
258	fmov	v5.d[1], x20                           //AES block 1 - mov high
259
260	fmov	d6, x21                               //AES block 2 - mov low
261	eor	x24, x24, x14                     //AES block 3 - round 10 high
262	rev	w9, w12                                 //CTR block 4
263
264	fmov	v6.d[1], x22                           //AES block 2 - mov high
265	orr	x9, x11, x9, lsl #32            //CTR block 4
266
267	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
268	fmov	d0, x10                               //CTR block 4
269	add	w12, w12, #1                            //CTR block 4
270
271	fmov	v0.d[1], x9                               //CTR block 4
272	rev	w9, w12                                 //CTR block 5
273
274	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
275	fmov	d1, x10                               //CTR block 5
276	orr	x9, x11, x9, lsl #32            //CTR block 5
277
278	add	w12, w12, #1                            //CTR block 5
279	add	x0, x0, #64                       //AES input_ptr update
280	fmov	v1.d[1], x9                               //CTR block 5
281
282	fmov	d7, x23                               //AES block 3 - mov low
283	rev	w9, w12                                 //CTR block 6
284	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
285
286	fmov	v7.d[1], x24                           //AES block 3 - mov high
287	orr	x9, x11, x9, lsl #32            //CTR block 6
288
289	add	w12, w12, #1                            //CTR block 6
290	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
291	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
292
293	fmov	d2, x10                               //CTR block 6
294	cmp	x0, x5                   //check if we have <= 8 blocks
295
296	fmov	v2.d[1], x9                               //CTR block 6
297	rev	w9, w12                                 //CTR block 7
298	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
299
300	orr	x9, x11, x9, lsl #32            //CTR block 7
301
302	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
303	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
304	b.ge	.L128_enc_prepretail                              //do prepretail
305
306.L128_enc_main_loop:	//main	loop start
307	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
308#ifdef __AARCH64EB__
309	rev	x23, x23
310	rev	x24, x24
311#endif
312	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
313	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
314
315	aese	v2.16b, v18.16b
316	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
317	fmov	d3, x10                               //CTR block 4k+3
318
319	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
320	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
321
322	aese	v1.16b, v18.16b
323	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
324	add	w12, w12, #1                            //CTR block 4k+3
325	fmov	v3.d[1], x9                               //CTR block 4k+3
326
327	aese	v0.16b, v18.16b
328	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
329	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
330
331	aese	v2.16b, v19.16b
332	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
333	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
334
335	aese	v1.16b, v19.16b
336	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
337	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
338
339	aese	v3.16b, v18.16b
340	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
341	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
342
343	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
344	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
345	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
346#ifdef __AARCH64EB__
347	rev	x6, x6
348	rev	x7, x7
349#endif
350	aese	v0.16b, v19.16b
351	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
352	rev	w9, w12                                 //CTR block 4k+8
353
354	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
355	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
356	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
357
358	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
359	add	w12, w12, #1                            //CTR block 4k+8
360	mov	d10, v17.d[1]                               //GHASH block 4k - mid
361
362	aese	v0.16b, v20.16b
363	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
364
365	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
366	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
367
368	aese	v1.16b, v20.16b
369	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
370
371	aese	v0.16b, v21.16b
372	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
373	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
374
375	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
376
377	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
378	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
379
380	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
381
382	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
383	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
384
385	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
386	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
387
388	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
389	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
390
391	aese	v3.16b, v19.16b
392	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
393	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
394
395	aese	v2.16b, v20.16b
396	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
397	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
398
399	aese	v1.16b, v21.16b
400	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
401	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
402
403	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
404
405	aese	v2.16b, v21.16b
406	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
407	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
408
409	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
410
411	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
412	movi	v8.8b, #0xc2
413
414	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
415	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
416
417	aese	v1.16b, v22.16b
418	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
419
420	aese	v3.16b, v20.16b
421	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
422	shl	d8, d8, #56               //mod_constant
423
424	aese	v0.16b, v22.16b
425	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
426	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
427
428	aese	v1.16b, v23.16b
429	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
430	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
431#ifdef __AARCH64EB__
432	rev	x19, x19
433	rev	x20, x20
434#endif
435	aese	v3.16b, v21.16b
436	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
437	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
438
439	aese	v0.16b, v23.16b
440	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
441	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
442#ifdef __AARCH64EB__
443	rev	x21, x21
444	rev	x22, x22
445#endif
446	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
447	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
448
449	aese	v2.16b, v22.16b
450	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
451	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
452
453	aese	v3.16b, v22.16b
454	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
455	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
456
457	aese	v1.16b, v24.16b
458	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
459	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
460
461	aese	v2.16b, v23.16b
462	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
463	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
464
465	fmov	d4, x6                               //AES block 4k+4 - mov low
466	aese	v0.16b, v24.16b
467	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
468	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
469
470	add	x0, x0, #64                       //AES input_ptr update
471	fmov	d7, x23                               //AES block 4k+3 - mov low
472	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
473
474	aese	v3.16b, v23.16b
475	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
476	fmov	d5, x19                               //AES block 4k+5 - mov low
477
478	aese	v0.16b, v25.16b
479	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
480	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
481
482	aese	v2.16b, v24.16b
483	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
484	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
485
486	aese	v1.16b, v25.16b
487	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
488	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
489
490	aese	v0.16b, v26.16b
491	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
492	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
493
494	aese	v3.16b, v24.16b
495	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
496	cmp	x0, x5                   //.LOOP CONTROL
497
498	aese	v1.16b, v26.16b
499	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
500	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
501
502	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
503	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
504	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
505
506	aese	v3.16b, v25.16b
507	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
508	fmov	d6, x21                               //AES block 4k+6 - mov low
509
510	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
511	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
512
513	aese	v2.16b, v25.16b
514	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
515	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
516
517	fmov	d0, x10                               //CTR block 4k+8
518	aese	v3.16b, v26.16b
519	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
520
521	fmov	v0.d[1], x9                               //CTR block 4k+8
522	rev	w9, w12                                 //CTR block 4k+9
523	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
524
525	aese	v2.16b, v26.16b
526	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
527	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
528
529	add	w12, w12, #1                            //CTR block 4k+9
530	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
531	fmov	d1, x10                               //CTR block 4k+9
532
533	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
534	fmov	v1.d[1], x9                               //CTR block 4k+9
535	rev	w9, w12                                 //CTR block 4k+10
536
537	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
538	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
539	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
540	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
541
542	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
543	add	w12, w12, #1                            //CTR block 4k+10
544	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
545	fmov	d2, x10                               //CTR block 4k+10
546
547	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
548	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
549
550	fmov	v2.d[1], x9                               //CTR block 4k+10
551	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
552	rev	w9, w12                                 //CTR block 4k+11
553
554	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
555	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
556
557	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
558	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
559	b.lt	.L128_enc_main_loop
560
561.L128_enc_prepretail:	//PREPRETAIL
562	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
563	fmov	d3, x10                               //CTR block 4k+3
564	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
565
566	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
567	add	w12, w12, #1                            //CTR block 4k+3
568	fmov	v3.d[1], x9                               //CTR block 4k+3
569
570	aese	v1.16b, v18.16b
571	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
572	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
573
574	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
575
576	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
577	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
578
579	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
580
581	aese	v3.16b, v18.16b
582	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
583	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
584
585	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
586	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
587
588	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
589	mov	d10, v17.d[1]                               //GHASH block 4k - mid
590
591	aese	v1.16b, v19.16b
592	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
593	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
594
595	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
596
597	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
598	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
599
600	aese	v3.16b, v19.16b
601	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
602
603	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
604	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
605
606	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
607
608	aese	v0.16b, v18.16b
609	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
610	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
611
612	aese	v2.16b, v18.16b
613	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
614
615	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
616	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
617
618	aese	v0.16b, v19.16b
619	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
620	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
621
622	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
623
624	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
625	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
626
627	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
628
629	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
630
631	aese	v2.16b, v19.16b
632	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
633	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
634
635	aese	v0.16b, v20.16b
636	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
637
638	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
639	movi	v8.8b, #0xc2
640
641	aese	v2.16b, v20.16b
642	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
643	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
644
645	aese	v3.16b, v20.16b
646	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
647
648	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
649	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
650
651	aese	v2.16b, v21.16b
652	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
653
654	aese	v1.16b, v20.16b
655	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
656	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
657
658	aese	v0.16b, v21.16b
659	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
660
661	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
662	shl	d8, d8, #56               //mod_constant
663
664	aese	v1.16b, v21.16b
665	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
666	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
667
668	aese	v0.16b, v22.16b
669	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
670
671	pmull	v28.1q, v9.1d, v8.1d
672	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
673
674	aese	v1.16b, v22.16b
675	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
676
677	aese	v0.16b, v23.16b
678	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
679	ext	v9.16b, v9.16b, v9.16b, #8
680
681	aese	v3.16b, v21.16b
682	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
683
684	aese	v2.16b, v22.16b
685	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
686	eor	v10.16b, v10.16b, v11.16b
687
688	aese	v0.16b, v24.16b
689	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
690
691	aese	v3.16b, v22.16b
692	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
693
694	aese	v1.16b, v23.16b
695	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
696
697	aese	v2.16b, v23.16b
698	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
699	eor	v10.16b, v10.16b, v28.16b
700
701	aese	v3.16b, v23.16b
702	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
703
704	aese	v1.16b, v24.16b
705	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
706
707	aese	v2.16b, v24.16b
708	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
709
710	aese	v3.16b, v24.16b
711	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
712	eor	v10.16b, v10.16b, v9.16b
713
714	aese	v0.16b, v25.16b
715	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
716
717	aese	v2.16b, v25.16b
718	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
719
720	aese	v3.16b, v25.16b
721	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
722
723	pmull	v28.1q, v10.1d, v8.1d
724
725	aese	v1.16b, v25.16b
726	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
727	ext	v10.16b, v10.16b, v10.16b, #8
728
729	aese	v3.16b, v26.16b
730	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
731
732	aese	v0.16b, v26.16b
733	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
734	eor	v11.16b, v11.16b, v28.16b
735
736	aese	v1.16b, v26.16b
737	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
738
739	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
740
741	aese	v2.16b, v26.16b
742	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
743
744	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
745
746	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
747	eor	v11.16b, v11.16b, v10.16b
748
749	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
750.L128_enc_tail:	//TAIL
751
752	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
753	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
754#ifdef __AARCH64EB__
755	rev	x6, x6
756	rev	x7, x7
757#endif
758	cmp	x5, #48
759
760	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
761	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
762	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
763
764	fmov	d4, x6                               //AES block 4k+4 - mov low
765
766	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
767
768	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
769
770	b.gt	.L128_enc_blocks_more_than_3
771
772	sub	w12, w12, #1
773	movi	v11.8b, #0
774	mov	v3.16b, v2.16b
775
776	cmp	x5, #32
777	mov	v2.16b, v1.16b
778	movi	v9.8b, #0
779
780	movi	v10.8b, #0
781	b.gt	.L128_enc_blocks_more_than_2
782
783	mov	v3.16b, v1.16b
784	cmp	x5, #16
785
786	sub	w12, w12, #1
787	b.gt	.L128_enc_blocks_more_than_1
788
789	sub	w12, w12, #1
790	b	.L128_enc_blocks_less_than_1
791.L128_enc_blocks_more_than_3:	//blocks	left >  3
792	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
793
794	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
795#ifdef __AARCH64EB__
796	rev	x6, x6
797	rev	x7, x7
798#endif
799	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
800
801	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
802	eor	x7, x7, x14                     //AES final-2 block - round 10 high
803	eor	x6, x6, x13                     //AES final-2 block - round 10 low
804
805	fmov	d5, x6                                 //AES final-2 block - mov low
806
807	movi	v8.8b, #0                                        //suppress further partial tag feed in
808	fmov	v5.d[1], x7                             //AES final-2 block - mov high
809
810	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
811	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
812
813	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
814
815	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
816
817	eor	v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
818	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
819
820	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
821.L128_enc_blocks_more_than_2:	//blocks	left >  2
822
823	st1	{ v5.16b}, [x2], #16                     //AES final-2 block - store result
824
825	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
826	ldp	x6, x7, [x0], #16           //AES final-1 block - load input low & high
827#ifdef __AARCH64EB__
828	rev	x6, x6
829	rev	x7, x7
830#endif
831	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
832
833	eor	x6, x6, x13                     //AES final-1 block - round 10 low
834
835	fmov	d5, x6                                 //AES final-1 block - mov low
836	eor	x7, x7, x14                     //AES final-1 block - round 10 high
837
838	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
839	fmov	v5.d[1], x7                             //AES final-1 block - mov high
840
841	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
842
843	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
844
845	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
846
847	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
848
849	eor	v5.16b, v5.16b, v2.16b                            //AES final-1 block - result
850
851	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
852
853	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
854
855	movi	v8.8b, #0                                        //suppress further partial tag feed in
856
857	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
858.L128_enc_blocks_more_than_1:	//blocks	left >  1
859
860	st1	{ v5.16b}, [x2], #16                     //AES final-1 block - store result
861
862	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
863	ldp	x6, x7, [x0], #16           //AES final block - load input low & high
864#ifdef __AARCH64EB__
865	rev	x6, x6
866	rev	x7, x7
867#endif
868	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
869
870	eor	x7, x7, x14                     //AES final block - round 10 high
871	eor	x6, x6, x13                     //AES final block - round 10 low
872
873	fmov	d5, x6                                 //AES final block - mov low
874
875	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
876	fmov	v5.d[1], x7                             //AES final block - mov high
877
878	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
879
880	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
881
882	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
883
884	eor	v5.16b, v5.16b, v3.16b                            //AES final block - result
885
886	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
887
888	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
889
890	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
891
892	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
893
894	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
895	movi	v8.8b, #0                                        //suppress further partial tag feed in
896.L128_enc_blocks_less_than_1:	//blocks	left <= 1
897
898	and	x1, x1, #127                    //bit_length %= 128
899	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
900
901	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
902	sub	x1, x1, #128                    //bit_length -= 128
903
904	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
905
906	and	x1, x1, #127                    //bit_length %= 128
907
908	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
909	cmp	x1, #64
910
911	csel	x6, x13, x14, lt
912	csel	x7, x14, xzr, lt
913
914	fmov	d0, x6                                 //ctr0b is mask for last block
915
916	fmov	v0.d[1], x7
917
918	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
919
920	rev64	v4.16b, v5.16b                                    //GHASH final block
921
922	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
923
924	mov	d8, v4.d[1]                                  //GHASH final block - mid
925
926	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
927	ld1	{ v18.16b}, [x2]                            //load existing bytes where the possibly partial last block is to be stored
928
929	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
930#ifndef __AARCH64EB__
931	rev	w9, w12
932#else
933	mov	w9, w12
934#endif
935	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
936
937	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
938
939	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
940
941	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
942
943	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
944	movi	v8.8b, #0xc2
945
946	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
947
948	shl	d8, d8, #56               //mod_constant
949
950	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
951
952	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
953
954	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
955
956	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
957
958	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
959
960	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
961
962	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
963
964	bif	v5.16b, v18.16b, v0.16b                              //insert existing bytes in top end of result before storing
965
966	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
967	st1	{ v5.16b}, [x2]                          //store all 16B
968
969	str	w9, [x16, #12]                          //store the updated counter
970
971	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
972	ext	v11.16b, v11.16b, v11.16b, #8
973	rev64	v11.16b, v11.16b
974	mov	x0, x15
975	st1	{ v11.16b }, [x3]
976	ldp	x21, x22, [sp, #16]
977	ldp	x23, x24, [sp, #32]
978	ldp	d8, d9, [sp, #48]
979	ldp	d10, d11, [sp, #64]
980	ldp	d12, d13, [sp, #80]
981	ldp	d14, d15, [sp, #96]
982	ldp	x19, x20, [sp], #112
983	ret
984
985.L128_enc_ret:
986	mov	w0, #0x0
987	ret
988.size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
989.globl	aes_gcm_dec_128_kernel
990.type	aes_gcm_dec_128_kernel,%function
991.align	4
992aes_gcm_dec_128_kernel:
993	cbz	x1, .L128_dec_ret
994	stp	x19, x20, [sp, #-112]!
995	mov	x16, x4
996	mov	x8, x5
997	stp	x21, x22, [sp, #16]
998	stp	x23, x24, [sp, #32]
999	stp	d8, d9, [sp, #48]
1000	stp	d10, d11, [sp, #64]
1001	stp	d12, d13, [sp, #80]
1002	stp	d14, d15, [sp, #96]
1003
1004	lsr	x5, x1, #3              //byte_len
1005	mov	x15, x5
1006	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
1007#ifdef __AARCH64EB__
1008	rev	x10, x10
1009	rev	x11, x11
1010#endif
1011	ldp	x13, x14, [x8, #160]                     //load rk10
1012#ifdef __AARCH64EB__
1013	ror	x14, x14, 32
1014	ror	x13, x13, 32
1015#endif
1016	sub	x5, x5, #1      //byte_len - 1
1017	ld1	{v18.4s}, [x8], #16                                //load rk0
1018
1019	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1020	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
1021
1022	ldr	q13, [x3, #64]                         //load h2l | h2h
1023#ifndef __AARCH64EB__
1024	ext	v13.16b, v13.16b, v13.16b, #8
1025#endif
1026	lsr	x12, x11, #32
1027	fmov	d2, x10                               //CTR block 2
1028
1029	ld1	{v19.4s}, [x8], #16                                //load rk1
1030	orr	w11, w11, w11
1031	rev	w12, w12                                //rev_ctr32
1032
1033	fmov	d1, x10                               //CTR block 1
1034	add	w12, w12, #1                            //increment rev_ctr32
1035
1036	aese	v0.16b, v18.16b
1037	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
1038	rev	w9, w12                                 //CTR block 1
1039
1040	orr	x9, x11, x9, lsl #32            //CTR block 1
1041	ld1	{v20.4s}, [x8], #16                                //load rk2
1042	add	w12, w12, #1                            //CTR block 1
1043
1044	fmov	v1.d[1], x9                               //CTR block 1
1045	rev	w9, w12                                 //CTR block 2
1046	add	w12, w12, #1                            //CTR block 2
1047
1048	aese	v0.16b, v19.16b
1049	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
1050	orr	x9, x11, x9, lsl #32            //CTR block 2
1051
1052	fmov	v2.d[1], x9                               //CTR block 2
1053	rev	w9, w12                                 //CTR block 3
1054
1055	fmov	d3, x10                               //CTR block 3
1056	orr	x9, x11, x9, lsl #32            //CTR block 3
1057	add	w12, w12, #1                            //CTR block 3
1058
1059	fmov	v3.d[1], x9                               //CTR block 3
1060	add	x4, x0, x1, lsr #3   //end_input_ptr
1061
1062	aese	v1.16b, v18.16b
1063	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
1064	ld1	{v21.4s}, [x8], #16                                //load rk3
1065
1066	aese	v0.16b, v20.16b
1067	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
1068	ld1	{v22.4s}, [x8], #16                                //load rk4
1069
1070	aese	v2.16b, v18.16b
1071	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
1072	ld1	{v23.4s}, [x8], #16                                //load rk5
1073
1074	aese	v1.16b, v19.16b
1075	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
1076	ld1	{v24.4s}, [x8], #16                                //load rk6
1077
1078	aese	v3.16b, v18.16b
1079	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
1080
1081	aese	v2.16b, v19.16b
1082	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
1083
1084	aese	v1.16b, v20.16b
1085	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
1086
1087	aese	v3.16b, v19.16b
1088	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
1089	ld1	{ v11.16b}, [x3]
1090	ext	v11.16b, v11.16b, v11.16b, #8
1091	rev64	v11.16b, v11.16b
1092
1093	aese	v0.16b, v21.16b
1094	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
1095	ld1	{v25.4s}, [x8], #16                                //load rk7
1096
1097	aese	v1.16b, v21.16b
1098	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
1099
1100	aese	v3.16b, v20.16b
1101	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
1102
1103	aese	v2.16b, v20.16b
1104	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
1105	ld1	{v26.4s}, [x8], #16                                //load rk8
1106
1107	aese	v1.16b, v22.16b
1108	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
1109
1110	aese	v3.16b, v21.16b
1111	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
1112
1113	aese	v2.16b, v21.16b
1114	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
1115	ldr	q14, [x3, #80]                         //load h3l | h3h
1116#ifndef __AARCH64EB__
1117	ext	v14.16b, v14.16b, v14.16b, #8
1118#endif
1119	aese	v0.16b, v22.16b
1120	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
1121	ld1	{v27.4s}, [x8], #16                                //load rk9
1122
1123	aese	v1.16b, v23.16b
1124	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
1125
1126	aese	v2.16b, v22.16b
1127	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
1128
1129	aese	v3.16b, v22.16b
1130	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
1131
1132	aese	v0.16b, v23.16b
1133	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
1134
1135	aese	v2.16b, v23.16b
1136	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
1137	ldr	q12, [x3, #32]                         //load h1l | h1h
1138#ifndef __AARCH64EB__
1139	ext	v12.16b, v12.16b, v12.16b, #8
1140#endif
1141	aese	v3.16b, v23.16b
1142	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
1143
1144	aese	v0.16b, v24.16b
1145	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
1146
1147	aese	v1.16b, v24.16b
1148	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
1149
1150	aese	v3.16b, v24.16b
1151	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
1152
1153	aese	v2.16b, v24.16b
1154	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
1155	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
1156
1157	ldr	q15, [x3, #112]                        //load h4l | h4h
1158#ifndef __AARCH64EB__
1159	ext	v15.16b, v15.16b, v15.16b, #8
1160#endif
1161	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
1162	add	x5, x5, x0
1163
1164	aese	v1.16b, v25.16b
1165	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
1166
1167	aese	v2.16b, v25.16b
1168	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
1169
1170	aese	v0.16b, v25.16b
1171	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
1172	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
1173
1174	aese	v3.16b, v25.16b
1175	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
1176
1177	aese	v1.16b, v26.16b
1178	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
1179	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
1180
1181	aese	v2.16b, v26.16b
1182	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
1183
1184	aese	v3.16b, v26.16b
1185	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
1186
1187	aese	v0.16b, v26.16b
1188	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
1189	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
1190
1191	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
1192
1193	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
1194
1195	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
1196	cmp	x0, x5                   //check if we have <= 4 blocks
1197
1198	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
1199	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
1200	b.ge	.L128_dec_tail                                    //handle tail
1201
1202	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0 - load ciphertext; AES block 1 - load ciphertext
1203
1204	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
1205	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
1206
1207	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
1208	rev64	v4.16b, v4.16b                                    //GHASH block 0
1209	rev	w9, w12                                 //CTR block 4
1210
1211	orr	x9, x11, x9, lsl #32            //CTR block 4
1212	add	w12, w12, #1                            //CTR block 4
1213	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
1214
1215	rev64	v5.16b, v5.16b                                    //GHASH block 1
1216	mov	x19, v1.d[0]                            //AES block 1 - mov low
1217
1218	mov	x20, v1.d[1]                            //AES block 1 - mov high
1219
1220	mov	x6, v0.d[0]                            //AES block 0 - mov low
1221	cmp	x0, x5                   //check if we have <= 8 blocks
1222
1223	mov	x7, v0.d[1]                            //AES block 0 - mov high
1224
1225	fmov	d0, x10                               //CTR block 4
1226
1227	fmov	v0.d[1], x9                               //CTR block 4
1228	rev	w9, w12                                 //CTR block 5
1229	eor	x19, x19, x13                   //AES block 1 - round 10 low
1230#ifdef __AARCH64EB__
1231	rev	x19, x19
1232#endif
1233	fmov	d1, x10                               //CTR block 5
1234	add	w12, w12, #1                            //CTR block 5
1235	orr	x9, x11, x9, lsl #32            //CTR block 5
1236
1237	fmov	v1.d[1], x9                               //CTR block 5
1238	rev	w9, w12                                 //CTR block 6
1239	add	w12, w12, #1                            //CTR block 6
1240
1241	orr	x9, x11, x9, lsl #32            //CTR block 6
1242
1243	eor	x20, x20, x14                   //AES block 1 - round 10 high
1244#ifdef __AARCH64EB__
1245	rev	x20, x20
1246#endif
1247	eor	x6, x6, x13                   //AES block 0 - round 10 low
1248#ifdef __AARCH64EB__
1249	rev	x6, x6
1250#endif
1251	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
1252
1253	eor	x7, x7, x14                   //AES block 0 - round 10 high
1254#ifdef __AARCH64EB__
1255	rev	x7, x7
1256#endif
1257	stp	x6, x7, [x2], #16        //AES block 0 - store result
1258
1259	stp	x19, x20, [x2], #16        //AES block 1 - store result
1260	b.ge	.L128_dec_prepretail                              //do prepretail
1261
1262.L128_dec_main_loop:	//main	loop start
1263	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
1264	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
1265	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
1266
1267	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
1268	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
1269
1270	aese	v1.16b, v18.16b
1271	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
1272	fmov	d2, x10                               //CTR block 4k+6
1273
1274	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
1275	fmov	v2.d[1], x9                               //CTR block 4k+6
1276	rev	w9, w12                                 //CTR block 4k+7
1277
1278	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
1279	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
1280	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
1281
1282	aese	v1.16b, v19.16b
1283	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
1284	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
1285
1286	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
1287	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
1288	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
1289
1290	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
1291	fmov	d3, x10                               //CTR block 4k+7
1292	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
1293
1294	aese	v1.16b, v20.16b
1295	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
1296	fmov	v3.d[1], x9                               //CTR block 4k+7
1297
1298	aese	v2.16b, v18.16b
1299	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
1300	mov	d10, v17.d[1]                               //GHASH block 4k - mid
1301
1302	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
1303	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
1304
1305	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
1306
1307	aese	v1.16b, v21.16b
1308	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
1309	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
1310
1311	aese	v3.16b, v18.16b
1312	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
1313	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
1314
1315	aese	v0.16b, v18.16b
1316	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
1317
1318	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
1319	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
1320
1321	aese	v3.16b, v19.16b
1322	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
1323	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
1324#ifdef __AARCH64EB__
1325	rev	x23, x23
1326#endif
1327	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
1328	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
1329#ifdef __AARCH64EB__
1330	rev	x22, x22
1331#endif
1332	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
1333
1334	aese	v0.16b, v19.16b
1335	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
1336	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
1337
1338	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
1339
1340	aese	v3.16b, v20.16b
1341	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
1342	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
1343
1344	aese	v0.16b, v20.16b
1345	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
1346
1347	aese	v1.16b, v22.16b
1348	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
1349	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
1350
1351	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
1352
1353	aese	v0.16b, v21.16b
1354	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
1355	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
1356
1357	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
1358
1359	aese	v2.16b, v19.16b
1360	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
1361	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
1362
1363	aese	v0.16b, v22.16b
1364	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
1365	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
1366
1367	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
1368	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
1369#ifdef __AARCH64EB__
1370	rev	x24, x24
1371#endif
1372	aese	v2.16b, v20.16b
1373	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
1374	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
1375
1376	aese	v1.16b, v23.16b
1377	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
1378	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
1379#ifdef __AARCH64EB__
1380	rev	x21, x21
1381#endif
1382	aese	v0.16b, v23.16b
1383	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
1384	movi	v8.8b, #0xc2
1385
1386	aese	v2.16b, v21.16b
1387	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
1388	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
1389
1390	aese	v1.16b, v24.16b
1391	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
1392
1393	aese	v0.16b, v24.16b
1394	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
1395	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
1396
1397	aese	v2.16b, v22.16b
1398	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
1399	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
1400
1401	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
1402	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
1403	ld1	{v4.16b}, [x0], #16                       //AES block 4k+3 - load ciphertext
1404
1405	aese	v1.16b, v25.16b
1406	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
1407	add	w12, w12, #1                            //CTR block 4k+7
1408
1409	aese	v0.16b, v25.16b
1410	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
1411	shl	d8, d8, #56               //mod_constant
1412
1413	aese	v2.16b, v23.16b
1414	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
1415	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
1416
1417	aese	v1.16b, v26.16b
1418	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
1419	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
1420
1421	aese	v0.16b, v26.16b
1422	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
1423	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1424
1425	aese	v3.16b, v21.16b
1426	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
1427	rev	w9, w12                                 //CTR block 4k+8
1428
1429	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1430	ld1	{v5.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
1431	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1432
1433	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
1434	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
1435
1436	aese	v3.16b, v22.16b
1437	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
1438	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1439
1440	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
1441
1442	aese	v2.16b, v24.16b
1443	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
1444	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
1445
1446	aese	v3.16b, v23.16b
1447	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
1448	ld1	{v6.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
1449
1450	add	w12, w12, #1                            //CTR block 4k+8
1451	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1452	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
1453
1454	aese	v2.16b, v25.16b
1455	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
1456	ld1	{v7.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
1457
1458	aese	v3.16b, v24.16b
1459	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
1460
1461	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
1462	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1463	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
1464
1465	aese	v2.16b, v26.16b
1466	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
1467	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
1468
1469	aese	v3.16b, v25.16b
1470	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
1471	fmov	d0, x10                               //CTR block 4k+8
1472
1473	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1474	fmov	v0.d[1], x9                               //CTR block 4k+8
1475	rev	w9, w12                                 //CTR block 4k+9
1476
1477	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
1478	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
1479	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1480
1481	aese	v3.16b, v26.16b
1482	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
1483	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
1484#ifdef __AARCH64EB__
1485	rev	x7, x7
1486#endif
1487	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1488	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
1489	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
1490#ifdef __AARCH64EB__
1491	rev	x6, x6
1492#endif
1493	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
1494	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
1495	add	w12, w12, #1                            //CTR block 4k+9
1496
1497	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
1498	fmov	d1, x10                               //CTR block 4k+9
1499	cmp	x0, x5                   //.LOOP CONTROL
1500
1501	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
1502	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1503	fmov	v1.d[1], x9                               //CTR block 4k+9
1504
1505	rev	w9, w12                                 //CTR block 4k+10
1506	add	w12, w12, #1                            //CTR block 4k+10
1507
1508	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
1509#ifdef __AARCH64EB__
1510	rev	x20, x20
1511#endif
1512	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
1513
1514	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
1515#ifdef __AARCH64EB__
1516	rev	x19, x19
1517#endif
1518	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
1519
1520	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
1521	b.lt	.L128_dec_main_loop
1522
1523.L128_dec_prepretail:	//PREPRETAIL
1524	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
1525	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
1526	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
1527
1528	aese	v0.16b, v18.16b
1529	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
1530	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
1531
1532	aese	v1.16b, v18.16b
1533	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
1534	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
1535
1536	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
1537	fmov	d2, x10                               //CTR block 4k+6
1538	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
1539
1540	aese	v0.16b, v19.16b
1541	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
1542	fmov	v2.d[1], x9                               //CTR block 4k+6
1543
1544	rev	w9, w12                                 //CTR block 4k+7
1545	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
1546	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
1547
1548	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
1549	mov	d10, v17.d[1]                               //GHASH block 4k - mid
1550	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
1551
1552	aese	v1.16b, v19.16b
1553	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
1554	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
1555
1556	aese	v0.16b, v20.16b
1557	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
1558	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
1559
1560	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
1561	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
1562	fmov	d3, x10                               //CTR block 4k+7
1563
1564	aese	v2.16b, v18.16b
1565	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
1566	fmov	v3.d[1], x9                               //CTR block 4k+7
1567
1568	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
1569	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
1570
1571	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
1572
1573	aese	v2.16b, v19.16b
1574	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
1575	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
1576
1577	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
1578
1579	aese	v3.16b, v18.16b
1580	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
1581	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
1582
1583	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
1584
1585	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
1586	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
1587
1588	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
1589
1590	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
1591	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
1592
1593	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
1594
1595	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
1596
1597	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
1598	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
1599
1600	aese	v1.16b, v20.16b
1601	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
1602	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
1603
1604	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
1605
1606	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
1607	movi	v8.8b, #0xc2
1608
1609	aese	v3.16b, v19.16b
1610	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
1611	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
1612
1613	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
1614
1615	aese	v2.16b, v20.16b
1616	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
1617	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
1618
1619	aese	v3.16b, v20.16b
1620	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
1621	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
1622#ifdef __AARCH64EB__
1623	rev	x23, x23
1624#endif
1625	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
1626	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
1627#ifdef __AARCH64EB__
1628	rev	x21, x21
1629#endif
1630	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
1631
1632	aese	v2.16b, v21.16b
1633	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
1634
1635	aese	v1.16b, v21.16b
1636	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
1637	shl	d8, d8, #56               //mod_constant
1638
1639	aese	v0.16b, v21.16b
1640	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
1641
1642	aese	v2.16b, v22.16b
1643	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
1644	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
1645
1646	aese	v1.16b, v22.16b
1647	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
1648
1649	aese	v3.16b, v21.16b
1650	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
1651	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1652
1653	aese	v2.16b, v23.16b
1654	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
1655
1656	aese	v1.16b, v23.16b
1657	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
1658
1659	aese	v3.16b, v22.16b
1660	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
1661
1662	aese	v0.16b, v22.16b
1663	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
1664	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1665
1666	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1667
1668	aese	v1.16b, v24.16b
1669	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
1670	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1671
1672	aese	v3.16b, v23.16b
1673	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
1674
1675	aese	v0.16b, v23.16b
1676	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
1677	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1678
1679	aese	v1.16b, v25.16b
1680	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
1681
1682	aese	v2.16b, v24.16b
1683	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
1684
1685	aese	v0.16b, v24.16b
1686	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
1687
1688	aese	v1.16b, v26.16b
1689	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
1690	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1691
1692	aese	v3.16b, v24.16b
1693	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
1694
1695	aese	v0.16b, v25.16b
1696	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
1697
1698	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
1699
1700	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1701	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
1702#ifdef __AARCH64EB__
1703	rev	x24, x24
1704#endif
1705	aese	v2.16b, v25.16b
1706	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
1707	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1708
1709	aese	v3.16b, v25.16b
1710	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
1711
1712	aese	v0.16b, v26.16b
1713	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
1714	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1715
1716	aese	v2.16b, v26.16b
1717	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
1718
1719	aese	v3.16b, v26.16b
1720	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
1721	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
1722#ifdef __AARCH64EB__
1723	rev	x22, x22
1724#endif
1725	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
1726	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
1727
1728	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
1729	add	w12, w12, #1                            //CTR block 4k+7
1730	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
1731
1732	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
1733	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1734.L128_dec_tail:	//TAIL
1735
1736	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
1737	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
1738
1739	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
1740
1741	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
1742
1743	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
1744
1745	cmp	x5, #48
1746
1747	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
1748#ifdef __AARCH64EB__
1749	rev	x7, x7
1750#endif
1751	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
1752	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
1753#ifdef __AARCH64EB__
1754	rev	x6, x6
1755#endif
1756	b.gt	.L128_dec_blocks_more_than_3
1757
1758	mov	v3.16b, v2.16b
1759	sub	w12, w12, #1
1760	movi	v11.8b, #0
1761
1762	movi	v9.8b, #0
1763	mov	v2.16b, v1.16b
1764
1765	movi	v10.8b, #0
1766	cmp	x5, #32
1767	b.gt	.L128_dec_blocks_more_than_2
1768
1769	cmp	x5, #16
1770
1771	mov	v3.16b, v1.16b
1772	sub	w12, w12, #1
1773	b.gt	.L128_dec_blocks_more_than_1
1774
1775	sub	w12, w12, #1
1776	b	.L128_dec_blocks_less_than_1
1777.L128_dec_blocks_more_than_3:	//blocks	left >  3
1778	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
1779	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
1780
1781	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1782
1783	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
1784	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
1785	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
1786
1787	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
1788	mov	x7, v0.d[1]                            //AES final-2 block - mov high
1789
1790	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
1791	mov	x6, v0.d[0]                            //AES final-2 block - mov low
1792
1793	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
1794
1795	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
1796
1797	movi	v8.8b, #0                                        //suppress further partial tag feed in
1798	eor	x7, x7, x14                   //AES final-2 block - round 10 high
1799#ifdef __AARCH64EB__
1800	rev	x7, x7
1801#endif
1802	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
1803	eor	x6, x6, x13                   //AES final-2 block - round 10 low
1804#ifdef __AARCH64EB__
1805	rev	x6, x6
1806#endif
1807.L128_dec_blocks_more_than_2:	//blocks	left >  2
1808
1809	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
1810	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
1811
1812	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1813
1814	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
1815	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
1816
1817	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
1818
1819	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
1820
1821	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
1822	mov	x6, v0.d[0]                            //AES final-1 block - mov low
1823
1824	mov	x7, v0.d[1]                            //AES final-1 block - mov high
1825	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
1826
1827	movi	v8.8b, #0                                        //suppress further partial tag feed in
1828
1829	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
1830
1831	eor	x6, x6, x13                   //AES final-1 block - round 10 low
1832#ifdef __AARCH64EB__
1833	rev	x6, x6
1834#endif
1835	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
1836
1837	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
1838
1839	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
1840	eor	x7, x7, x14                   //AES final-1 block - round 10 high
1841#ifdef __AARCH64EB__
1842	rev	x7, x7
1843#endif
1844.L128_dec_blocks_more_than_1:	//blocks	left >  1
1845
1846	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
1847
1848	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
1849	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1850
1851	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
1852
1853	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
1854
1855	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
1856
1857	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
1858	mov	x6, v0.d[0]                            //AES final block - mov low
1859
1860	mov	x7, v0.d[1]                            //AES final block - mov high
1861	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
1862
1863	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
1864
1865	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
1866
1867	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
1868	movi	v8.8b, #0                                        //suppress further partial tag feed in
1869
1870	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
1871
1872	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
1873	eor	x7, x7, x14                   //AES final block - round 10 high
1874#ifdef __AARCH64EB__
1875	rev	x7, x7
1876#endif
1877	eor	x6, x6, x13                   //AES final block - round 10 low
1878#ifdef __AARCH64EB__
1879	rev	x6, x6
1880#endif
1881	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
1882.L128_dec_blocks_less_than_1:	//blocks	left <= 1
1883
1884	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
1885	and	x1, x1, #127                    //bit_length %= 128
1886
1887	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
1888	sub	x1, x1, #128                    //bit_length -= 128
1889
1890	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
1891
1892	and	x1, x1, #127                    //bit_length %= 128
1893
1894	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
1895	cmp	x1, #64
1896
1897	csel	x10, x14, xzr, lt
1898	csel	x9, x13, x14, lt
1899
1900	fmov	d0, x9                                   //ctr0b is mask for last block
1901
1902	mov	v0.d[1], x10
1903
1904	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
1905
1906	rev64	v4.16b, v5.16b                                    //GHASH final block
1907
1908	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1909
1910	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
1911
1912	and	x7, x7, x10
1913
1914	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
1915	mov	d8, v4.d[1]                                  //GHASH final block - mid
1916
1917	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
1918	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
1919
1920	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
1921
1922	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
1923	bic	x4, x4, x9           //mask out low existing bytes
1924	and	x6, x6, x9
1925
1926#ifndef __AARCH64EB__
1927	rev	w9, w12
1928#else
1929	mov	w9, w12
1930#endif
1931
1932	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
1933	movi	v8.8b, #0xc2
1934
1935	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
1936
1937	bic	x5, x5, x10   //mask out high existing bytes
1938	shl	d8, d8, #56               //mod_constant
1939
1940	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1941
1942	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1943
1944	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1945
1946	orr	x6, x6, x4
1947	str	w9, [x16, #12]                          //store the updated counter
1948
1949	orr	x7, x7, x5
1950	stp	x6, x7, [x2]
1951	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1952
1953	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1954
1955	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1956
1957	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1958	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1959
1960	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1961
1962	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1963	ext	v11.16b, v11.16b, v11.16b, #8
1964	rev64	v11.16b, v11.16b
1965	mov	x0, x15
1966	st1	{ v11.16b }, [x3]
1967
1968	ldp	x21, x22, [sp, #16]
1969	ldp	x23, x24, [sp, #32]
1970	ldp	d8, d9, [sp, #48]
1971	ldp	d10, d11, [sp, #64]
1972	ldp	d12, d13, [sp, #80]
1973	ldp	d14, d15, [sp, #96]
1974	ldp	x19, x20, [sp], #112
1975	ret
1976
1977.L128_dec_ret:
1978	mov	w0, #0x0
1979	ret
1980.size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1981.globl	aes_gcm_enc_192_kernel
1982.type	aes_gcm_enc_192_kernel,%function
1983.align	4
1984aes_gcm_enc_192_kernel:
1985	cbz	x1, .L192_enc_ret
1986	stp	x19, x20, [sp, #-112]!
1987	mov	x16, x4
1988	mov	x8, x5
1989	stp	x21, x22, [sp, #16]
1990	stp	x23, x24, [sp, #32]
1991	stp	d8, d9, [sp, #48]
1992	stp	d10, d11, [sp, #64]
1993	stp	d12, d13, [sp, #80]
1994	stp	d14, d15, [sp, #96]
1995
1996	ldp	x10, x11, [x16]             //ctr96_b64, ctr96_t32
1997#ifdef __AARCH64EB__
1998	rev	x10, x10
1999	rev	x11, x11
2000#endif
2001	ldp	x13, x14, [x8, #192]                     //load rk12
2002#ifdef __AARCH64EB__
2003	ror	x13, x13, #32
2004	ror	x14, x14, #32
2005#endif
2006	ld1	{v18.4s}, [x8], #16	                             //load rk0
2007
2008	ld1	{v19.4s}, [x8], #16	                             //load rk1
2009
2010	ld1	{v20.4s}, [x8], #16	                             //load rk2
2011
2012	lsr	x12, x11, #32
2013	ld1	{v21.4s}, [x8], #16	                             //load rk3
2014	orr	w11, w11, w11
2015
2016	ld1	{v22.4s}, [x8], #16	                             //load rk4
2017	rev	w12, w12                               //rev_ctr32
2018
2019	add	w12, w12, #1                           //increment rev_ctr32
2020	fmov	d3, x10                              //CTR block 3
2021
2022	rev	w9, w12                                //CTR block 1
2023	add	w12, w12, #1                           //CTR block 1
2024	fmov	d1, x10                              //CTR block 1
2025
2026	orr	x9, x11, x9, lsl #32           //CTR block 1
2027	ld1	{ v0.16b}, [x16]                            //special case vector load initial counter so we can start first AES block as quickly as possible
2028
2029	fmov	v1.d[1], x9                              //CTR block 1
2030	rev	w9, w12                                //CTR block 2
2031	add	w12, w12, #1                           //CTR block 2
2032
2033	fmov	d2, x10                              //CTR block 2
2034	orr	x9, x11, x9, lsl #32           //CTR block 2
2035
2036	fmov	v2.d[1], x9                              //CTR block 2
2037	rev	w9, w12                                //CTR block 3
2038
2039	orr	x9, x11, x9, lsl #32           //CTR block 3
2040	ld1	{v23.4s}, [x8], #16	                             //load rk5
2041
2042	fmov	v3.d[1], x9                              //CTR block 3
2043
2044	ld1	{v24.4s}, [x8], #16	                             //load rk6
2045
2046	ld1	{v25.4s}, [x8], #16	                             //load rk7
2047
2048	aese	v0.16b, v18.16b
2049	aesmc	v0.16b, v0.16b         //AES block 0 - round 0
2050	ld1	{ v11.16b}, [x3]
2051	ext	v11.16b, v11.16b, v11.16b, #8
2052	rev64	v11.16b, v11.16b
2053
2054	aese	v3.16b, v18.16b
2055	aesmc	v3.16b, v3.16b         //AES block 3 - round 0
2056	ld1	{v26.4s}, [x8], #16	                             //load rk8
2057
2058	aese	v1.16b, v18.16b
2059	aesmc	v1.16b, v1.16b         //AES block 1 - round 0
2060	ldr	q15, [x3, #112]                       //load h4l | h4h
2061#ifndef __AARCH64EB__
2062	ext	v15.16b, v15.16b, v15.16b, #8
2063#endif
2064	aese	v2.16b, v18.16b
2065	aesmc	v2.16b, v2.16b         //AES block 2 - round 0
2066	ld1	{v27.4s}, [x8], #16	                             //load rk9
2067
2068	aese	v0.16b, v19.16b
2069	aesmc	v0.16b, v0.16b         //AES block 0 - round 1
2070	ld1	{v28.4s}, [x8], #16	                         //load rk10
2071
2072	aese	v1.16b, v19.16b
2073	aesmc	v1.16b, v1.16b         //AES block 1 - round 1
2074	ldr	q12, [x3, #32]                        //load h1l | h1h
2075#ifndef __AARCH64EB__
2076	ext	v12.16b, v12.16b, v12.16b, #8
2077#endif
2078	aese	v2.16b, v19.16b
2079	aesmc	v2.16b, v2.16b         //AES block 2 - round 1
2080	ld1	{v29.4s}, [x8], #16	                         //load rk11
2081
2082	aese	v3.16b, v19.16b
2083	aesmc	v3.16b, v3.16b         //AES block 3 - round 1
2084	ldr	q14, [x3, #80]                        //load h3l | h3h
2085#ifndef __AARCH64EB__
2086	ext	v14.16b, v14.16b, v14.16b, #8
2087#endif
2088	aese	v0.16b, v20.16b
2089	aesmc	v0.16b, v0.16b         //AES block 0 - round 2
2090
2091	aese	v2.16b, v20.16b
2092	aesmc	v2.16b, v2.16b         //AES block 2 - round 2
2093
2094	aese	v3.16b, v20.16b
2095	aesmc	v3.16b, v3.16b         //AES block 3 - round 2
2096
2097	aese	v0.16b, v21.16b
2098	aesmc	v0.16b, v0.16b         //AES block 0 - round 3
2099	trn1	v9.2d, v14.2d,    v15.2d                     //h4h | h3h
2100
2101	aese	v2.16b, v21.16b
2102	aesmc	v2.16b, v2.16b         //AES block 2 - round 3
2103
2104	aese	v1.16b, v20.16b
2105	aesmc	v1.16b, v1.16b         //AES block 1 - round 2
2106	trn2	v17.2d,  v14.2d,    v15.2d                     //h4l | h3l
2107
2108	aese	v0.16b, v22.16b
2109	aesmc	v0.16b, v0.16b         //AES block 0 - round 4
2110
2111	aese	v3.16b, v21.16b
2112	aesmc	v3.16b, v3.16b         //AES block 3 - round 3
2113
2114	aese	v1.16b, v21.16b
2115	aesmc	v1.16b, v1.16b         //AES block 1 - round 3
2116
2117	aese	v0.16b, v23.16b
2118	aesmc	v0.16b, v0.16b         //AES block 0 - round 5
2119
2120	aese	v2.16b, v22.16b
2121	aesmc	v2.16b, v2.16b         //AES block 2 - round 4
2122
2123	aese	v1.16b, v22.16b
2124	aesmc	v1.16b, v1.16b         //AES block 1 - round 4
2125
2126	aese	v0.16b, v24.16b
2127	aesmc	v0.16b, v0.16b         //AES block 0 - round 6
2128
2129	aese	v3.16b, v22.16b
2130	aesmc	v3.16b, v3.16b         //AES block 3 - round 4
2131
2132	aese	v2.16b, v23.16b
2133	aesmc	v2.16b, v2.16b         //AES block 2 - round 5
2134
2135	aese	v1.16b, v23.16b
2136	aesmc	v1.16b, v1.16b         //AES block 1 - round 5
2137
2138	aese	v3.16b, v23.16b
2139	aesmc	v3.16b, v3.16b         //AES block 3 - round 5
2140
2141	aese	v2.16b, v24.16b
2142	aesmc	v2.16b, v2.16b         //AES block 2 - round 6
2143	ldr	q13, [x3, #64]                        //load h2l | h2h
2144#ifndef __AARCH64EB__
2145	ext	v13.16b, v13.16b, v13.16b, #8
2146#endif
2147	aese	v1.16b, v24.16b
2148	aesmc	v1.16b, v1.16b         //AES block 1 - round 6
2149
2150	aese	v3.16b, v24.16b
2151	aesmc	v3.16b, v3.16b         //AES block 3 - round 6
2152
2153	aese	v0.16b, v25.16b
2154	aesmc	v0.16b, v0.16b         //AES block 0 - round 7
2155
2156	aese	v1.16b, v25.16b
2157	aesmc	v1.16b, v1.16b         //AES block 1 - round 7
2158	trn2	v16.2d,  v12.2d,    v13.2d                     //h2l | h1l
2159
2160	aese	v3.16b, v25.16b
2161	aesmc	v3.16b, v3.16b         //AES block 3 - round 7
2162
2163	aese	v0.16b, v26.16b
2164	aesmc	v0.16b, v0.16b         //AES block 0 - round 8
2165
2166	aese	v2.16b, v25.16b
2167	aesmc	v2.16b, v2.16b         //AES block 2 - round 7
2168	trn1	v8.2d,    v12.2d,    v13.2d                     //h2h | h1h
2169
2170	aese	v1.16b, v26.16b
2171	aesmc	v1.16b, v1.16b         //AES block 1 - round 8
2172
2173	aese	v3.16b, v26.16b
2174	aesmc	v3.16b, v3.16b         //AES block 3 - round 8
2175
2176	aese	v2.16b, v26.16b
2177	aesmc	v2.16b, v2.16b         //AES block 2 - round 8
2178
2179	aese	v0.16b, v27.16b
2180	aesmc	v0.16b, v0.16b         //AES block 0 - round 9
2181
2182	aese	v3.16b, v27.16b
2183	aesmc	v3.16b, v3.16b         //AES block 3 - round 9
2184
2185	aese	v2.16b, v27.16b
2186	aesmc	v2.16b, v2.16b         //AES block 2 - round 9
2187
2188	aese	v1.16b, v27.16b
2189	aesmc	v1.16b, v1.16b         //AES block 1 - round 9
2190
2191	aese	v0.16b, v28.16b
2192	aesmc	v0.16b, v0.16b         //AES block 0 - round 10
2193
2194	aese	v2.16b, v28.16b
2195	aesmc	v2.16b, v2.16b         //AES block 2 - round 10
2196
2197	aese	v1.16b, v28.16b
2198	aesmc	v1.16b, v1.16b         //AES block 1 - round 10
2199	lsr	x5, x1, #3             //byte_len
2200	mov	x15, x5
2201
2202	aese	v3.16b, v28.16b
2203	aesmc	v3.16b, v3.16b         //AES block 3 - round 10
2204	sub	x5, x5, #1     //byte_len - 1
2205
2206	eor	v16.16b, v16.16b, v8.16b                    //h2k | h1k
2207	and	x5, x5, #0xffffffffffffffc0   //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2208
2209	eor	v17.16b, v17.16b, v9.16b                 //h4k | h3k
2210
2211	aese	v2.16b, v29.16b                                    //AES block 2 - round 11
2212	add	x4, x0, x1, lsr #3  //end_input_ptr
2213	add	x5, x5, x0
2214
2215	aese	v1.16b, v29.16b                                    //AES block 1 - round 11
2216	cmp	x0, x5                  //check if we have <= 4 blocks
2217
2218	aese	v0.16b, v29.16b                                    //AES block 0 - round 11
2219	add	w12, w12, #1                           //CTR block 3
2220
2221	aese	v3.16b, v29.16b                                    //AES block 3 - round 11
2222	b.ge	.L192_enc_tail                                   //handle tail
2223
2224	rev	w9, w12                                //CTR block 4
2225	ldp	x6, x7, [x0, #0]           //AES block 0 - load plaintext
2226#ifdef __AARCH64EB__
2227	rev	x6, x6
2228	rev	x7, x7
2229#endif
2230	orr	x9, x11, x9, lsl #32           //CTR block 4
2231	ldp	x21, x22, [x0, #32]          //AES block 2 - load plaintext
2232#ifdef __AARCH64EB__
2233	rev	x21, x21
2234	rev	x22, x22
2235#endif
2236	ldp	x23, x24, [x0, #48]          //AES block 3 - load plaintext
2237#ifdef __AARCH64EB__
2238	rev	x23, x23
2239	rev	x24, x24
2240#endif
2241	ldp	x19, x20, [x0, #16]          //AES block 1 - load plaintext
2242#ifdef __AARCH64EB__
2243	rev	x19, x19
2244	rev	x20, x20
2245#endif
2246	add	x0, x0, #64                      //AES input_ptr update
2247	cmp	x0, x5                  //check if we have <= 8 blocks
2248
2249	eor	x6, x6, x13                    //AES block 0 - round 12 low
2250
2251	eor	x7, x7, x14                    //AES block 0 - round 12 high
2252	eor	x22, x22, x14                    //AES block 2 - round 12 high
2253	fmov	d4, x6                              //AES block 0 - mov low
2254
2255	eor	x24, x24, x14                    //AES block 3 - round 12 high
2256	fmov	v4.d[1], x7                          //AES block 0 - mov high
2257
2258	eor	x21, x21, x13                    //AES block 2 - round 12 low
2259	eor	x19, x19, x13                    //AES block 1 - round 12 low
2260
2261	fmov	d5, x19                              //AES block 1 - mov low
2262	eor	x20, x20, x14                    //AES block 1 - round 12 high
2263
2264	fmov	v5.d[1], x20                          //AES block 1 - mov high
2265
2266	eor	x23, x23, x13                    //AES block 3 - round 12 low
2267	fmov	d6, x21                              //AES block 2 - mov low
2268
2269	add	w12, w12, #1                           //CTR block 4
2270	eor	v4.16b, v4.16b, v0.16b                         //AES block 0 - result
2271	fmov	d0, x10                              //CTR block 4
2272
2273	fmov	v0.d[1], x9                              //CTR block 4
2274	rev	w9, w12                                //CTR block 5
2275
2276	orr	x9, x11, x9, lsl #32           //CTR block 5
2277	add	w12, w12, #1                           //CTR block 5
2278
2279	fmov	d7, x23                              //AES block 3 - mov low
2280	st1	{ v4.16b}, [x2], #16                    //AES block 0 - store result
2281
2282	fmov	v6.d[1], x22                          //AES block 2 - mov high
2283
2284	eor	v5.16b, v5.16b, v1.16b                         //AES block 1 - result
2285	fmov	d1, x10                              //CTR block 5
2286	st1	{ v5.16b}, [x2], #16                    //AES block 1 - store result
2287
2288	fmov	v7.d[1], x24                          //AES block 3 - mov high
2289
2290	fmov	v1.d[1], x9                              //CTR block 5
2291	rev	w9, w12                                //CTR block 6
2292
2293	orr	x9, x11, x9, lsl #32           //CTR block 6
2294
2295	add	w12, w12, #1                           //CTR block 6
2296	eor	v6.16b, v6.16b, v2.16b                         //AES block 2 - result
2297	fmov	d2, x10                              //CTR block 6
2298
2299	fmov	v2.d[1], x9                              //CTR block 6
2300	rev	w9, w12                                //CTR block 7
2301
2302	orr	x9, x11, x9, lsl #32           //CTR block 7
2303	st1	{ v6.16b}, [x2], #16                    //AES block 2 - store result
2304
2305	eor	v7.16b, v7.16b, v3.16b                         //AES block 3 - result
2306	st1	{ v7.16b}, [x2], #16                    //AES block 3 - store result
2307	b.ge	.L192_enc_prepretail                             //do prepretail
2308
2309.L192_enc_main_loop:	//main	loop start
2310	aese	v2.16b, v18.16b
2311	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
2312	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
2313
2314	aese	v1.16b, v18.16b
2315	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
2316	ldp	x19, x20, [x0, #16]          //AES block 4k+5 - load plaintext
2317#ifdef __AARCH64EB__
2318	rev	x19, x19
2319	rev	x20, x20
2320#endif
2321	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
2322	fmov	d3, x10                              //CTR block 4k+3
2323	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
2324
2325	aese	v2.16b, v19.16b
2326	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
2327	fmov	v3.d[1], x9                              //CTR block 4k+3
2328
2329	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
2330	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2331	ldp	x21, x22, [x0, #32]          //AES block 4k+6 - load plaintext
2332#ifdef __AARCH64EB__
2333	rev	x21, x21
2334	rev	x22, x22
2335#endif
2336	aese	v0.16b, v18.16b
2337	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
2338	ldp	x23, x24, [x0, #48]          //AES block 4k+3 - load plaintext
2339#ifdef __AARCH64EB__
2340	rev	x23, x23
2341	rev	x24, x24
2342#endif
2343	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
2344	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
2345
2346	aese	v1.16b, v19.16b
2347	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
2348
2349	aese	v0.16b, v19.16b
2350	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
2351	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
2352
2353	aese	v3.16b, v18.16b
2354	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
2355	eor	x24, x24, x14                    //AES block 4k+3 - round 12 high
2356
2357	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
2358	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
2359
2360	aese	v0.16b, v20.16b
2361	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
2362
2363	aese	v3.16b, v19.16b
2364	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
2365	eor	x21, x21, x13                    //AES block 4k+6 - round 12 low
2366
2367	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
2368	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
2369
2370	aese	v0.16b, v21.16b
2371	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
2372	eor	x19, x19, x13                    //AES block 4k+5 - round 12 low
2373
2374	aese	v1.16b, v20.16b
2375	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
2376	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
2377
2378	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
2379	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
2380
2381	aese	v2.16b, v20.16b
2382	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
2383
2384	aese	v1.16b, v21.16b
2385	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
2386
2387	mov	d10, v17.d[1]                              //GHASH block 4k - mid
2388	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
2389
2390	aese	v3.16b, v20.16b
2391	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
2392	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
2393
2394	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
2395
2396	aese	v0.16b, v22.16b
2397	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
2398	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
2399
2400	aese	v3.16b, v21.16b
2401	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
2402
2403	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
2404	eor	x20, x20, x14                    //AES block 4k+5 - round 12 high
2405	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
2406
2407	aese	v0.16b, v23.16b
2408	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
2409	add	w12, w12, #1                           //CTR block 4k+3
2410
2411	aese	v3.16b, v22.16b
2412	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
2413	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
2414
2415	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
2416	eor	x22, x22, x14                    //AES block 4k+6 - round 12 high
2417
2418	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
2419	eor	x23, x23, x13                    //AES block 4k+3 - round 12 low
2420	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
2421
2422	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
2423	rev	w9, w12                                //CTR block 4k+8
2424
2425	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
2426	orr	x9, x11, x9, lsl #32           //CTR block 4k+8
2427
2428	aese	v2.16b, v21.16b
2429	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
2430	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
2431
2432	aese	v1.16b, v22.16b
2433	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
2434	ldp	x6, x7, [x0, #0]           //AES block 4k+4 - load plaintext
2435#ifdef __AARCH64EB__
2436	rev	x6, x6
2437	rev	x7, x7
2438#endif
2439	aese	v0.16b, v24.16b
2440	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
2441	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
2442
2443	aese	v2.16b, v22.16b
2444	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
2445	add	x0, x0, #64                      //AES input_ptr update
2446
2447	aese	v1.16b, v23.16b
2448	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
2449	movi	v8.8b, #0xc2
2450
2451	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
2452	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
2453	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
2454
2455	aese	v2.16b, v23.16b
2456	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
2457	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
2458
2459	aese	v1.16b, v24.16b
2460	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
2461	shl	d8, d8, #56              //mod_constant
2462
2463	aese	v3.16b, v23.16b
2464	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
2465	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
2466
2467	aese	v0.16b, v25.16b
2468	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
2469	fmov	d5, x19                              //AES block 4k+5 - mov low
2470
2471	aese	v1.16b, v25.16b
2472	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
2473	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
2474
2475	aese	v3.16b, v24.16b
2476	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
2477	fmov	v5.d[1], x20                          //AES block 4k+5 - mov high
2478
2479	aese	v0.16b, v26.16b
2480	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
2481	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
2482
2483	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
2484	cmp	x0, x5                  //.LOOP CONTROL
2485	fmov	d4, x6                              //AES block 4k+4 - mov low
2486
2487	aese	v2.16b, v24.16b
2488	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
2489	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
2490
2491	aese	v1.16b, v26.16b
2492	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
2493	fmov	d7, x23                              //AES block 4k+3 - mov low
2494
2495	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
2496	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
2497	add	w12, w12, #1                           //CTR block 4k+8
2498
2499	aese	v2.16b, v25.16b
2500	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
2501	fmov	v7.d[1], x24                          //AES block 4k+3 - mov high
2502
2503	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
2504	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
2505	fmov	d6, x21                              //AES block 4k+6 - mov low
2506
2507	aese	v3.16b, v25.16b
2508	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
2509
2510	aese	v0.16b, v27.16b
2511	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
2512	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
2513
2514	aese	v2.16b, v26.16b
2515	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
2516
2517	aese	v3.16b, v26.16b
2518	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
2519
2520	aese	v1.16b, v27.16b
2521	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
2522
2523	aese	v0.16b, v28.16b
2524	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
2525	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
2526
2527	aese	v3.16b, v27.16b
2528	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
2529
2530	aese	v2.16b, v27.16b
2531	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
2532
2533	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
2534
2535	aese	v1.16b, v28.16b
2536	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
2537	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
2538
2539	aese	v2.16b, v28.16b
2540	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
2541
2542	eor	v4.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
2543	fmov	d0, x10                              //CTR block 4k+8
2544
2545	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
2546	fmov	v0.d[1], x9                              //CTR block 4k+8
2547	rev	w9, w12                                //CTR block 4k+9
2548
2549	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
2550	fmov	v6.d[1], x22                          //AES block 4k+6 - mov high
2551	st1	{ v4.16b}, [x2], #16                    //AES block 4k+4 - store result
2552
2553	aese	v3.16b, v28.16b
2554	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
2555	orr	x9, x11, x9, lsl #32           //CTR block 4k+9
2556
2557	eor	v5.16b, v5.16b, v1.16b                         //AES block 4k+5 - result
2558	add	w12, w12, #1                           //CTR block 4k+9
2559	fmov	d1, x10                              //CTR block 4k+9
2560
2561	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
2562	fmov	v1.d[1], x9                              //CTR block 4k+9
2563	rev	w9, w12                                //CTR block 4k+10
2564
2565	add	w12, w12, #1                           //CTR block 4k+10
2566	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
2567	orr	x9, x11, x9, lsl #32           //CTR block 4k+10
2568
2569	st1	{ v5.16b}, [x2], #16                    //AES block 4k+5 - store result
2570	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
2571
2572	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
2573	eor	v6.16b, v6.16b, v2.16b                         //AES block 4k+6 - result
2574	fmov	d2, x10                              //CTR block 4k+10
2575
2576	st1	{ v6.16b}, [x2], #16                    //AES block 4k+6 - store result
2577	fmov	v2.d[1], x9                              //CTR block 4k+10
2578	rev	w9, w12                                //CTR block 4k+11
2579
2580	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
2581	orr	x9, x11, x9, lsl #32           //CTR block 4k+11
2582
2583	eor	v7.16b, v7.16b, v3.16b                         //AES block 4k+3 - result
2584	st1	{ v7.16b}, [x2], #16                    //AES block 4k+3 - store result
2585	b.lt	.L192_enc_main_loop
2586
2587.L192_enc_prepretail:	//PREPRETAIL
2588	aese	v0.16b, v18.16b
2589	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
2590	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
2591
2592	fmov	d3, x10                              //CTR block 4k+3
2593	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
2594	add	w12, w12, #1                           //CTR block 4k+3
2595
2596	aese	v1.16b, v18.16b
2597	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
2598	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
2599
2600	aese	v2.16b, v18.16b
2601	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
2602
2603	fmov	v3.d[1], x9                              //CTR block 4k+3
2604	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
2605	mov	d10, v17.d[1]                              //GHASH block 4k - mid
2606
2607	aese	v1.16b, v19.16b
2608	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
2609	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
2610
2611	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
2612
2613	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
2614	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
2615
2616	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
2617	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2618
2619	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
2620
2621	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
2622	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
2623
2624	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
2625	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
2626
2627	aese	v3.16b, v18.16b
2628	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
2629	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
2630
2631	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
2632
2633	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
2634	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
2635
2636	aese	v3.16b, v19.16b
2637	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
2638
2639	aese	v2.16b, v19.16b
2640	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
2641	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
2642
2643	aese	v0.16b, v19.16b
2644	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
2645
2646	aese	v1.16b, v20.16b
2647	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
2648	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
2649
2650	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
2651	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
2652
2653	aese	v0.16b, v20.16b
2654	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
2655
2656	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
2657	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
2658
2659	aese	v1.16b, v21.16b
2660	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
2661
2662	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
2663
2664	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
2665
2666	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
2667	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
2668
2669	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
2670
2671	aese	v0.16b, v21.16b
2672	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
2673	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
2674
2675	aese	v3.16b, v20.16b
2676	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
2677
2678	aese	v2.16b, v20.16b
2679	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
2680	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
2681
2682	aese	v0.16b, v22.16b
2683	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
2684
2685	aese	v3.16b, v21.16b
2686	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
2687	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
2688
2689	aese	v2.16b, v21.16b
2690	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
2691
2692	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
2693	movi	v8.8b, #0xc2
2694
2695	aese	v3.16b, v22.16b
2696	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
2697
2698	aese	v2.16b, v22.16b
2699	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
2700
2701	aese	v1.16b, v22.16b
2702	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
2703	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
2704
2705	aese	v3.16b, v23.16b
2706	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
2707
2708	aese	v2.16b, v23.16b
2709	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
2710
2711	aese	v1.16b, v23.16b
2712	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
2713	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
2714
2715	aese	v0.16b, v23.16b
2716	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
2717
2718	aese	v3.16b, v24.16b
2719	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
2720	eor	v10.16b, v10.16b, v9.16b                        //karatsuba tidy up
2721
2722	aese	v1.16b, v24.16b
2723	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
2724
2725	aese	v0.16b, v24.16b
2726	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
2727	shl	d8, d8, #56              //mod_constant
2728
2729	aese	v3.16b, v25.16b
2730	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
2731
2732	aese	v1.16b, v25.16b
2733	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
2734	eor	v10.16b, v10.16b, v11.16b
2735
2736	aese	v0.16b, v25.16b
2737	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
2738
2739	pmull	v30.1q, v9.1d, v8.1d
2740
2741	aese	v2.16b, v24.16b
2742	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
2743	ext	v9.16b, v9.16b, v9.16b, #8
2744
2745	aese	v0.16b, v26.16b
2746	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
2747
2748	aese	v1.16b, v26.16b
2749	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
2750	eor	v10.16b, v10.16b, v30.16b
2751
2752	aese	v2.16b, v25.16b
2753	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
2754
2755	aese	v3.16b, v26.16b
2756	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
2757
2758	aese	v0.16b, v27.16b
2759	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
2760
2761	aese	v2.16b, v26.16b
2762	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
2763	eor	v10.16b, v10.16b, v9.16b
2764
2765	aese	v3.16b, v27.16b
2766	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
2767
2768	aese	v1.16b, v27.16b
2769	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
2770
2771	aese	v2.16b, v27.16b
2772	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
2773
2774	pmull	v30.1q, v10.1d, v8.1d
2775
2776	ext	v10.16b, v10.16b, v10.16b, #8
2777
2778	aese	v3.16b, v28.16b
2779	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
2780
2781	aese	v0.16b, v28.16b
2782	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
2783
2784	aese	v2.16b, v28.16b
2785	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
2786
2787	aese	v1.16b, v28.16b
2788	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
2789	eor	v11.16b, v11.16b, v30.16b
2790
2791	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
2792
2793	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
2794
2795	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
2796
2797	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
2798	eor	v11.16b, v11.16b, v10.16b
2799.L192_enc_tail:	//TAIL
2800
2801	sub	x5, x4, x0  //main_end_input_ptr is number of bytes left to process
2802	ldp	x6, x7, [x0], #16          //AES block 4k+4 - load plaintext
2803#ifdef __AARCH64EB__
2804	rev	x6, x6
2805	rev	x7, x7
2806#endif
2807	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
2808	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
2809
2810	fmov	d4, x6                              //AES block 4k+4 - mov low
2811
2812	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
2813	cmp	x5, #48
2814
2815	eor	v5.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
2816
2817	ext	v8.16b, v11.16b, v11.16b, #8                    //prepare final partial tag
2818	b.gt	.L192_enc_blocks_more_than_3
2819
2820	sub	w12, w12, #1
2821	movi	v10.8b, #0
2822
2823	mov	v3.16b, v2.16b
2824	movi	v9.8b, #0
2825	cmp	x5, #32
2826
2827	mov	v2.16b, v1.16b
2828	movi	v11.8b, #0
2829	b.gt	.L192_enc_blocks_more_than_2
2830
2831	sub	w12, w12, #1
2832
2833	mov	v3.16b, v1.16b
2834	cmp	x5, #16
2835	b.gt	.L192_enc_blocks_more_than_1
2836
2837	sub	w12, w12, #1
2838	b	.L192_enc_blocks_less_than_1
2839.L192_enc_blocks_more_than_3:	//blocks	left >  3
2840	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
2841
2842	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
2843#ifdef __AARCH64EB__
2844	rev	x6, x6
2845	rev	x7, x7
2846#endif
2847	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
2848
2849	eor	x6, x6, x13                    //AES final-2 block - round 12 low
2850	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2851
2852	eor	x7, x7, x14                    //AES final-2 block - round 12 high
2853	fmov	d5, x6                                //AES final-2 block - mov low
2854
2855	fmov	v5.d[1], x7                            //AES final-2 block - mov high
2856
2857	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
2858
2859	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
2860
2861	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
2862
2863	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
2864
2865	movi	v8.8b, #0                                       //suppress further partial tag feed in
2866
2867	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
2868
2869	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
2870	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
2871.L192_enc_blocks_more_than_2:	//blocks	left >  2
2872
2873	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
2874
2875	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
2876	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
2877#ifdef __AARCH64EB__
2878	rev	x6, x6
2879	rev	x7, x7
2880#endif
2881	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2882
2883	eor	x7, x7, x14                    //AES final-1 block - round 12 high
2884
2885	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
2886	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
2887
2888	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
2889	eor	x6, x6, x13                    //AES final-1 block - round 12 low
2890
2891	fmov	d5, x6                                //AES final-1 block - mov low
2892
2893	fmov	v5.d[1], x7                            //AES final-1 block - mov high
2894	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
2895	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
2896
2897	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
2898
2899	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
2900
2901	movi	v8.8b, #0                                       //suppress further partial tag feed in
2902
2903	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
2904
2905	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
2906.L192_enc_blocks_more_than_1:	//blocks	left >  1
2907
2908	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
2909
2910	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
2911#ifdef __AARCH64EB__
2912	rev	x6, x6
2913	rev	x7, x7
2914#endif
2915	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
2916
2917	eor	x6, x6, x13                    //AES final block - round 12 low
2918	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2919	movi	v8.8b, #0                                       //suppress further partial tag feed in
2920
2921	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
2922
2923	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
2924	eor	x7, x7, x14                    //AES final block - round 12 high
2925	fmov	d5, x6                                //AES final block - mov low
2926
2927	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
2928	fmov	v5.d[1], x7                            //AES final block - mov high
2929
2930	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
2931
2932	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
2933
2934	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
2935
2936	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
2937
2938	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
2939
2940	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
2941
2942	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
2943.L192_enc_blocks_less_than_1:	//blocks	left <= 1
2944
2945	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
2946#ifndef __AARCH64EB__
2947	rev	w9, w12
2948#else
2949	mov	w9, w12
2950#endif
2951	and	x1, x1, #127                   //bit_length %= 128
2952
2953	sub	x1, x1, #128                   //bit_length -= 128
2954	mvn	x14, xzr                                     //rk12_h = 0xffffffffffffffff
2955
2956	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
2957	mvn	x13, xzr                                     //rk12_l = 0xffffffffffffffff
2958
2959	and	x1, x1, #127                   //bit_length %= 128
2960
2961	lsr	x14, x14, x1                    //rk12_h is mask for top 64b of last block
2962	cmp	x1, #64
2963
2964	csel	x6, x13, x14, lt
2965	csel	x7, x14, xzr, lt
2966
2967	fmov	d0, x6                                //ctr0b is mask for last block
2968
2969	fmov	v0.d[1], x7
2970
2971	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
2972
2973	rev64	v4.16b, v5.16b                                   //GHASH final block
2974
2975	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2976
2977	mov	d8, v4.d[1]                                 //GHASH final block - mid
2978
2979	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
2980
2981	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
2982
2983	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
2984
2985	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
2986
2987	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
2988
2989	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
2990
2991	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
2992	movi	v8.8b, #0xc2
2993
2994	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
2995
2996	shl	d8, d8, #56              //mod_constant
2997
2998	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
2999
3000	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
3001
3002	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
3003
3004	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
3005
3006	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
3007
3008	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
3009
3010	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
3011
3012	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
3013
3014	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
3015	str	w9, [x16, #12]                         //store the updated counter
3016
3017	st1	{ v5.16b}, [x2]                         //store all 16B
3018
3019	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
3020	ext	v11.16b, v11.16b, v11.16b, #8
3021	rev64	v11.16b, v11.16b
3022	mov	x0, x15
3023	st1	{ v11.16b }, [x3]
3024
3025	ldp	x21, x22, [sp, #16]
3026	ldp	x23, x24, [sp, #32]
3027	ldp	d8, d9, [sp, #48]
3028	ldp	d10, d11, [sp, #64]
3029	ldp	d12, d13, [sp, #80]
3030	ldp	d14, d15, [sp, #96]
3031	ldp	x19, x20, [sp], #112
3032	ret
3033
3034.L192_enc_ret:
3035	mov	w0, #0x0
3036	ret
3037.size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3038.globl	aes_gcm_dec_192_kernel
3039.type	aes_gcm_dec_192_kernel,%function
3040.align	4
3041aes_gcm_dec_192_kernel:
3042	cbz	x1, .L192_dec_ret
3043	stp	x19, x20, [sp, #-112]!
3044	mov	x16, x4
3045	mov	x8, x5
3046	stp	x21, x22, [sp, #16]
3047	stp	x23, x24, [sp, #32]
3048	stp	d8, d9, [sp, #48]
3049	stp	d10, d11, [sp, #64]
3050	stp	d12, d13, [sp, #80]
3051	stp	d14, d15, [sp, #96]
3052
3053	add	x4, x0, x1, lsr #3   //end_input_ptr
3054	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
3055#ifdef __AARCH64EB__
3056	rev	x10, x10
3057	rev	x11, x11
3058#endif
3059	ldp	x13, x14, [x8, #192]                     //load rk12
3060#ifdef __AARCH64EB__
3061	ror	x13, x13, #32
3062	ror	x14, x14, #32
3063#endif
3064	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
3065
3066	ld1	{v18.4s}, [x8], #16                                  //load rk0
3067
3068	lsr	x5, x1, #3              //byte_len
3069	mov	x15, x5
3070	ld1	{v19.4s}, [x8], #16                               //load rk1
3071
3072	lsr	x12, x11, #32
3073	orr	w11, w11, w11
3074	fmov	d3, x10                               //CTR block 3
3075
3076	rev	w12, w12                                //rev_ctr32
3077	fmov	d1, x10                               //CTR block 1
3078
3079	add	w12, w12, #1                            //increment rev_ctr32
3080	ld1	{v20.4s}, [x8], #16                               //load rk2
3081
3082	aese	v0.16b, v18.16b
3083	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
3084	rev	w9, w12                                 //CTR block 1
3085
3086	add	w12, w12, #1                            //CTR block 1
3087	orr	x9, x11, x9, lsl #32            //CTR block 1
3088	ld1	{v21.4s}, [x8], #16                               //load rk3
3089
3090	fmov	v1.d[1], x9                               //CTR block 1
3091	rev	w9, w12                                 //CTR block 2
3092	add	w12, w12, #1                            //CTR block 2
3093
3094	fmov	d2, x10                               //CTR block 2
3095	orr	x9, x11, x9, lsl #32            //CTR block 2
3096
3097	fmov	v2.d[1], x9                               //CTR block 2
3098	rev	w9, w12                                 //CTR block 3
3099
3100	aese	v0.16b, v19.16b
3101	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
3102	orr	x9, x11, x9, lsl #32            //CTR block 3
3103
3104	fmov	v3.d[1], x9                               //CTR block 3
3105
3106	ld1	{v22.4s}, [x8], #16                               //load rk4
3107
3108	aese	v0.16b, v20.16b
3109	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
3110
3111	aese	v2.16b, v18.16b
3112	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
3113	ld1	{v23.4s}, [x8], #16                               //load rk5
3114
3115	aese	v1.16b, v18.16b
3116	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
3117	ldr	q15, [x3, #112]                        //load h4l | h4h
3118#ifndef __AARCH64EB__
3119	ext	v15.16b, v15.16b, v15.16b, #8
3120#endif
3121	aese	v3.16b, v18.16b
3122	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
3123	ldr	q13, [x3, #64]                         //load h2l | h2h
3124#ifndef __AARCH64EB__
3125	ext	v13.16b, v13.16b, v13.16b, #8
3126#endif
3127	aese	v2.16b, v19.16b
3128	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
3129	ldr	q14, [x3, #80]                         //load h3l | h3h
3130#ifndef __AARCH64EB__
3131	ext	v14.16b, v14.16b, v14.16b, #8
3132#endif
3133	aese	v1.16b, v19.16b
3134	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
3135
3136	aese	v3.16b, v19.16b
3137	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
3138	ldr	q12, [x3, #32]                         //load h1l | h1h
3139#ifndef __AARCH64EB__
3140	ext	v12.16b, v12.16b, v12.16b, #8
3141#endif
3142	aese	v2.16b, v20.16b
3143	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
3144	ld1	{v24.4s}, [x8], #16                               //load rk6
3145
3146	aese	v0.16b, v21.16b
3147	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
3148	ld1	{v25.4s}, [x8], #16                               //load rk7
3149
3150	aese	v1.16b, v20.16b
3151	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
3152	ld1	{v26.4s}, [x8], #16                               //load rk8
3153
3154	aese	v3.16b, v20.16b
3155	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
3156	ld1	{v27.4s}, [x8], #16                               //load rk9
3157
3158	aese	v2.16b, v21.16b
3159	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
3160	ld1	{ v11.16b}, [x3]
3161	ext	v11.16b, v11.16b, v11.16b, #8
3162	rev64	v11.16b, v11.16b
3163
3164	aese	v1.16b, v21.16b
3165	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
3166	add	w12, w12, #1                            //CTR block 3
3167
3168	aese	v3.16b, v21.16b
3169	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
3170	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
3171
3172	aese	v0.16b, v22.16b
3173	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
3174	ld1	{v28.4s}, [x8], #16                              //load rk10
3175
3176	aese	v1.16b, v22.16b
3177	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
3178	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
3179
3180	aese	v2.16b, v22.16b
3181	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
3182
3183	aese	v3.16b, v22.16b
3184	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
3185	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
3186
3187	aese	v0.16b, v23.16b
3188	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
3189	ld1	{v29.4s}, [x8], #16                              //load rk11
3190
3191	aese	v1.16b, v23.16b
3192	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
3193
3194	aese	v2.16b, v23.16b
3195	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
3196
3197	aese	v3.16b, v23.16b
3198	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
3199
3200	aese	v0.16b, v24.16b
3201	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
3202
3203	aese	v2.16b, v24.16b
3204	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
3205
3206	aese	v3.16b, v24.16b
3207	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
3208
3209	aese	v0.16b, v25.16b
3210	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
3211
3212	aese	v2.16b, v25.16b
3213	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
3214
3215	aese	v3.16b, v25.16b
3216	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
3217
3218	aese	v1.16b, v24.16b
3219	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
3220
3221	aese	v2.16b, v26.16b
3222	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
3223
3224	aese	v3.16b, v26.16b
3225	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
3226
3227	aese	v1.16b, v25.16b
3228	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
3229
3230	aese	v2.16b, v27.16b
3231	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
3232
3233	aese	v3.16b, v27.16b
3234	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
3235
3236	aese	v1.16b, v26.16b
3237	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
3238	sub	x5, x5, #1      //byte_len - 1
3239
3240	aese	v0.16b, v26.16b
3241	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
3242	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3243
3244	aese	v3.16b, v28.16b
3245	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
3246	add	x5, x5, x0
3247
3248	aese	v1.16b, v27.16b
3249	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
3250	cmp	x0, x5                   //check if we have <= 4 blocks
3251
3252	aese	v0.16b, v27.16b
3253	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
3254	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
3255
3256	aese	v3.16b, v29.16b                                     //AES block 3 - round 11
3257
3258	aese	v2.16b, v28.16b
3259	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
3260
3261	aese	v1.16b, v28.16b
3262	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
3263
3264	aese	v0.16b, v28.16b
3265	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
3266	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
3267
3268	aese	v2.16b, v29.16b                                     //AES block 2 - round 11
3269
3270	aese	v1.16b, v29.16b                                     //AES block 1 - round 11
3271	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
3272
3273	aese	v0.16b, v29.16b                                     //AES block 0 - round 11
3274	b.ge	.L192_dec_tail                                    //handle tail
3275
3276	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
3277
3278	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
3279
3280	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
3281	rev	w9, w12                                 //CTR block 4
3282	ld1	{v6.16b, v7.16b}, [x0], #32               //AES block 2,3 - load ciphertext
3283
3284	mov	x19, v1.d[0]                            //AES block 1 - mov low
3285
3286	mov	x20, v1.d[1]                            //AES block 1 - mov high
3287
3288	mov	x6, v0.d[0]                            //AES block 0 - mov low
3289	orr	x9, x11, x9, lsl #32            //CTR block 4
3290	add	w12, w12, #1                            //CTR block 4
3291
3292	mov	x7, v0.d[1]                            //AES block 0 - mov high
3293	rev64	v4.16b, v4.16b                                    //GHASH block 0
3294
3295	fmov	d0, x10                               //CTR block 4
3296	rev64	v5.16b, v5.16b                                    //GHASH block 1
3297	cmp	x0, x5                   //check if we have <= 8 blocks
3298
3299	eor	x19, x19, x13                   //AES block 1 - round 12 low
3300#ifdef __AARCH64EB__
3301	rev	x19, x19
3302#endif
3303	fmov	v0.d[1], x9                               //CTR block 4
3304	rev	w9, w12                                 //CTR block 5
3305
3306	orr	x9, x11, x9, lsl #32            //CTR block 5
3307	fmov	d1, x10                               //CTR block 5
3308	eor	x20, x20, x14                   //AES block 1 - round 12 high
3309#ifdef __AARCH64EB__
3310	rev	x20, x20
3311#endif
3312	add	w12, w12, #1                            //CTR block 5
3313	fmov	v1.d[1], x9                               //CTR block 5
3314	eor	x6, x6, x13                   //AES block 0 - round 12 low
3315#ifdef __AARCH64EB__
3316	rev	x6, x6
3317#endif
3318	rev	w9, w12                                 //CTR block 6
3319	eor	x7, x7, x14                   //AES block 0 - round 12 high
3320#ifdef __AARCH64EB__
3321	rev	x7, x7
3322#endif
3323	stp	x6, x7, [x2], #16        //AES block 0 - store result
3324	orr	x9, x11, x9, lsl #32            //CTR block 6
3325
3326	stp	x19, x20, [x2], #16        //AES block 1 - store result
3327
3328	add	w12, w12, #1                            //CTR block 6
3329	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
3330	b.ge	.L192_dec_prepretail                              //do prepretail
3331
3332.L192_dec_main_loop:	//main	loop start
3333	aese	v1.16b, v18.16b
3334	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
3335	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
3336
3337	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
3338	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
3339
3340	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
3341	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
3342	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
3343
3344	aese	v1.16b, v19.16b
3345	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
3346	fmov	d2, x10                               //CTR block 4k+6
3347
3348	aese	v0.16b, v18.16b
3349	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
3350	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
3351
3352	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
3353	fmov	v2.d[1], x9                               //CTR block 4k+6
3354
3355	aese	v1.16b, v20.16b
3356	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
3357	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
3358
3359	aese	v0.16b, v19.16b
3360	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
3361	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
3362
3363	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
3364	fmov	d3, x10                               //CTR block 4k+7
3365	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
3366
3367	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
3368	mov	d10, v17.d[1]                               //GHASH block 4k - mid
3369	rev	w9, w12                                 //CTR block 4k+7
3370
3371	aese	v2.16b, v18.16b
3372	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
3373	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
3374
3375	fmov	v3.d[1], x9                               //CTR block 4k+7
3376	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
3377	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
3378
3379	aese	v1.16b, v21.16b
3380	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
3381
3382	aese	v0.16b, v20.16b
3383	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
3384	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
3385#ifdef __AARCH64EB__
3386	rev	x22, x22
3387#endif
3388	aese	v2.16b, v19.16b
3389	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
3390	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
3391
3392	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
3393
3394	aese	v3.16b, v18.16b
3395	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
3396	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
3397
3398	aese	v2.16b, v20.16b
3399	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
3400
3401	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
3402	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
3403	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
3404#ifdef __AARCH64EB__
3405	rev	x21, x21
3406#endif
3407	aese	v1.16b, v22.16b
3408	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
3409
3410	aese	v0.16b, v21.16b
3411	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
3412
3413	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
3414	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
3415
3416	aese	v3.16b, v19.16b
3417	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
3418	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
3419
3420	aese	v0.16b, v22.16b
3421	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
3422
3423	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
3424	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
3425
3426	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
3427
3428	aese	v0.16b, v23.16b
3429	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
3430
3431	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
3432	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
3433
3434	aese	v1.16b, v23.16b
3435	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
3436
3437	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
3438
3439	aese	v3.16b, v20.16b
3440	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
3441	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
3442
3443	aese	v1.16b, v24.16b
3444	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
3445
3446	aese	v0.16b, v24.16b
3447	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
3448	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
3449
3450	aese	v3.16b, v21.16b
3451	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
3452
3453	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
3454	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
3455
3456	aese	v0.16b, v25.16b
3457	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
3458
3459	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
3460	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
3461
3462	aese	v1.16b, v25.16b
3463	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
3464
3465	aese	v0.16b, v26.16b
3466	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
3467	movi	v8.8b, #0xc2
3468
3469	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
3470
3471	aese	v1.16b, v26.16b
3472	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
3473	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
3474
3475	aese	v2.16b, v21.16b
3476	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
3477
3478	aese	v0.16b, v27.16b
3479	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
3480	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
3481
3482	aese	v3.16b, v22.16b
3483	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
3484
3485	aese	v2.16b, v22.16b
3486	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
3487	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
3488
3489	aese	v0.16b, v28.16b
3490	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
3491
3492	aese	v1.16b, v27.16b
3493	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
3494	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
3495
3496	aese	v2.16b, v23.16b
3497	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
3498
3499	aese	v3.16b, v23.16b
3500	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
3501	shl	d8, d8, #56               //mod_constant
3502
3503	aese	v1.16b, v28.16b
3504	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
3505
3506	aese	v2.16b, v24.16b
3507	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
3508	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
3509
3510	aese	v3.16b, v24.16b
3511	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
3512	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
3513
3514	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
3515	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
3516	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
3517#ifdef __AARCH64EB__
3518	rev	x23, x23
3519#endif
3520	aese	v2.16b, v25.16b
3521	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
3522	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
3523
3524	aese	v0.16b, v29.16b                                     //AES block 4k+4 - round 11
3525	add	w12, w12, #1                            //CTR block 4k+7
3526
3527	aese	v3.16b, v25.16b
3528	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
3529	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
3530
3531	aese	v2.16b, v26.16b
3532	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
3533	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
3534
3535	aese	v1.16b, v29.16b                                     //AES block 4k+5 - round 11
3536	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
3537	rev	w9, w12                                 //CTR block 4k+8
3538
3539	aese	v3.16b, v26.16b
3540	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
3541	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
3542
3543	aese	v2.16b, v27.16b
3544	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
3545	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
3546
3547	cmp	x0, x5                   //.LOOP CONTROL
3548
3549	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
3550	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
3551#ifdef __AARCH64EB__
3552	rev	x24, x24
3553#endif
3554	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
3555
3556	aese	v2.16b, v28.16b
3557	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
3558	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
3559
3560	aese	v3.16b, v27.16b
3561	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
3562
3563	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
3564	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
3565
3566	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
3567	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
3568	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
3569
3570	aese	v2.16b, v29.16b                                     //AES block 4k+6 - round 11
3571	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
3572
3573	aese	v3.16b, v28.16b
3574	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
3575	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
3576
3577	fmov	d0, x10                               //CTR block 4k+8
3578	add	w12, w12, #1                            //CTR block 4k+8
3579	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
3580
3581	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
3582	fmov	v0.d[1], x9                               //CTR block 4k+8
3583	rev	w9, w12                                 //CTR block 4k+9
3584
3585	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
3586#ifdef __AARCH64EB__
3587	rev	x6, x6
3588#endif
3589	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
3590	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
3591
3592	fmov	d1, x10                               //CTR block 4k+9
3593	add	w12, w12, #1                            //CTR block 4k+9
3594	eor	x19, x19, x13                   //AES block 4k+5 - round 12 low
3595#ifdef __AARCH64EB__
3596	rev	x19, x19
3597#endif
3598	fmov	v1.d[1], x9                               //CTR block 4k+9
3599	rev	w9, w12                                 //CTR block 4k+10
3600	eor	x20, x20, x14                   //AES block 4k+5 - round 12 high
3601#ifdef __AARCH64EB__
3602	rev	x20, x20
3603#endif
3604	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
3605#ifdef __AARCH64EB__
3606	rev	x7, x7
3607#endif
3608	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
3609	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
3610
3611	add	w12, w12, #1                            //CTR block 4k+10
3612	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
3613	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
3614
3615	aese	v3.16b, v29.16b                                     //AES block 4k+7 - round 11
3616	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
3617	b.lt	.L192_dec_main_loop
3618
3619.L192_dec_prepretail:	//PREPRETAIL
3620	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
3621	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
3622	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
3623
3624	aese	v1.16b, v18.16b
3625	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
3626	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
3627
3628	aese	v0.16b, v18.16b
3629	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
3630	mov	d10, v17.d[1]                               //GHASH block 4k - mid
3631
3632	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
3633	fmov	d2, x10                               //CTR block 4k+6
3634
3635	aese	v1.16b, v19.16b
3636	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
3637	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
3638
3639	aese	v0.16b, v19.16b
3640	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
3641	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
3642
3643	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
3644	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
3645	fmov	d3, x10                               //CTR block 4k+7
3646
3647	aese	v1.16b, v20.16b
3648	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
3649	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
3650
3651	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
3652	fmov	v2.d[1], x9                               //CTR block 4k+6
3653	rev	w9, w12                                 //CTR block 4k+7
3654
3655	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
3656	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
3657	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
3658
3659	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
3660	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
3661#ifdef __AARCH64EB__
3662	rev	x24, x24
3663#endif
3664	fmov	v3.d[1], x9                               //CTR block 4k+7
3665
3666	aese	v0.16b, v20.16b
3667	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
3668	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
3669#ifdef __AARCH64EB__
3670	rev	x21, x21
3671#endif
3672	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
3673	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
3674#ifdef __AARCH64EB__
3675	rev	x22, x22
3676#endif
3677	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
3678
3679	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
3680	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
3681#ifdef __AARCH64EB__
3682	rev	x23, x23
3683#endif
3684	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
3685
3686	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
3687	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
3688
3689	aese	v3.16b, v18.16b
3690	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
3691	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
3692
3693	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
3694	add	w12, w12, #1                            //CTR block 4k+7
3695
3696	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
3697	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
3698
3699	aese	v2.16b, v18.16b
3700	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
3701
3702	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
3703	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
3704
3705	aese	v3.16b, v19.16b
3706	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
3707
3708	aese	v2.16b, v19.16b
3709	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
3710	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
3711
3712	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
3713
3714	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
3715
3716	aese	v2.16b, v20.16b
3717	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
3718	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
3719
3720	aese	v3.16b, v20.16b
3721	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
3722	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
3723
3724	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
3725
3726	aese	v0.16b, v21.16b
3727	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
3728	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
3729
3730	aese	v1.16b, v21.16b
3731	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
3732
3733	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
3734	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
3735
3736	aese	v0.16b, v22.16b
3737	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
3738
3739	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
3740	movi	v8.8b, #0xc2
3741
3742	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
3743
3744	aese	v2.16b, v21.16b
3745	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
3746
3747	shl	d8, d8, #56               //mod_constant
3748	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
3749
3750	aese	v0.16b, v23.16b
3751	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
3752	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
3753
3754	aese	v2.16b, v22.16b
3755	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
3756
3757	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
3758	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
3759
3760	aese	v0.16b, v24.16b
3761	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
3762
3763	aese	v3.16b, v21.16b
3764	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
3765	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
3766
3767	aese	v2.16b, v23.16b
3768	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
3769
3770	aese	v0.16b, v25.16b
3771	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
3772	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
3773
3774	aese	v3.16b, v22.16b
3775	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
3776
3777	aese	v2.16b, v24.16b
3778	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
3779	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
3780
3781	aese	v0.16b, v26.16b
3782	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
3783
3784	aese	v3.16b, v23.16b
3785	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
3786	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
3787
3788	aese	v1.16b, v22.16b
3789	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
3790
3791	aese	v2.16b, v25.16b
3792	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
3793
3794	aese	v0.16b, v27.16b
3795	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
3796
3797	aese	v1.16b, v23.16b
3798	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
3799
3800	aese	v3.16b, v24.16b
3801	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
3802	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
3803
3804	aese	v0.16b, v28.16b
3805	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
3806
3807	aese	v1.16b, v24.16b
3808	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
3809
3810	aese	v3.16b, v25.16b
3811	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
3812
3813	aese	v2.16b, v26.16b
3814	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
3815	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
3816
3817	aese	v1.16b, v25.16b
3818	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
3819
3820	aese	v3.16b, v26.16b
3821	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
3822
3823	aese	v2.16b, v27.16b
3824	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
3825
3826	aese	v1.16b, v26.16b
3827	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
3828
3829	aese	v3.16b, v27.16b
3830	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
3831
3832	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
3833
3834	aese	v1.16b, v27.16b
3835	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
3836
3837	aese	v2.16b, v28.16b
3838	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
3839
3840	aese	v3.16b, v28.16b
3841	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
3842	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
3843
3844	aese	v1.16b, v28.16b
3845	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
3846
3847	aese	v0.16b, v29.16b
3848	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
3849
3850	aese	v2.16b, v29.16b
3851
3852	aese	v1.16b, v29.16b
3853
3854	aese	v3.16b, v29.16b
3855
3856	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
3857.L192_dec_tail:	//TAIL
3858
3859	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
3860	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
3861
3862	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
3863
3864	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
3865
3866	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
3867
3868	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
3869
3870	cmp	x5, #48
3871
3872	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
3873#ifdef __AARCH64EB__
3874	rev	x7, x7
3875#endif
3876	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
3877#ifdef __AARCH64EB__
3878	rev	x6, x6
3879#endif
3880	b.gt	.L192_dec_blocks_more_than_3
3881
3882	movi	v11.8b, #0
3883	movi	v9.8b, #0
3884
3885	mov	v3.16b, v2.16b
3886	mov	v2.16b, v1.16b
3887	sub	w12, w12, #1
3888
3889	movi	v10.8b, #0
3890	cmp	x5, #32
3891	b.gt	.L192_dec_blocks_more_than_2
3892
3893	mov	v3.16b, v1.16b
3894	cmp	x5, #16
3895	sub	w12, w12, #1
3896
3897	b.gt	.L192_dec_blocks_more_than_1
3898
3899	sub	w12, w12, #1
3900	b	.L192_dec_blocks_less_than_1
3901.L192_dec_blocks_more_than_3:	//blocks	left >  3
3902	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
3903	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
3904
3905	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
3906
3907	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3908
3909	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
3910
3911	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
3912	mov	x6, v0.d[0]                            //AES final-2 block - mov low
3913	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
3914
3915	mov	x7, v0.d[1]                            //AES final-2 block - mov high
3916
3917	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
3918	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
3919
3920	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
3921
3922	eor	x6, x6, x13                   //AES final-2 block - round 12 low
3923#ifdef __AARCH64EB__
3924	rev	x6, x6
3925#endif
3926	movi	v8.8b, #0                                        //suppress further partial tag feed in
3927
3928	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
3929	eor	x7, x7, x14                   //AES final-2 block - round 12 high
3930#ifdef __AARCH64EB__
3931	rev	x7, x7
3932#endif
3933.L192_dec_blocks_more_than_2:	//blocks	left >  2
3934
3935	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
3936	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
3937
3938	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3939
3940	movi	v8.8b, #0                                        //suppress further partial tag feed in
3941
3942	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
3943
3944	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
3945
3946	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
3947
3948	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
3949
3950	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
3951	mov	x7, v0.d[1]                            //AES final-1 block - mov high
3952
3953	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
3954	mov	x6, v0.d[0]                            //AES final-1 block - mov low
3955
3956	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
3957
3958	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
3959
3960	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
3961	eor	x7, x7, x14                   //AES final-1 block - round 12 high
3962#ifdef __AARCH64EB__
3963	rev	x7, x7
3964#endif
3965	eor	x6, x6, x13                   //AES final-1 block - round 12 low
3966#ifdef __AARCH64EB__
3967	rev	x6, x6
3968#endif
3969	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
3970.L192_dec_blocks_more_than_1:	//blocks	left >  1
3971
3972	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
3973
3974	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3975	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
3976
3977	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
3978
3979	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
3980
3981	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
3982	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
3983
3984	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
3985
3986	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
3987
3988	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
3989	mov	x7, v0.d[1]                            //AES final block - mov high
3990
3991	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
3992	mov	x6, v0.d[0]                            //AES final block - mov low
3993
3994	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
3995
3996	movi	v8.8b, #0                                        //suppress further partial tag feed in
3997	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
3998	eor	x7, x7, x14                   //AES final block - round 12 high
3999#ifdef __AARCH64EB__
4000	rev	x7, x7
4001#endif
4002	eor	x6, x6, x13                   //AES final block - round 12 low
4003#ifdef __AARCH64EB__
4004	rev	x6, x6
4005#endif
4006	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
4007.L192_dec_blocks_less_than_1:	//blocks	left <= 1
4008
4009	mvn	x13, xzr                                      //rk12_l = 0xffffffffffffffff
4010	ldp	x4, x5, [x2]  //load existing bytes we need to not overwrite
4011	and	x1, x1, #127                    //bit_length %= 128
4012
4013	sub	x1, x1, #128                    //bit_length -= 128
4014
4015	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
4016
4017	and	x1, x1, #127                    //bit_length %= 128
4018	mvn	x14, xzr                                      //rk12_h = 0xffffffffffffffff
4019
4020	lsr	x14, x14, x1                     //rk12_h is mask for top 64b of last block
4021	cmp	x1, #64
4022
4023	csel	x9, x13, x14, lt
4024	csel	x10, x14, xzr, lt
4025
4026	fmov	d0, x9                                   //ctr0b is mask for last block
4027	and	x6, x6, x9
4028	bic	x4, x4, x9           //mask out low existing bytes
4029
4030	orr	x6, x6, x4
4031	mov	v0.d[1], x10
4032#ifndef __AARCH64EB__
4033	rev	w9, w12
4034#else
4035	mov	w9, w12
4036#endif
4037
4038	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
4039	str	w9, [x16, #12]                          //store the updated counter
4040
4041	rev64	v4.16b, v5.16b                                    //GHASH final block
4042
4043	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
4044	bic	x5, x5, x10 //mask out high existing bytes
4045
4046	and	x7, x7, x10
4047
4048	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
4049	mov	d8, v4.d[1]                                  //GHASH final block - mid
4050
4051	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
4052
4053	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
4054
4055	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
4056
4057	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
4058
4059	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
4060
4061	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
4062	movi	v8.8b, #0xc2
4063
4064	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
4065
4066	shl	d8, d8, #56               //mod_constant
4067
4068	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
4069
4070	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
4071	orr	x7, x7, x5
4072	stp	x6, x7, [x2]
4073
4074	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
4075
4076	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
4077
4078	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
4079
4080	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
4081
4082	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
4083
4084	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
4085
4086	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
4087	ext	v11.16b, v11.16b, v11.16b, #8
4088	rev64	v11.16b, v11.16b
4089	mov	x0, x15
4090	st1	{ v11.16b }, [x3]
4091
4092	ldp	x21, x22, [sp, #16]
4093	ldp	x23, x24, [sp, #32]
4094	ldp	d8, d9, [sp, #48]
4095	ldp	d10, d11, [sp, #64]
4096	ldp	d12, d13, [sp, #80]
4097	ldp	d14, d15, [sp, #96]
4098	ldp	x19, x20, [sp], #112
4099	ret
4100
4101.L192_dec_ret:
4102	mov	w0, #0x0
4103	ret
4104.size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
4105.globl	aes_gcm_enc_256_kernel
4106.type	aes_gcm_enc_256_kernel,%function
4107.align	4
4108aes_gcm_enc_256_kernel:
4109	cbz	x1, .L256_enc_ret
4110	stp	x19, x20, [sp, #-112]!
4111	mov	x16, x4
4112	mov	x8, x5
4113	stp	x21, x22, [sp, #16]
4114	stp	x23, x24, [sp, #32]
4115	stp	d8, d9, [sp, #48]
4116	stp	d10, d11, [sp, #64]
4117	stp	d12, d13, [sp, #80]
4118	stp	d14, d15, [sp, #96]
4119
4120	add	x4, x0, x1, lsr #3   //end_input_ptr
4121	lsr	x5, x1, #3              //byte_len
4122	mov	x15, x5
4123	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
4124#ifdef __AARCH64EB__
4125	rev	x10, x10
4126	rev	x11, x11
4127#endif
4128	ldp	x13, x14, [x8, #224]                     //load rk14
4129#ifdef __AARCH64EB__
4130	ror	x13, x13, #32
4131	ror	x14, x14, #32
4132#endif
4133	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
4134	sub	x5, x5, #1      //byte_len - 1
4135
4136	ld1	{v18.4s}, [x8], #16                               //load rk0
4137	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4138
4139	ld1	{v19.4s}, [x8], #16                               //load rk1
4140	add	x5, x5, x0
4141
4142	lsr	x12, x11, #32
4143	fmov	d2, x10                               //CTR block 2
4144	orr	w11, w11, w11
4145
4146	rev	w12, w12                                //rev_ctr32
4147	cmp	x0, x5                   //check if we have <= 4 blocks
4148	fmov	d1, x10                               //CTR block 1
4149
4150	aese	v0.16b, v18.16b
4151	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
4152	add	w12, w12, #1                            //increment rev_ctr32
4153
4154	rev	w9, w12                                 //CTR block 1
4155	fmov	d3, x10                               //CTR block 3
4156
4157	orr	x9, x11, x9, lsl #32            //CTR block 1
4158	add	w12, w12, #1                            //CTR block 1
4159	ld1	{v20.4s}, [x8], #16                               //load rk2
4160
4161	fmov	v1.d[1], x9                               //CTR block 1
4162	rev	w9, w12                                 //CTR block 2
4163	add	w12, w12, #1                            //CTR block 2
4164
4165	orr	x9, x11, x9, lsl #32            //CTR block 2
4166	ld1	{v21.4s}, [x8], #16                               //load rk3
4167
4168	fmov	v2.d[1], x9                               //CTR block 2
4169	rev	w9, w12                                 //CTR block 3
4170
4171	aese	v0.16b, v19.16b
4172	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
4173	orr	x9, x11, x9, lsl #32            //CTR block 3
4174
4175	fmov	v3.d[1], x9                               //CTR block 3
4176
4177	aese	v1.16b, v18.16b
4178	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
4179	ld1	{v22.4s}, [x8], #16                               //load rk4
4180
4181	aese	v0.16b, v20.16b
4182	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
4183	ld1	{v23.4s}, [x8], #16                               //load rk5
4184
4185	aese	v2.16b, v18.16b
4186	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
4187	ld1	{v24.4s}, [x8], #16                               //load rk6
4188
4189	aese	v1.16b, v19.16b
4190	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
4191	ldr	q14, [x3, #80]                         //load h3l | h3h
4192#ifndef __AARCH64EB__
4193	ext	v14.16b, v14.16b, v14.16b, #8
4194#endif
4195	aese	v3.16b, v18.16b
4196	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
4197	ld1	{v25.4s}, [x8], #16                               //load rk7
4198
4199	aese	v2.16b, v19.16b
4200	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
4201	ld1	{v26.4s}, [x8], #16                               //load rk8
4202
4203	aese	v1.16b, v20.16b
4204	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
4205	ldr	q13, [x3, #64]                         //load h2l | h2h
4206#ifndef __AARCH64EB__
4207	ext	v13.16b, v13.16b, v13.16b, #8
4208#endif
4209	aese	v3.16b, v19.16b
4210	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
4211	ld1	{v27.4s}, [x8], #16                               //load rk9
4212
4213	aese	v2.16b, v20.16b
4214	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
4215	ldr	q15, [x3, #112]                        //load h4l | h4h
4216#ifndef __AARCH64EB__
4217	ext	v15.16b, v15.16b, v15.16b, #8
4218#endif
4219	aese	v1.16b, v21.16b
4220	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
4221	ld1	{v28.4s}, [x8], #16                              //load rk10
4222
4223	aese	v3.16b, v20.16b
4224	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
4225	ld1	{v29.4s}, [x8], #16                              //load rk11
4226
4227	aese	v2.16b, v21.16b
4228	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
4229	add	w12, w12, #1                            //CTR block 3
4230
4231	aese	v0.16b, v21.16b
4232	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
4233
4234	aese	v3.16b, v21.16b
4235	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
4236	ld1	{ v11.16b}, [x3]
4237	ext	v11.16b, v11.16b, v11.16b, #8
4238	rev64	v11.16b, v11.16b
4239
4240	aese	v2.16b, v22.16b
4241	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
4242
4243	aese	v0.16b, v22.16b
4244	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
4245
4246	aese	v1.16b, v22.16b
4247	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
4248
4249	aese	v3.16b, v22.16b
4250	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
4251
4252	aese	v0.16b, v23.16b
4253	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
4254
4255	aese	v1.16b, v23.16b
4256	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
4257
4258	aese	v3.16b, v23.16b
4259	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
4260
4261	aese	v2.16b, v23.16b
4262	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
4263
4264	aese	v1.16b, v24.16b
4265	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
4266	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
4267
4268	aese	v3.16b, v24.16b
4269	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
4270	ld1	{v30.4s}, [x8], #16                              //load rk12
4271
4272	aese	v0.16b, v24.16b
4273	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
4274	ldr	q12, [x3, #32]                         //load h1l | h1h
4275#ifndef __AARCH64EB__
4276	ext	v12.16b, v12.16b, v12.16b, #8
4277#endif
4278	aese	v2.16b, v24.16b
4279	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
4280	ld1	{v31.4s}, [x8], #16                              //load rk13
4281
4282	aese	v1.16b, v25.16b
4283	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
4284	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
4285
4286	aese	v0.16b, v25.16b
4287	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
4288
4289	aese	v2.16b, v25.16b
4290	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
4291
4292	aese	v3.16b, v25.16b
4293	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
4294	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
4295
4296	aese	v1.16b, v26.16b
4297	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
4298
4299	aese	v2.16b, v26.16b
4300	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
4301
4302	aese	v3.16b, v26.16b
4303	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
4304
4305	aese	v1.16b, v27.16b
4306	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
4307
4308	aese	v2.16b, v27.16b
4309	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
4310
4311	aese	v0.16b, v26.16b
4312	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
4313
4314	aese	v1.16b, v28.16b
4315	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
4316
4317	aese	v3.16b, v27.16b
4318	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
4319
4320	aese	v0.16b, v27.16b
4321	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
4322
4323	aese	v2.16b, v28.16b
4324	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
4325
4326	aese	v3.16b, v28.16b
4327	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
4328
4329	aese	v1.16b, v29.16b
4330	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
4331
4332	aese	v2.16b, v29.16b
4333	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
4334
4335	aese	v0.16b, v28.16b
4336	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
4337
4338	aese	v1.16b, v30.16b
4339	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
4340
4341	aese	v2.16b, v30.16b
4342	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
4343
4344	aese	v0.16b, v29.16b
4345	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
4346	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
4347
4348	aese	v3.16b, v29.16b
4349	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
4350
4351	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
4352	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
4353
4354	aese	v0.16b, v30.16b
4355	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
4356
4357	aese	v3.16b, v30.16b
4358	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
4359
4360	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
4361
4362	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
4363
4364	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
4365	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
4366	b.ge	.L256_enc_tail                                    //handle tail
4367
4368	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
4369#ifdef __AARCH64EB__
4370	rev	x19, x19
4371	rev	x20, x20
4372#endif
4373	rev	w9, w12                                 //CTR block 4
4374	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
4375#ifdef __AARCH64EB__
4376	rev	x6, x6
4377	rev	x7, x7
4378#endif
4379	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
4380#ifdef __AARCH64EB__
4381	rev	x23, x23
4382	rev	x24, x24
4383#endif
4384	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
4385#ifdef __AARCH64EB__
4386	rev	x21, x21
4387	rev	x22, x22
4388#endif
4389	add	x0, x0, #64                       //AES input_ptr update
4390
4391	eor	x19, x19, x13                     //AES block 1 - round 14 low
4392	eor	x20, x20, x14                     //AES block 1 - round 14 high
4393
4394	fmov	d5, x19                               //AES block 1 - mov low
4395	eor	x6, x6, x13                     //AES block 0 - round 14 low
4396
4397	eor	x7, x7, x14                     //AES block 0 - round 14 high
4398	eor	x24, x24, x14                     //AES block 3 - round 14 high
4399	fmov	d4, x6                               //AES block 0 - mov low
4400
4401	cmp	x0, x5                   //check if we have <= 8 blocks
4402	fmov	v4.d[1], x7                           //AES block 0 - mov high
4403	eor	x23, x23, x13                     //AES block 3 - round 14 low
4404
4405	eor	x21, x21, x13                     //AES block 2 - round 14 low
4406	fmov	v5.d[1], x20                           //AES block 1 - mov high
4407
4408	fmov	d6, x21                               //AES block 2 - mov low
4409	add	w12, w12, #1                            //CTR block 4
4410
4411	orr	x9, x11, x9, lsl #32            //CTR block 4
4412	fmov	d7, x23                               //AES block 3 - mov low
4413	eor	x22, x22, x14                     //AES block 2 - round 14 high
4414
4415	fmov	v6.d[1], x22                           //AES block 2 - mov high
4416
4417	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
4418	fmov	d0, x10                               //CTR block 4
4419
4420	fmov	v0.d[1], x9                               //CTR block 4
4421	rev	w9, w12                                 //CTR block 5
4422	add	w12, w12, #1                            //CTR block 5
4423
4424	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
4425	fmov	d1, x10                               //CTR block 5
4426	orr	x9, x11, x9, lsl #32            //CTR block 5
4427
4428	fmov	v1.d[1], x9                               //CTR block 5
4429	rev	w9, w12                                 //CTR block 6
4430	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
4431
4432	fmov	v7.d[1], x24                           //AES block 3 - mov high
4433	orr	x9, x11, x9, lsl #32            //CTR block 6
4434	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
4435
4436	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
4437
4438	add	w12, w12, #1                            //CTR block 6
4439	fmov	d2, x10                               //CTR block 6
4440
4441	fmov	v2.d[1], x9                               //CTR block 6
4442	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
4443	rev	w9, w12                                 //CTR block 7
4444
4445	orr	x9, x11, x9, lsl #32            //CTR block 7
4446
4447	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
4448	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
4449	b.ge	.L256_enc_prepretail                               //do prepretail
4450
4451.L256_enc_main_loop:	//main	loop start
4452	aese	v0.16b, v18.16b
4453	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
4454	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
4455
4456	aese	v1.16b, v18.16b
4457	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
4458	fmov	d3, x10                               //CTR block 4k+3
4459
4460	aese	v2.16b, v18.16b
4461	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
4462	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
4463
4464	aese	v0.16b, v19.16b
4465	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
4466	fmov	v3.d[1], x9                               //CTR block 4k+3
4467
4468	aese	v1.16b, v19.16b
4469	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
4470	ldp	x23, x24, [x0, #48]           //AES block 4k+7 - load plaintext
4471#ifdef __AARCH64EB__
4472	rev	x23, x23
4473	rev	x24, x24
4474#endif
4475	aese	v2.16b, v19.16b
4476	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
4477	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
4478#ifdef __AARCH64EB__
4479	rev	x21, x21
4480	rev	x22, x22
4481#endif
4482	aese	v0.16b, v20.16b
4483	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
4484	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
4485
4486	aese	v1.16b, v20.16b
4487	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
4488
4489	aese	v3.16b, v18.16b
4490	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
4491	eor	x23, x23, x13                     //AES block 4k+7 - round 14 low
4492
4493	aese	v0.16b, v21.16b
4494	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
4495	mov	d10, v17.d[1]                               //GHASH block 4k - mid
4496
4497	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
4498	eor	x22, x22, x14                     //AES block 4k+6 - round 14 high
4499	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
4500
4501	aese	v3.16b, v19.16b
4502	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
4503	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
4504
4505	aese	v0.16b, v22.16b
4506	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
4507
4508	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
4509	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
4510
4511	aese	v2.16b, v20.16b
4512	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
4513
4514	aese	v0.16b, v23.16b
4515	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
4516	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4517
4518	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
4519
4520	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
4521	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
4522
4523	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
4524
4525	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
4526	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
4527
4528	aese	v1.16b, v21.16b
4529	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
4530
4531	aese	v3.16b, v20.16b
4532	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
4533	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
4534
4535	aese	v2.16b, v21.16b
4536	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
4537
4538	aese	v1.16b, v22.16b
4539	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
4540	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
4541
4542	aese	v3.16b, v21.16b
4543	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
4544	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
4545
4546	aese	v2.16b, v22.16b
4547	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
4548
4549	aese	v0.16b, v24.16b
4550	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
4551	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
4552
4553	aese	v3.16b, v22.16b
4554	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
4555
4556	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
4557
4558	aese	v0.16b, v25.16b
4559	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
4560
4561	aese	v3.16b, v23.16b
4562	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
4563	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
4564
4565	aese	v1.16b, v23.16b
4566	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
4567
4568	aese	v0.16b, v26.16b
4569	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
4570
4571	aese	v2.16b, v23.16b
4572	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
4573
4574	aese	v1.16b, v24.16b
4575	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
4576	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
4577
4578	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
4579
4580	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
4581
4582	aese	v1.16b, v25.16b
4583	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
4584
4585	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
4586	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
4587
4588	aese	v3.16b, v24.16b
4589	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
4590	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
4591#ifdef __AARCH64EB__
4592	rev	x19, x19
4593	rev	x20, x20
4594#endif
4595	aese	v1.16b, v26.16b
4596	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
4597	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
4598
4599	aese	v2.16b, v24.16b
4600	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
4601	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
4602
4603	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
4604
4605	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
4606	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
4607
4608	aese	v2.16b, v25.16b
4609	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
4610	eor	x19, x19, x13                     //AES block 4k+5 - round 14 low
4611
4612	aese	v1.16b, v27.16b
4613	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
4614	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
4615
4616	aese	v3.16b, v25.16b
4617	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
4618	eor	x21, x21, x13                     //AES block 4k+6 - round 14 low
4619
4620	aese	v0.16b, v27.16b
4621	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
4622	movi	v8.8b, #0xc2
4623
4624	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
4625	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
4626	fmov	d5, x19                               //AES block 4k+5 - mov low
4627
4628	aese	v2.16b, v26.16b
4629	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
4630	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
4631#ifdef __AARCH64EB__
4632	rev	x6, x6
4633	rev	x7, x7
4634#endif
4635	aese	v0.16b, v28.16b
4636	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
4637	shl	d8, d8, #56               //mod_constant
4638
4639	aese	v3.16b, v26.16b
4640	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
4641	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
4642
4643	aese	v2.16b, v27.16b
4644	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
4645
4646	aese	v1.16b, v28.16b
4647	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
4648	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
4649
4650	aese	v3.16b, v27.16b
4651	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
4652	add	w12, w12, #1                            //CTR block 4k+3
4653
4654	aese	v0.16b, v29.16b
4655	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
4656	eor	v4.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
4657
4658	aese	v1.16b, v29.16b
4659	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
4660	add	x0, x0, #64                       //AES input_ptr update
4661
4662	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
4663	rev	w9, w12                                 //CTR block 4k+8
4664	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
4665
4666	aese	v2.16b, v28.16b
4667	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
4668	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
4669
4670	aese	v1.16b, v30.16b
4671	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
4672	eor	v10.16b, v10.16b, v4.16b                         //MODULO - karatsuba tidy up
4673
4674	aese	v3.16b, v28.16b
4675	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
4676	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
4677
4678	fmov	d4, x6                               //AES block 4k+4 - mov low
4679	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
4680	eor	v7.16b, v9.16b, v7.16b                   //MODULO - fold into mid
4681
4682	aese	v0.16b, v30.16b
4683	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
4684	eor	x20, x20, x14                     //AES block 4k+5 - round 14 high
4685
4686	aese	v2.16b, v29.16b
4687	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
4688	eor	x24, x24, x14                     //AES block 4k+7 - round 14 high
4689
4690	aese	v3.16b, v29.16b
4691	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
4692	add	w12, w12, #1                            //CTR block 4k+8
4693
4694	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
4695	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
4696	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
4697
4698	aese	v2.16b, v30.16b
4699	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
4700	fmov	d7, x23                               //AES block 4k+7 - mov low
4701
4702	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
4703	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
4704
4705	fmov	d6, x21                               //AES block 4k+6 - mov low
4706	cmp	x0, x5                   //.LOOP CONTROL
4707
4708	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
4709
4710	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
4711	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
4712	fmov	d0, x10                               //CTR block 4k+8
4713
4714	fmov	v0.d[1], x9                               //CTR block 4k+8
4715	rev	w9, w12                                 //CTR block 4k+9
4716	add	w12, w12, #1                            //CTR block 4k+9
4717
4718	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
4719	fmov	d1, x10                               //CTR block 4k+9
4720	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
4721
4722	aese	v3.16b, v30.16b
4723	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
4724	fmov	v1.d[1], x9                               //CTR block 4k+9
4725
4726	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
4727	rev	w9, w12                                 //CTR block 4k+10
4728	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
4729
4730	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
4731	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
4732	fmov	v7.d[1], x24                           //AES block 4k+7 - mov high
4733
4734	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
4735	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
4736	add	w12, w12, #1                            //CTR block 4k+10
4737
4738	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
4739	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
4740	fmov	d2, x10                               //CTR block 4k+10
4741
4742	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
4743	fmov	v2.d[1], x9                               //CTR block 4k+10
4744	rev	w9, w12                                 //CTR block 4k+11
4745
4746	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
4747	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
4748
4749	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+7 - result
4750	st1	{ v7.16b}, [x2], #16                     //AES block 4k+7 - store result
4751	b.lt	.L256_enc_main_loop
4752
4753.L256_enc_prepretail:	//PREPRETAIL
4754	aese	v1.16b, v18.16b
4755	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
4756	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
4757
4758	aese	v2.16b, v18.16b
4759	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
4760	fmov	d3, x10                               //CTR block 4k+3
4761
4762	aese	v0.16b, v18.16b
4763	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
4764	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
4765
4766	fmov	v3.d[1], x9                               //CTR block 4k+3
4767	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
4768
4769	aese	v2.16b, v19.16b
4770	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
4771
4772	aese	v0.16b, v19.16b
4773	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
4774
4775	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
4776	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
4777
4778	aese	v2.16b, v20.16b
4779	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
4780
4781	aese	v3.16b, v18.16b
4782	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
4783	mov	d10, v17.d[1]                               //GHASH block 4k - mid
4784
4785	aese	v1.16b, v19.16b
4786	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
4787
4788	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
4789	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
4790
4791	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
4792
4793	aese	v2.16b, v21.16b
4794	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
4795
4796	aese	v1.16b, v20.16b
4797	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
4798	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
4799
4800	aese	v0.16b, v20.16b
4801	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
4802
4803	aese	v3.16b, v19.16b
4804	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
4805
4806	aese	v1.16b, v21.16b
4807	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
4808
4809	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
4810
4811	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
4812
4813	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
4814
4815	aese	v3.16b, v20.16b
4816	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
4817
4818	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
4819	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
4820
4821	aese	v0.16b, v21.16b
4822	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
4823	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
4824
4825	aese	v3.16b, v21.16b
4826	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
4827
4828	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
4829	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
4830
4831	aese	v0.16b, v22.16b
4832	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
4833	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4834
4835	aese	v3.16b, v22.16b
4836	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
4837
4838	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
4839	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
4840	add	w12, w12, #1                            //CTR block 4k+3
4841
4842	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
4843
4844	aese	v3.16b, v23.16b
4845	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
4846
4847	aese	v2.16b, v22.16b
4848	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
4849	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
4850
4851	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
4852
4853	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
4854	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
4855
4856	aese	v2.16b, v23.16b
4857	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
4858
4859	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
4860	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
4861
4862	aese	v1.16b, v22.16b
4863	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
4864
4865	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
4866
4867	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
4868
4869	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
4870
4871	aese	v1.16b, v23.16b
4872	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
4873
4874	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
4875	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
4876
4877	aese	v0.16b, v23.16b
4878	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
4879
4880	aese	v1.16b, v24.16b
4881	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
4882
4883	aese	v2.16b, v24.16b
4884	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
4885
4886	aese	v0.16b, v24.16b
4887	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
4888	movi	v8.8b, #0xc2
4889
4890	aese	v3.16b, v24.16b
4891	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
4892
4893	aese	v1.16b, v25.16b
4894	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
4895	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
4896
4897	aese	v0.16b, v25.16b
4898	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
4899
4900	aese	v3.16b, v25.16b
4901	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
4902	shl	d8, d8, #56               //mod_constant
4903
4904	aese	v1.16b, v26.16b
4905	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
4906	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
4907
4908	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
4909
4910	aese	v3.16b, v26.16b
4911	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
4912
4913	aese	v1.16b, v27.16b
4914	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
4915
4916	aese	v0.16b, v26.16b
4917	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
4918	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
4919
4920	aese	v3.16b, v27.16b
4921	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
4922
4923	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
4924
4925	pmull	v4.1q, v9.1d, v8.1d
4926	ext	v9.16b, v9.16b, v9.16b, #8
4927
4928	aese	v3.16b, v28.16b
4929	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
4930
4931	aese	v2.16b, v25.16b
4932	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
4933	eor	v10.16b, v10.16b, v11.16b
4934
4935	aese	v1.16b, v28.16b
4936	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
4937
4938	aese	v0.16b, v27.16b
4939	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
4940
4941	aese	v2.16b, v26.16b
4942	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
4943
4944	aese	v1.16b, v29.16b
4945	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
4946	eor	v10.16b, v10.16b, v4.16b
4947
4948	aese	v0.16b, v28.16b
4949	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
4950
4951	aese	v2.16b, v27.16b
4952	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
4953
4954	aese	v1.16b, v30.16b
4955	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
4956
4957	aese	v0.16b, v29.16b
4958	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
4959	eor	v10.16b, v10.16b, v9.16b
4960
4961	aese	v3.16b, v29.16b
4962	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
4963
4964	aese	v2.16b, v28.16b
4965	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
4966
4967	aese	v0.16b, v30.16b
4968	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
4969
4970	pmull	v4.1q, v10.1d, v8.1d
4971
4972	aese	v2.16b, v29.16b
4973	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
4974	ext	v10.16b, v10.16b, v10.16b, #8
4975
4976	aese	v3.16b, v30.16b
4977	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
4978
4979	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
4980	eor	v11.16b, v11.16b, v4.16b
4981
4982	aese	v2.16b, v30.16b
4983	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
4984
4985	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
4986
4987	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
4988
4989	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
4990	eor	v11.16b, v11.16b, v10.16b
4991.L256_enc_tail:	//TAIL
4992
4993	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
4994	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
4995	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
4996#ifdef __AARCH64EB__
4997	rev	x6, x6
4998	rev	x7, x7
4999#endif
5000	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
5001	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
5002
5003	cmp	x5, #48
5004	fmov	d4, x6                               //AES block 4k+4 - mov low
5005
5006	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
5007
5008	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
5009	b.gt	.L256_enc_blocks_more_than_3
5010
5011	cmp	x5, #32
5012	mov	v3.16b, v2.16b
5013	movi	v11.8b, #0
5014
5015	movi	v9.8b, #0
5016	sub	w12, w12, #1
5017
5018	mov	v2.16b, v1.16b
5019	movi	v10.8b, #0
5020	b.gt	.L256_enc_blocks_more_than_2
5021
5022	mov	v3.16b, v1.16b
5023	sub	w12, w12, #1
5024	cmp	x5, #16
5025
5026	b.gt	.L256_enc_blocks_more_than_1
5027
5028	sub	w12, w12, #1
5029	b	.L256_enc_blocks_less_than_1
5030.L256_enc_blocks_more_than_3:	//blocks	left >  3
5031	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
5032
5033	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
5034#ifdef __AARCH64EB__
5035	rev	x6, x6
5036	rev	x7, x7
5037#endif
5038	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
5039
5040	eor	x6, x6, x13                    //AES final-2 block - round 14 low
5041	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5042
5043	eor	x7, x7, x14                    //AES final-2 block - round 14 high
5044
5045	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
5046	fmov	d5, x6                                //AES final-2 block - mov low
5047
5048	fmov	v5.d[1], x7                            //AES final-2 block - mov high
5049
5050	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
5051	movi	v8.8b, #0                                       //suppress further partial tag feed in
5052
5053	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
5054
5055	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
5056
5057	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
5058
5059	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
5060	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
5061.L256_enc_blocks_more_than_2:	//blocks	left >  2
5062
5063	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
5064
5065	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
5066#ifdef __AARCH64EB__
5067	rev	x6, x6
5068	rev	x7, x7
5069#endif
5070	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
5071
5072	eor	x6, x6, x13                    //AES final-1 block - round 14 low
5073	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5074
5075	fmov	d5, x6                                //AES final-1 block - mov low
5076	eor	x7, x7, x14                    //AES final-1 block - round 14 high
5077
5078	fmov	v5.d[1], x7                            //AES final-1 block - mov high
5079
5080	movi	v8.8b, #0                                       //suppress further partial tag feed in
5081
5082	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
5083	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
5084
5085	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
5086
5087	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
5088
5089	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
5090
5091	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
5092
5093	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
5094
5095	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
5096
5097	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
5098.L256_enc_blocks_more_than_1:	//blocks	left >  1
5099
5100	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
5101
5102	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
5103
5104	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
5105#ifdef __AARCH64EB__
5106	rev	x6, x6
5107	rev	x7, x7
5108#endif
5109	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5110
5111	movi	v8.8b, #0                                       //suppress further partial tag feed in
5112
5113	eor	x6, x6, x13                    //AES final block - round 14 low
5114	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
5115
5116	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
5117	eor	x7, x7, x14                    //AES final block - round 14 high
5118
5119	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
5120
5121	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
5122
5123	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
5124	fmov	d5, x6                                //AES final block - mov low
5125
5126	fmov	v5.d[1], x7                            //AES final block - mov high
5127
5128	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
5129
5130	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
5131
5132	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
5133	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
5134
5135	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
5136.L256_enc_blocks_less_than_1:	//blocks	left <= 1
5137
5138	and	x1, x1, #127                   //bit_length %= 128
5139
5140	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
5141	sub	x1, x1, #128                   //bit_length -= 128
5142
5143	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
5144	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
5145
5146	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
5147	and	x1, x1, #127                   //bit_length %= 128
5148
5149	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
5150	cmp	x1, #64
5151
5152	csel	x6, x13, x14, lt
5153	csel	x7, x14, xzr, lt
5154
5155	fmov	d0, x6                                //ctr0b is mask for last block
5156
5157	fmov	v0.d[1], x7
5158
5159	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
5160
5161	rev64	v4.16b, v5.16b                                   //GHASH final block
5162
5163	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5164
5165	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
5166
5167	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
5168	mov	d8, v4.d[1]                                 //GHASH final block - mid
5169#ifndef __AARCH64EB__
5170	rev	w9, w12
5171#else
5172	mov	w9, w12
5173#endif
5174
5175	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
5176
5177	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
5178	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
5179
5180	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
5181
5182	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
5183
5184	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
5185	movi	v8.8b, #0xc2
5186
5187	eor	v4.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
5188
5189	shl	d8, d8, #56              //mod_constant
5190
5191	eor	v10.16b, v10.16b, v4.16b                        //MODULO - karatsuba tidy up
5192
5193	pmull	v7.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
5194
5195	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
5196
5197	eor	v10.16b, v10.16b, v7.16b                     //MODULO - fold into mid
5198
5199	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
5200
5201	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
5202
5203	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
5204
5205	str	w9, [x16, #12]                         //store the updated counter
5206
5207	st1	{ v5.16b}, [x2]                         //store all 16B
5208	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
5209
5210	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
5211	ext	v11.16b, v11.16b, v11.16b, #8
5212	rev64	v11.16b, v11.16b
5213	mov	x0, x15
5214	st1	{ v11.16b }, [x3]
5215
5216	ldp	x21, x22, [sp, #16]
5217	ldp	x23, x24, [sp, #32]
5218	ldp	d8, d9, [sp, #48]
5219	ldp	d10, d11, [sp, #64]
5220	ldp	d12, d13, [sp, #80]
5221	ldp	d14, d15, [sp, #96]
5222	ldp	x19, x20, [sp], #112
5223	ret
5224
5225.L256_enc_ret:
5226	mov	w0, #0x0
5227	ret
5228.size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5229.globl	aes_gcm_dec_256_kernel
5230.type	aes_gcm_dec_256_kernel,%function
5231.align	4
5232aes_gcm_dec_256_kernel:
5233	cbz	x1, .L256_dec_ret
5234	stp	x19, x20, [sp, #-112]!
5235	mov	x16, x4
5236	mov	x8, x5
5237	stp	x21, x22, [sp, #16]
5238	stp	x23, x24, [sp, #32]
5239	stp	d8, d9, [sp, #48]
5240	stp	d10, d11, [sp, #64]
5241	stp	d12, d13, [sp, #80]
5242	stp	d14, d15, [sp, #96]
5243
5244	lsr	x5, x1, #3              //byte_len
5245	mov	x15, x5
5246	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
5247#ifdef __AARCH64EB__
5248	rev	x10, x10
5249	rev	x11, x11
5250#endif
5251	ldp	x13, x14, [x8, #224]                     //load rk14
5252#ifdef __AARCH64EB__
5253	ror	x14, x14, #32
5254	ror	x13, x13, #32
5255#endif
5256	ld1	{v18.4s}, [x8], #16                               //load rk0
5257	sub	x5, x5, #1      //byte_len - 1
5258
5259	ld1	{v19.4s}, [x8], #16                               //load rk1
5260	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5261
5262	add	x4, x0, x1, lsr #3   //end_input_ptr
5263	ld1	{v20.4s}, [x8], #16                               //load rk2
5264
5265	lsr	x12, x11, #32
5266	ld1	{v21.4s}, [x8], #16                               //load rk3
5267	orr	w11, w11, w11
5268
5269	ld1	{v22.4s}, [x8], #16                               //load rk4
5270	add	x5, x5, x0
5271	rev	w12, w12                                //rev_ctr32
5272
5273	add	w12, w12, #1                            //increment rev_ctr32
5274	fmov	d3, x10                               //CTR block 3
5275
5276	rev	w9, w12                                 //CTR block 1
5277	add	w12, w12, #1                            //CTR block 1
5278	fmov	d1, x10                               //CTR block 1
5279
5280	orr	x9, x11, x9, lsl #32            //CTR block 1
5281	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
5282
5283	fmov	v1.d[1], x9                               //CTR block 1
5284	rev	w9, w12                                 //CTR block 2
5285	add	w12, w12, #1                            //CTR block 2
5286
5287	fmov	d2, x10                               //CTR block 2
5288	orr	x9, x11, x9, lsl #32            //CTR block 2
5289
5290	fmov	v2.d[1], x9                               //CTR block 2
5291	rev	w9, w12                                 //CTR block 3
5292
5293	orr	x9, x11, x9, lsl #32            //CTR block 3
5294	ld1	{v23.4s}, [x8], #16                               //load rk5
5295
5296	fmov	v3.d[1], x9                               //CTR block 3
5297	add	w12, w12, #1                            //CTR block 3
5298
5299	ld1	{v24.4s}, [x8], #16                               //load rk6
5300
5301	ld1	{v25.4s}, [x8], #16                               //load rk7
5302
5303	ld1	{v26.4s}, [x8], #16                               //load rk8
5304
5305	aese	v0.16b, v18.16b
5306	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
5307	ldr	q14, [x3, #80]                         //load h3l | h3h
5308#ifndef __AARCH64EB__
5309	ext	v14.16b, v14.16b, v14.16b, #8
5310#endif
5311
5312	aese	v3.16b, v18.16b
5313	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
5314	ldr	q15, [x3, #112]                        //load h4l | h4h
5315#ifndef __AARCH64EB__
5316	ext	v15.16b, v15.16b, v15.16b, #8
5317#endif
5318
5319	aese	v1.16b, v18.16b
5320	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
5321	ldr	q13, [x3, #64]                         //load h2l | h2h
5322#ifndef __AARCH64EB__
5323	ext	v13.16b, v13.16b, v13.16b, #8
5324#endif
5325
5326	aese	v2.16b, v18.16b
5327	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
5328	ld1	{v27.4s}, [x8], #16                                 //load rk9
5329
5330	aese	v0.16b, v19.16b
5331	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
5332
5333	aese	v1.16b, v19.16b
5334	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
5335	ld1	{ v11.16b}, [x3]
5336	ext	v11.16b, v11.16b, v11.16b, #8
5337	rev64	v11.16b, v11.16b
5338
5339	aese	v2.16b, v19.16b
5340	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
5341	ld1	{v28.4s}, [x8], #16                              //load rk10
5342
5343	aese	v3.16b, v19.16b
5344	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
5345	ld1	{v29.4s}, [x8], #16                              //load rk11
5346
5347	aese	v0.16b, v20.16b
5348	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
5349	ldr	q12, [x3, #32]                         //load h1l | h1h
5350#ifndef __AARCH64EB__
5351	ext	v12.16b, v12.16b, v12.16b, #8
5352#endif
5353	aese	v2.16b, v20.16b
5354	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
5355	ld1	{v30.4s}, [x8], #16                              //load rk12
5356
5357	aese	v3.16b, v20.16b
5358	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
5359
5360	aese	v0.16b, v21.16b
5361	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
5362
5363	aese	v1.16b, v20.16b
5364	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
5365
5366	aese	v3.16b, v21.16b
5367	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
5368
5369	aese	v0.16b, v22.16b
5370	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
5371	cmp	x0, x5                   //check if we have <= 4 blocks
5372
5373	aese	v2.16b, v21.16b
5374	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
5375
5376	aese	v1.16b, v21.16b
5377	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
5378
5379	aese	v3.16b, v22.16b
5380	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
5381
5382	aese	v2.16b, v22.16b
5383	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
5384
5385	aese	v1.16b, v22.16b
5386	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
5387
5388	aese	v3.16b, v23.16b
5389	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
5390
5391	aese	v0.16b, v23.16b
5392	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
5393
5394	aese	v1.16b, v23.16b
5395	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
5396
5397	aese	v2.16b, v23.16b
5398	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
5399
5400	aese	v0.16b, v24.16b
5401	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
5402
5403	aese	v3.16b, v24.16b
5404	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
5405
5406	aese	v1.16b, v24.16b
5407	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
5408
5409	aese	v2.16b, v24.16b
5410	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
5411
5412	aese	v0.16b, v25.16b
5413	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
5414
5415	aese	v1.16b, v25.16b
5416	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
5417
5418	aese	v3.16b, v25.16b
5419	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
5420
5421	aese	v0.16b, v26.16b
5422	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
5423
5424	aese	v2.16b, v25.16b
5425	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
5426
5427	aese	v3.16b, v26.16b
5428	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
5429
5430	aese	v1.16b, v26.16b
5431	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
5432
5433	aese	v0.16b, v27.16b
5434	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
5435
5436	aese	v2.16b, v26.16b
5437	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
5438	ld1	{v31.4s}, [x8], #16                             //load rk13
5439
5440	aese	v1.16b, v27.16b
5441	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
5442
5443	aese	v0.16b, v28.16b
5444	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
5445
5446	aese	v3.16b, v27.16b
5447	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
5448
5449	aese	v1.16b, v28.16b
5450	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
5451
5452	aese	v2.16b, v27.16b
5453	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
5454
5455	aese	v3.16b, v28.16b
5456	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
5457
5458	aese	v0.16b, v29.16b
5459	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
5460
5461	aese	v2.16b, v28.16b
5462	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
5463
5464	aese	v3.16b, v29.16b
5465	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
5466
5467	aese	v1.16b, v29.16b
5468	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
5469
5470	aese	v2.16b, v29.16b
5471	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
5472
5473	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
5474
5475	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
5476
5477	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
5478	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
5479
5480	aese	v1.16b, v30.16b
5481	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
5482
5483	aese	v0.16b, v30.16b
5484	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
5485
5486	aese	v2.16b, v30.16b
5487	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
5488
5489	aese	v3.16b, v30.16b
5490	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
5491	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
5492
5493	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
5494
5495	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
5496	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
5497
5498	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
5499
5500	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
5501	b.ge	.L256_dec_tail                                    //handle tail
5502
5503	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
5504
5505	rev	w9, w12                                 //CTR block 4
5506
5507	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
5508
5509	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
5510	rev64	v5.16b, v5.16b                                    //GHASH block 1
5511	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
5512
5513	mov	x7, v0.d[1]                            //AES block 0 - mov high
5514
5515	mov	x6, v0.d[0]                            //AES block 0 - mov low
5516	rev64	v4.16b, v4.16b                                    //GHASH block 0
5517	add	w12, w12, #1                            //CTR block 4
5518
5519	fmov	d0, x10                               //CTR block 4
5520	orr	x9, x11, x9, lsl #32            //CTR block 4
5521
5522	fmov	v0.d[1], x9                               //CTR block 4
5523	rev	w9, w12                                 //CTR block 5
5524	add	w12, w12, #1                            //CTR block 5
5525
5526	mov	x19, v1.d[0]                            //AES block 1 - mov low
5527
5528	orr	x9, x11, x9, lsl #32            //CTR block 5
5529	mov	x20, v1.d[1]                            //AES block 1 - mov high
5530	eor	x7, x7, x14                   //AES block 0 - round 14 high
5531#ifdef __AARCH64EB__
5532	rev	x7, x7
5533#endif
5534	eor	x6, x6, x13                   //AES block 0 - round 14 low
5535#ifdef __AARCH64EB__
5536	rev	x6, x6
5537#endif
5538	stp	x6, x7, [x2], #16        //AES block 0 - store result
5539	fmov	d1, x10                               //CTR block 5
5540
5541	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
5542
5543	fmov	v1.d[1], x9                               //CTR block 5
5544	rev	w9, w12                                 //CTR block 6
5545	add	w12, w12, #1                            //CTR block 6
5546
5547	eor	x19, x19, x13                   //AES block 1 - round 14 low
5548#ifdef __AARCH64EB__
5549	rev	x19, x19
5550#endif
5551	orr	x9, x11, x9, lsl #32            //CTR block 6
5552
5553	eor	x20, x20, x14                   //AES block 1 - round 14 high
5554#ifdef __AARCH64EB__
5555	rev	x20, x20
5556#endif
5557	stp	x19, x20, [x2], #16        //AES block 1 - store result
5558
5559	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
5560	cmp	x0, x5                   //check if we have <= 8 blocks
5561	b.ge	.L256_dec_prepretail                              //do prepretail
5562
5563.L256_dec_main_loop:	//main	loop start
5564	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
5565	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
5566	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
5567
5568	aese	v0.16b, v18.16b
5569	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
5570	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
5571
5572	aese	v1.16b, v18.16b
5573	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
5574	fmov	d2, x10                               //CTR block 4k+6
5575
5576	fmov	v2.d[1], x9                               //CTR block 4k+6
5577	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
5578	rev	w9, w12                                 //CTR block 4k+7
5579
5580	aese	v0.16b, v19.16b
5581	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
5582	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
5583
5584	aese	v1.16b, v19.16b
5585	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
5586	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
5587
5588	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
5589	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
5590	fmov	d3, x10                               //CTR block 4k+7
5591
5592	aese	v0.16b, v20.16b
5593	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
5594	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
5595
5596	aese	v2.16b, v18.16b
5597	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
5598	fmov	v3.d[1], x9                               //CTR block 4k+7
5599
5600	aese	v1.16b, v20.16b
5601	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
5602	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
5603
5604	aese	v0.16b, v21.16b
5605	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
5606	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
5607#ifdef __AARCH64EB__
5608	rev	x22, x22
5609#endif
5610	aese	v2.16b, v19.16b
5611	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
5612	mov	d10, v17.d[1]                               //GHASH block 4k - mid
5613
5614	aese	v1.16b, v21.16b
5615	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
5616	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
5617
5618	aese	v3.16b, v18.16b
5619	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
5620	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
5621#ifdef __AARCH64EB__
5622	rev	x21, x21
5623#endif
5624	aese	v2.16b, v20.16b
5625	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
5626	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
5627
5628	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
5629
5630	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
5631
5632	aese	v2.16b, v21.16b
5633	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
5634	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
5635
5636	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
5637	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
5638#ifdef __AARCH64EB__
5639	rev	x23, x23
5640#endif
5641	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
5642	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
5643#ifdef __AARCH64EB__
5644	rev	x24, x24
5645#endif
5646	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
5647
5648	aese	v2.16b, v22.16b
5649	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
5650
5651	aese	v3.16b, v19.16b
5652	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
5653	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
5654
5655	aese	v0.16b, v22.16b
5656	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
5657	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
5658
5659	aese	v2.16b, v23.16b
5660	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
5661	add	w12, w12, #1                            //CTR block 4k+7
5662
5663	aese	v3.16b, v20.16b
5664	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
5665	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
5666
5667	aese	v1.16b, v22.16b
5668	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
5669	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
5670
5671	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
5672
5673	aese	v3.16b, v21.16b
5674	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
5675	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
5676
5677	aese	v1.16b, v23.16b
5678	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
5679
5680	aese	v0.16b, v23.16b
5681	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
5682	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
5683
5684	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
5685	rev	w9, w12                                 //CTR block 4k+8
5686
5687	aese	v1.16b, v24.16b
5688	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
5689	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
5690
5691	aese	v0.16b, v24.16b
5692	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
5693	add	w12, w12, #1                            //CTR block 4k+8
5694
5695	aese	v3.16b, v22.16b
5696	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
5697
5698	aese	v1.16b, v25.16b
5699	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
5700	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
5701
5702	aese	v0.16b, v25.16b
5703	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
5704
5705	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
5706	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
5707
5708	aese	v3.16b, v23.16b
5709	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
5710
5711	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
5712
5713	aese	v0.16b, v26.16b
5714	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
5715	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
5716
5717	aese	v3.16b, v24.16b
5718	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
5719
5720	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
5721	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
5722	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
5723
5724	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
5725
5726	aese	v0.16b, v27.16b
5727	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
5728	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
5729
5730	aese	v1.16b, v26.16b
5731	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
5732
5733	aese	v2.16b, v24.16b
5734	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
5735	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
5736
5737	aese	v0.16b, v28.16b
5738	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
5739
5740	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
5741	movi	v8.8b, #0xc2
5742
5743	aese	v2.16b, v25.16b
5744	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
5745	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
5746
5747	aese	v0.16b, v29.16b
5748	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
5749
5750	aese	v3.16b, v25.16b
5751	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
5752	shl	d8, d8, #56               //mod_constant
5753
5754	aese	v2.16b, v26.16b
5755	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
5756	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
5757
5758	aese	v0.16b, v30.16b
5759	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
5760
5761	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
5762	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
5763
5764	aese	v1.16b, v27.16b
5765	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
5766	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
5767
5768	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
5769	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
5770
5771	aese	v1.16b, v28.16b
5772	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
5773	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
5774
5775	aese	v2.16b, v27.16b
5776	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
5777	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
5778
5779	aese	v3.16b, v26.16b
5780	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
5781	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
5782
5783	aese	v1.16b, v29.16b
5784	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
5785	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
5786
5787	aese	v2.16b, v28.16b
5788	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
5789	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
5790
5791	aese	v3.16b, v27.16b
5792	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
5793	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
5794
5795	aese	v1.16b, v30.16b
5796	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
5797	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
5798
5799	aese	v2.16b, v29.16b
5800	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
5801	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
5802
5803	aese	v3.16b, v28.16b
5804	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
5805	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
5806
5807	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
5808	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
5809
5810	aese	v2.16b, v30.16b
5811	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
5812	fmov	d0, x10                               //CTR block 4k+8
5813
5814	aese	v3.16b, v29.16b
5815	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
5816	fmov	v0.d[1], x9                               //CTR block 4k+8
5817
5818	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
5819	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
5820	rev	w9, w12                                 //CTR block 4k+9
5821
5822	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
5823	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
5824	cmp	x0, x5                   //.LOOP CONTROL
5825
5826	add	w12, w12, #1                            //CTR block 4k+9
5827
5828	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
5829#ifdef __AARCH64EB__
5830	rev	x6, x6
5831#endif
5832	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
5833#ifdef __AARCH64EB__
5834	rev	x7, x7
5835#endif
5836	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
5837	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
5838	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
5839
5840	aese	v3.16b, v30.16b
5841	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
5842	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
5843
5844	fmov	d1, x10                               //CTR block 4k+9
5845	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
5846
5847	fmov	v1.d[1], x9                               //CTR block 4k+9
5848	rev	w9, w12                                 //CTR block 4k+10
5849	add	w12, w12, #1                            //CTR block 4k+10
5850
5851	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
5852	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
5853
5854	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
5855	eor	x20, x20, x14                   //AES block 4k+5 - round 14 high
5856#ifdef __AARCH64EB__
5857	rev	x20, x20
5858#endif
5859	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
5860
5861	eor	x19, x19, x13                   //AES block 4k+5 - round 14 low
5862#ifdef __AARCH64EB__
5863	rev	x19, x19
5864#endif
5865	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
5866
5867	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
5868	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
5869	b.lt	.L256_dec_main_loop
5870
5871
5872.L256_dec_prepretail:	//PREPRETAIL
5873	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
5874	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
5875	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
5876
5877	aese	v0.16b, v18.16b
5878	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
5879	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
5880
5881	aese	v1.16b, v18.16b
5882	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
5883	fmov	d2, x10                               //CTR block 4k+6
5884
5885	fmov	v2.d[1], x9                               //CTR block 4k+6
5886	rev	w9, w12                                 //CTR block 4k+7
5887	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
5888
5889	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
5890	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
5891	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
5892
5893	aese	v1.16b, v19.16b
5894	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
5895	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
5896
5897	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
5898	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
5899	fmov	d3, x10                               //CTR block 4k+7
5900
5901	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
5902	fmov	v3.d[1], x9                               //CTR block 4k+7
5903
5904	aese	v2.16b, v18.16b
5905	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
5906	mov	d10, v17.d[1]                               //GHASH block 4k - mid
5907
5908	aese	v0.16b, v19.16b
5909	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
5910	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
5911
5912	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
5913
5914	aese	v2.16b, v19.16b
5915	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
5916	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
5917
5918	aese	v3.16b, v18.16b
5919	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
5920
5921	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
5922	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
5923
5924	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
5925
5926	aese	v3.16b, v19.16b
5927	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
5928	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
5929
5930	aese	v0.16b, v20.16b
5931	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
5932
5933	aese	v1.16b, v20.16b
5934	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
5935	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
5936
5937	aese	v2.16b, v20.16b
5938	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
5939
5940	aese	v0.16b, v21.16b
5941	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
5942	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
5943
5944	aese	v3.16b, v20.16b
5945	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
5946	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
5947
5948	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
5949
5950	aese	v0.16b, v22.16b
5951	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
5952
5953	aese	v3.16b, v21.16b
5954	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
5955	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
5956
5957	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
5958
5959	aese	v0.16b, v23.16b
5960	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
5961	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
5962
5963	aese	v3.16b, v22.16b
5964	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
5965
5966	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
5967	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
5968
5969	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
5970
5971	aese	v3.16b, v23.16b
5972	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
5973	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
5974
5975	aese	v2.16b, v21.16b
5976	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
5977
5978	aese	v1.16b, v21.16b
5979	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
5980	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
5981
5982	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
5983
5984	aese	v2.16b, v22.16b
5985	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
5986	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
5987
5988	aese	v1.16b, v22.16b
5989	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
5990
5991	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
5992
5993	aese	v2.16b, v23.16b
5994	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
5995	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
5996
5997	aese	v1.16b, v23.16b
5998	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
5999
6000	aese	v3.16b, v24.16b
6001	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
6002	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
6003
6004	aese	v2.16b, v24.16b
6005	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
6006
6007	aese	v0.16b, v24.16b
6008	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
6009	movi	v8.8b, #0xc2
6010
6011	aese	v1.16b, v24.16b
6012	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
6013	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
6014
6015	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
6016
6017	aese	v3.16b, v25.16b
6018	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
6019	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
6020
6021	aese	v1.16b, v25.16b
6022	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
6023
6024	aese	v0.16b, v25.16b
6025	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
6026	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
6027
6028	aese	v3.16b, v26.16b
6029	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
6030
6031	aese	v2.16b, v25.16b
6032	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
6033	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
6034
6035	aese	v1.16b, v26.16b
6036	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
6037
6038	aese	v0.16b, v26.16b
6039	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
6040	shl	d8, d8, #56               //mod_constant
6041
6042	aese	v2.16b, v26.16b
6043	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
6044
6045	aese	v1.16b, v27.16b
6046	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
6047	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
6048
6049	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
6050
6051	aese	v2.16b, v27.16b
6052	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
6053	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
6054
6055	aese	v3.16b, v27.16b
6056	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
6057
6058	aese	v0.16b, v27.16b
6059	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
6060	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
6061
6062	aese	v2.16b, v28.16b
6063	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
6064
6065	aese	v3.16b, v28.16b
6066	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
6067
6068	aese	v0.16b, v28.16b
6069	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
6070	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
6071#ifdef __AARCH64EB__
6072	rev	x22, x22
6073#endif
6074	aese	v1.16b, v28.16b
6075	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
6076	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
6077#ifdef __AARCH64EB__
6078	rev	x23, x23
6079#endif
6080	aese	v2.16b, v29.16b
6081	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
6082	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
6083
6084	aese	v0.16b, v29.16b
6085	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
6086	add	w12, w12, #1                            //CTR block 4k+7
6087
6088	aese	v1.16b, v29.16b
6089	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
6090	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
6091#ifdef __AARCH64EB__
6092	rev	x21, x21
6093#endif
6094
6095	aese	v2.16b, v30.16b
6096	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
6097
6098	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
6099	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
6100#ifdef __AARCH64EB__
6101	rev	x24, x24
6102#endif
6103
6104	aese	v3.16b, v29.16b
6105	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
6106	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
6107
6108	aese	v1.16b, v30.16b
6109	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
6110	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
6111
6112	aese	v0.16b, v30.16b
6113	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
6114	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
6115
6116	aese	v3.16b, v30.16b
6117	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
6118	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
6119
6120	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
6121
6122	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
6123
6124	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
6125
6126	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
6127	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
6128.L256_dec_tail:	//TAIL
6129
6130	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
6131	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
6132
6133	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
6134
6135	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
6136
6137	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
6138	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
6139
6140	cmp	x5, #48
6141
6142	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
6143#ifdef __AARCH64EB__
6144	rev	x6, x6
6145#endif
6146
6147	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
6148#ifdef __AARCH64EB__
6149	rev	x7, x7
6150#endif
6151	b.gt	.L256_dec_blocks_more_than_3
6152
6153	sub	w12, w12, #1
6154	mov	v3.16b, v2.16b
6155	movi	v10.8b, #0
6156
6157	movi	v11.8b, #0
6158	cmp	x5, #32
6159
6160	movi	v9.8b, #0
6161	mov	v2.16b, v1.16b
6162	b.gt	.L256_dec_blocks_more_than_2
6163
6164	sub	w12, w12, #1
6165
6166	mov	v3.16b, v1.16b
6167	cmp	x5, #16
6168	b.gt	.L256_dec_blocks_more_than_1
6169
6170	sub	w12, w12, #1
6171	b	.L256_dec_blocks_less_than_1
6172.L256_dec_blocks_more_than_3:	//blocks	left >  3
6173	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
6174	ld1	{ v5.16b}, [x0], #16                     //AES final-2 block - load ciphertext
6175
6176	stp	x6, x7, [x2], #16       //AES final-3 block  - store result
6177
6178	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
6179
6180	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6181
6182	eor	v0.16b, v5.16b, v1.16b                           //AES final-2 block - result
6183
6184	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
6185
6186	mov	x6, v0.d[0]                           //AES final-2 block - mov low
6187
6188	mov	x7, v0.d[1]                           //AES final-2 block - mov high
6189
6190	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
6191
6192	movi	v8.8b, #0                                       //suppress further partial tag feed in
6193
6194	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
6195
6196	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
6197	eor	x6, x6, x13                  //AES final-2 block - round 14 low
6198#ifdef __AARCH64EB__
6199	rev	x6, x6
6200#endif
6201
6202	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
6203	eor	x7, x7, x14                  //AES final-2 block - round 14 high
6204#ifdef __AARCH64EB__
6205	rev	x7, x7
6206#endif
6207.L256_dec_blocks_more_than_2:	//blocks	left >  2
6208
6209	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
6210	ld1	{ v5.16b}, [x0], #16                     //AES final-1 block - load ciphertext
6211
6212	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6213	stp	x6, x7, [x2], #16       //AES final-2 block  - store result
6214
6215	eor	v0.16b, v5.16b, v2.16b                           //AES final-1 block - result
6216
6217	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
6218
6219	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
6220
6221	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
6222
6223	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
6224	mov	x6, v0.d[0]                           //AES final-1 block - mov low
6225
6226	mov	x7, v0.d[1]                           //AES final-1 block - mov high
6227	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
6228	movi	v8.8b, #0                                       //suppress further partial tag feed in
6229
6230	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
6231
6232	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
6233	eor	x6, x6, x13                  //AES final-1 block - round 14 low
6234#ifdef __AARCH64EB__
6235	rev	x6, x6
6236#endif
6237
6238	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
6239	eor	x7, x7, x14                  //AES final-1 block - round 14 high
6240#ifdef __AARCH64EB__
6241	rev	x7, x7
6242#endif
6243.L256_dec_blocks_more_than_1:	//blocks	left >  1
6244
6245	stp	x6, x7, [x2], #16       //AES final-1 block  - store result
6246	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
6247
6248	ld1	{ v5.16b}, [x0], #16                     //AES final block - load ciphertext
6249
6250	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6251	movi	v8.8b, #0                                       //suppress further partial tag feed in
6252
6253	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
6254
6255	eor	v0.16b, v5.16b, v3.16b                           //AES final block - result
6256
6257	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
6258
6259	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
6260
6261	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
6262	mov	x6, v0.d[0]                           //AES final block - mov low
6263
6264	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
6265
6266	mov	x7, v0.d[1]                           //AES final block - mov high
6267
6268	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
6269	eor	x6, x6, x13                  //AES final block - round 14 low
6270#ifdef __AARCH64EB__
6271	rev	x6, x6
6272#endif
6273	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
6274
6275	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
6276
6277	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
6278	eor	x7, x7, x14                  //AES final block - round 14 high
6279#ifdef __AARCH64EB__
6280	rev	x7, x7
6281#endif
6282.L256_dec_blocks_less_than_1:	//blocks	left <= 1
6283
6284	and	x1, x1, #127                   //bit_length %= 128
6285	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
6286
6287	sub	x1, x1, #128                   //bit_length -= 128
6288	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
6289
6290	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
6291	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
6292
6293	and	x1, x1, #127                   //bit_length %= 128
6294
6295	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
6296	cmp	x1, #64
6297
6298	csel	x9, x13, x14, lt
6299	csel	x10, x14, xzr, lt
6300
6301	fmov	d0, x9                                  //ctr0b is mask for last block
6302	and	x6, x6, x9
6303
6304	mov	v0.d[1], x10
6305	bic	x4, x4, x9          //mask out low existing bytes
6306
6307#ifndef __AARCH64EB__
6308	rev	w9, w12
6309#else
6310	mov	w9, w12
6311#endif
6312
6313	bic	x5, x5, x10      //mask out high existing bytes
6314
6315	orr	x6, x6, x4
6316
6317	and	x7, x7, x10
6318
6319	orr	x7, x7, x5
6320
6321	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
6322
6323	rev64	v4.16b, v5.16b                                    //GHASH final block
6324
6325	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
6326
6327	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
6328
6329	mov	d8, v4.d[1]                                  //GHASH final block - mid
6330
6331	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
6332
6333	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
6334
6335	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
6336
6337	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
6338
6339	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
6340
6341	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
6342	movi	v8.8b, #0xc2
6343
6344	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
6345
6346	shl	d8, d8, #56               //mod_constant
6347
6348	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
6349
6350	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
6351
6352	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
6353
6354	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
6355
6356	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
6357
6358	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
6359
6360	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
6361
6362	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
6363
6364	stp	x6, x7, [x2]
6365
6366	str	w9, [x16, #12]                          //store the updated counter
6367
6368	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
6369	ext	v11.16b, v11.16b, v11.16b, #8
6370	rev64	v11.16b, v11.16b
6371	mov	x0, x15
6372	st1	{ v11.16b }, [x3]
6373
6374	ldp	x21, x22, [sp, #16]
6375	ldp	x23, x24, [sp, #32]
6376	ldp	d8, d9, [sp, #48]
6377	ldp	d10, d11, [sp, #64]
6378	ldp	d12, d13, [sp, #80]
6379	ldp	d14, d15, [sp, #96]
6380	ldp	x19, x20, [sp], #112
6381	ret
6382
6383.L256_dec_ret:
6384	mov	w0, #0x0
6385	ret
6386.size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6387.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
6388.align	2
6389.align	2
6390#endif
6391