xref: /freebsd/sys/crypto/openssl/aarch64/aes-gcm-armv8_64.S (revision ac77b2621508c6a50ab01d07fe8d43795d908f05)
1/* Do not modify. This file is auto-generated from aes-gcm-armv8_64.pl. */
2#include "arm_arch.h"
3
4#if __ARM_MAX_ARCH__>=8
5.arch	armv8-a+crypto
6.text
7.globl	aes_gcm_enc_128_kernel
8.type	aes_gcm_enc_128_kernel,%function
9.align	4
10aes_gcm_enc_128_kernel:
11	AARCH64_VALID_CALL_TARGET
12	cbz	x1, .L128_enc_ret
13	stp	x19, x20, [sp, #-112]!
14	mov	x16, x4
15	mov	x8, x5
16	stp	x21, x22, [sp, #16]
17	stp	x23, x24, [sp, #32]
18	stp	d8, d9, [sp, #48]
19	stp	d10, d11, [sp, #64]
20	stp	d12, d13, [sp, #80]
21	stp	d14, d15, [sp, #96]
22
23	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
24#ifdef __AARCH64EB__
25	rev	x10, x10
26	rev	x11, x11
27#endif
28	ldp	x13, x14, [x8, #160]                     //load rk10
29#ifdef __AARCH64EB__
30	ror	x13, x13, #32
31	ror	x14, x14, #32
32#endif
33	ld1	{v11.16b}, [x3]
34	ext	v11.16b, v11.16b, v11.16b, #8
35	rev64	v11.16b, v11.16b
36	lsr	x5, x1, #3              //byte_len
37	mov	x15, x5
38
39	ld1	{v18.4s}, [x8], #16								  //load rk0
40	add	x4, x0, x1, lsr #3   //end_input_ptr
41	sub	x5, x5, #1      //byte_len - 1
42
43	lsr	x12, x11, #32
44	ldr	q15, [x3, #112]                        //load h4l | h4h
45#ifndef __AARCH64EB__
46	ext	v15.16b, v15.16b, v15.16b, #8
47#endif
48	fmov	d1, x10                               //CTR block 1
49	rev	w12, w12                                //rev_ctr32
50
51	add	w12, w12, #1                            //increment rev_ctr32
52	orr	w11, w11, w11
53	ld1	{v19.4s}, [x8], #16								  //load rk1
54
55	rev	w9, w12                                 //CTR block 1
56	add	w12, w12, #1                            //CTR block 1
57	fmov	d3, x10                               //CTR block 3
58
59	orr	x9, x11, x9, lsl #32            //CTR block 1
60	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
61
62	fmov	v1.d[1], x9                               //CTR block 1
63	rev	w9, w12                                 //CTR block 2
64
65	fmov	d2, x10                               //CTR block 2
66	orr	x9, x11, x9, lsl #32            //CTR block 2
67	add	w12, w12, #1                            //CTR block 2
68
69	fmov	v2.d[1], x9                               //CTR block 2
70	rev	w9, w12                                 //CTR block 3
71
72	orr	x9, x11, x9, lsl #32            //CTR block 3
73	ld1	{v20.4s}, [x8], #16								  //load rk2
74
75	add	w12, w12, #1                            //CTR block 3
76	fmov	v3.d[1], x9                               //CTR block 3
77
78	ldr	q14, [x3, #80]                         //load h3l | h3h
79#ifndef __AARCH64EB__
80	ext	v14.16b, v14.16b, v14.16b, #8
81#endif
82	aese	v1.16b, v18.16b
83	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
84	ld1	{v21.4s}, [x8], #16								  //load rk3
85
86	aese	v2.16b, v18.16b
87	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
88	ldr	q12, [x3, #32]                         //load h1l | h1h
89#ifndef __AARCH64EB__
90	ext	v12.16b, v12.16b, v12.16b, #8
91#endif
92
93	aese	v0.16b, v18.16b
94	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
95	ld1	{v22.4s}, [x8], #16								  //load rk4
96
97	aese	v3.16b, v18.16b
98	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
99	ld1	{v23.4s}, [x8], #16								  //load rk5
100
101	aese	v2.16b, v19.16b
102	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
103	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
104
105	aese	v0.16b, v19.16b
106	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
107	ld1	{v24.4s}, [x8], #16								  //load rk6
108
109	aese	v1.16b, v19.16b
110	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
111	ld1	{v25.4s}, [x8], #16								  //load rk7
112
113	aese	v3.16b, v19.16b
114	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
115	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
116
117	aese	v0.16b, v20.16b
118	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
119	ld1	{v26.4s}, [x8], #16								  //load rk8
120
121	aese	v1.16b, v20.16b
122	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
123	ldr	q13, [x3, #64]                         //load h2l | h2h
124#ifndef __AARCH64EB__
125	ext	v13.16b, v13.16b, v13.16b, #8
126#endif
127
128	aese	v3.16b, v20.16b
129	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
130
131	aese	v2.16b, v20.16b
132	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
133	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
134
135	aese	v0.16b, v21.16b
136	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
137
138	aese	v1.16b, v21.16b
139	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
140
141	aese	v2.16b, v21.16b
142	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
143	ld1	{v27.4s}, [x8], #16								  //load rk9
144
145	aese	v3.16b, v21.16b
146	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
147
148	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
149	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
150
151	aese	v3.16b, v22.16b
152	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
153	add	x5, x5, x0
154
155	aese	v2.16b, v22.16b
156	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
157	cmp	x0, x5                   //check if we have <= 4 blocks
158
159	aese	v0.16b, v22.16b
160	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
161
162	aese	v3.16b, v23.16b
163	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
164
165	aese	v2.16b, v23.16b
166	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
167
168	aese	v0.16b, v23.16b
169	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
170
171	aese	v3.16b, v24.16b
172	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
173
174	aese	v1.16b, v22.16b
175	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
176
177	aese	v2.16b, v24.16b
178	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
179	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
180
181	aese	v0.16b, v24.16b
182	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
183
184	aese	v1.16b, v23.16b
185	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
186
187	aese	v3.16b, v25.16b
188	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
189
190	aese	v0.16b, v25.16b
191	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
192
193	aese	v1.16b, v24.16b
194	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
195
196	aese	v2.16b, v25.16b
197	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
198
199	aese	v0.16b, v26.16b
200	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
201
202	aese	v1.16b, v25.16b
203	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
204
205	aese	v2.16b, v26.16b
206	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
207
208	aese	v3.16b, v26.16b
209	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
210
211	aese	v1.16b, v26.16b
212	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
213
214	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
215
216	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
217
218	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
219
220	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
221
222	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
223	b.ge	.L128_enc_tail                                    //handle tail
224
225	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
226#ifdef __AARCH64EB__
227	rev	x6, x6
228	rev	x7, x7
229#endif
230	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
231#ifdef __AARCH64EB__
232	rev	x21, x21
233	rev	x22, x22
234#endif
235	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
236#ifdef __AARCH64EB__
237	rev	x19, x19
238	rev	x20, x20
239#endif
240	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
241#ifdef __AARCH64EB__
242	rev	x23, x23
243	rev	x24, x24
244#endif
245	eor	x6, x6, x13                     //AES block 0 - round 10 low
246	eor	x7, x7, x14                     //AES block 0 - round 10 high
247
248	eor	x21, x21, x13                     //AES block 2 - round 10 low
249	fmov	d4, x6                               //AES block 0 - mov low
250
251	eor	x19, x19, x13                     //AES block 1 - round 10 low
252	eor	x22, x22, x14                     //AES block 2 - round 10 high
253	fmov	v4.d[1], x7                           //AES block 0 - mov high
254
255	fmov	d5, x19                               //AES block 1 - mov low
256	eor	x20, x20, x14                     //AES block 1 - round 10 high
257
258	eor	x23, x23, x13                     //AES block 3 - round 10 low
259	fmov	v5.d[1], x20                           //AES block 1 - mov high
260
261	fmov	d6, x21                               //AES block 2 - mov low
262	eor	x24, x24, x14                     //AES block 3 - round 10 high
263	rev	w9, w12                                 //CTR block 4
264
265	fmov	v6.d[1], x22                           //AES block 2 - mov high
266	orr	x9, x11, x9, lsl #32            //CTR block 4
267
268	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
269	fmov	d0, x10                               //CTR block 4
270	add	w12, w12, #1                            //CTR block 4
271
272	fmov	v0.d[1], x9                               //CTR block 4
273	rev	w9, w12                                 //CTR block 5
274
275	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
276	fmov	d1, x10                               //CTR block 5
277	orr	x9, x11, x9, lsl #32            //CTR block 5
278
279	add	w12, w12, #1                            //CTR block 5
280	add	x0, x0, #64                       //AES input_ptr update
281	fmov	v1.d[1], x9                               //CTR block 5
282
283	fmov	d7, x23                               //AES block 3 - mov low
284	rev	w9, w12                                 //CTR block 6
285	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
286
287	fmov	v7.d[1], x24                           //AES block 3 - mov high
288	orr	x9, x11, x9, lsl #32            //CTR block 6
289
290	add	w12, w12, #1                            //CTR block 6
291	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
292	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
293
294	fmov	d2, x10                               //CTR block 6
295	cmp	x0, x5                   //check if we have <= 8 blocks
296
297	fmov	v2.d[1], x9                               //CTR block 6
298	rev	w9, w12                                 //CTR block 7
299	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
300
301	orr	x9, x11, x9, lsl #32            //CTR block 7
302
303	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
304	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
305	b.ge	.L128_enc_prepretail                              //do prepretail
306
307.L128_enc_main_loop:	//main	loop start
308	ldp	x23, x24, [x0, #48]           //AES block 4k+3 - load plaintext
309#ifdef __AARCH64EB__
310	rev	x23, x23
311	rev	x24, x24
312#endif
313	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
314	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
315
316	aese	v2.16b, v18.16b
317	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
318	fmov	d3, x10                               //CTR block 4k+3
319
320	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
321	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
322
323	aese	v1.16b, v18.16b
324	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
325	add	w12, w12, #1                            //CTR block 4k+3
326	fmov	v3.d[1], x9                               //CTR block 4k+3
327
328	aese	v0.16b, v18.16b
329	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
330	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
331
332	aese	v2.16b, v19.16b
333	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
334	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
335
336	aese	v1.16b, v19.16b
337	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
338	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
339
340	aese	v3.16b, v18.16b
341	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
342	eor	x24, x24, x14                     //AES block 4k+3 - round 10 high
343
344	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
345	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
346	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
347#ifdef __AARCH64EB__
348	rev	x6, x6
349	rev	x7, x7
350#endif
351	aese	v0.16b, v19.16b
352	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
353	rev	w9, w12                                 //CTR block 4k+8
354
355	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
356	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
357	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
358
359	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
360	add	w12, w12, #1                            //CTR block 4k+8
361	mov	d10, v17.d[1]                               //GHASH block 4k - mid
362
363	aese	v0.16b, v20.16b
364	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
365
366	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
367	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
368
369	aese	v1.16b, v20.16b
370	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
371
372	aese	v0.16b, v21.16b
373	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
374	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
375
376	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
377
378	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
379	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
380
381	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
382
383	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
384	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
385
386	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
387	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
388
389	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
390	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
391
392	aese	v3.16b, v19.16b
393	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
394	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
395
396	aese	v2.16b, v20.16b
397	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
398	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
399
400	aese	v1.16b, v21.16b
401	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
402	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
403
404	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
405
406	aese	v2.16b, v21.16b
407	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
408	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
409
410	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
411
412	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
413	movi	v8.8b, #0xc2
414
415	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
416	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
417
418	aese	v1.16b, v22.16b
419	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
420
421	aese	v3.16b, v20.16b
422	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
423	shl	d8, d8, #56               //mod_constant
424
425	aese	v0.16b, v22.16b
426	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
427	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
428
429	aese	v1.16b, v23.16b
430	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
431	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
432#ifdef __AARCH64EB__
433	rev	x19, x19
434	rev	x20, x20
435#endif
436	aese	v3.16b, v21.16b
437	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
438	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
439
440	aese	v0.16b, v23.16b
441	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
442	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
443#ifdef __AARCH64EB__
444	rev	x21, x21
445	rev	x22, x22
446#endif
447	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
448	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
449
450	aese	v2.16b, v22.16b
451	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
452	eor	x19, x19, x13                     //AES block 4k+5 - round 10 low
453
454	aese	v3.16b, v22.16b
455	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
456	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
457
458	aese	v1.16b, v24.16b
459	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
460	eor	x23, x23, x13                     //AES block 4k+3 - round 10 low
461
462	aese	v2.16b, v23.16b
463	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
464	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
465
466	fmov	d4, x6                               //AES block 4k+4 - mov low
467	aese	v0.16b, v24.16b
468	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
469	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
470
471	add	x0, x0, #64                       //AES input_ptr update
472	fmov	d7, x23                               //AES block 4k+3 - mov low
473	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
474
475	aese	v3.16b, v23.16b
476	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
477	fmov	d5, x19                               //AES block 4k+5 - mov low
478
479	aese	v0.16b, v25.16b
480	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
481	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
482
483	aese	v2.16b, v24.16b
484	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
485	eor	x20, x20, x14                     //AES block 4k+5 - round 10 high
486
487	aese	v1.16b, v25.16b
488	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
489	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
490
491	aese	v0.16b, v26.16b
492	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
493	fmov	v7.d[1], x24                           //AES block 4k+3 - mov high
494
495	aese	v3.16b, v24.16b
496	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
497	cmp	x0, x5                   //.LOOP CONTROL
498
499	aese	v1.16b, v26.16b
500	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
501	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
502
503	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
504	eor	x21, x21, x13                     //AES block 4k+6 - round 10 low
505	eor	x22, x22, x14                     //AES block 4k+6 - round 10 high
506
507	aese	v3.16b, v25.16b
508	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
509	fmov	d6, x21                               //AES block 4k+6 - mov low
510
511	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
512	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
513
514	aese	v2.16b, v25.16b
515	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
516	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
517
518	fmov	d0, x10                               //CTR block 4k+8
519	aese	v3.16b, v26.16b
520	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
521
522	fmov	v0.d[1], x9                               //CTR block 4k+8
523	rev	w9, w12                                 //CTR block 4k+9
524	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
525
526	aese	v2.16b, v26.16b
527	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
528	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
529
530	add	w12, w12, #1                            //CTR block 4k+9
531	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
532	fmov	d1, x10                               //CTR block 4k+9
533
534	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
535	fmov	v1.d[1], x9                               //CTR block 4k+9
536	rev	w9, w12                                 //CTR block 4k+10
537
538	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
539	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
540	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
541	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
542
543	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
544	add	w12, w12, #1                            //CTR block 4k+10
545	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
546	fmov	d2, x10                               //CTR block 4k+10
547
548	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
549	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
550
551	fmov	v2.d[1], x9                               //CTR block 4k+10
552	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
553	rev	w9, w12                                 //CTR block 4k+11
554
555	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
556	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+3 - result
557
558	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
559	st1	{ v7.16b}, [x2], #16                     //AES block 4k+3 - store result
560	b.lt	.L128_enc_main_loop
561
562.L128_enc_prepretail:	//PREPRETAIL
563	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
564	fmov	d3, x10                               //CTR block 4k+3
565	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
566
567	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
568	add	w12, w12, #1                            //CTR block 4k+3
569	fmov	v3.d[1], x9                               //CTR block 4k+3
570
571	aese	v1.16b, v18.16b
572	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
573	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
574
575	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
576
577	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
578	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
579
580	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
581
582	aese	v3.16b, v18.16b
583	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
584	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
585
586	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
587	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
588
589	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
590	mov	d10, v17.d[1]                               //GHASH block 4k - mid
591
592	aese	v1.16b, v19.16b
593	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
594	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
595
596	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
597
598	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
599	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
600
601	aese	v3.16b, v19.16b
602	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
603
604	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
605	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
606
607	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
608
609	aese	v0.16b, v18.16b
610	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
611	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
612
613	aese	v2.16b, v18.16b
614	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
615
616	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
617	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
618
619	aese	v0.16b, v19.16b
620	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
621	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
622
623	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
624
625	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
626	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
627
628	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
629
630	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
631
632	aese	v2.16b, v19.16b
633	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
634	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
635
636	aese	v0.16b, v20.16b
637	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
638
639	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
640	movi	v8.8b, #0xc2
641
642	aese	v2.16b, v20.16b
643	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
644	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
645
646	aese	v3.16b, v20.16b
647	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
648
649	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
650	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
651
652	aese	v2.16b, v21.16b
653	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
654
655	aese	v1.16b, v20.16b
656	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
657	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
658
659	aese	v0.16b, v21.16b
660	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
661
662	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
663	shl	d8, d8, #56               //mod_constant
664
665	aese	v1.16b, v21.16b
666	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
667	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
668
669	aese	v0.16b, v22.16b
670	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
671
672	pmull	v28.1q, v9.1d, v8.1d
673	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
674
675	aese	v1.16b, v22.16b
676	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
677
678	aese	v0.16b, v23.16b
679	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
680	ext	v9.16b, v9.16b, v9.16b, #8
681
682	aese	v3.16b, v21.16b
683	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
684
685	aese	v2.16b, v22.16b
686	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
687	eor	v10.16b, v10.16b, v11.16b
688
689	aese	v0.16b, v24.16b
690	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
691
692	aese	v3.16b, v22.16b
693	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
694
695	aese	v1.16b, v23.16b
696	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
697
698	aese	v2.16b, v23.16b
699	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
700	eor	v10.16b, v10.16b, v28.16b
701
702	aese	v3.16b, v23.16b
703	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
704
705	aese	v1.16b, v24.16b
706	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
707
708	aese	v2.16b, v24.16b
709	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
710
711	aese	v3.16b, v24.16b
712	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
713	eor	v10.16b, v10.16b, v9.16b
714
715	aese	v0.16b, v25.16b
716	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
717
718	aese	v2.16b, v25.16b
719	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
720
721	aese	v3.16b, v25.16b
722	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
723
724	pmull	v28.1q, v10.1d, v8.1d
725
726	aese	v1.16b, v25.16b
727	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
728	ext	v10.16b, v10.16b, v10.16b, #8
729
730	aese	v3.16b, v26.16b
731	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
732
733	aese	v0.16b, v26.16b
734	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
735	eor	v11.16b, v11.16b, v28.16b
736
737	aese	v1.16b, v26.16b
738	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
739
740	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
741
742	aese	v2.16b, v26.16b
743	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
744
745	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
746
747	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
748	eor	v11.16b, v11.16b, v10.16b
749
750	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
751.L128_enc_tail:	//TAIL
752
753	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
754	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
755#ifdef __AARCH64EB__
756	rev	x6, x6
757	rev	x7, x7
758#endif
759	cmp	x5, #48
760
761	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
762	eor	x6, x6, x13                     //AES block 4k+4 - round 10 low
763	eor	x7, x7, x14                     //AES block 4k+4 - round 10 high
764
765	fmov	d4, x6                               //AES block 4k+4 - mov low
766
767	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
768
769	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
770
771	b.gt	.L128_enc_blocks_more_than_3
772
773	sub	w12, w12, #1
774	movi	v11.8b, #0
775	mov	v3.16b, v2.16b
776
777	cmp	x5, #32
778	mov	v2.16b, v1.16b
779	movi	v9.8b, #0
780
781	movi	v10.8b, #0
782	b.gt	.L128_enc_blocks_more_than_2
783
784	mov	v3.16b, v1.16b
785	cmp	x5, #16
786
787	sub	w12, w12, #1
788	b.gt	.L128_enc_blocks_more_than_1
789
790	sub	w12, w12, #1
791	b	.L128_enc_blocks_less_than_1
792.L128_enc_blocks_more_than_3:	//blocks	left >  3
793	st1	{ v5.16b}, [x2], #16                     //AES final-3 block  - store result
794
795	ldp	x6, x7, [x0], #16           //AES final-2 block - load input low & high
796#ifdef __AARCH64EB__
797	rev	x6, x6
798	rev	x7, x7
799#endif
800	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
801
802	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
803	eor	x7, x7, x14                     //AES final-2 block - round 10 high
804	eor	x6, x6, x13                     //AES final-2 block - round 10 low
805
806	fmov	d5, x6                                 //AES final-2 block - mov low
807
808	movi	v8.8b, #0                                        //suppress further partial tag feed in
809	fmov	v5.d[1], x7                             //AES final-2 block - mov high
810
811	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
812	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
813
814	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
815
816	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
817
818	eor	v5.16b, v5.16b, v1.16b                            //AES final-2 block - result
819	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
820
821	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
822.L128_enc_blocks_more_than_2:	//blocks	left >  2
823
824	st1	{ v5.16b}, [x2], #16                     //AES final-2 block - store result
825
826	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
827	ldp	x6, x7, [x0], #16           //AES final-1 block - load input low & high
828#ifdef __AARCH64EB__
829	rev	x6, x6
830	rev	x7, x7
831#endif
832	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
833
834	eor	x6, x6, x13                     //AES final-1 block - round 10 low
835
836	fmov	d5, x6                                 //AES final-1 block - mov low
837	eor	x7, x7, x14                     //AES final-1 block - round 10 high
838
839	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
840	fmov	v5.d[1], x7                             //AES final-1 block - mov high
841
842	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
843
844	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
845
846	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
847
848	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
849
850	eor	v5.16b, v5.16b, v2.16b                            //AES final-1 block - result
851
852	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
853
854	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
855
856	movi	v8.8b, #0                                        //suppress further partial tag feed in
857
858	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
859.L128_enc_blocks_more_than_1:	//blocks	left >  1
860
861	st1	{ v5.16b}, [x2], #16                     //AES final-1 block - store result
862
863	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
864	ldp	x6, x7, [x0], #16           //AES final block - load input low & high
865#ifdef __AARCH64EB__
866	rev	x6, x6
867	rev	x7, x7
868#endif
869	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
870
871	eor	x7, x7, x14                     //AES final block - round 10 high
872	eor	x6, x6, x13                     //AES final block - round 10 low
873
874	fmov	d5, x6                                 //AES final block - mov low
875
876	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
877	fmov	v5.d[1], x7                             //AES final block - mov high
878
879	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
880
881	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
882
883	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
884
885	eor	v5.16b, v5.16b, v3.16b                            //AES final block - result
886
887	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
888
889	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
890
891	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
892
893	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
894
895	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
896	movi	v8.8b, #0                                        //suppress further partial tag feed in
897.L128_enc_blocks_less_than_1:	//blocks	left <= 1
898
899	and	x1, x1, #127                    //bit_length %= 128
900	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
901
902	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
903	sub	x1, x1, #128                    //bit_length -= 128
904
905	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
906
907	and	x1, x1, #127                    //bit_length %= 128
908
909	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
910	cmp	x1, #64
911
912	csel	x6, x13, x14, lt
913	csel	x7, x14, xzr, lt
914
915	fmov	d0, x6                                 //ctr0b is mask for last block
916
917	fmov	v0.d[1], x7
918
919	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
920
921	rev64	v4.16b, v5.16b                                    //GHASH final block
922
923	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
924
925	mov	d8, v4.d[1]                                  //GHASH final block - mid
926
927	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
928	ld1	{ v18.16b}, [x2]                            //load existing bytes where the possibly partial last block is to be stored
929
930	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
931#ifndef __AARCH64EB__
932	rev	w9, w12
933#else
934	mov	w9, w12
935#endif
936	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
937
938	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
939
940	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
941
942	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
943
944	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
945	movi	v8.8b, #0xc2
946
947	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
948
949	shl	d8, d8, #56               //mod_constant
950
951	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
952
953	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
954
955	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
956
957	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
958
959	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
960
961	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
962
963	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
964
965	bif	v5.16b, v18.16b, v0.16b                              //insert existing bytes in top end of result before storing
966
967	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
968	st1	{ v5.16b}, [x2]                          //store all 16B
969
970	str	w9, [x16, #12]                          //store the updated counter
971
972	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
973	ext	v11.16b, v11.16b, v11.16b, #8
974	rev64	v11.16b, v11.16b
975	mov	x0, x15
976	st1	{ v11.16b }, [x3]
977	ldp	x21, x22, [sp, #16]
978	ldp	x23, x24, [sp, #32]
979	ldp	d8, d9, [sp, #48]
980	ldp	d10, d11, [sp, #64]
981	ldp	d12, d13, [sp, #80]
982	ldp	d14, d15, [sp, #96]
983	ldp	x19, x20, [sp], #112
984	ret
985
986.L128_enc_ret:
987	mov	w0, #0x0
988	ret
989.size	aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
990.globl	aes_gcm_dec_128_kernel
991.type	aes_gcm_dec_128_kernel,%function
992.align	4
993aes_gcm_dec_128_kernel:
994	AARCH64_VALID_CALL_TARGET
995	cbz	x1, .L128_dec_ret
996	stp	x19, x20, [sp, #-112]!
997	mov	x16, x4
998	mov	x8, x5
999	stp	x21, x22, [sp, #16]
1000	stp	x23, x24, [sp, #32]
1001	stp	d8, d9, [sp, #48]
1002	stp	d10, d11, [sp, #64]
1003	stp	d12, d13, [sp, #80]
1004	stp	d14, d15, [sp, #96]
1005
1006	lsr	x5, x1, #3              //byte_len
1007	mov	x15, x5
1008	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
1009#ifdef __AARCH64EB__
1010	rev	x10, x10
1011	rev	x11, x11
1012#endif
1013	ldp	x13, x14, [x8, #160]                     //load rk10
1014#ifdef __AARCH64EB__
1015	ror	x14, x14, 32
1016	ror	x13, x13, 32
1017#endif
1018	sub	x5, x5, #1      //byte_len - 1
1019	ld1	{v18.4s}, [x8], #16                                //load rk0
1020
1021	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1022	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
1023
1024	ldr	q13, [x3, #64]                         //load h2l | h2h
1025#ifndef __AARCH64EB__
1026	ext	v13.16b, v13.16b, v13.16b, #8
1027#endif
1028	lsr	x12, x11, #32
1029	fmov	d2, x10                               //CTR block 2
1030
1031	ld1	{v19.4s}, [x8], #16                                //load rk1
1032	orr	w11, w11, w11
1033	rev	w12, w12                                //rev_ctr32
1034
1035	fmov	d1, x10                               //CTR block 1
1036	add	w12, w12, #1                            //increment rev_ctr32
1037
1038	aese	v0.16b, v18.16b
1039	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
1040	rev	w9, w12                                 //CTR block 1
1041
1042	orr	x9, x11, x9, lsl #32            //CTR block 1
1043	ld1	{v20.4s}, [x8], #16                                //load rk2
1044	add	w12, w12, #1                            //CTR block 1
1045
1046	fmov	v1.d[1], x9                               //CTR block 1
1047	rev	w9, w12                                 //CTR block 2
1048	add	w12, w12, #1                            //CTR block 2
1049
1050	aese	v0.16b, v19.16b
1051	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
1052	orr	x9, x11, x9, lsl #32            //CTR block 2
1053
1054	fmov	v2.d[1], x9                               //CTR block 2
1055	rev	w9, w12                                 //CTR block 3
1056
1057	fmov	d3, x10                               //CTR block 3
1058	orr	x9, x11, x9, lsl #32            //CTR block 3
1059	add	w12, w12, #1                            //CTR block 3
1060
1061	fmov	v3.d[1], x9                               //CTR block 3
1062	add	x4, x0, x1, lsr #3   //end_input_ptr
1063
1064	aese	v1.16b, v18.16b
1065	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
1066	ld1	{v21.4s}, [x8], #16                                //load rk3
1067
1068	aese	v0.16b, v20.16b
1069	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
1070	ld1	{v22.4s}, [x8], #16                                //load rk4
1071
1072	aese	v2.16b, v18.16b
1073	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
1074	ld1	{v23.4s}, [x8], #16                                //load rk5
1075
1076	aese	v1.16b, v19.16b
1077	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
1078	ld1	{v24.4s}, [x8], #16                                //load rk6
1079
1080	aese	v3.16b, v18.16b
1081	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
1082
1083	aese	v2.16b, v19.16b
1084	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
1085
1086	aese	v1.16b, v20.16b
1087	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
1088
1089	aese	v3.16b, v19.16b
1090	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
1091	ld1	{ v11.16b}, [x3]
1092	ext	v11.16b, v11.16b, v11.16b, #8
1093	rev64	v11.16b, v11.16b
1094
1095	aese	v0.16b, v21.16b
1096	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
1097	ld1	{v25.4s}, [x8], #16                                //load rk7
1098
1099	aese	v1.16b, v21.16b
1100	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
1101
1102	aese	v3.16b, v20.16b
1103	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
1104
1105	aese	v2.16b, v20.16b
1106	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
1107	ld1	{v26.4s}, [x8], #16                                //load rk8
1108
1109	aese	v1.16b, v22.16b
1110	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
1111
1112	aese	v3.16b, v21.16b
1113	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
1114
1115	aese	v2.16b, v21.16b
1116	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
1117	ldr	q14, [x3, #80]                         //load h3l | h3h
1118#ifndef __AARCH64EB__
1119	ext	v14.16b, v14.16b, v14.16b, #8
1120#endif
1121	aese	v0.16b, v22.16b
1122	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
1123	ld1	{v27.4s}, [x8], #16                                //load rk9
1124
1125	aese	v1.16b, v23.16b
1126	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
1127
1128	aese	v2.16b, v22.16b
1129	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
1130
1131	aese	v3.16b, v22.16b
1132	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
1133
1134	aese	v0.16b, v23.16b
1135	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
1136
1137	aese	v2.16b, v23.16b
1138	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
1139	ldr	q12, [x3, #32]                         //load h1l | h1h
1140#ifndef __AARCH64EB__
1141	ext	v12.16b, v12.16b, v12.16b, #8
1142#endif
1143	aese	v3.16b, v23.16b
1144	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
1145
1146	aese	v0.16b, v24.16b
1147	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
1148
1149	aese	v1.16b, v24.16b
1150	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
1151
1152	aese	v3.16b, v24.16b
1153	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
1154
1155	aese	v2.16b, v24.16b
1156	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
1157	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
1158
1159	ldr	q15, [x3, #112]                        //load h4l | h4h
1160#ifndef __AARCH64EB__
1161	ext	v15.16b, v15.16b, v15.16b, #8
1162#endif
1163	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
1164	add	x5, x5, x0
1165
1166	aese	v1.16b, v25.16b
1167	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
1168
1169	aese	v2.16b, v25.16b
1170	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
1171
1172	aese	v0.16b, v25.16b
1173	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
1174	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
1175
1176	aese	v3.16b, v25.16b
1177	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
1178
1179	aese	v1.16b, v26.16b
1180	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
1181	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
1182
1183	aese	v2.16b, v26.16b
1184	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
1185
1186	aese	v3.16b, v26.16b
1187	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
1188
1189	aese	v0.16b, v26.16b
1190	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
1191	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
1192
1193	aese	v2.16b, v27.16b                                      //AES block 2 - round 9
1194
1195	aese	v3.16b, v27.16b                                      //AES block 3 - round 9
1196
1197	aese	v0.16b, v27.16b                                      //AES block 0 - round 9
1198	cmp	x0, x5                   //check if we have <= 4 blocks
1199
1200	aese	v1.16b, v27.16b                                      //AES block 1 - round 9
1201	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
1202	b.ge	.L128_dec_tail                                    //handle tail
1203
1204	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0 - load ciphertext; AES block 1 - load ciphertext
1205
1206	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
1207	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
1208
1209	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
1210	rev64	v4.16b, v4.16b                                    //GHASH block 0
1211	rev	w9, w12                                 //CTR block 4
1212
1213	orr	x9, x11, x9, lsl #32            //CTR block 4
1214	add	w12, w12, #1                            //CTR block 4
1215	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
1216
1217	rev64	v5.16b, v5.16b                                    //GHASH block 1
1218	mov	x19, v1.d[0]                            //AES block 1 - mov low
1219
1220	mov	x20, v1.d[1]                            //AES block 1 - mov high
1221
1222	mov	x6, v0.d[0]                            //AES block 0 - mov low
1223	cmp	x0, x5                   //check if we have <= 8 blocks
1224
1225	mov	x7, v0.d[1]                            //AES block 0 - mov high
1226
1227	fmov	d0, x10                               //CTR block 4
1228
1229	fmov	v0.d[1], x9                               //CTR block 4
1230	rev	w9, w12                                 //CTR block 5
1231	eor	x19, x19, x13                   //AES block 1 - round 10 low
1232#ifdef __AARCH64EB__
1233	rev	x19, x19
1234#endif
1235	fmov	d1, x10                               //CTR block 5
1236	add	w12, w12, #1                            //CTR block 5
1237	orr	x9, x11, x9, lsl #32            //CTR block 5
1238
1239	fmov	v1.d[1], x9                               //CTR block 5
1240	rev	w9, w12                                 //CTR block 6
1241	add	w12, w12, #1                            //CTR block 6
1242
1243	orr	x9, x11, x9, lsl #32            //CTR block 6
1244
1245	eor	x20, x20, x14                   //AES block 1 - round 10 high
1246#ifdef __AARCH64EB__
1247	rev	x20, x20
1248#endif
1249	eor	x6, x6, x13                   //AES block 0 - round 10 low
1250#ifdef __AARCH64EB__
1251	rev	x6, x6
1252#endif
1253	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
1254
1255	eor	x7, x7, x14                   //AES block 0 - round 10 high
1256#ifdef __AARCH64EB__
1257	rev	x7, x7
1258#endif
1259	stp	x6, x7, [x2], #16        //AES block 0 - store result
1260
1261	stp	x19, x20, [x2], #16        //AES block 1 - store result
1262	b.ge	.L128_dec_prepretail                              //do prepretail
1263
1264.L128_dec_main_loop:	//main	loop start
1265	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
1266	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
1267	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
1268
1269	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
1270	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
1271
1272	aese	v1.16b, v18.16b
1273	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
1274	fmov	d2, x10                               //CTR block 4k+6
1275
1276	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
1277	fmov	v2.d[1], x9                               //CTR block 4k+6
1278	rev	w9, w12                                 //CTR block 4k+7
1279
1280	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
1281	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
1282	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
1283
1284	aese	v1.16b, v19.16b
1285	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
1286	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
1287
1288	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
1289	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
1290	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
1291
1292	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
1293	fmov	d3, x10                               //CTR block 4k+7
1294	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
1295
1296	aese	v1.16b, v20.16b
1297	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
1298	fmov	v3.d[1], x9                               //CTR block 4k+7
1299
1300	aese	v2.16b, v18.16b
1301	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
1302	mov	d10, v17.d[1]                               //GHASH block 4k - mid
1303
1304	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
1305	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
1306
1307	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
1308
1309	aese	v1.16b, v21.16b
1310	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
1311	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
1312
1313	aese	v3.16b, v18.16b
1314	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
1315	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
1316
1317	aese	v0.16b, v18.16b
1318	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
1319
1320	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
1321	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
1322
1323	aese	v3.16b, v19.16b
1324	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
1325	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
1326#ifdef __AARCH64EB__
1327	rev	x23, x23
1328#endif
1329	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
1330	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
1331#ifdef __AARCH64EB__
1332	rev	x22, x22
1333#endif
1334	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
1335
1336	aese	v0.16b, v19.16b
1337	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
1338	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
1339
1340	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
1341
1342	aese	v3.16b, v20.16b
1343	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
1344	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
1345
1346	aese	v0.16b, v20.16b
1347	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
1348
1349	aese	v1.16b, v22.16b
1350	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
1351	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
1352
1353	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
1354
1355	aese	v0.16b, v21.16b
1356	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
1357	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
1358
1359	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
1360
1361	aese	v2.16b, v19.16b
1362	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
1363	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
1364
1365	aese	v0.16b, v22.16b
1366	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
1367	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
1368
1369	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
1370	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
1371#ifdef __AARCH64EB__
1372	rev	x24, x24
1373#endif
1374	aese	v2.16b, v20.16b
1375	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
1376	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
1377
1378	aese	v1.16b, v23.16b
1379	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
1380	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
1381#ifdef __AARCH64EB__
1382	rev	x21, x21
1383#endif
1384	aese	v0.16b, v23.16b
1385	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
1386	movi	v8.8b, #0xc2
1387
1388	aese	v2.16b, v21.16b
1389	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
1390	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
1391
1392	aese	v1.16b, v24.16b
1393	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
1394
1395	aese	v0.16b, v24.16b
1396	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
1397	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
1398
1399	aese	v2.16b, v22.16b
1400	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
1401	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
1402
1403	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
1404	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
1405	ld1	{v4.16b}, [x0], #16                       //AES block 4k+3 - load ciphertext
1406
1407	aese	v1.16b, v25.16b
1408	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
1409	add	w12, w12, #1                            //CTR block 4k+7
1410
1411	aese	v0.16b, v25.16b
1412	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
1413	shl	d8, d8, #56               //mod_constant
1414
1415	aese	v2.16b, v23.16b
1416	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
1417	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
1418
1419	aese	v1.16b, v26.16b
1420	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
1421	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
1422
1423	aese	v0.16b, v26.16b
1424	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
1425	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1426
1427	aese	v3.16b, v21.16b
1428	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
1429	rev	w9, w12                                 //CTR block 4k+8
1430
1431	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1432	ld1	{v5.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
1433	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1434
1435	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
1436	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
1437
1438	aese	v3.16b, v22.16b
1439	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
1440	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1441
1442	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
1443
1444	aese	v2.16b, v24.16b
1445	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
1446	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
1447
1448	aese	v3.16b, v23.16b
1449	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
1450	ld1	{v6.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
1451
1452	add	w12, w12, #1                            //CTR block 4k+8
1453	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1454	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
1455
1456	aese	v2.16b, v25.16b
1457	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
1458	ld1	{v7.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
1459
1460	aese	v3.16b, v24.16b
1461	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
1462
1463	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
1464	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1465	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
1466
1467	aese	v2.16b, v26.16b
1468	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
1469	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
1470
1471	aese	v3.16b, v25.16b
1472	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
1473	fmov	d0, x10                               //CTR block 4k+8
1474
1475	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1476	fmov	v0.d[1], x9                               //CTR block 4k+8
1477	rev	w9, w12                                 //CTR block 4k+9
1478
1479	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
1480	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
1481	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1482
1483	aese	v3.16b, v26.16b
1484	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
1485	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
1486#ifdef __AARCH64EB__
1487	rev	x7, x7
1488#endif
1489	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1490	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
1491	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
1492#ifdef __AARCH64EB__
1493	rev	x6, x6
1494#endif
1495	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
1496	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
1497	add	w12, w12, #1                            //CTR block 4k+9
1498
1499	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
1500	fmov	d1, x10                               //CTR block 4k+9
1501	cmp	x0, x5                   //.LOOP CONTROL
1502
1503	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
1504	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1505	fmov	v1.d[1], x9                               //CTR block 4k+9
1506
1507	rev	w9, w12                                 //CTR block 4k+10
1508	add	w12, w12, #1                            //CTR block 4k+10
1509
1510	eor	x20, x20, x14                   //AES block 4k+5 - round 10 high
1511#ifdef __AARCH64EB__
1512	rev	x20, x20
1513#endif
1514	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
1515
1516	eor	x19, x19, x13                   //AES block 4k+5 - round 10 low
1517#ifdef __AARCH64EB__
1518	rev	x19, x19
1519#endif
1520	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
1521
1522	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
1523	b.lt	.L128_dec_main_loop
1524
1525.L128_dec_prepretail:	//PREPRETAIL
1526	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
1527	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
1528	mov	d30, v5.d[1]                                  //GHASH block 4k+1 - mid
1529
1530	aese	v0.16b, v18.16b
1531	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
1532	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
1533
1534	aese	v1.16b, v18.16b
1535	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
1536	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
1537
1538	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
1539	fmov	d2, x10                               //CTR block 4k+6
1540	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
1541
1542	aese	v0.16b, v19.16b
1543	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
1544	fmov	v2.d[1], x9                               //CTR block 4k+6
1545
1546	rev	w9, w12                                 //CTR block 4k+7
1547	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
1548	eor	v30.8b, v30.8b, v5.8b                          //GHASH block 4k+1 - mid
1549
1550	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
1551	mov	d10, v17.d[1]                               //GHASH block 4k - mid
1552	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
1553
1554	aese	v1.16b, v19.16b
1555	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
1556	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
1557
1558	aese	v0.16b, v20.16b
1559	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
1560	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
1561
1562	pmull	v29.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
1563	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
1564	fmov	d3, x10                               //CTR block 4k+7
1565
1566	aese	v2.16b, v18.16b
1567	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
1568	fmov	v3.d[1], x9                               //CTR block 4k+7
1569
1570	pmull	v30.1q, v30.1d, v17.1d                          //GHASH block 4k+1 - mid
1571	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
1572
1573	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
1574
1575	aese	v2.16b, v19.16b
1576	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
1577	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
1578
1579	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
1580
1581	aese	v3.16b, v18.16b
1582	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
1583	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
1584
1585	pmull2	v28.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
1586
1587	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
1588	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+1 - low
1589
1590	pmull	v29.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
1591
1592	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
1593	eor	v9.16b, v9.16b, v28.16b                         //GHASH block 4k+1 - high
1594
1595	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+1 - mid
1596
1597	pmull2	v4.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
1598
1599	pmull2	v8.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
1600	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
1601
1602	aese	v1.16b, v20.16b
1603	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
1604	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
1605
1606	pmull	v28.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
1607
1608	eor	v9.16b, v9.16b, v8.16b                         //GHASH block 4k+2 - high
1609	movi	v8.8b, #0xc2
1610
1611	aese	v3.16b, v19.16b
1612	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
1613	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
1614
1615	eor	v11.16b, v11.16b, v28.16b                         //GHASH block 4k+2 - low
1616
1617	aese	v2.16b, v20.16b
1618	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
1619	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+3 - high
1620
1621	aese	v3.16b, v20.16b
1622	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
1623	eor	x23, x23, x13                   //AES block 4k+3 - round 10 low
1624#ifdef __AARCH64EB__
1625	rev	x23, x23
1626#endif
1627	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
1628	eor	x21, x21, x13                   //AES block 4k+2 - round 10 low
1629#ifdef __AARCH64EB__
1630	rev	x21, x21
1631#endif
1632	eor	v11.16b, v11.16b, v29.16b                         //GHASH block 4k+3 - low
1633
1634	aese	v2.16b, v21.16b
1635	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
1636
1637	aese	v1.16b, v21.16b
1638	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
1639	shl	d8, d8, #56               //mod_constant
1640
1641	aese	v0.16b, v21.16b
1642	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
1643
1644	aese	v2.16b, v22.16b
1645	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
1646	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
1647
1648	aese	v1.16b, v22.16b
1649	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
1650
1651	aese	v3.16b, v21.16b
1652	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
1653	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1654
1655	aese	v2.16b, v23.16b
1656	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
1657
1658	aese	v1.16b, v23.16b
1659	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
1660
1661	aese	v3.16b, v22.16b
1662	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
1663
1664	aese	v0.16b, v22.16b
1665	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
1666	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1667
1668	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1669
1670	aese	v1.16b, v24.16b
1671	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
1672	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1673
1674	aese	v3.16b, v23.16b
1675	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
1676
1677	aese	v0.16b, v23.16b
1678	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
1679	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1680
1681	aese	v1.16b, v25.16b
1682	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
1683
1684	aese	v2.16b, v24.16b
1685	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
1686
1687	aese	v0.16b, v24.16b
1688	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
1689
1690	aese	v1.16b, v26.16b
1691	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
1692	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1693
1694	aese	v3.16b, v24.16b
1695	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
1696
1697	aese	v0.16b, v25.16b
1698	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
1699
1700	aese	v1.16b, v27.16b                                      //AES block 4k+5 - round 9
1701
1702	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1703	eor	x24, x24, x14                   //AES block 4k+3 - round 10 high
1704#ifdef __AARCH64EB__
1705	rev	x24, x24
1706#endif
1707	aese	v2.16b, v25.16b
1708	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
1709	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1710
1711	aese	v3.16b, v25.16b
1712	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
1713
1714	aese	v0.16b, v26.16b
1715	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
1716	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1717
1718	aese	v2.16b, v26.16b
1719	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
1720
1721	aese	v3.16b, v26.16b
1722	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
1723	eor	x22, x22, x14                   //AES block 4k+2 - round 10 high
1724#ifdef __AARCH64EB__
1725	rev	x22, x22
1726#endif
1727	aese	v0.16b, v27.16b                                      //AES block 4k+4 - round 9
1728	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
1729
1730	aese	v2.16b, v27.16b                                      //AES block 4k+6 - round 9
1731	add	w12, w12, #1                            //CTR block 4k+7
1732	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
1733
1734	aese	v3.16b, v27.16b                                      //AES block 4k+7 - round 9
1735	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1736.L128_dec_tail:	//TAIL
1737
1738	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
1739	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
1740
1741	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
1742
1743	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
1744
1745	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
1746
1747	cmp	x5, #48
1748
1749	eor	x7, x7, x14                   //AES block 4k+4 - round 10 high
1750#ifdef __AARCH64EB__
1751	rev	x7, x7
1752#endif
1753	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
1754	eor	x6, x6, x13                   //AES block 4k+4 - round 10 low
1755#ifdef __AARCH64EB__
1756	rev	x6, x6
1757#endif
1758	b.gt	.L128_dec_blocks_more_than_3
1759
1760	mov	v3.16b, v2.16b
1761	sub	w12, w12, #1
1762	movi	v11.8b, #0
1763
1764	movi	v9.8b, #0
1765	mov	v2.16b, v1.16b
1766
1767	movi	v10.8b, #0
1768	cmp	x5, #32
1769	b.gt	.L128_dec_blocks_more_than_2
1770
1771	cmp	x5, #16
1772
1773	mov	v3.16b, v1.16b
1774	sub	w12, w12, #1
1775	b.gt	.L128_dec_blocks_more_than_1
1776
1777	sub	w12, w12, #1
1778	b	.L128_dec_blocks_less_than_1
1779.L128_dec_blocks_more_than_3:	//blocks	left >  3
1780	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
1781	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
1782
1783	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1784
1785	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
1786	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
1787	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
1788
1789	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
1790	mov	x7, v0.d[1]                            //AES final-2 block - mov high
1791
1792	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
1793	mov	x6, v0.d[0]                            //AES final-2 block - mov low
1794
1795	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
1796
1797	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
1798
1799	movi	v8.8b, #0                                        //suppress further partial tag feed in
1800	eor	x7, x7, x14                   //AES final-2 block - round 10 high
1801#ifdef __AARCH64EB__
1802	rev	x7, x7
1803#endif
1804	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
1805	eor	x6, x6, x13                   //AES final-2 block - round 10 low
1806#ifdef __AARCH64EB__
1807	rev	x6, x6
1808#endif
1809.L128_dec_blocks_more_than_2:	//blocks	left >  2
1810
1811	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
1812	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
1813
1814	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1815
1816	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
1817	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
1818
1819	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
1820
1821	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
1822
1823	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
1824	mov	x6, v0.d[0]                            //AES final-1 block - mov low
1825
1826	mov	x7, v0.d[1]                            //AES final-1 block - mov high
1827	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
1828
1829	movi	v8.8b, #0                                        //suppress further partial tag feed in
1830
1831	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
1832
1833	eor	x6, x6, x13                   //AES final-1 block - round 10 low
1834#ifdef __AARCH64EB__
1835	rev	x6, x6
1836#endif
1837	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
1838
1839	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
1840
1841	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
1842	eor	x7, x7, x14                   //AES final-1 block - round 10 high
1843#ifdef __AARCH64EB__
1844	rev	x7, x7
1845#endif
1846.L128_dec_blocks_more_than_1:	//blocks	left >  1
1847
1848	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
1849
1850	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
1851	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1852
1853	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
1854
1855	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
1856
1857	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
1858
1859	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
1860	mov	x6, v0.d[0]                            //AES final block - mov low
1861
1862	mov	x7, v0.d[1]                            //AES final block - mov high
1863	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
1864
1865	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
1866
1867	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
1868
1869	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
1870	movi	v8.8b, #0                                        //suppress further partial tag feed in
1871
1872	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
1873
1874	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
1875	eor	x7, x7, x14                   //AES final block - round 10 high
1876#ifdef __AARCH64EB__
1877	rev	x7, x7
1878#endif
1879	eor	x6, x6, x13                   //AES final block - round 10 low
1880#ifdef __AARCH64EB__
1881	rev	x6, x6
1882#endif
1883	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
1884.L128_dec_blocks_less_than_1:	//blocks	left <= 1
1885
1886	mvn	x14, xzr                                      //rk10_h = 0xffffffffffffffff
1887	and	x1, x1, #127                    //bit_length %= 128
1888
1889	mvn	x13, xzr                                      //rk10_l = 0xffffffffffffffff
1890	sub	x1, x1, #128                    //bit_length -= 128
1891
1892	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
1893
1894	and	x1, x1, #127                    //bit_length %= 128
1895
1896	lsr	x14, x14, x1                     //rk10_h is mask for top 64b of last block
1897	cmp	x1, #64
1898
1899	csel	x10, x14, xzr, lt
1900	csel	x9, x13, x14, lt
1901
1902	fmov	d0, x9                                   //ctr0b is mask for last block
1903
1904	mov	v0.d[1], x10
1905
1906	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
1907
1908	rev64	v4.16b, v5.16b                                    //GHASH final block
1909
1910	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
1911
1912	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
1913
1914	and	x7, x7, x10
1915
1916	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
1917	mov	d8, v4.d[1]                                  //GHASH final block - mid
1918
1919	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
1920	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
1921
1922	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
1923
1924	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
1925	bic	x4, x4, x9           //mask out low existing bytes
1926	and	x6, x6, x9
1927
1928#ifndef __AARCH64EB__
1929	rev	w9, w12
1930#else
1931	mov	w9, w12
1932#endif
1933
1934	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
1935	movi	v8.8b, #0xc2
1936
1937	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
1938
1939	bic	x5, x5, x10   //mask out high existing bytes
1940	shl	d8, d8, #56               //mod_constant
1941
1942	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
1943
1944	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
1945
1946	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
1947
1948	orr	x6, x6, x4
1949	str	w9, [x16, #12]                          //store the updated counter
1950
1951	orr	x7, x7, x5
1952	stp	x6, x7, [x2]
1953	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
1954
1955	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
1956
1957	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
1958
1959	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
1960	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
1961
1962	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
1963
1964	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
1965	ext	v11.16b, v11.16b, v11.16b, #8
1966	rev64	v11.16b, v11.16b
1967	mov	x0, x15
1968	st1	{ v11.16b }, [x3]
1969
1970	ldp	x21, x22, [sp, #16]
1971	ldp	x23, x24, [sp, #32]
1972	ldp	d8, d9, [sp, #48]
1973	ldp	d10, d11, [sp, #64]
1974	ldp	d12, d13, [sp, #80]
1975	ldp	d14, d15, [sp, #96]
1976	ldp	x19, x20, [sp], #112
1977	ret
1978
1979.L128_dec_ret:
1980	mov	w0, #0x0
1981	ret
1982.size	aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
1983.globl	aes_gcm_enc_192_kernel
1984.type	aes_gcm_enc_192_kernel,%function
1985.align	4
1986aes_gcm_enc_192_kernel:
1987	AARCH64_VALID_CALL_TARGET
1988	cbz	x1, .L192_enc_ret
1989	stp	x19, x20, [sp, #-112]!
1990	mov	x16, x4
1991	mov	x8, x5
1992	stp	x21, x22, [sp, #16]
1993	stp	x23, x24, [sp, #32]
1994	stp	d8, d9, [sp, #48]
1995	stp	d10, d11, [sp, #64]
1996	stp	d12, d13, [sp, #80]
1997	stp	d14, d15, [sp, #96]
1998
1999	ldp	x10, x11, [x16]             //ctr96_b64, ctr96_t32
2000#ifdef __AARCH64EB__
2001	rev	x10, x10
2002	rev	x11, x11
2003#endif
2004	ldp	x13, x14, [x8, #192]                     //load rk12
2005#ifdef __AARCH64EB__
2006	ror	x13, x13, #32
2007	ror	x14, x14, #32
2008#endif
2009	ld1	{v18.4s}, [x8], #16	                             //load rk0
2010
2011	ld1	{v19.4s}, [x8], #16	                             //load rk1
2012
2013	ld1	{v20.4s}, [x8], #16	                             //load rk2
2014
2015	lsr	x12, x11, #32
2016	ld1	{v21.4s}, [x8], #16	                             //load rk3
2017	orr	w11, w11, w11
2018
2019	ld1	{v22.4s}, [x8], #16	                             //load rk4
2020	rev	w12, w12                               //rev_ctr32
2021
2022	add	w12, w12, #1                           //increment rev_ctr32
2023	fmov	d3, x10                              //CTR block 3
2024
2025	rev	w9, w12                                //CTR block 1
2026	add	w12, w12, #1                           //CTR block 1
2027	fmov	d1, x10                              //CTR block 1
2028
2029	orr	x9, x11, x9, lsl #32           //CTR block 1
2030	ld1	{ v0.16b}, [x16]                            //special case vector load initial counter so we can start first AES block as quickly as possible
2031
2032	fmov	v1.d[1], x9                              //CTR block 1
2033	rev	w9, w12                                //CTR block 2
2034	add	w12, w12, #1                           //CTR block 2
2035
2036	fmov	d2, x10                              //CTR block 2
2037	orr	x9, x11, x9, lsl #32           //CTR block 2
2038
2039	fmov	v2.d[1], x9                              //CTR block 2
2040	rev	w9, w12                                //CTR block 3
2041
2042	orr	x9, x11, x9, lsl #32           //CTR block 3
2043	ld1	{v23.4s}, [x8], #16	                             //load rk5
2044
2045	fmov	v3.d[1], x9                              //CTR block 3
2046
2047	ld1	{v24.4s}, [x8], #16	                             //load rk6
2048
2049	ld1	{v25.4s}, [x8], #16	                             //load rk7
2050
2051	aese	v0.16b, v18.16b
2052	aesmc	v0.16b, v0.16b         //AES block 0 - round 0
2053	ld1	{ v11.16b}, [x3]
2054	ext	v11.16b, v11.16b, v11.16b, #8
2055	rev64	v11.16b, v11.16b
2056
2057	aese	v3.16b, v18.16b
2058	aesmc	v3.16b, v3.16b         //AES block 3 - round 0
2059	ld1	{v26.4s}, [x8], #16	                             //load rk8
2060
2061	aese	v1.16b, v18.16b
2062	aesmc	v1.16b, v1.16b         //AES block 1 - round 0
2063	ldr	q15, [x3, #112]                       //load h4l | h4h
2064#ifndef __AARCH64EB__
2065	ext	v15.16b, v15.16b, v15.16b, #8
2066#endif
2067	aese	v2.16b, v18.16b
2068	aesmc	v2.16b, v2.16b         //AES block 2 - round 0
2069	ld1	{v27.4s}, [x8], #16	                             //load rk9
2070
2071	aese	v0.16b, v19.16b
2072	aesmc	v0.16b, v0.16b         //AES block 0 - round 1
2073	ld1	{v28.4s}, [x8], #16	                         //load rk10
2074
2075	aese	v1.16b, v19.16b
2076	aesmc	v1.16b, v1.16b         //AES block 1 - round 1
2077	ldr	q12, [x3, #32]                        //load h1l | h1h
2078#ifndef __AARCH64EB__
2079	ext	v12.16b, v12.16b, v12.16b, #8
2080#endif
2081	aese	v2.16b, v19.16b
2082	aesmc	v2.16b, v2.16b         //AES block 2 - round 1
2083	ld1	{v29.4s}, [x8], #16	                         //load rk11
2084
2085	aese	v3.16b, v19.16b
2086	aesmc	v3.16b, v3.16b         //AES block 3 - round 1
2087	ldr	q14, [x3, #80]                        //load h3l | h3h
2088#ifndef __AARCH64EB__
2089	ext	v14.16b, v14.16b, v14.16b, #8
2090#endif
2091	aese	v0.16b, v20.16b
2092	aesmc	v0.16b, v0.16b         //AES block 0 - round 2
2093
2094	aese	v2.16b, v20.16b
2095	aesmc	v2.16b, v2.16b         //AES block 2 - round 2
2096
2097	aese	v3.16b, v20.16b
2098	aesmc	v3.16b, v3.16b         //AES block 3 - round 2
2099
2100	aese	v0.16b, v21.16b
2101	aesmc	v0.16b, v0.16b         //AES block 0 - round 3
2102	trn1	v9.2d, v14.2d,    v15.2d                     //h4h | h3h
2103
2104	aese	v2.16b, v21.16b
2105	aesmc	v2.16b, v2.16b         //AES block 2 - round 3
2106
2107	aese	v1.16b, v20.16b
2108	aesmc	v1.16b, v1.16b         //AES block 1 - round 2
2109	trn2	v17.2d,  v14.2d,    v15.2d                     //h4l | h3l
2110
2111	aese	v0.16b, v22.16b
2112	aesmc	v0.16b, v0.16b         //AES block 0 - round 4
2113
2114	aese	v3.16b, v21.16b
2115	aesmc	v3.16b, v3.16b         //AES block 3 - round 3
2116
2117	aese	v1.16b, v21.16b
2118	aesmc	v1.16b, v1.16b         //AES block 1 - round 3
2119
2120	aese	v0.16b, v23.16b
2121	aesmc	v0.16b, v0.16b         //AES block 0 - round 5
2122
2123	aese	v2.16b, v22.16b
2124	aesmc	v2.16b, v2.16b         //AES block 2 - round 4
2125
2126	aese	v1.16b, v22.16b
2127	aesmc	v1.16b, v1.16b         //AES block 1 - round 4
2128
2129	aese	v0.16b, v24.16b
2130	aesmc	v0.16b, v0.16b         //AES block 0 - round 6
2131
2132	aese	v3.16b, v22.16b
2133	aesmc	v3.16b, v3.16b         //AES block 3 - round 4
2134
2135	aese	v2.16b, v23.16b
2136	aesmc	v2.16b, v2.16b         //AES block 2 - round 5
2137
2138	aese	v1.16b, v23.16b
2139	aesmc	v1.16b, v1.16b         //AES block 1 - round 5
2140
2141	aese	v3.16b, v23.16b
2142	aesmc	v3.16b, v3.16b         //AES block 3 - round 5
2143
2144	aese	v2.16b, v24.16b
2145	aesmc	v2.16b, v2.16b         //AES block 2 - round 6
2146	ldr	q13, [x3, #64]                        //load h2l | h2h
2147#ifndef __AARCH64EB__
2148	ext	v13.16b, v13.16b, v13.16b, #8
2149#endif
2150	aese	v1.16b, v24.16b
2151	aesmc	v1.16b, v1.16b         //AES block 1 - round 6
2152
2153	aese	v3.16b, v24.16b
2154	aesmc	v3.16b, v3.16b         //AES block 3 - round 6
2155
2156	aese	v0.16b, v25.16b
2157	aesmc	v0.16b, v0.16b         //AES block 0 - round 7
2158
2159	aese	v1.16b, v25.16b
2160	aesmc	v1.16b, v1.16b         //AES block 1 - round 7
2161	trn2	v16.2d,  v12.2d,    v13.2d                     //h2l | h1l
2162
2163	aese	v3.16b, v25.16b
2164	aesmc	v3.16b, v3.16b         //AES block 3 - round 7
2165
2166	aese	v0.16b, v26.16b
2167	aesmc	v0.16b, v0.16b         //AES block 0 - round 8
2168
2169	aese	v2.16b, v25.16b
2170	aesmc	v2.16b, v2.16b         //AES block 2 - round 7
2171	trn1	v8.2d,    v12.2d,    v13.2d                     //h2h | h1h
2172
2173	aese	v1.16b, v26.16b
2174	aesmc	v1.16b, v1.16b         //AES block 1 - round 8
2175
2176	aese	v3.16b, v26.16b
2177	aesmc	v3.16b, v3.16b         //AES block 3 - round 8
2178
2179	aese	v2.16b, v26.16b
2180	aesmc	v2.16b, v2.16b         //AES block 2 - round 8
2181
2182	aese	v0.16b, v27.16b
2183	aesmc	v0.16b, v0.16b         //AES block 0 - round 9
2184
2185	aese	v3.16b, v27.16b
2186	aesmc	v3.16b, v3.16b         //AES block 3 - round 9
2187
2188	aese	v2.16b, v27.16b
2189	aesmc	v2.16b, v2.16b         //AES block 2 - round 9
2190
2191	aese	v1.16b, v27.16b
2192	aesmc	v1.16b, v1.16b         //AES block 1 - round 9
2193
2194	aese	v0.16b, v28.16b
2195	aesmc	v0.16b, v0.16b         //AES block 0 - round 10
2196
2197	aese	v2.16b, v28.16b
2198	aesmc	v2.16b, v2.16b         //AES block 2 - round 10
2199
2200	aese	v1.16b, v28.16b
2201	aesmc	v1.16b, v1.16b         //AES block 1 - round 10
2202	lsr	x5, x1, #3             //byte_len
2203	mov	x15, x5
2204
2205	aese	v3.16b, v28.16b
2206	aesmc	v3.16b, v3.16b         //AES block 3 - round 10
2207	sub	x5, x5, #1     //byte_len - 1
2208
2209	eor	v16.16b, v16.16b, v8.16b                    //h2k | h1k
2210	and	x5, x5, #0xffffffffffffffc0   //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2211
2212	eor	v17.16b, v17.16b, v9.16b                 //h4k | h3k
2213
2214	aese	v2.16b, v29.16b                                    //AES block 2 - round 11
2215	add	x4, x0, x1, lsr #3  //end_input_ptr
2216	add	x5, x5, x0
2217
2218	aese	v1.16b, v29.16b                                    //AES block 1 - round 11
2219	cmp	x0, x5                  //check if we have <= 4 blocks
2220
2221	aese	v0.16b, v29.16b                                    //AES block 0 - round 11
2222	add	w12, w12, #1                           //CTR block 3
2223
2224	aese	v3.16b, v29.16b                                    //AES block 3 - round 11
2225	b.ge	.L192_enc_tail                                   //handle tail
2226
2227	rev	w9, w12                                //CTR block 4
2228	ldp	x6, x7, [x0, #0]           //AES block 0 - load plaintext
2229#ifdef __AARCH64EB__
2230	rev	x6, x6
2231	rev	x7, x7
2232#endif
2233	orr	x9, x11, x9, lsl #32           //CTR block 4
2234	ldp	x21, x22, [x0, #32]          //AES block 2 - load plaintext
2235#ifdef __AARCH64EB__
2236	rev	x21, x21
2237	rev	x22, x22
2238#endif
2239	ldp	x23, x24, [x0, #48]          //AES block 3 - load plaintext
2240#ifdef __AARCH64EB__
2241	rev	x23, x23
2242	rev	x24, x24
2243#endif
2244	ldp	x19, x20, [x0, #16]          //AES block 1 - load plaintext
2245#ifdef __AARCH64EB__
2246	rev	x19, x19
2247	rev	x20, x20
2248#endif
2249	add	x0, x0, #64                      //AES input_ptr update
2250	cmp	x0, x5                  //check if we have <= 8 blocks
2251
2252	eor	x6, x6, x13                    //AES block 0 - round 12 low
2253
2254	eor	x7, x7, x14                    //AES block 0 - round 12 high
2255	eor	x22, x22, x14                    //AES block 2 - round 12 high
2256	fmov	d4, x6                              //AES block 0 - mov low
2257
2258	eor	x24, x24, x14                    //AES block 3 - round 12 high
2259	fmov	v4.d[1], x7                          //AES block 0 - mov high
2260
2261	eor	x21, x21, x13                    //AES block 2 - round 12 low
2262	eor	x19, x19, x13                    //AES block 1 - round 12 low
2263
2264	fmov	d5, x19                              //AES block 1 - mov low
2265	eor	x20, x20, x14                    //AES block 1 - round 12 high
2266
2267	fmov	v5.d[1], x20                          //AES block 1 - mov high
2268
2269	eor	x23, x23, x13                    //AES block 3 - round 12 low
2270	fmov	d6, x21                              //AES block 2 - mov low
2271
2272	add	w12, w12, #1                           //CTR block 4
2273	eor	v4.16b, v4.16b, v0.16b                         //AES block 0 - result
2274	fmov	d0, x10                              //CTR block 4
2275
2276	fmov	v0.d[1], x9                              //CTR block 4
2277	rev	w9, w12                                //CTR block 5
2278
2279	orr	x9, x11, x9, lsl #32           //CTR block 5
2280	add	w12, w12, #1                           //CTR block 5
2281
2282	fmov	d7, x23                              //AES block 3 - mov low
2283	st1	{ v4.16b}, [x2], #16                    //AES block 0 - store result
2284
2285	fmov	v6.d[1], x22                          //AES block 2 - mov high
2286
2287	eor	v5.16b, v5.16b, v1.16b                         //AES block 1 - result
2288	fmov	d1, x10                              //CTR block 5
2289	st1	{ v5.16b}, [x2], #16                    //AES block 1 - store result
2290
2291	fmov	v7.d[1], x24                          //AES block 3 - mov high
2292
2293	fmov	v1.d[1], x9                              //CTR block 5
2294	rev	w9, w12                                //CTR block 6
2295
2296	orr	x9, x11, x9, lsl #32           //CTR block 6
2297
2298	add	w12, w12, #1                           //CTR block 6
2299	eor	v6.16b, v6.16b, v2.16b                         //AES block 2 - result
2300	fmov	d2, x10                              //CTR block 6
2301
2302	fmov	v2.d[1], x9                              //CTR block 6
2303	rev	w9, w12                                //CTR block 7
2304
2305	orr	x9, x11, x9, lsl #32           //CTR block 7
2306	st1	{ v6.16b}, [x2], #16                    //AES block 2 - store result
2307
2308	eor	v7.16b, v7.16b, v3.16b                         //AES block 3 - result
2309	st1	{ v7.16b}, [x2], #16                    //AES block 3 - store result
2310	b.ge	.L192_enc_prepretail                             //do prepretail
2311
2312.L192_enc_main_loop:	//main	loop start
2313	aese	v2.16b, v18.16b
2314	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
2315	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
2316
2317	aese	v1.16b, v18.16b
2318	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
2319	ldp	x19, x20, [x0, #16]          //AES block 4k+5 - load plaintext
2320#ifdef __AARCH64EB__
2321	rev	x19, x19
2322	rev	x20, x20
2323#endif
2324	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
2325	fmov	d3, x10                              //CTR block 4k+3
2326	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
2327
2328	aese	v2.16b, v19.16b
2329	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
2330	fmov	v3.d[1], x9                              //CTR block 4k+3
2331
2332	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
2333	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2334	ldp	x21, x22, [x0, #32]          //AES block 4k+6 - load plaintext
2335#ifdef __AARCH64EB__
2336	rev	x21, x21
2337	rev	x22, x22
2338#endif
2339	aese	v0.16b, v18.16b
2340	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
2341	ldp	x23, x24, [x0, #48]          //AES block 4k+3 - load plaintext
2342#ifdef __AARCH64EB__
2343	rev	x23, x23
2344	rev	x24, x24
2345#endif
2346	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
2347	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
2348
2349	aese	v1.16b, v19.16b
2350	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
2351
2352	aese	v0.16b, v19.16b
2353	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
2354	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
2355
2356	aese	v3.16b, v18.16b
2357	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
2358	eor	x24, x24, x14                    //AES block 4k+3 - round 12 high
2359
2360	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
2361	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
2362
2363	aese	v0.16b, v20.16b
2364	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
2365
2366	aese	v3.16b, v19.16b
2367	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
2368	eor	x21, x21, x13                    //AES block 4k+6 - round 12 low
2369
2370	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
2371	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
2372
2373	aese	v0.16b, v21.16b
2374	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
2375	eor	x19, x19, x13                    //AES block 4k+5 - round 12 low
2376
2377	aese	v1.16b, v20.16b
2378	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
2379	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
2380
2381	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
2382	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
2383
2384	aese	v2.16b, v20.16b
2385	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
2386
2387	aese	v1.16b, v21.16b
2388	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
2389
2390	mov	d10, v17.d[1]                              //GHASH block 4k - mid
2391	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
2392
2393	aese	v3.16b, v20.16b
2394	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
2395	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
2396
2397	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
2398
2399	aese	v0.16b, v22.16b
2400	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
2401	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
2402
2403	aese	v3.16b, v21.16b
2404	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
2405
2406	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
2407	eor	x20, x20, x14                    //AES block 4k+5 - round 12 high
2408	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
2409
2410	aese	v0.16b, v23.16b
2411	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
2412	add	w12, w12, #1                           //CTR block 4k+3
2413
2414	aese	v3.16b, v22.16b
2415	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
2416	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
2417
2418	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
2419	eor	x22, x22, x14                    //AES block 4k+6 - round 12 high
2420
2421	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
2422	eor	x23, x23, x13                    //AES block 4k+3 - round 12 low
2423	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
2424
2425	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
2426	rev	w9, w12                                //CTR block 4k+8
2427
2428	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
2429	orr	x9, x11, x9, lsl #32           //CTR block 4k+8
2430
2431	aese	v2.16b, v21.16b
2432	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
2433	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
2434
2435	aese	v1.16b, v22.16b
2436	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
2437	ldp	x6, x7, [x0, #0]           //AES block 4k+4 - load plaintext
2438#ifdef __AARCH64EB__
2439	rev	x6, x6
2440	rev	x7, x7
2441#endif
2442	aese	v0.16b, v24.16b
2443	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
2444	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
2445
2446	aese	v2.16b, v22.16b
2447	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
2448	add	x0, x0, #64                      //AES input_ptr update
2449
2450	aese	v1.16b, v23.16b
2451	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
2452	movi	v8.8b, #0xc2
2453
2454	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
2455	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
2456	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
2457
2458	aese	v2.16b, v23.16b
2459	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
2460	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
2461
2462	aese	v1.16b, v24.16b
2463	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
2464	shl	d8, d8, #56              //mod_constant
2465
2466	aese	v3.16b, v23.16b
2467	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
2468	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
2469
2470	aese	v0.16b, v25.16b
2471	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
2472	fmov	d5, x19                              //AES block 4k+5 - mov low
2473
2474	aese	v1.16b, v25.16b
2475	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
2476	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
2477
2478	aese	v3.16b, v24.16b
2479	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
2480	fmov	v5.d[1], x20                          //AES block 4k+5 - mov high
2481
2482	aese	v0.16b, v26.16b
2483	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
2484	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
2485
2486	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
2487	cmp	x0, x5                  //.LOOP CONTROL
2488	fmov	d4, x6                              //AES block 4k+4 - mov low
2489
2490	aese	v2.16b, v24.16b
2491	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
2492	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
2493
2494	aese	v1.16b, v26.16b
2495	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
2496	fmov	d7, x23                              //AES block 4k+3 - mov low
2497
2498	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
2499	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
2500	add	w12, w12, #1                           //CTR block 4k+8
2501
2502	aese	v2.16b, v25.16b
2503	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
2504	fmov	v7.d[1], x24                          //AES block 4k+3 - mov high
2505
2506	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
2507	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
2508	fmov	d6, x21                              //AES block 4k+6 - mov low
2509
2510	aese	v3.16b, v25.16b
2511	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
2512
2513	aese	v0.16b, v27.16b
2514	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
2515	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
2516
2517	aese	v2.16b, v26.16b
2518	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
2519
2520	aese	v3.16b, v26.16b
2521	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
2522
2523	aese	v1.16b, v27.16b
2524	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
2525
2526	aese	v0.16b, v28.16b
2527	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
2528	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
2529
2530	aese	v3.16b, v27.16b
2531	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
2532
2533	aese	v2.16b, v27.16b
2534	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
2535
2536	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
2537
2538	aese	v1.16b, v28.16b
2539	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
2540	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
2541
2542	aese	v2.16b, v28.16b
2543	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
2544
2545	eor	v4.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
2546	fmov	d0, x10                              //CTR block 4k+8
2547
2548	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
2549	fmov	v0.d[1], x9                              //CTR block 4k+8
2550	rev	w9, w12                                //CTR block 4k+9
2551
2552	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
2553	fmov	v6.d[1], x22                          //AES block 4k+6 - mov high
2554	st1	{ v4.16b}, [x2], #16                    //AES block 4k+4 - store result
2555
2556	aese	v3.16b, v28.16b
2557	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
2558	orr	x9, x11, x9, lsl #32           //CTR block 4k+9
2559
2560	eor	v5.16b, v5.16b, v1.16b                         //AES block 4k+5 - result
2561	add	w12, w12, #1                           //CTR block 4k+9
2562	fmov	d1, x10                              //CTR block 4k+9
2563
2564	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
2565	fmov	v1.d[1], x9                              //CTR block 4k+9
2566	rev	w9, w12                                //CTR block 4k+10
2567
2568	add	w12, w12, #1                           //CTR block 4k+10
2569	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
2570	orr	x9, x11, x9, lsl #32           //CTR block 4k+10
2571
2572	st1	{ v5.16b}, [x2], #16                    //AES block 4k+5 - store result
2573	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
2574
2575	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
2576	eor	v6.16b, v6.16b, v2.16b                         //AES block 4k+6 - result
2577	fmov	d2, x10                              //CTR block 4k+10
2578
2579	st1	{ v6.16b}, [x2], #16                    //AES block 4k+6 - store result
2580	fmov	v2.d[1], x9                              //CTR block 4k+10
2581	rev	w9, w12                                //CTR block 4k+11
2582
2583	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
2584	orr	x9, x11, x9, lsl #32           //CTR block 4k+11
2585
2586	eor	v7.16b, v7.16b, v3.16b                         //AES block 4k+3 - result
2587	st1	{ v7.16b}, [x2], #16                    //AES block 4k+3 - store result
2588	b.lt	.L192_enc_main_loop
2589
2590.L192_enc_prepretail:	//PREPRETAIL
2591	aese	v0.16b, v18.16b
2592	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 0
2593	rev64	v4.16b, v4.16b                                   //GHASH block 4k (only t0 is free)
2594
2595	fmov	d3, x10                              //CTR block 4k+3
2596	ext	v11.16b, v11.16b, v11.16b, #8                    //PRE 0
2597	add	w12, w12, #1                           //CTR block 4k+3
2598
2599	aese	v1.16b, v18.16b
2600	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 0
2601	rev64	v5.16b, v5.16b                                   //GHASH block 4k+1 (t0 and t1 free)
2602
2603	aese	v2.16b, v18.16b
2604	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 0
2605
2606	fmov	v3.d[1], x9                              //CTR block 4k+3
2607	eor	v4.16b, v4.16b, v11.16b                          //PRE 1
2608	mov	d10, v17.d[1]                              //GHASH block 4k - mid
2609
2610	aese	v1.16b, v19.16b
2611	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 1
2612	rev64	v6.16b, v6.16b                                   //GHASH block 4k+2 (t0, t1, and t2 free)
2613
2614	pmull2	v30.1q, v5.2d, v14.2d                         //GHASH block 4k+1 - high
2615
2616	pmull	v11.1q, v4.1d, v15.1d                      //GHASH block 4k - low
2617	mov	d8, v4.d[1]                                 //GHASH block 4k - mid
2618
2619	pmull	v31.1q, v5.1d, v14.1d                         //GHASH block 4k+1 - low
2620	rev64	v7.16b, v7.16b                                   //GHASH block 4k+3 (t0, t1, t2 and t3 free)
2621
2622	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH block 4k - high
2623
2624	eor	v8.8b, v8.8b, v4.8b                         //GHASH block 4k - mid
2625	mov	d4, v5.d[1]                                 //GHASH block 4k+1 - mid
2626
2627	eor	v11.16b, v11.16b, v31.16b                        //GHASH block 4k+1 - low
2628	mov	d31, v6.d[1]                                 //GHASH block 4k+2 - mid
2629
2630	aese	v3.16b, v18.16b
2631	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 0
2632	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+1 - high
2633
2634	pmull2	v30.1q, v6.2d, v13.2d                         //GHASH block 4k+2 - high
2635
2636	eor	v4.8b, v4.8b, v5.8b                         //GHASH block 4k+1 - mid
2637	eor	v31.8b, v31.8b, v6.8b                         //GHASH block 4k+2 - mid
2638
2639	aese	v3.16b, v19.16b
2640	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 1
2641
2642	aese	v2.16b, v19.16b
2643	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 1
2644	eor	v9.16b, v9.16b, v30.16b                        //GHASH block 4k+2 - high
2645
2646	aese	v0.16b, v19.16b
2647	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 1
2648
2649	aese	v1.16b, v20.16b
2650	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 2
2651	mov	d30, v7.d[1]                                 //GHASH block 4k+3 - mid
2652
2653	pmull2	v5.1q, v7.2d, v12.2d                         //GHASH block 4k+3 - high
2654	ins	v31.d[1], v31.d[0]                               //GHASH block 4k+2 - mid
2655
2656	aese	v0.16b, v20.16b
2657	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 2
2658
2659	pmull	v10.1q, v8.1d, v10.1d                     //GHASH block 4k - mid
2660	eor	v30.8b, v30.8b, v7.8b                         //GHASH block 4k+3 - mid
2661
2662	aese	v1.16b, v21.16b
2663	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 3
2664
2665	pmull2	v31.1q, v31.2d, v16.2d                         //GHASH block 4k+2 - mid
2666
2667	pmull	v4.1q, v4.1d, v17.1d                         //GHASH block 4k+1 - mid
2668
2669	pmull	v30.1q, v30.1d, v16.1d                         //GHASH block 4k+3 - mid
2670	eor	v9.16b, v9.16b, v5.16b                        //GHASH block 4k+3 - high
2671
2672	pmull	v8.1q, v6.1d, v13.1d                         //GHASH block 4k+2 - low
2673
2674	aese	v0.16b, v21.16b
2675	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 3
2676	eor	v10.16b, v10.16b, v4.16b                        //GHASH block 4k+1 - mid
2677
2678	aese	v3.16b, v20.16b
2679	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 2
2680
2681	aese	v2.16b, v20.16b
2682	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 2
2683	eor	v11.16b, v11.16b, v8.16b                        //GHASH block 4k+2 - low
2684
2685	aese	v0.16b, v22.16b
2686	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 4
2687
2688	aese	v3.16b, v21.16b
2689	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 3
2690	eor	v10.16b, v10.16b, v31.16b                        //GHASH block 4k+2 - mid
2691
2692	aese	v2.16b, v21.16b
2693	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 3
2694
2695	pmull	v6.1q, v7.1d, v12.1d                         //GHASH block 4k+3 - low
2696	movi	v8.8b, #0xc2
2697
2698	aese	v3.16b, v22.16b
2699	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 4
2700
2701	aese	v2.16b, v22.16b
2702	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 4
2703
2704	aese	v1.16b, v22.16b
2705	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 4
2706	eor	v10.16b, v10.16b, v30.16b                        //GHASH block 4k+3 - mid
2707
2708	aese	v3.16b, v23.16b
2709	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 5
2710
2711	aese	v2.16b, v23.16b
2712	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 5
2713
2714	aese	v1.16b, v23.16b
2715	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 5
2716	eor	v11.16b, v11.16b, v6.16b                        //GHASH block 4k+3 - low
2717
2718	aese	v0.16b, v23.16b
2719	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 5
2720
2721	aese	v3.16b, v24.16b
2722	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 6
2723	eor	v10.16b, v10.16b, v9.16b                        //karatsuba tidy up
2724
2725	aese	v1.16b, v24.16b
2726	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 6
2727
2728	aese	v0.16b, v24.16b
2729	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 6
2730	shl	d8, d8, #56              //mod_constant
2731
2732	aese	v3.16b, v25.16b
2733	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 7
2734
2735	aese	v1.16b, v25.16b
2736	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 7
2737	eor	v10.16b, v10.16b, v11.16b
2738
2739	aese	v0.16b, v25.16b
2740	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 7
2741
2742	pmull	v30.1q, v9.1d, v8.1d
2743
2744	aese	v2.16b, v24.16b
2745	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 6
2746	ext	v9.16b, v9.16b, v9.16b, #8
2747
2748	aese	v0.16b, v26.16b
2749	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 8
2750
2751	aese	v1.16b, v26.16b
2752	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 8
2753	eor	v10.16b, v10.16b, v30.16b
2754
2755	aese	v2.16b, v25.16b
2756	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 7
2757
2758	aese	v3.16b, v26.16b
2759	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 8
2760
2761	aese	v0.16b, v27.16b
2762	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 9
2763
2764	aese	v2.16b, v26.16b
2765	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 8
2766	eor	v10.16b, v10.16b, v9.16b
2767
2768	aese	v3.16b, v27.16b
2769	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 9
2770
2771	aese	v1.16b, v27.16b
2772	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 9
2773
2774	aese	v2.16b, v27.16b
2775	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 9
2776
2777	pmull	v30.1q, v10.1d, v8.1d
2778
2779	ext	v10.16b, v10.16b, v10.16b, #8
2780
2781	aese	v3.16b, v28.16b
2782	aesmc	v3.16b, v3.16b         //AES block 4k+7 - round 10
2783
2784	aese	v0.16b, v28.16b
2785	aesmc	v0.16b, v0.16b         //AES block 4k+4 - round 10
2786
2787	aese	v2.16b, v28.16b
2788	aesmc	v2.16b, v2.16b         //AES block 4k+6 - round 10
2789
2790	aese	v1.16b, v28.16b
2791	aesmc	v1.16b, v1.16b         //AES block 4k+5 - round 10
2792	eor	v11.16b, v11.16b, v30.16b
2793
2794	aese	v0.16b, v29.16b                                    //AES block 4k+4 - round 11
2795
2796	aese	v3.16b, v29.16b                                    //AES block 4k+7 - round 11
2797
2798	aese	v2.16b, v29.16b                                    //AES block 4k+6 - round 11
2799
2800	aese	v1.16b, v29.16b                                    //AES block 4k+5 - round 11
2801	eor	v11.16b, v11.16b, v10.16b
2802.L192_enc_tail:	//TAIL
2803
2804	sub	x5, x4, x0  //main_end_input_ptr is number of bytes left to process
2805	ldp	x6, x7, [x0], #16          //AES block 4k+4 - load plaintext
2806#ifdef __AARCH64EB__
2807	rev	x6, x6
2808	rev	x7, x7
2809#endif
2810	eor	x6, x6, x13                    //AES block 4k+4 - round 12 low
2811	eor	x7, x7, x14                    //AES block 4k+4 - round 12 high
2812
2813	fmov	d4, x6                              //AES block 4k+4 - mov low
2814
2815	fmov	v4.d[1], x7                          //AES block 4k+4 - mov high
2816	cmp	x5, #48
2817
2818	eor	v5.16b, v4.16b, v0.16b                         //AES block 4k+4 - result
2819
2820	ext	v8.16b, v11.16b, v11.16b, #8                    //prepare final partial tag
2821	b.gt	.L192_enc_blocks_more_than_3
2822
2823	sub	w12, w12, #1
2824	movi	v10.8b, #0
2825
2826	mov	v3.16b, v2.16b
2827	movi	v9.8b, #0
2828	cmp	x5, #32
2829
2830	mov	v2.16b, v1.16b
2831	movi	v11.8b, #0
2832	b.gt	.L192_enc_blocks_more_than_2
2833
2834	sub	w12, w12, #1
2835
2836	mov	v3.16b, v1.16b
2837	cmp	x5, #16
2838	b.gt	.L192_enc_blocks_more_than_1
2839
2840	sub	w12, w12, #1
2841	b	.L192_enc_blocks_less_than_1
2842.L192_enc_blocks_more_than_3:	//blocks	left >  3
2843	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
2844
2845	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
2846#ifdef __AARCH64EB__
2847	rev	x6, x6
2848	rev	x7, x7
2849#endif
2850	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
2851
2852	eor	x6, x6, x13                    //AES final-2 block - round 12 low
2853	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2854
2855	eor	x7, x7, x14                    //AES final-2 block - round 12 high
2856	fmov	d5, x6                                //AES final-2 block - mov low
2857
2858	fmov	v5.d[1], x7                            //AES final-2 block - mov high
2859
2860	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
2861
2862	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
2863
2864	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
2865
2866	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
2867
2868	movi	v8.8b, #0                                       //suppress further partial tag feed in
2869
2870	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
2871
2872	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
2873	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
2874.L192_enc_blocks_more_than_2:	//blocks	left >  2
2875
2876	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
2877
2878	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
2879	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
2880#ifdef __AARCH64EB__
2881	rev	x6, x6
2882	rev	x7, x7
2883#endif
2884	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2885
2886	eor	x7, x7, x14                    //AES final-1 block - round 12 high
2887
2888	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
2889	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
2890
2891	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
2892	eor	x6, x6, x13                    //AES final-1 block - round 12 low
2893
2894	fmov	d5, x6                                //AES final-1 block - mov low
2895
2896	fmov	v5.d[1], x7                            //AES final-1 block - mov high
2897	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
2898	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
2899
2900	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
2901
2902	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
2903
2904	movi	v8.8b, #0                                       //suppress further partial tag feed in
2905
2906	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
2907
2908	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
2909.L192_enc_blocks_more_than_1:	//blocks	left >  1
2910
2911	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
2912
2913	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
2914#ifdef __AARCH64EB__
2915	rev	x6, x6
2916	rev	x7, x7
2917#endif
2918	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
2919
2920	eor	x6, x6, x13                    //AES final block - round 12 low
2921	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2922	movi	v8.8b, #0                                       //suppress further partial tag feed in
2923
2924	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
2925
2926	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
2927	eor	x7, x7, x14                    //AES final block - round 12 high
2928	fmov	d5, x6                                //AES final block - mov low
2929
2930	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
2931	fmov	v5.d[1], x7                            //AES final block - mov high
2932
2933	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
2934
2935	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
2936
2937	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
2938
2939	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
2940
2941	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
2942
2943	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
2944
2945	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
2946.L192_enc_blocks_less_than_1:	//blocks	left <= 1
2947
2948	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
2949#ifndef __AARCH64EB__
2950	rev	w9, w12
2951#else
2952	mov	w9, w12
2953#endif
2954	and	x1, x1, #127                   //bit_length %= 128
2955
2956	sub	x1, x1, #128                   //bit_length -= 128
2957	mvn	x14, xzr                                     //rk12_h = 0xffffffffffffffff
2958
2959	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
2960	mvn	x13, xzr                                     //rk12_l = 0xffffffffffffffff
2961
2962	and	x1, x1, #127                   //bit_length %= 128
2963
2964	lsr	x14, x14, x1                    //rk12_h is mask for top 64b of last block
2965	cmp	x1, #64
2966
2967	csel	x6, x13, x14, lt
2968	csel	x7, x14, xzr, lt
2969
2970	fmov	d0, x6                                //ctr0b is mask for last block
2971
2972	fmov	v0.d[1], x7
2973
2974	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
2975
2976	rev64	v4.16b, v5.16b                                   //GHASH final block
2977
2978	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
2979
2980	mov	d8, v4.d[1]                                 //GHASH final block - mid
2981
2982	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
2983
2984	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
2985
2986	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
2987
2988	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
2989
2990	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
2991
2992	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
2993
2994	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
2995	movi	v8.8b, #0xc2
2996
2997	eor	v30.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
2998
2999	shl	d8, d8, #56              //mod_constant
3000
3001	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
3002
3003	eor	v10.16b, v10.16b, v30.16b                        //MODULO - karatsuba tidy up
3004
3005	pmull	v31.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
3006
3007	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
3008
3009	eor	v10.16b, v10.16b, v31.16b                     //MODULO - fold into mid
3010
3011	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
3012
3013	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
3014
3015	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
3016
3017	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
3018	str	w9, [x16, #12]                         //store the updated counter
3019
3020	st1	{ v5.16b}, [x2]                         //store all 16B
3021
3022	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
3023	ext	v11.16b, v11.16b, v11.16b, #8
3024	rev64	v11.16b, v11.16b
3025	mov	x0, x15
3026	st1	{ v11.16b }, [x3]
3027
3028	ldp	x21, x22, [sp, #16]
3029	ldp	x23, x24, [sp, #32]
3030	ldp	d8, d9, [sp, #48]
3031	ldp	d10, d11, [sp, #64]
3032	ldp	d12, d13, [sp, #80]
3033	ldp	d14, d15, [sp, #96]
3034	ldp	x19, x20, [sp], #112
3035	ret
3036
3037.L192_enc_ret:
3038	mov	w0, #0x0
3039	ret
3040.size	aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3041.globl	aes_gcm_dec_192_kernel
3042.type	aes_gcm_dec_192_kernel,%function
3043.align	4
3044aes_gcm_dec_192_kernel:
3045	AARCH64_VALID_CALL_TARGET
3046	cbz	x1, .L192_dec_ret
3047	stp	x19, x20, [sp, #-112]!
3048	mov	x16, x4
3049	mov	x8, x5
3050	stp	x21, x22, [sp, #16]
3051	stp	x23, x24, [sp, #32]
3052	stp	d8, d9, [sp, #48]
3053	stp	d10, d11, [sp, #64]
3054	stp	d12, d13, [sp, #80]
3055	stp	d14, d15, [sp, #96]
3056
3057	add	x4, x0, x1, lsr #3   //end_input_ptr
3058	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
3059#ifdef __AARCH64EB__
3060	rev	x10, x10
3061	rev	x11, x11
3062#endif
3063	ldp	x13, x14, [x8, #192]                     //load rk12
3064#ifdef __AARCH64EB__
3065	ror	x13, x13, #32
3066	ror	x14, x14, #32
3067#endif
3068	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
3069
3070	ld1	{v18.4s}, [x8], #16                                  //load rk0
3071
3072	lsr	x5, x1, #3              //byte_len
3073	mov	x15, x5
3074	ld1	{v19.4s}, [x8], #16                               //load rk1
3075
3076	lsr	x12, x11, #32
3077	orr	w11, w11, w11
3078	fmov	d3, x10                               //CTR block 3
3079
3080	rev	w12, w12                                //rev_ctr32
3081	fmov	d1, x10                               //CTR block 1
3082
3083	add	w12, w12, #1                            //increment rev_ctr32
3084	ld1	{v20.4s}, [x8], #16                               //load rk2
3085
3086	aese	v0.16b, v18.16b
3087	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
3088	rev	w9, w12                                 //CTR block 1
3089
3090	add	w12, w12, #1                            //CTR block 1
3091	orr	x9, x11, x9, lsl #32            //CTR block 1
3092	ld1	{v21.4s}, [x8], #16                               //load rk3
3093
3094	fmov	v1.d[1], x9                               //CTR block 1
3095	rev	w9, w12                                 //CTR block 2
3096	add	w12, w12, #1                            //CTR block 2
3097
3098	fmov	d2, x10                               //CTR block 2
3099	orr	x9, x11, x9, lsl #32            //CTR block 2
3100
3101	fmov	v2.d[1], x9                               //CTR block 2
3102	rev	w9, w12                                 //CTR block 3
3103
3104	aese	v0.16b, v19.16b
3105	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
3106	orr	x9, x11, x9, lsl #32            //CTR block 3
3107
3108	fmov	v3.d[1], x9                               //CTR block 3
3109
3110	ld1	{v22.4s}, [x8], #16                               //load rk4
3111
3112	aese	v0.16b, v20.16b
3113	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
3114
3115	aese	v2.16b, v18.16b
3116	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
3117	ld1	{v23.4s}, [x8], #16                               //load rk5
3118
3119	aese	v1.16b, v18.16b
3120	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
3121	ldr	q15, [x3, #112]                        //load h4l | h4h
3122#ifndef __AARCH64EB__
3123	ext	v15.16b, v15.16b, v15.16b, #8
3124#endif
3125	aese	v3.16b, v18.16b
3126	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
3127	ldr	q13, [x3, #64]                         //load h2l | h2h
3128#ifndef __AARCH64EB__
3129	ext	v13.16b, v13.16b, v13.16b, #8
3130#endif
3131	aese	v2.16b, v19.16b
3132	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
3133	ldr	q14, [x3, #80]                         //load h3l | h3h
3134#ifndef __AARCH64EB__
3135	ext	v14.16b, v14.16b, v14.16b, #8
3136#endif
3137	aese	v1.16b, v19.16b
3138	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
3139
3140	aese	v3.16b, v19.16b
3141	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
3142	ldr	q12, [x3, #32]                         //load h1l | h1h
3143#ifndef __AARCH64EB__
3144	ext	v12.16b, v12.16b, v12.16b, #8
3145#endif
3146	aese	v2.16b, v20.16b
3147	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
3148	ld1	{v24.4s}, [x8], #16                               //load rk6
3149
3150	aese	v0.16b, v21.16b
3151	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
3152	ld1	{v25.4s}, [x8], #16                               //load rk7
3153
3154	aese	v1.16b, v20.16b
3155	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
3156	ld1	{v26.4s}, [x8], #16                               //load rk8
3157
3158	aese	v3.16b, v20.16b
3159	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
3160	ld1	{v27.4s}, [x8], #16                               //load rk9
3161
3162	aese	v2.16b, v21.16b
3163	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
3164	ld1	{ v11.16b}, [x3]
3165	ext	v11.16b, v11.16b, v11.16b, #8
3166	rev64	v11.16b, v11.16b
3167
3168	aese	v1.16b, v21.16b
3169	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
3170	add	w12, w12, #1                            //CTR block 3
3171
3172	aese	v3.16b, v21.16b
3173	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
3174	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
3175
3176	aese	v0.16b, v22.16b
3177	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
3178	ld1	{v28.4s}, [x8], #16                              //load rk10
3179
3180	aese	v1.16b, v22.16b
3181	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
3182	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
3183
3184	aese	v2.16b, v22.16b
3185	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
3186
3187	aese	v3.16b, v22.16b
3188	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
3189	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
3190
3191	aese	v0.16b, v23.16b
3192	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
3193	ld1	{v29.4s}, [x8], #16                              //load rk11
3194
3195	aese	v1.16b, v23.16b
3196	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
3197
3198	aese	v2.16b, v23.16b
3199	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
3200
3201	aese	v3.16b, v23.16b
3202	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
3203
3204	aese	v0.16b, v24.16b
3205	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
3206
3207	aese	v2.16b, v24.16b
3208	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
3209
3210	aese	v3.16b, v24.16b
3211	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
3212
3213	aese	v0.16b, v25.16b
3214	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
3215
3216	aese	v2.16b, v25.16b
3217	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
3218
3219	aese	v3.16b, v25.16b
3220	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
3221
3222	aese	v1.16b, v24.16b
3223	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
3224
3225	aese	v2.16b, v26.16b
3226	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
3227
3228	aese	v3.16b, v26.16b
3229	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
3230
3231	aese	v1.16b, v25.16b
3232	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
3233
3234	aese	v2.16b, v27.16b
3235	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
3236
3237	aese	v3.16b, v27.16b
3238	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
3239
3240	aese	v1.16b, v26.16b
3241	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
3242	sub	x5, x5, #1      //byte_len - 1
3243
3244	aese	v0.16b, v26.16b
3245	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
3246	and	x5, x5, #0xffffffffffffffc0    //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3247
3248	aese	v3.16b, v28.16b
3249	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
3250	add	x5, x5, x0
3251
3252	aese	v1.16b, v27.16b
3253	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
3254	cmp	x0, x5                   //check if we have <= 4 blocks
3255
3256	aese	v0.16b, v27.16b
3257	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
3258	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
3259
3260	aese	v3.16b, v29.16b                                     //AES block 3 - round 11
3261
3262	aese	v2.16b, v28.16b
3263	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
3264
3265	aese	v1.16b, v28.16b
3266	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
3267
3268	aese	v0.16b, v28.16b
3269	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
3270	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
3271
3272	aese	v2.16b, v29.16b                                     //AES block 2 - round 11
3273
3274	aese	v1.16b, v29.16b                                     //AES block 1 - round 11
3275	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
3276
3277	aese	v0.16b, v29.16b                                     //AES block 0 - round 11
3278	b.ge	.L192_dec_tail                                    //handle tail
3279
3280	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
3281
3282	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
3283
3284	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
3285	rev	w9, w12                                 //CTR block 4
3286	ld1	{v6.16b, v7.16b}, [x0], #32               //AES block 2,3 - load ciphertext
3287
3288	mov	x19, v1.d[0]                            //AES block 1 - mov low
3289
3290	mov	x20, v1.d[1]                            //AES block 1 - mov high
3291
3292	mov	x6, v0.d[0]                            //AES block 0 - mov low
3293	orr	x9, x11, x9, lsl #32            //CTR block 4
3294	add	w12, w12, #1                            //CTR block 4
3295
3296	mov	x7, v0.d[1]                            //AES block 0 - mov high
3297	rev64	v4.16b, v4.16b                                    //GHASH block 0
3298
3299	fmov	d0, x10                               //CTR block 4
3300	rev64	v5.16b, v5.16b                                    //GHASH block 1
3301	cmp	x0, x5                   //check if we have <= 8 blocks
3302
3303	eor	x19, x19, x13                   //AES block 1 - round 12 low
3304#ifdef __AARCH64EB__
3305	rev	x19, x19
3306#endif
3307	fmov	v0.d[1], x9                               //CTR block 4
3308	rev	w9, w12                                 //CTR block 5
3309
3310	orr	x9, x11, x9, lsl #32            //CTR block 5
3311	fmov	d1, x10                               //CTR block 5
3312	eor	x20, x20, x14                   //AES block 1 - round 12 high
3313#ifdef __AARCH64EB__
3314	rev	x20, x20
3315#endif
3316	add	w12, w12, #1                            //CTR block 5
3317	fmov	v1.d[1], x9                               //CTR block 5
3318	eor	x6, x6, x13                   //AES block 0 - round 12 low
3319#ifdef __AARCH64EB__
3320	rev	x6, x6
3321#endif
3322	rev	w9, w12                                 //CTR block 6
3323	eor	x7, x7, x14                   //AES block 0 - round 12 high
3324#ifdef __AARCH64EB__
3325	rev	x7, x7
3326#endif
3327	stp	x6, x7, [x2], #16        //AES block 0 - store result
3328	orr	x9, x11, x9, lsl #32            //CTR block 6
3329
3330	stp	x19, x20, [x2], #16        //AES block 1 - store result
3331
3332	add	w12, w12, #1                            //CTR block 6
3333	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
3334	b.ge	.L192_dec_prepretail                              //do prepretail
3335
3336.L192_dec_main_loop:	//main	loop start
3337	aese	v1.16b, v18.16b
3338	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
3339	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
3340
3341	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
3342	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
3343
3344	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
3345	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
3346	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
3347
3348	aese	v1.16b, v19.16b
3349	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
3350	fmov	d2, x10                               //CTR block 4k+6
3351
3352	aese	v0.16b, v18.16b
3353	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
3354	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
3355
3356	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
3357	fmov	v2.d[1], x9                               //CTR block 4k+6
3358
3359	aese	v1.16b, v20.16b
3360	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
3361	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
3362
3363	aese	v0.16b, v19.16b
3364	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
3365	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
3366
3367	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
3368	fmov	d3, x10                               //CTR block 4k+7
3369	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
3370
3371	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
3372	mov	d10, v17.d[1]                               //GHASH block 4k - mid
3373	rev	w9, w12                                 //CTR block 4k+7
3374
3375	aese	v2.16b, v18.16b
3376	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
3377	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
3378
3379	fmov	v3.d[1], x9                               //CTR block 4k+7
3380	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
3381	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
3382
3383	aese	v1.16b, v21.16b
3384	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
3385
3386	aese	v0.16b, v20.16b
3387	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
3388	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
3389#ifdef __AARCH64EB__
3390	rev	x22, x22
3391#endif
3392	aese	v2.16b, v19.16b
3393	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
3394	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
3395
3396	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
3397
3398	aese	v3.16b, v18.16b
3399	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
3400	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
3401
3402	aese	v2.16b, v20.16b
3403	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
3404
3405	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
3406	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
3407	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
3408#ifdef __AARCH64EB__
3409	rev	x21, x21
3410#endif
3411	aese	v1.16b, v22.16b
3412	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
3413
3414	aese	v0.16b, v21.16b
3415	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
3416
3417	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
3418	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
3419
3420	aese	v3.16b, v19.16b
3421	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
3422	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
3423
3424	aese	v0.16b, v22.16b
3425	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
3426
3427	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
3428	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
3429
3430	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
3431
3432	aese	v0.16b, v23.16b
3433	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
3434
3435	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
3436	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
3437
3438	aese	v1.16b, v23.16b
3439	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
3440
3441	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
3442
3443	aese	v3.16b, v20.16b
3444	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
3445	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
3446
3447	aese	v1.16b, v24.16b
3448	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
3449
3450	aese	v0.16b, v24.16b
3451	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
3452	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
3453
3454	aese	v3.16b, v21.16b
3455	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
3456
3457	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
3458	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
3459
3460	aese	v0.16b, v25.16b
3461	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
3462
3463	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
3464	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
3465
3466	aese	v1.16b, v25.16b
3467	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
3468
3469	aese	v0.16b, v26.16b
3470	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
3471	movi	v8.8b, #0xc2
3472
3473	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
3474
3475	aese	v1.16b, v26.16b
3476	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
3477	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
3478
3479	aese	v2.16b, v21.16b
3480	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
3481
3482	aese	v0.16b, v27.16b
3483	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
3484	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
3485
3486	aese	v3.16b, v22.16b
3487	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
3488
3489	aese	v2.16b, v22.16b
3490	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
3491	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
3492
3493	aese	v0.16b, v28.16b
3494	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
3495
3496	aese	v1.16b, v27.16b
3497	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
3498	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
3499
3500	aese	v2.16b, v23.16b
3501	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
3502
3503	aese	v3.16b, v23.16b
3504	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
3505	shl	d8, d8, #56               //mod_constant
3506
3507	aese	v1.16b, v28.16b
3508	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
3509
3510	aese	v2.16b, v24.16b
3511	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
3512	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
3513
3514	aese	v3.16b, v24.16b
3515	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
3516	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
3517
3518	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
3519	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
3520	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
3521#ifdef __AARCH64EB__
3522	rev	x23, x23
3523#endif
3524	aese	v2.16b, v25.16b
3525	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
3526	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
3527
3528	aese	v0.16b, v29.16b                                     //AES block 4k+4 - round 11
3529	add	w12, w12, #1                            //CTR block 4k+7
3530
3531	aese	v3.16b, v25.16b
3532	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
3533	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
3534
3535	aese	v2.16b, v26.16b
3536	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
3537	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
3538
3539	aese	v1.16b, v29.16b                                     //AES block 4k+5 - round 11
3540	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
3541	rev	w9, w12                                 //CTR block 4k+8
3542
3543	aese	v3.16b, v26.16b
3544	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
3545	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
3546
3547	aese	v2.16b, v27.16b
3548	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
3549	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
3550
3551	cmp	x0, x5                   //.LOOP CONTROL
3552
3553	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
3554	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
3555#ifdef __AARCH64EB__
3556	rev	x24, x24
3557#endif
3558	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
3559
3560	aese	v2.16b, v28.16b
3561	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
3562	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
3563
3564	aese	v3.16b, v27.16b
3565	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
3566
3567	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
3568	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
3569
3570	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
3571	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
3572	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
3573
3574	aese	v2.16b, v29.16b                                     //AES block 4k+6 - round 11
3575	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
3576
3577	aese	v3.16b, v28.16b
3578	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
3579	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
3580
3581	fmov	d0, x10                               //CTR block 4k+8
3582	add	w12, w12, #1                            //CTR block 4k+8
3583	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
3584
3585	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
3586	fmov	v0.d[1], x9                               //CTR block 4k+8
3587	rev	w9, w12                                 //CTR block 4k+9
3588
3589	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
3590#ifdef __AARCH64EB__
3591	rev	x6, x6
3592#endif
3593	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
3594	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
3595
3596	fmov	d1, x10                               //CTR block 4k+9
3597	add	w12, w12, #1                            //CTR block 4k+9
3598	eor	x19, x19, x13                   //AES block 4k+5 - round 12 low
3599#ifdef __AARCH64EB__
3600	rev	x19, x19
3601#endif
3602	fmov	v1.d[1], x9                               //CTR block 4k+9
3603	rev	w9, w12                                 //CTR block 4k+10
3604	eor	x20, x20, x14                   //AES block 4k+5 - round 12 high
3605#ifdef __AARCH64EB__
3606	rev	x20, x20
3607#endif
3608	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
3609#ifdef __AARCH64EB__
3610	rev	x7, x7
3611#endif
3612	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
3613	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
3614
3615	add	w12, w12, #1                            //CTR block 4k+10
3616	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
3617	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
3618
3619	aese	v3.16b, v29.16b                                     //AES block 4k+7 - round 11
3620	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
3621	b.lt	.L192_dec_main_loop
3622
3623.L192_dec_prepretail:	//PREPRETAIL
3624	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
3625	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
3626	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
3627
3628	aese	v1.16b, v18.16b
3629	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
3630	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
3631
3632	aese	v0.16b, v18.16b
3633	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
3634	mov	d10, v17.d[1]                               //GHASH block 4k - mid
3635
3636	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
3637	fmov	d2, x10                               //CTR block 4k+6
3638
3639	aese	v1.16b, v19.16b
3640	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
3641	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
3642
3643	aese	v0.16b, v19.16b
3644	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
3645	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
3646
3647	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
3648	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
3649	fmov	d3, x10                               //CTR block 4k+7
3650
3651	aese	v1.16b, v20.16b
3652	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
3653	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
3654
3655	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
3656	fmov	v2.d[1], x9                               //CTR block 4k+6
3657	rev	w9, w12                                 //CTR block 4k+7
3658
3659	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
3660	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
3661	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
3662
3663	pmull	v31.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
3664	eor	x24, x24, x14                   //AES block 4k+3 - round 12 high
3665#ifdef __AARCH64EB__
3666	rev	x24, x24
3667#endif
3668	fmov	v3.d[1], x9                               //CTR block 4k+7
3669
3670	aese	v0.16b, v20.16b
3671	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
3672	eor	x21, x21, x13                   //AES block 4k+2 - round 12 low
3673#ifdef __AARCH64EB__
3674	rev	x21, x21
3675#endif
3676	pmull2	v30.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
3677	eor	x22, x22, x14                   //AES block 4k+2 - round 12 high
3678#ifdef __AARCH64EB__
3679	rev	x22, x22
3680#endif
3681	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
3682
3683	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
3684	eor	x23, x23, x13                   //AES block 4k+3 - round 12 low
3685#ifdef __AARCH64EB__
3686	rev	x23, x23
3687#endif
3688	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
3689
3690	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
3691	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
3692
3693	aese	v3.16b, v18.16b
3694	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
3695	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+1 - high
3696
3697	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
3698	add	w12, w12, #1                            //CTR block 4k+7
3699
3700	pmull2	v30.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
3701	eor	v11.16b, v11.16b, v31.16b                         //GHASH block 4k+1 - low
3702
3703	aese	v2.16b, v18.16b
3704	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
3705
3706	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
3707	mov	d31, v6.d[1]                                  //GHASH block 4k+2 - mid
3708
3709	aese	v3.16b, v19.16b
3710	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
3711
3712	aese	v2.16b, v19.16b
3713	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
3714	eor	v9.16b, v9.16b, v30.16b                         //GHASH block 4k+2 - high
3715
3716	eor	v31.8b, v31.8b, v6.8b                          //GHASH block 4k+2 - mid
3717
3718	pmull	v8.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
3719
3720	aese	v2.16b, v20.16b
3721	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
3722	mov	d30, v7.d[1]                                  //GHASH block 4k+3 - mid
3723
3724	aese	v3.16b, v20.16b
3725	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
3726	ins	v31.d[1], v31.d[0]                                //GHASH block 4k+2 - mid
3727
3728	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
3729
3730	aese	v0.16b, v21.16b
3731	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
3732	eor	v30.8b, v30.8b, v7.8b                          //GHASH block 4k+3 - mid
3733
3734	aese	v1.16b, v21.16b
3735	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
3736
3737	pmull2	v31.1q, v31.2d, v16.2d                          //GHASH block 4k+2 - mid
3738	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+2 - low
3739
3740	aese	v0.16b, v22.16b
3741	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
3742
3743	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
3744	movi	v8.8b, #0xc2
3745
3746	pmull	v30.1q, v30.1d, v16.1d                          //GHASH block 4k+3 - mid
3747
3748	aese	v2.16b, v21.16b
3749	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
3750
3751	shl	d8, d8, #56               //mod_constant
3752	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
3753
3754	aese	v0.16b, v23.16b
3755	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
3756	eor	v10.16b, v10.16b, v31.16b                         //GHASH block 4k+2 - mid
3757
3758	aese	v2.16b, v22.16b
3759	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
3760
3761	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
3762	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
3763
3764	aese	v0.16b, v24.16b
3765	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
3766
3767	aese	v3.16b, v21.16b
3768	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
3769	eor	v10.16b, v10.16b, v30.16b                         //GHASH block 4k+3 - mid
3770
3771	aese	v2.16b, v23.16b
3772	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
3773
3774	aese	v0.16b, v25.16b
3775	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
3776	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
3777
3778	aese	v3.16b, v22.16b
3779	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
3780
3781	aese	v2.16b, v24.16b
3782	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
3783	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
3784
3785	aese	v0.16b, v26.16b
3786	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
3787
3788	aese	v3.16b, v23.16b
3789	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
3790	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
3791
3792	aese	v1.16b, v22.16b
3793	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
3794
3795	aese	v2.16b, v25.16b
3796	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
3797
3798	aese	v0.16b, v27.16b
3799	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
3800
3801	aese	v1.16b, v23.16b
3802	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
3803
3804	aese	v3.16b, v24.16b
3805	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
3806	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
3807
3808	aese	v0.16b, v28.16b
3809	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
3810
3811	aese	v1.16b, v24.16b
3812	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
3813
3814	aese	v3.16b, v25.16b
3815	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
3816
3817	aese	v2.16b, v26.16b
3818	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
3819	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
3820
3821	aese	v1.16b, v25.16b
3822	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
3823
3824	aese	v3.16b, v26.16b
3825	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
3826
3827	aese	v2.16b, v27.16b
3828	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
3829
3830	aese	v1.16b, v26.16b
3831	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
3832
3833	aese	v3.16b, v27.16b
3834	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
3835
3836	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
3837
3838	aese	v1.16b, v27.16b
3839	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
3840
3841	aese	v2.16b, v28.16b
3842	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
3843
3844	aese	v3.16b, v28.16b
3845	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
3846	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
3847
3848	aese	v1.16b, v28.16b
3849	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
3850
3851	aese	v0.16b, v29.16b
3852	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
3853
3854	aese	v2.16b, v29.16b
3855
3856	aese	v1.16b, v29.16b
3857
3858	aese	v3.16b, v29.16b
3859
3860	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
3861.L192_dec_tail:	//TAIL
3862
3863	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
3864	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
3865
3866	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
3867
3868	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
3869
3870	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
3871
3872	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
3873
3874	cmp	x5, #48
3875
3876	eor	x7, x7, x14                   //AES block 4k+4 - round 12 high
3877#ifdef __AARCH64EB__
3878	rev	x7, x7
3879#endif
3880	eor	x6, x6, x13                   //AES block 4k+4 - round 12 low
3881#ifdef __AARCH64EB__
3882	rev	x6, x6
3883#endif
3884	b.gt	.L192_dec_blocks_more_than_3
3885
3886	movi	v11.8b, #0
3887	movi	v9.8b, #0
3888
3889	mov	v3.16b, v2.16b
3890	mov	v2.16b, v1.16b
3891	sub	w12, w12, #1
3892
3893	movi	v10.8b, #0
3894	cmp	x5, #32
3895	b.gt	.L192_dec_blocks_more_than_2
3896
3897	mov	v3.16b, v1.16b
3898	cmp	x5, #16
3899	sub	w12, w12, #1
3900
3901	b.gt	.L192_dec_blocks_more_than_1
3902
3903	sub	w12, w12, #1
3904	b	.L192_dec_blocks_less_than_1
3905.L192_dec_blocks_more_than_3:	//blocks	left >  3
3906	rev64	v4.16b, v5.16b                                    //GHASH final-3 block
3907	ld1	{ v5.16b}, [x0], #16                      //AES final-2 block - load ciphertext
3908
3909	stp	x6, x7, [x2], #16        //AES final-3 block  - store result
3910
3911	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3912
3913	eor	v0.16b, v5.16b, v1.16b                            //AES final-2 block - result
3914
3915	pmull	v11.1q, v4.1d, v15.1d                       //GHASH final-3 block - low
3916	mov	x6, v0.d[0]                            //AES final-2 block - mov low
3917	mov	d22, v4.d[1]                                 //GHASH final-3 block - mid
3918
3919	mov	x7, v0.d[1]                            //AES final-2 block - mov high
3920
3921	mov	d10, v17.d[1]                               //GHASH final-3 block - mid
3922	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-3 block - mid
3923
3924	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH final-3 block - high
3925
3926	eor	x6, x6, x13                   //AES final-2 block - round 12 low
3927#ifdef __AARCH64EB__
3928	rev	x6, x6
3929#endif
3930	movi	v8.8b, #0                                        //suppress further partial tag feed in
3931
3932	pmull	v10.1q, v22.1d, v10.1d                    //GHASH final-3 block - mid
3933	eor	x7, x7, x14                   //AES final-2 block - round 12 high
3934#ifdef __AARCH64EB__
3935	rev	x7, x7
3936#endif
3937.L192_dec_blocks_more_than_2:	//blocks	left >  2
3938
3939	rev64	v4.16b, v5.16b                                    //GHASH final-2 block
3940	ld1	{ v5.16b}, [x0], #16                      //AES final-1 block - load ciphertext
3941
3942	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3943
3944	movi	v8.8b, #0                                        //suppress further partial tag feed in
3945
3946	eor	v0.16b, v5.16b, v2.16b                            //AES final-1 block - result
3947
3948	mov	d22, v4.d[1]                                 //GHASH final-2 block - mid
3949
3950	pmull	v21.1q, v4.1d, v14.1d                          //GHASH final-2 block - low
3951
3952	stp	x6, x7, [x2], #16        //AES final-2 block  - store result
3953
3954	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-2 block - mid
3955	mov	x7, v0.d[1]                            //AES final-1 block - mov high
3956
3957	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-2 block - low
3958	mov	x6, v0.d[0]                            //AES final-1 block - mov low
3959
3960	pmull2	v20.1q, v4.2d, v14.2d                          //GHASH final-2 block - high
3961
3962	pmull	v22.1q, v22.1d, v17.1d                      //GHASH final-2 block - mid
3963
3964	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-2 block - high
3965	eor	x7, x7, x14                   //AES final-1 block - round 12 high
3966#ifdef __AARCH64EB__
3967	rev	x7, x7
3968#endif
3969	eor	x6, x6, x13                   //AES final-1 block - round 12 low
3970#ifdef __AARCH64EB__
3971	rev	x6, x6
3972#endif
3973	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-2 block - mid
3974.L192_dec_blocks_more_than_1:	//blocks	left >  1
3975
3976	rev64	v4.16b, v5.16b                                    //GHASH final-1 block
3977
3978	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
3979	ld1	{ v5.16b}, [x0], #16                      //AES final block - load ciphertext
3980
3981	mov	d22, v4.d[1]                                 //GHASH final-1 block - mid
3982
3983	pmull2	v20.1q, v4.2d, v13.2d                          //GHASH final-1 block - high
3984
3985	eor	v0.16b, v5.16b, v3.16b                            //AES final block - result
3986	stp	x6, x7, [x2], #16        //AES final-1 block  - store result
3987
3988	eor	v22.8b, v22.8b, v4.8b                      //GHASH final-1 block - mid
3989
3990	eor	v9.16b, v9.16b, v20.16b                            //GHASH final-1 block - high
3991
3992	pmull	v21.1q, v4.1d, v13.1d                          //GHASH final-1 block - low
3993	mov	x7, v0.d[1]                            //AES final block - mov high
3994
3995	ins	v22.d[1], v22.d[0]                            //GHASH final-1 block - mid
3996	mov	x6, v0.d[0]                            //AES final block - mov low
3997
3998	pmull2	v22.1q, v22.2d, v16.2d                      //GHASH final-1 block - mid
3999
4000	movi	v8.8b, #0                                        //suppress further partial tag feed in
4001	eor	v11.16b, v11.16b, v21.16b                            //GHASH final-1 block - low
4002	eor	x7, x7, x14                   //AES final block - round 12 high
4003#ifdef __AARCH64EB__
4004	rev	x7, x7
4005#endif
4006	eor	x6, x6, x13                   //AES final block - round 12 low
4007#ifdef __AARCH64EB__
4008	rev	x6, x6
4009#endif
4010	eor	v10.16b, v10.16b, v22.16b                       //GHASH final-1 block - mid
4011.L192_dec_blocks_less_than_1:	//blocks	left <= 1
4012
4013	mvn	x13, xzr                                      //rk12_l = 0xffffffffffffffff
4014	ldp	x4, x5, [x2]  //load existing bytes we need to not overwrite
4015	and	x1, x1, #127                    //bit_length %= 128
4016
4017	sub	x1, x1, #128                    //bit_length -= 128
4018
4019	neg	x1, x1                          //bit_length = 128 - #bits in input (in range [1,128])
4020
4021	and	x1, x1, #127                    //bit_length %= 128
4022	mvn	x14, xzr                                      //rk12_h = 0xffffffffffffffff
4023
4024	lsr	x14, x14, x1                     //rk12_h is mask for top 64b of last block
4025	cmp	x1, #64
4026
4027	csel	x9, x13, x14, lt
4028	csel	x10, x14, xzr, lt
4029
4030	fmov	d0, x9                                   //ctr0b is mask for last block
4031	and	x6, x6, x9
4032	bic	x4, x4, x9           //mask out low existing bytes
4033
4034	orr	x6, x6, x4
4035	mov	v0.d[1], x10
4036#ifndef __AARCH64EB__
4037	rev	w9, w12
4038#else
4039	mov	w9, w12
4040#endif
4041
4042	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
4043	str	w9, [x16, #12]                          //store the updated counter
4044
4045	rev64	v4.16b, v5.16b                                    //GHASH final block
4046
4047	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
4048	bic	x5, x5, x10 //mask out high existing bytes
4049
4050	and	x7, x7, x10
4051
4052	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
4053	mov	d8, v4.d[1]                                  //GHASH final block - mid
4054
4055	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
4056
4057	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
4058
4059	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
4060
4061	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
4062
4063	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
4064
4065	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
4066	movi	v8.8b, #0xc2
4067
4068	eor	v30.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
4069
4070	shl	d8, d8, #56               //mod_constant
4071
4072	eor	v10.16b, v10.16b, v30.16b                         //MODULO - karatsuba tidy up
4073
4074	pmull	v31.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
4075	orr	x7, x7, x5
4076	stp	x6, x7, [x2]
4077
4078	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
4079
4080	eor	v10.16b, v10.16b, v31.16b                      //MODULO - fold into mid
4081
4082	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
4083
4084	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
4085
4086	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
4087
4088	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
4089
4090	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
4091	ext	v11.16b, v11.16b, v11.16b, #8
4092	rev64	v11.16b, v11.16b
4093	mov	x0, x15
4094	st1	{ v11.16b }, [x3]
4095
4096	ldp	x21, x22, [sp, #16]
4097	ldp	x23, x24, [sp, #32]
4098	ldp	d8, d9, [sp, #48]
4099	ldp	d10, d11, [sp, #64]
4100	ldp	d12, d13, [sp, #80]
4101	ldp	d14, d15, [sp, #96]
4102	ldp	x19, x20, [sp], #112
4103	ret
4104
4105.L192_dec_ret:
4106	mov	w0, #0x0
4107	ret
4108.size	aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
4109.globl	aes_gcm_enc_256_kernel
4110.type	aes_gcm_enc_256_kernel,%function
4111.align	4
4112aes_gcm_enc_256_kernel:
4113	AARCH64_VALID_CALL_TARGET
4114	cbz	x1, .L256_enc_ret
4115	stp	x19, x20, [sp, #-112]!
4116	mov	x16, x4
4117	mov	x8, x5
4118	stp	x21, x22, [sp, #16]
4119	stp	x23, x24, [sp, #32]
4120	stp	d8, d9, [sp, #48]
4121	stp	d10, d11, [sp, #64]
4122	stp	d12, d13, [sp, #80]
4123	stp	d14, d15, [sp, #96]
4124
4125	add	x4, x0, x1, lsr #3   //end_input_ptr
4126	lsr	x5, x1, #3              //byte_len
4127	mov	x15, x5
4128	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
4129#ifdef __AARCH64EB__
4130	rev	x10, x10
4131	rev	x11, x11
4132#endif
4133	ldp	x13, x14, [x8, #224]                     //load rk14
4134#ifdef __AARCH64EB__
4135	ror	x13, x13, #32
4136	ror	x14, x14, #32
4137#endif
4138	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
4139	sub	x5, x5, #1      //byte_len - 1
4140
4141	ld1	{v18.4s}, [x8], #16                               //load rk0
4142	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4143
4144	ld1	{v19.4s}, [x8], #16                               //load rk1
4145	add	x5, x5, x0
4146
4147	lsr	x12, x11, #32
4148	fmov	d2, x10                               //CTR block 2
4149	orr	w11, w11, w11
4150
4151	rev	w12, w12                                //rev_ctr32
4152	cmp	x0, x5                   //check if we have <= 4 blocks
4153	fmov	d1, x10                               //CTR block 1
4154
4155	aese	v0.16b, v18.16b
4156	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
4157	add	w12, w12, #1                            //increment rev_ctr32
4158
4159	rev	w9, w12                                 //CTR block 1
4160	fmov	d3, x10                               //CTR block 3
4161
4162	orr	x9, x11, x9, lsl #32            //CTR block 1
4163	add	w12, w12, #1                            //CTR block 1
4164	ld1	{v20.4s}, [x8], #16                               //load rk2
4165
4166	fmov	v1.d[1], x9                               //CTR block 1
4167	rev	w9, w12                                 //CTR block 2
4168	add	w12, w12, #1                            //CTR block 2
4169
4170	orr	x9, x11, x9, lsl #32            //CTR block 2
4171	ld1	{v21.4s}, [x8], #16                               //load rk3
4172
4173	fmov	v2.d[1], x9                               //CTR block 2
4174	rev	w9, w12                                 //CTR block 3
4175
4176	aese	v0.16b, v19.16b
4177	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
4178	orr	x9, x11, x9, lsl #32            //CTR block 3
4179
4180	fmov	v3.d[1], x9                               //CTR block 3
4181
4182	aese	v1.16b, v18.16b
4183	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
4184	ld1	{v22.4s}, [x8], #16                               //load rk4
4185
4186	aese	v0.16b, v20.16b
4187	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
4188	ld1	{v23.4s}, [x8], #16                               //load rk5
4189
4190	aese	v2.16b, v18.16b
4191	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
4192	ld1	{v24.4s}, [x8], #16                               //load rk6
4193
4194	aese	v1.16b, v19.16b
4195	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
4196	ldr	q14, [x3, #80]                         //load h3l | h3h
4197#ifndef __AARCH64EB__
4198	ext	v14.16b, v14.16b, v14.16b, #8
4199#endif
4200	aese	v3.16b, v18.16b
4201	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
4202	ld1	{v25.4s}, [x8], #16                               //load rk7
4203
4204	aese	v2.16b, v19.16b
4205	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
4206	ld1	{v26.4s}, [x8], #16                               //load rk8
4207
4208	aese	v1.16b, v20.16b
4209	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
4210	ldr	q13, [x3, #64]                         //load h2l | h2h
4211#ifndef __AARCH64EB__
4212	ext	v13.16b, v13.16b, v13.16b, #8
4213#endif
4214	aese	v3.16b, v19.16b
4215	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
4216	ld1	{v27.4s}, [x8], #16                               //load rk9
4217
4218	aese	v2.16b, v20.16b
4219	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
4220	ldr	q15, [x3, #112]                        //load h4l | h4h
4221#ifndef __AARCH64EB__
4222	ext	v15.16b, v15.16b, v15.16b, #8
4223#endif
4224	aese	v1.16b, v21.16b
4225	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
4226	ld1	{v28.4s}, [x8], #16                              //load rk10
4227
4228	aese	v3.16b, v20.16b
4229	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
4230	ld1	{v29.4s}, [x8], #16                              //load rk11
4231
4232	aese	v2.16b, v21.16b
4233	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
4234	add	w12, w12, #1                            //CTR block 3
4235
4236	aese	v0.16b, v21.16b
4237	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
4238
4239	aese	v3.16b, v21.16b
4240	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
4241	ld1	{ v11.16b}, [x3]
4242	ext	v11.16b, v11.16b, v11.16b, #8
4243	rev64	v11.16b, v11.16b
4244
4245	aese	v2.16b, v22.16b
4246	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
4247
4248	aese	v0.16b, v22.16b
4249	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
4250
4251	aese	v1.16b, v22.16b
4252	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
4253
4254	aese	v3.16b, v22.16b
4255	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
4256
4257	aese	v0.16b, v23.16b
4258	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
4259
4260	aese	v1.16b, v23.16b
4261	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
4262
4263	aese	v3.16b, v23.16b
4264	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
4265
4266	aese	v2.16b, v23.16b
4267	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
4268
4269	aese	v1.16b, v24.16b
4270	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
4271	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
4272
4273	aese	v3.16b, v24.16b
4274	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
4275	ld1	{v30.4s}, [x8], #16                              //load rk12
4276
4277	aese	v0.16b, v24.16b
4278	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
4279	ldr	q12, [x3, #32]                         //load h1l | h1h
4280#ifndef __AARCH64EB__
4281	ext	v12.16b, v12.16b, v12.16b, #8
4282#endif
4283	aese	v2.16b, v24.16b
4284	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
4285	ld1	{v31.4s}, [x8], #16                              //load rk13
4286
4287	aese	v1.16b, v25.16b
4288	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
4289	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
4290
4291	aese	v0.16b, v25.16b
4292	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
4293
4294	aese	v2.16b, v25.16b
4295	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
4296
4297	aese	v3.16b, v25.16b
4298	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
4299	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
4300
4301	aese	v1.16b, v26.16b
4302	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
4303
4304	aese	v2.16b, v26.16b
4305	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
4306
4307	aese	v3.16b, v26.16b
4308	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
4309
4310	aese	v1.16b, v27.16b
4311	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
4312
4313	aese	v2.16b, v27.16b
4314	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
4315
4316	aese	v0.16b, v26.16b
4317	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
4318
4319	aese	v1.16b, v28.16b
4320	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
4321
4322	aese	v3.16b, v27.16b
4323	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
4324
4325	aese	v0.16b, v27.16b
4326	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
4327
4328	aese	v2.16b, v28.16b
4329	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
4330
4331	aese	v3.16b, v28.16b
4332	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
4333
4334	aese	v1.16b, v29.16b
4335	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
4336
4337	aese	v2.16b, v29.16b
4338	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
4339
4340	aese	v0.16b, v28.16b
4341	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
4342
4343	aese	v1.16b, v30.16b
4344	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
4345
4346	aese	v2.16b, v30.16b
4347	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
4348
4349	aese	v0.16b, v29.16b
4350	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
4351	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
4352
4353	aese	v3.16b, v29.16b
4354	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
4355
4356	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
4357	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
4358
4359	aese	v0.16b, v30.16b
4360	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
4361
4362	aese	v3.16b, v30.16b
4363	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
4364
4365	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
4366
4367	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
4368
4369	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
4370	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
4371	b.ge	.L256_enc_tail                                    //handle tail
4372
4373	ldp	x19, x20, [x0, #16]           //AES block 1 - load plaintext
4374#ifdef __AARCH64EB__
4375	rev	x19, x19
4376	rev	x20, x20
4377#endif
4378	rev	w9, w12                                 //CTR block 4
4379	ldp	x6, x7, [x0, #0]            //AES block 0 - load plaintext
4380#ifdef __AARCH64EB__
4381	rev	x6, x6
4382	rev	x7, x7
4383#endif
4384	ldp	x23, x24, [x0, #48]           //AES block 3 - load plaintext
4385#ifdef __AARCH64EB__
4386	rev	x23, x23
4387	rev	x24, x24
4388#endif
4389	ldp	x21, x22, [x0, #32]           //AES block 2 - load plaintext
4390#ifdef __AARCH64EB__
4391	rev	x21, x21
4392	rev	x22, x22
4393#endif
4394	add	x0, x0, #64                       //AES input_ptr update
4395
4396	eor	x19, x19, x13                     //AES block 1 - round 14 low
4397	eor	x20, x20, x14                     //AES block 1 - round 14 high
4398
4399	fmov	d5, x19                               //AES block 1 - mov low
4400	eor	x6, x6, x13                     //AES block 0 - round 14 low
4401
4402	eor	x7, x7, x14                     //AES block 0 - round 14 high
4403	eor	x24, x24, x14                     //AES block 3 - round 14 high
4404	fmov	d4, x6                               //AES block 0 - mov low
4405
4406	cmp	x0, x5                   //check if we have <= 8 blocks
4407	fmov	v4.d[1], x7                           //AES block 0 - mov high
4408	eor	x23, x23, x13                     //AES block 3 - round 14 low
4409
4410	eor	x21, x21, x13                     //AES block 2 - round 14 low
4411	fmov	v5.d[1], x20                           //AES block 1 - mov high
4412
4413	fmov	d6, x21                               //AES block 2 - mov low
4414	add	w12, w12, #1                            //CTR block 4
4415
4416	orr	x9, x11, x9, lsl #32            //CTR block 4
4417	fmov	d7, x23                               //AES block 3 - mov low
4418	eor	x22, x22, x14                     //AES block 2 - round 14 high
4419
4420	fmov	v6.d[1], x22                           //AES block 2 - mov high
4421
4422	eor	v4.16b, v4.16b, v0.16b                          //AES block 0 - result
4423	fmov	d0, x10                               //CTR block 4
4424
4425	fmov	v0.d[1], x9                               //CTR block 4
4426	rev	w9, w12                                 //CTR block 5
4427	add	w12, w12, #1                            //CTR block 5
4428
4429	eor	v5.16b, v5.16b, v1.16b                          //AES block 1 - result
4430	fmov	d1, x10                               //CTR block 5
4431	orr	x9, x11, x9, lsl #32            //CTR block 5
4432
4433	fmov	v1.d[1], x9                               //CTR block 5
4434	rev	w9, w12                                 //CTR block 6
4435	st1	{ v4.16b}, [x2], #16                     //AES block 0 - store result
4436
4437	fmov	v7.d[1], x24                           //AES block 3 - mov high
4438	orr	x9, x11, x9, lsl #32            //CTR block 6
4439	eor	v6.16b, v6.16b, v2.16b                          //AES block 2 - result
4440
4441	st1	{ v5.16b}, [x2], #16                     //AES block 1 - store result
4442
4443	add	w12, w12, #1                            //CTR block 6
4444	fmov	d2, x10                               //CTR block 6
4445
4446	fmov	v2.d[1], x9                               //CTR block 6
4447	st1	{ v6.16b}, [x2], #16                     //AES block 2 - store result
4448	rev	w9, w12                                 //CTR block 7
4449
4450	orr	x9, x11, x9, lsl #32            //CTR block 7
4451
4452	eor	v7.16b, v7.16b, v3.16b                          //AES block 3 - result
4453	st1	{ v7.16b}, [x2], #16                     //AES block 3 - store result
4454	b.ge	.L256_enc_prepretail                               //do prepretail
4455
4456.L256_enc_main_loop:	//main	loop start
4457	aese	v0.16b, v18.16b
4458	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
4459	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
4460
4461	aese	v1.16b, v18.16b
4462	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
4463	fmov	d3, x10                               //CTR block 4k+3
4464
4465	aese	v2.16b, v18.16b
4466	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
4467	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
4468
4469	aese	v0.16b, v19.16b
4470	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
4471	fmov	v3.d[1], x9                               //CTR block 4k+3
4472
4473	aese	v1.16b, v19.16b
4474	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
4475	ldp	x23, x24, [x0, #48]           //AES block 4k+7 - load plaintext
4476#ifdef __AARCH64EB__
4477	rev	x23, x23
4478	rev	x24, x24
4479#endif
4480	aese	v2.16b, v19.16b
4481	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
4482	ldp	x21, x22, [x0, #32]           //AES block 4k+6 - load plaintext
4483#ifdef __AARCH64EB__
4484	rev	x21, x21
4485	rev	x22, x22
4486#endif
4487	aese	v0.16b, v20.16b
4488	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
4489	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
4490
4491	aese	v1.16b, v20.16b
4492	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
4493
4494	aese	v3.16b, v18.16b
4495	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
4496	eor	x23, x23, x13                     //AES block 4k+7 - round 14 low
4497
4498	aese	v0.16b, v21.16b
4499	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
4500	mov	d10, v17.d[1]                               //GHASH block 4k - mid
4501
4502	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
4503	eor	x22, x22, x14                     //AES block 4k+6 - round 14 high
4504	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
4505
4506	aese	v3.16b, v19.16b
4507	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
4508	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
4509
4510	aese	v0.16b, v22.16b
4511	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
4512
4513	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
4514	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
4515
4516	aese	v2.16b, v20.16b
4517	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
4518
4519	aese	v0.16b, v23.16b
4520	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
4521	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4522
4523	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
4524
4525	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
4526	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
4527
4528	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
4529
4530	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
4531	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
4532
4533	aese	v1.16b, v21.16b
4534	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
4535
4536	aese	v3.16b, v20.16b
4537	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
4538	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
4539
4540	aese	v2.16b, v21.16b
4541	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
4542
4543	aese	v1.16b, v22.16b
4544	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
4545	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
4546
4547	aese	v3.16b, v21.16b
4548	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
4549	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
4550
4551	aese	v2.16b, v22.16b
4552	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
4553
4554	aese	v0.16b, v24.16b
4555	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
4556	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
4557
4558	aese	v3.16b, v22.16b
4559	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
4560
4561	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
4562
4563	aese	v0.16b, v25.16b
4564	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
4565
4566	aese	v3.16b, v23.16b
4567	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
4568	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
4569
4570	aese	v1.16b, v23.16b
4571	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
4572
4573	aese	v0.16b, v26.16b
4574	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
4575
4576	aese	v2.16b, v23.16b
4577	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
4578
4579	aese	v1.16b, v24.16b
4580	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
4581	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
4582
4583	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
4584
4585	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
4586
4587	aese	v1.16b, v25.16b
4588	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
4589
4590	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
4591	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
4592
4593	aese	v3.16b, v24.16b
4594	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
4595	ldp	x19, x20, [x0, #16]           //AES block 4k+5 - load plaintext
4596#ifdef __AARCH64EB__
4597	rev	x19, x19
4598	rev	x20, x20
4599#endif
4600	aese	v1.16b, v26.16b
4601	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
4602	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
4603
4604	aese	v2.16b, v24.16b
4605	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
4606	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
4607
4608	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
4609
4610	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
4611	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
4612
4613	aese	v2.16b, v25.16b
4614	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
4615	eor	x19, x19, x13                     //AES block 4k+5 - round 14 low
4616
4617	aese	v1.16b, v27.16b
4618	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
4619	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
4620
4621	aese	v3.16b, v25.16b
4622	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
4623	eor	x21, x21, x13                     //AES block 4k+6 - round 14 low
4624
4625	aese	v0.16b, v27.16b
4626	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
4627	movi	v8.8b, #0xc2
4628
4629	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
4630	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
4631	fmov	d5, x19                               //AES block 4k+5 - mov low
4632
4633	aese	v2.16b, v26.16b
4634	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
4635	ldp	x6, x7, [x0, #0]            //AES block 4k+4 - load plaintext
4636#ifdef __AARCH64EB__
4637	rev	x6, x6
4638	rev	x7, x7
4639#endif
4640	aese	v0.16b, v28.16b
4641	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
4642	shl	d8, d8, #56               //mod_constant
4643
4644	aese	v3.16b, v26.16b
4645	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
4646	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
4647
4648	aese	v2.16b, v27.16b
4649	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
4650
4651	aese	v1.16b, v28.16b
4652	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
4653	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
4654
4655	aese	v3.16b, v27.16b
4656	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
4657	add	w12, w12, #1                            //CTR block 4k+3
4658
4659	aese	v0.16b, v29.16b
4660	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
4661	eor	v4.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
4662
4663	aese	v1.16b, v29.16b
4664	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
4665	add	x0, x0, #64                       //AES input_ptr update
4666
4667	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
4668	rev	w9, w12                                 //CTR block 4k+8
4669	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
4670
4671	aese	v2.16b, v28.16b
4672	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
4673	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
4674
4675	aese	v1.16b, v30.16b
4676	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
4677	eor	v10.16b, v10.16b, v4.16b                         //MODULO - karatsuba tidy up
4678
4679	aese	v3.16b, v28.16b
4680	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
4681	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
4682
4683	fmov	d4, x6                               //AES block 4k+4 - mov low
4684	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
4685	eor	v7.16b, v9.16b, v7.16b                   //MODULO - fold into mid
4686
4687	aese	v0.16b, v30.16b
4688	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
4689	eor	x20, x20, x14                     //AES block 4k+5 - round 14 high
4690
4691	aese	v2.16b, v29.16b
4692	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
4693	eor	x24, x24, x14                     //AES block 4k+7 - round 14 high
4694
4695	aese	v3.16b, v29.16b
4696	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
4697	add	w12, w12, #1                            //CTR block 4k+8
4698
4699	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
4700	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
4701	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
4702
4703	aese	v2.16b, v30.16b
4704	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
4705	fmov	d7, x23                               //AES block 4k+7 - mov low
4706
4707	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
4708	fmov	v5.d[1], x20                           //AES block 4k+5 - mov high
4709
4710	fmov	d6, x21                               //AES block 4k+6 - mov low
4711	cmp	x0, x5                   //.LOOP CONTROL
4712
4713	fmov	v6.d[1], x22                           //AES block 4k+6 - mov high
4714
4715	pmull	v9.1q, v10.1d, v8.1d            //MODULO - mid 64b align with low
4716	eor	v4.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
4717	fmov	d0, x10                               //CTR block 4k+8
4718
4719	fmov	v0.d[1], x9                               //CTR block 4k+8
4720	rev	w9, w12                                 //CTR block 4k+9
4721	add	w12, w12, #1                            //CTR block 4k+9
4722
4723	eor	v5.16b, v5.16b, v1.16b                          //AES block 4k+5 - result
4724	fmov	d1, x10                               //CTR block 4k+9
4725	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
4726
4727	aese	v3.16b, v30.16b
4728	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
4729	fmov	v1.d[1], x9                               //CTR block 4k+9
4730
4731	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
4732	rev	w9, w12                                 //CTR block 4k+10
4733	st1	{ v4.16b}, [x2], #16                     //AES block 4k+4 - store result
4734
4735	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
4736	eor	v11.16b, v11.16b, v9.16b                         //MODULO - fold into low
4737	fmov	v7.d[1], x24                           //AES block 4k+7 - mov high
4738
4739	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
4740	st1	{ v5.16b}, [x2], #16                     //AES block 4k+5 - store result
4741	add	w12, w12, #1                            //CTR block 4k+10
4742
4743	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
4744	eor	v6.16b, v6.16b, v2.16b                          //AES block 4k+6 - result
4745	fmov	d2, x10                               //CTR block 4k+10
4746
4747	st1	{ v6.16b}, [x2], #16                     //AES block 4k+6 - store result
4748	fmov	v2.d[1], x9                               //CTR block 4k+10
4749	rev	w9, w12                                 //CTR block 4k+11
4750
4751	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
4752	orr	x9, x11, x9, lsl #32            //CTR block 4k+11
4753
4754	eor	v7.16b, v7.16b, v3.16b                          //AES block 4k+7 - result
4755	st1	{ v7.16b}, [x2], #16                     //AES block 4k+7 - store result
4756	b.lt	.L256_enc_main_loop
4757
4758.L256_enc_prepretail:	//PREPRETAIL
4759	aese	v1.16b, v18.16b
4760	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
4761	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2 (t0, t1, and t2 free)
4762
4763	aese	v2.16b, v18.16b
4764	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
4765	fmov	d3, x10                               //CTR block 4k+3
4766
4767	aese	v0.16b, v18.16b
4768	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
4769	rev64	v4.16b, v4.16b                                    //GHASH block 4k (only t0 is free)
4770
4771	fmov	v3.d[1], x9                               //CTR block 4k+3
4772	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
4773
4774	aese	v2.16b, v19.16b
4775	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
4776
4777	aese	v0.16b, v19.16b
4778	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
4779
4780	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
4781	rev64	v5.16b, v5.16b                                    //GHASH block 4k+1 (t0 and t1 free)
4782
4783	aese	v2.16b, v20.16b
4784	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
4785
4786	aese	v3.16b, v18.16b
4787	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
4788	mov	d10, v17.d[1]                               //GHASH block 4k - mid
4789
4790	aese	v1.16b, v19.16b
4791	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
4792
4793	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
4794	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
4795
4796	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
4797
4798	aese	v2.16b, v21.16b
4799	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
4800
4801	aese	v1.16b, v20.16b
4802	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
4803	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
4804
4805	aese	v0.16b, v20.16b
4806	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
4807
4808	aese	v3.16b, v19.16b
4809	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
4810
4811	aese	v1.16b, v21.16b
4812	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
4813
4814	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
4815
4816	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
4817
4818	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
4819
4820	aese	v3.16b, v20.16b
4821	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
4822
4823	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
4824	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
4825
4826	aese	v0.16b, v21.16b
4827	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
4828	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
4829
4830	aese	v3.16b, v21.16b
4831	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
4832
4833	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
4834	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
4835
4836	aese	v0.16b, v22.16b
4837	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
4838	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3 (t0, t1, t2 and t3 free)
4839
4840	aese	v3.16b, v22.16b
4841	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
4842
4843	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
4844	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
4845	add	w12, w12, #1                            //CTR block 4k+3
4846
4847	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
4848
4849	aese	v3.16b, v23.16b
4850	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
4851
4852	aese	v2.16b, v22.16b
4853	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
4854	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
4855
4856	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
4857
4858	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
4859	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
4860
4861	aese	v2.16b, v23.16b
4862	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
4863
4864	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
4865	mov	d4, v7.d[1]                                  //GHASH block 4k+3 - mid
4866
4867	aese	v1.16b, v22.16b
4868	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
4869
4870	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
4871
4872	eor	v4.8b, v4.8b, v7.8b                          //GHASH block 4k+3 - mid
4873
4874	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
4875
4876	aese	v1.16b, v23.16b
4877	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
4878
4879	pmull	v4.1q, v4.1d, v16.1d                          //GHASH block 4k+3 - mid
4880	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
4881
4882	aese	v0.16b, v23.16b
4883	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
4884
4885	aese	v1.16b, v24.16b
4886	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
4887
4888	aese	v2.16b, v24.16b
4889	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
4890
4891	aese	v0.16b, v24.16b
4892	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
4893	movi	v8.8b, #0xc2
4894
4895	aese	v3.16b, v24.16b
4896	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
4897
4898	aese	v1.16b, v25.16b
4899	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
4900	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
4901
4902	aese	v0.16b, v25.16b
4903	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
4904
4905	aese	v3.16b, v25.16b
4906	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
4907	shl	d8, d8, #56               //mod_constant
4908
4909	aese	v1.16b, v26.16b
4910	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
4911	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+3 - mid
4912
4913	pmull	v6.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
4914
4915	aese	v3.16b, v26.16b
4916	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
4917
4918	aese	v1.16b, v27.16b
4919	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
4920
4921	aese	v0.16b, v26.16b
4922	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
4923	eor	v11.16b, v11.16b, v6.16b                         //GHASH block 4k+3 - low
4924
4925	aese	v3.16b, v27.16b
4926	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
4927
4928	eor	v10.16b, v10.16b, v9.16b                         //karatsuba tidy up
4929
4930	pmull	v4.1q, v9.1d, v8.1d
4931	ext	v9.16b, v9.16b, v9.16b, #8
4932
4933	aese	v3.16b, v28.16b
4934	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
4935
4936	aese	v2.16b, v25.16b
4937	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
4938	eor	v10.16b, v10.16b, v11.16b
4939
4940	aese	v1.16b, v28.16b
4941	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
4942
4943	aese	v0.16b, v27.16b
4944	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
4945
4946	aese	v2.16b, v26.16b
4947	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
4948
4949	aese	v1.16b, v29.16b
4950	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
4951	eor	v10.16b, v10.16b, v4.16b
4952
4953	aese	v0.16b, v28.16b
4954	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
4955
4956	aese	v2.16b, v27.16b
4957	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
4958
4959	aese	v1.16b, v30.16b
4960	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
4961
4962	aese	v0.16b, v29.16b
4963	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
4964	eor	v10.16b, v10.16b, v9.16b
4965
4966	aese	v3.16b, v29.16b
4967	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
4968
4969	aese	v2.16b, v28.16b
4970	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
4971
4972	aese	v0.16b, v30.16b
4973	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
4974
4975	pmull	v4.1q, v10.1d, v8.1d
4976
4977	aese	v2.16b, v29.16b
4978	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
4979	ext	v10.16b, v10.16b, v10.16b, #8
4980
4981	aese	v3.16b, v30.16b
4982	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
4983
4984	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
4985	eor	v11.16b, v11.16b, v4.16b
4986
4987	aese	v2.16b, v30.16b
4988	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
4989
4990	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
4991
4992	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
4993
4994	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
4995	eor	v11.16b, v11.16b, v10.16b
4996.L256_enc_tail:	//TAIL
4997
4998	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
4999	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
5000	ldp	x6, x7, [x0], #16           //AES block 4k+4 - load plaintext
5001#ifdef __AARCH64EB__
5002	rev	x6, x6
5003	rev	x7, x7
5004#endif
5005	eor	x6, x6, x13                     //AES block 4k+4 - round 14 low
5006	eor	x7, x7, x14                     //AES block 4k+4 - round 14 high
5007
5008	cmp	x5, #48
5009	fmov	d4, x6                               //AES block 4k+4 - mov low
5010
5011	fmov	v4.d[1], x7                           //AES block 4k+4 - mov high
5012
5013	eor	v5.16b, v4.16b, v0.16b                          //AES block 4k+4 - result
5014	b.gt	.L256_enc_blocks_more_than_3
5015
5016	cmp	x5, #32
5017	mov	v3.16b, v2.16b
5018	movi	v11.8b, #0
5019
5020	movi	v9.8b, #0
5021	sub	w12, w12, #1
5022
5023	mov	v2.16b, v1.16b
5024	movi	v10.8b, #0
5025	b.gt	.L256_enc_blocks_more_than_2
5026
5027	mov	v3.16b, v1.16b
5028	sub	w12, w12, #1
5029	cmp	x5, #16
5030
5031	b.gt	.L256_enc_blocks_more_than_1
5032
5033	sub	w12, w12, #1
5034	b	.L256_enc_blocks_less_than_1
5035.L256_enc_blocks_more_than_3:	//blocks	left >  3
5036	st1	{ v5.16b}, [x2], #16                    //AES final-3 block  - store result
5037
5038	ldp	x6, x7, [x0], #16          //AES final-2 block - load input low & high
5039#ifdef __AARCH64EB__
5040	rev	x6, x6
5041	rev	x7, x7
5042#endif
5043	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
5044
5045	eor	x6, x6, x13                    //AES final-2 block - round 14 low
5046	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5047
5048	eor	x7, x7, x14                    //AES final-2 block - round 14 high
5049
5050	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
5051	fmov	d5, x6                                //AES final-2 block - mov low
5052
5053	fmov	v5.d[1], x7                            //AES final-2 block - mov high
5054
5055	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
5056	movi	v8.8b, #0                                       //suppress further partial tag feed in
5057
5058	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
5059
5060	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
5061
5062	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
5063
5064	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
5065	eor	v5.16b, v5.16b, v1.16b                           //AES final-2 block - result
5066.L256_enc_blocks_more_than_2:	//blocks	left >  2
5067
5068	st1	{ v5.16b}, [x2], #16                    //AES final-2 block - store result
5069
5070	ldp	x6, x7, [x0], #16          //AES final-1 block - load input low & high
5071#ifdef __AARCH64EB__
5072	rev	x6, x6
5073	rev	x7, x7
5074#endif
5075	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
5076
5077	eor	x6, x6, x13                    //AES final-1 block - round 14 low
5078	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5079
5080	fmov	d5, x6                                //AES final-1 block - mov low
5081	eor	x7, x7, x14                    //AES final-1 block - round 14 high
5082
5083	fmov	v5.d[1], x7                            //AES final-1 block - mov high
5084
5085	movi	v8.8b, #0                                       //suppress further partial tag feed in
5086
5087	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
5088	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
5089
5090	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
5091
5092	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
5093
5094	eor	v5.16b, v5.16b, v2.16b                           //AES final-1 block - result
5095
5096	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
5097
5098	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
5099
5100	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
5101
5102	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
5103.L256_enc_blocks_more_than_1:	//blocks	left >  1
5104
5105	st1	{ v5.16b}, [x2], #16                    //AES final-1 block - store result
5106
5107	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
5108
5109	ldp	x6, x7, [x0], #16          //AES final block - load input low & high
5110#ifdef __AARCH64EB__
5111	rev	x6, x6
5112	rev	x7, x7
5113#endif
5114	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5115
5116	movi	v8.8b, #0                                       //suppress further partial tag feed in
5117
5118	eor	x6, x6, x13                    //AES final block - round 14 low
5119	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
5120
5121	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
5122	eor	x7, x7, x14                    //AES final block - round 14 high
5123
5124	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
5125
5126	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
5127
5128	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
5129	fmov	d5, x6                                //AES final block - mov low
5130
5131	fmov	v5.d[1], x7                            //AES final block - mov high
5132
5133	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
5134
5135	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
5136
5137	eor	v5.16b, v5.16b, v3.16b                           //AES final block - result
5138	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
5139
5140	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
5141.L256_enc_blocks_less_than_1:	//blocks	left <= 1
5142
5143	and	x1, x1, #127                   //bit_length %= 128
5144
5145	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
5146	sub	x1, x1, #128                   //bit_length -= 128
5147
5148	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
5149	ld1	{ v18.16b}, [x2]                           //load existing bytes where the possibly partial last block is to be stored
5150
5151	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
5152	and	x1, x1, #127                   //bit_length %= 128
5153
5154	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
5155	cmp	x1, #64
5156
5157	csel	x6, x13, x14, lt
5158	csel	x7, x14, xzr, lt
5159
5160	fmov	d0, x6                                //ctr0b is mask for last block
5161
5162	fmov	v0.d[1], x7
5163
5164	and	v5.16b, v5.16b, v0.16b                           //possibly partial last block has zeroes in highest bits
5165
5166	rev64	v4.16b, v5.16b                                   //GHASH final block
5167
5168	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
5169
5170	bif	v5.16b, v18.16b, v0.16b                             //insert existing bytes in top end of result before storing
5171
5172	pmull2	v20.1q, v4.2d, v12.2d                         //GHASH final block - high
5173	mov	d8, v4.d[1]                                 //GHASH final block - mid
5174#ifndef __AARCH64EB__
5175	rev	w9, w12
5176#else
5177	mov	w9, w12
5178#endif
5179
5180	pmull	v21.1q, v4.1d, v12.1d                         //GHASH final block - low
5181
5182	eor	v9.16b, v9.16b, v20.16b                           //GHASH final block - high
5183	eor	v8.8b, v8.8b, v4.8b                         //GHASH final block - mid
5184
5185	pmull	v8.1q, v8.1d, v16.1d                         //GHASH final block - mid
5186
5187	eor	v11.16b, v11.16b, v21.16b                           //GHASH final block - low
5188
5189	eor	v10.16b, v10.16b, v8.16b                        //GHASH final block - mid
5190	movi	v8.8b, #0xc2
5191
5192	eor	v4.16b, v11.16b, v9.16b                        //MODULO - karatsuba tidy up
5193
5194	shl	d8, d8, #56              //mod_constant
5195
5196	eor	v10.16b, v10.16b, v4.16b                        //MODULO - karatsuba tidy up
5197
5198	pmull	v7.1q, v9.1d, v8.1d           //MODULO - top 64b align with mid
5199
5200	ext	v9.16b, v9.16b, v9.16b, #8                    //MODULO - other top alignment
5201
5202	eor	v10.16b, v10.16b, v7.16b                     //MODULO - fold into mid
5203
5204	eor	v10.16b, v10.16b, v9.16b                        //MODULO - fold into mid
5205
5206	pmull	v9.1q, v10.1d, v8.1d           //MODULO - mid 64b align with low
5207
5208	ext	v10.16b, v10.16b, v10.16b, #8                    //MODULO - other mid alignment
5209
5210	str	w9, [x16, #12]                         //store the updated counter
5211
5212	st1	{ v5.16b}, [x2]                         //store all 16B
5213	eor	v11.16b, v11.16b, v9.16b                        //MODULO - fold into low
5214
5215	eor	v11.16b, v11.16b, v10.16b                        //MODULO - fold into low
5216	ext	v11.16b, v11.16b, v11.16b, #8
5217	rev64	v11.16b, v11.16b
5218	mov	x0, x15
5219	st1	{ v11.16b }, [x3]
5220
5221	ldp	x21, x22, [sp, #16]
5222	ldp	x23, x24, [sp, #32]
5223	ldp	d8, d9, [sp, #48]
5224	ldp	d10, d11, [sp, #64]
5225	ldp	d12, d13, [sp, #80]
5226	ldp	d14, d15, [sp, #96]
5227	ldp	x19, x20, [sp], #112
5228	ret
5229
5230.L256_enc_ret:
5231	mov	w0, #0x0
5232	ret
5233.size	aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5234.globl	aes_gcm_dec_256_kernel
5235.type	aes_gcm_dec_256_kernel,%function
5236.align	4
5237aes_gcm_dec_256_kernel:
5238	AARCH64_VALID_CALL_TARGET
5239	cbz	x1, .L256_dec_ret
5240	stp	x19, x20, [sp, #-112]!
5241	mov	x16, x4
5242	mov	x8, x5
5243	stp	x21, x22, [sp, #16]
5244	stp	x23, x24, [sp, #32]
5245	stp	d8, d9, [sp, #48]
5246	stp	d10, d11, [sp, #64]
5247	stp	d12, d13, [sp, #80]
5248	stp	d14, d15, [sp, #96]
5249
5250	lsr	x5, x1, #3              //byte_len
5251	mov	x15, x5
5252	ldp	x10, x11, [x16]              //ctr96_b64, ctr96_t32
5253#ifdef __AARCH64EB__
5254	rev	x10, x10
5255	rev	x11, x11
5256#endif
5257	ldp	x13, x14, [x8, #224]                     //load rk14
5258#ifdef __AARCH64EB__
5259	ror	x14, x14, #32
5260	ror	x13, x13, #32
5261#endif
5262	ld1	{v18.4s}, [x8], #16                               //load rk0
5263	sub	x5, x5, #1      //byte_len - 1
5264
5265	ld1	{v19.4s}, [x8], #16                               //load rk1
5266	and	x5, x5, #0xffffffffffffffc0 //number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5267
5268	add	x4, x0, x1, lsr #3   //end_input_ptr
5269	ld1	{v20.4s}, [x8], #16                               //load rk2
5270
5271	lsr	x12, x11, #32
5272	ld1	{v21.4s}, [x8], #16                               //load rk3
5273	orr	w11, w11, w11
5274
5275	ld1	{v22.4s}, [x8], #16                               //load rk4
5276	add	x5, x5, x0
5277	rev	w12, w12                                //rev_ctr32
5278
5279	add	w12, w12, #1                            //increment rev_ctr32
5280	fmov	d3, x10                               //CTR block 3
5281
5282	rev	w9, w12                                 //CTR block 1
5283	add	w12, w12, #1                            //CTR block 1
5284	fmov	d1, x10                               //CTR block 1
5285
5286	orr	x9, x11, x9, lsl #32            //CTR block 1
5287	ld1	{ v0.16b}, [x16]                             //special case vector load initial counter so we can start first AES block as quickly as possible
5288
5289	fmov	v1.d[1], x9                               //CTR block 1
5290	rev	w9, w12                                 //CTR block 2
5291	add	w12, w12, #1                            //CTR block 2
5292
5293	fmov	d2, x10                               //CTR block 2
5294	orr	x9, x11, x9, lsl #32            //CTR block 2
5295
5296	fmov	v2.d[1], x9                               //CTR block 2
5297	rev	w9, w12                                 //CTR block 3
5298
5299	orr	x9, x11, x9, lsl #32            //CTR block 3
5300	ld1	{v23.4s}, [x8], #16                               //load rk5
5301
5302	fmov	v3.d[1], x9                               //CTR block 3
5303	add	w12, w12, #1                            //CTR block 3
5304
5305	ld1	{v24.4s}, [x8], #16                               //load rk6
5306
5307	ld1	{v25.4s}, [x8], #16                               //load rk7
5308
5309	ld1	{v26.4s}, [x8], #16                               //load rk8
5310
5311	aese	v0.16b, v18.16b
5312	aesmc	v0.16b, v0.16b          //AES block 0 - round 0
5313	ldr	q14, [x3, #80]                         //load h3l | h3h
5314#ifndef __AARCH64EB__
5315	ext	v14.16b, v14.16b, v14.16b, #8
5316#endif
5317
5318	aese	v3.16b, v18.16b
5319	aesmc	v3.16b, v3.16b          //AES block 3 - round 0
5320	ldr	q15, [x3, #112]                        //load h4l | h4h
5321#ifndef __AARCH64EB__
5322	ext	v15.16b, v15.16b, v15.16b, #8
5323#endif
5324
5325	aese	v1.16b, v18.16b
5326	aesmc	v1.16b, v1.16b          //AES block 1 - round 0
5327	ldr	q13, [x3, #64]                         //load h2l | h2h
5328#ifndef __AARCH64EB__
5329	ext	v13.16b, v13.16b, v13.16b, #8
5330#endif
5331
5332	aese	v2.16b, v18.16b
5333	aesmc	v2.16b, v2.16b          //AES block 2 - round 0
5334	ld1	{v27.4s}, [x8], #16                                 //load rk9
5335
5336	aese	v0.16b, v19.16b
5337	aesmc	v0.16b, v0.16b          //AES block 0 - round 1
5338
5339	aese	v1.16b, v19.16b
5340	aesmc	v1.16b, v1.16b          //AES block 1 - round 1
5341	ld1	{ v11.16b}, [x3]
5342	ext	v11.16b, v11.16b, v11.16b, #8
5343	rev64	v11.16b, v11.16b
5344
5345	aese	v2.16b, v19.16b
5346	aesmc	v2.16b, v2.16b          //AES block 2 - round 1
5347	ld1	{v28.4s}, [x8], #16                              //load rk10
5348
5349	aese	v3.16b, v19.16b
5350	aesmc	v3.16b, v3.16b          //AES block 3 - round 1
5351	ld1	{v29.4s}, [x8], #16                              //load rk11
5352
5353	aese	v0.16b, v20.16b
5354	aesmc	v0.16b, v0.16b          //AES block 0 - round 2
5355	ldr	q12, [x3, #32]                         //load h1l | h1h
5356#ifndef __AARCH64EB__
5357	ext	v12.16b, v12.16b, v12.16b, #8
5358#endif
5359	aese	v2.16b, v20.16b
5360	aesmc	v2.16b, v2.16b          //AES block 2 - round 2
5361	ld1	{v30.4s}, [x8], #16                              //load rk12
5362
5363	aese	v3.16b, v20.16b
5364	aesmc	v3.16b, v3.16b          //AES block 3 - round 2
5365
5366	aese	v0.16b, v21.16b
5367	aesmc	v0.16b, v0.16b          //AES block 0 - round 3
5368
5369	aese	v1.16b, v20.16b
5370	aesmc	v1.16b, v1.16b          //AES block 1 - round 2
5371
5372	aese	v3.16b, v21.16b
5373	aesmc	v3.16b, v3.16b          //AES block 3 - round 3
5374
5375	aese	v0.16b, v22.16b
5376	aesmc	v0.16b, v0.16b          //AES block 0 - round 4
5377	cmp	x0, x5                   //check if we have <= 4 blocks
5378
5379	aese	v2.16b, v21.16b
5380	aesmc	v2.16b, v2.16b          //AES block 2 - round 3
5381
5382	aese	v1.16b, v21.16b
5383	aesmc	v1.16b, v1.16b          //AES block 1 - round 3
5384
5385	aese	v3.16b, v22.16b
5386	aesmc	v3.16b, v3.16b          //AES block 3 - round 4
5387
5388	aese	v2.16b, v22.16b
5389	aesmc	v2.16b, v2.16b          //AES block 2 - round 4
5390
5391	aese	v1.16b, v22.16b
5392	aesmc	v1.16b, v1.16b          //AES block 1 - round 4
5393
5394	aese	v3.16b, v23.16b
5395	aesmc	v3.16b, v3.16b          //AES block 3 - round 5
5396
5397	aese	v0.16b, v23.16b
5398	aesmc	v0.16b, v0.16b          //AES block 0 - round 5
5399
5400	aese	v1.16b, v23.16b
5401	aesmc	v1.16b, v1.16b          //AES block 1 - round 5
5402
5403	aese	v2.16b, v23.16b
5404	aesmc	v2.16b, v2.16b          //AES block 2 - round 5
5405
5406	aese	v0.16b, v24.16b
5407	aesmc	v0.16b, v0.16b          //AES block 0 - round 6
5408
5409	aese	v3.16b, v24.16b
5410	aesmc	v3.16b, v3.16b          //AES block 3 - round 6
5411
5412	aese	v1.16b, v24.16b
5413	aesmc	v1.16b, v1.16b          //AES block 1 - round 6
5414
5415	aese	v2.16b, v24.16b
5416	aesmc	v2.16b, v2.16b          //AES block 2 - round 6
5417
5418	aese	v0.16b, v25.16b
5419	aesmc	v0.16b, v0.16b          //AES block 0 - round 7
5420
5421	aese	v1.16b, v25.16b
5422	aesmc	v1.16b, v1.16b          //AES block 1 - round 7
5423
5424	aese	v3.16b, v25.16b
5425	aesmc	v3.16b, v3.16b          //AES block 3 - round 7
5426
5427	aese	v0.16b, v26.16b
5428	aesmc	v0.16b, v0.16b          //AES block 0 - round 8
5429
5430	aese	v2.16b, v25.16b
5431	aesmc	v2.16b, v2.16b          //AES block 2 - round 7
5432
5433	aese	v3.16b, v26.16b
5434	aesmc	v3.16b, v3.16b          //AES block 3 - round 8
5435
5436	aese	v1.16b, v26.16b
5437	aesmc	v1.16b, v1.16b          //AES block 1 - round 8
5438
5439	aese	v0.16b, v27.16b
5440	aesmc	v0.16b, v0.16b          //AES block 0 - round 9
5441
5442	aese	v2.16b, v26.16b
5443	aesmc	v2.16b, v2.16b          //AES block 2 - round 8
5444	ld1	{v31.4s}, [x8], #16                             //load rk13
5445
5446	aese	v1.16b, v27.16b
5447	aesmc	v1.16b, v1.16b          //AES block 1 - round 9
5448
5449	aese	v0.16b, v28.16b
5450	aesmc	v0.16b, v0.16b          //AES block 0 - round 10
5451
5452	aese	v3.16b, v27.16b
5453	aesmc	v3.16b, v3.16b          //AES block 3 - round 9
5454
5455	aese	v1.16b, v28.16b
5456	aesmc	v1.16b, v1.16b          //AES block 1 - round 10
5457
5458	aese	v2.16b, v27.16b
5459	aesmc	v2.16b, v2.16b          //AES block 2 - round 9
5460
5461	aese	v3.16b, v28.16b
5462	aesmc	v3.16b, v3.16b          //AES block 3 - round 10
5463
5464	aese	v0.16b, v29.16b
5465	aesmc	v0.16b, v0.16b          //AES block 0 - round 11
5466
5467	aese	v2.16b, v28.16b
5468	aesmc	v2.16b, v2.16b          //AES block 2 - round 10
5469
5470	aese	v3.16b, v29.16b
5471	aesmc	v3.16b, v3.16b          //AES block 3 - round 11
5472
5473	aese	v1.16b, v29.16b
5474	aesmc	v1.16b, v1.16b          //AES block 1 - round 11
5475
5476	aese	v2.16b, v29.16b
5477	aesmc	v2.16b, v2.16b          //AES block 2 - round 11
5478
5479	trn1	v9.2d, v14.2d,    v15.2d                      //h4h | h3h
5480
5481	trn2	v17.2d,  v14.2d,    v15.2d                      //h4l | h3l
5482
5483	trn1	v8.2d,    v12.2d,    v13.2d                      //h2h | h1h
5484	trn2	v16.2d,  v12.2d,    v13.2d                      //h2l | h1l
5485
5486	aese	v1.16b, v30.16b
5487	aesmc	v1.16b, v1.16b          //AES block 1 - round 12
5488
5489	aese	v0.16b, v30.16b
5490	aesmc	v0.16b, v0.16b          //AES block 0 - round 12
5491
5492	aese	v2.16b, v30.16b
5493	aesmc	v2.16b, v2.16b          //AES block 2 - round 12
5494
5495	aese	v3.16b, v30.16b
5496	aesmc	v3.16b, v3.16b          //AES block 3 - round 12
5497	eor	v17.16b, v17.16b, v9.16b                  //h4k | h3k
5498
5499	aese	v1.16b, v31.16b                                     //AES block 1 - round 13
5500
5501	aese	v2.16b, v31.16b                                     //AES block 2 - round 13
5502	eor	v16.16b, v16.16b, v8.16b                     //h2k | h1k
5503
5504	aese	v3.16b, v31.16b                                     //AES block 3 - round 13
5505
5506	aese	v0.16b, v31.16b                                     //AES block 0 - round 13
5507	b.ge	.L256_dec_tail                                    //handle tail
5508
5509	ld1	{v4.16b, v5.16b}, [x0], #32               //AES block 0,1 - load ciphertext
5510
5511	rev	w9, w12                                 //CTR block 4
5512
5513	eor	v0.16b, v4.16b, v0.16b                            //AES block 0 - result
5514
5515	eor	v1.16b, v5.16b, v1.16b                            //AES block 1 - result
5516	rev64	v5.16b, v5.16b                                    //GHASH block 1
5517	ld1	{v6.16b}, [x0], #16                       //AES block 2 - load ciphertext
5518
5519	mov	x7, v0.d[1]                            //AES block 0 - mov high
5520
5521	mov	x6, v0.d[0]                            //AES block 0 - mov low
5522	rev64	v4.16b, v4.16b                                    //GHASH block 0
5523	add	w12, w12, #1                            //CTR block 4
5524
5525	fmov	d0, x10                               //CTR block 4
5526	orr	x9, x11, x9, lsl #32            //CTR block 4
5527
5528	fmov	v0.d[1], x9                               //CTR block 4
5529	rev	w9, w12                                 //CTR block 5
5530	add	w12, w12, #1                            //CTR block 5
5531
5532	mov	x19, v1.d[0]                            //AES block 1 - mov low
5533
5534	orr	x9, x11, x9, lsl #32            //CTR block 5
5535	mov	x20, v1.d[1]                            //AES block 1 - mov high
5536	eor	x7, x7, x14                   //AES block 0 - round 14 high
5537#ifdef __AARCH64EB__
5538	rev	x7, x7
5539#endif
5540	eor	x6, x6, x13                   //AES block 0 - round 14 low
5541#ifdef __AARCH64EB__
5542	rev	x6, x6
5543#endif
5544	stp	x6, x7, [x2], #16        //AES block 0 - store result
5545	fmov	d1, x10                               //CTR block 5
5546
5547	ld1	{v7.16b}, [x0], #16                       //AES block 3 - load ciphertext
5548
5549	fmov	v1.d[1], x9                               //CTR block 5
5550	rev	w9, w12                                 //CTR block 6
5551	add	w12, w12, #1                            //CTR block 6
5552
5553	eor	x19, x19, x13                   //AES block 1 - round 14 low
5554#ifdef __AARCH64EB__
5555	rev	x19, x19
5556#endif
5557	orr	x9, x11, x9, lsl #32            //CTR block 6
5558
5559	eor	x20, x20, x14                   //AES block 1 - round 14 high
5560#ifdef __AARCH64EB__
5561	rev	x20, x20
5562#endif
5563	stp	x19, x20, [x2], #16        //AES block 1 - store result
5564
5565	eor	v2.16b, v6.16b, v2.16b                            //AES block 2 - result
5566	cmp	x0, x5                   //check if we have <= 8 blocks
5567	b.ge	.L256_dec_prepretail                              //do prepretail
5568
5569.L256_dec_main_loop:	//main	loop start
5570	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
5571	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
5572	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
5573
5574	aese	v0.16b, v18.16b
5575	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
5576	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
5577
5578	aese	v1.16b, v18.16b
5579	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
5580	fmov	d2, x10                               //CTR block 4k+6
5581
5582	fmov	v2.d[1], x9                               //CTR block 4k+6
5583	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
5584	rev	w9, w12                                 //CTR block 4k+7
5585
5586	aese	v0.16b, v19.16b
5587	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
5588	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
5589
5590	aese	v1.16b, v19.16b
5591	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
5592	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
5593
5594	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
5595	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
5596	fmov	d3, x10                               //CTR block 4k+7
5597
5598	aese	v0.16b, v20.16b
5599	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
5600	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
5601
5602	aese	v2.16b, v18.16b
5603	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
5604	fmov	v3.d[1], x9                               //CTR block 4k+7
5605
5606	aese	v1.16b, v20.16b
5607	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
5608	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
5609
5610	aese	v0.16b, v21.16b
5611	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
5612	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
5613#ifdef __AARCH64EB__
5614	rev	x22, x22
5615#endif
5616	aese	v2.16b, v19.16b
5617	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
5618	mov	d10, v17.d[1]                               //GHASH block 4k - mid
5619
5620	aese	v1.16b, v21.16b
5621	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
5622	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
5623
5624	aese	v3.16b, v18.16b
5625	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
5626	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
5627#ifdef __AARCH64EB__
5628	rev	x21, x21
5629#endif
5630	aese	v2.16b, v20.16b
5631	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
5632	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
5633
5634	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
5635
5636	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
5637
5638	aese	v2.16b, v21.16b
5639	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
5640	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
5641
5642	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
5643	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
5644#ifdef __AARCH64EB__
5645	rev	x23, x23
5646#endif
5647	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
5648	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
5649#ifdef __AARCH64EB__
5650	rev	x24, x24
5651#endif
5652	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
5653
5654	aese	v2.16b, v22.16b
5655	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
5656
5657	aese	v3.16b, v19.16b
5658	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
5659	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
5660
5661	aese	v0.16b, v22.16b
5662	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
5663	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
5664
5665	aese	v2.16b, v23.16b
5666	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
5667	add	w12, w12, #1                            //CTR block 4k+7
5668
5669	aese	v3.16b, v20.16b
5670	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
5671	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
5672
5673	aese	v1.16b, v22.16b
5674	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
5675	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
5676
5677	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
5678
5679	aese	v3.16b, v21.16b
5680	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
5681	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
5682
5683	aese	v1.16b, v23.16b
5684	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
5685
5686	aese	v0.16b, v23.16b
5687	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
5688	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
5689
5690	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
5691	rev	w9, w12                                 //CTR block 4k+8
5692
5693	aese	v1.16b, v24.16b
5694	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
5695	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
5696
5697	aese	v0.16b, v24.16b
5698	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
5699	add	w12, w12, #1                            //CTR block 4k+8
5700
5701	aese	v3.16b, v22.16b
5702	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
5703
5704	aese	v1.16b, v25.16b
5705	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
5706	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
5707
5708	aese	v0.16b, v25.16b
5709	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
5710
5711	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
5712	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
5713
5714	aese	v3.16b, v23.16b
5715	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
5716
5717	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
5718
5719	aese	v0.16b, v26.16b
5720	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
5721	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
5722
5723	aese	v3.16b, v24.16b
5724	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
5725
5726	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
5727	orr	x9, x11, x9, lsl #32            //CTR block 4k+8
5728	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
5729
5730	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
5731
5732	aese	v0.16b, v27.16b
5733	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
5734	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
5735
5736	aese	v1.16b, v26.16b
5737	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
5738
5739	aese	v2.16b, v24.16b
5740	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
5741	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
5742
5743	aese	v0.16b, v28.16b
5744	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
5745
5746	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
5747	movi	v8.8b, #0xc2
5748
5749	aese	v2.16b, v25.16b
5750	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
5751	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
5752
5753	aese	v0.16b, v29.16b
5754	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
5755
5756	aese	v3.16b, v25.16b
5757	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
5758	shl	d8, d8, #56               //mod_constant
5759
5760	aese	v2.16b, v26.16b
5761	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
5762	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
5763
5764	aese	v0.16b, v30.16b
5765	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
5766
5767	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
5768	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
5769
5770	aese	v1.16b, v27.16b
5771	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
5772	ld1	{v4.16b}, [x0], #16                       //AES block 4k+4 - load ciphertext
5773
5774	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
5775	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
5776
5777	aese	v1.16b, v28.16b
5778	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
5779	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
5780
5781	aese	v2.16b, v27.16b
5782	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
5783	ld1	{v5.16b}, [x0], #16                       //AES block 4k+5 - load ciphertext
5784
5785	aese	v3.16b, v26.16b
5786	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
5787	eor	v0.16b, v4.16b, v0.16b                            //AES block 4k+4 - result
5788
5789	aese	v1.16b, v29.16b
5790	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
5791	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
5792
5793	aese	v2.16b, v28.16b
5794	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
5795	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
5796
5797	aese	v3.16b, v27.16b
5798	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
5799	ld1	{v6.16b}, [x0], #16                       //AES block 4k+6 - load ciphertext
5800
5801	aese	v1.16b, v30.16b
5802	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
5803	ld1	{v7.16b}, [x0], #16                       //AES block 4k+7 - load ciphertext
5804
5805	aese	v2.16b, v29.16b
5806	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
5807	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
5808
5809	aese	v3.16b, v28.16b
5810	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
5811	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
5812
5813	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
5814	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
5815
5816	aese	v2.16b, v30.16b
5817	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
5818	fmov	d0, x10                               //CTR block 4k+8
5819
5820	aese	v3.16b, v29.16b
5821	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
5822	fmov	v0.d[1], x9                               //CTR block 4k+8
5823
5824	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
5825	eor	v1.16b, v5.16b, v1.16b                            //AES block 4k+5 - result
5826	rev	w9, w12                                 //CTR block 4k+9
5827
5828	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
5829	orr	x9, x11, x9, lsl #32            //CTR block 4k+9
5830	cmp	x0, x5                   //.LOOP CONTROL
5831
5832	add	w12, w12, #1                            //CTR block 4k+9
5833
5834	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
5835#ifdef __AARCH64EB__
5836	rev	x6, x6
5837#endif
5838	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
5839#ifdef __AARCH64EB__
5840	rev	x7, x7
5841#endif
5842	mov	x20, v1.d[1]                            //AES block 4k+5 - mov high
5843	eor	v2.16b, v6.16b, v2.16b                            //AES block 4k+6 - result
5844	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
5845
5846	aese	v3.16b, v30.16b
5847	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
5848	mov	x19, v1.d[0]                            //AES block 4k+5 - mov low
5849
5850	fmov	d1, x10                               //CTR block 4k+9
5851	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
5852
5853	fmov	v1.d[1], x9                               //CTR block 4k+9
5854	rev	w9, w12                                 //CTR block 4k+10
5855	add	w12, w12, #1                            //CTR block 4k+10
5856
5857	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
5858	orr	x9, x11, x9, lsl #32            //CTR block 4k+10
5859
5860	rev64	v5.16b, v5.16b                                    //GHASH block 4k+5
5861	eor	x20, x20, x14                   //AES block 4k+5 - round 14 high
5862#ifdef __AARCH64EB__
5863	rev	x20, x20
5864#endif
5865	stp	x6, x7, [x2], #16        //AES block 4k+4 - store result
5866
5867	eor	x19, x19, x13                   //AES block 4k+5 - round 14 low
5868#ifdef __AARCH64EB__
5869	rev	x19, x19
5870#endif
5871	stp	x19, x20, [x2], #16        //AES block 4k+5 - store result
5872
5873	rev64	v4.16b, v4.16b                                    //GHASH block 4k+4
5874	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
5875	b.lt	.L256_dec_main_loop
5876
5877
5878.L256_dec_prepretail:	//PREPRETAIL
5879	ext	v11.16b, v11.16b, v11.16b, #8                     //PRE 0
5880	mov	x21, v2.d[0]                            //AES block 4k+2 - mov low
5881	eor	v3.16b, v7.16b, v3.16b                            //AES block 4k+3 - result
5882
5883	aese	v0.16b, v18.16b
5884	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 0
5885	mov	x22, v2.d[1]                            //AES block 4k+2 - mov high
5886
5887	aese	v1.16b, v18.16b
5888	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 0
5889	fmov	d2, x10                               //CTR block 4k+6
5890
5891	fmov	v2.d[1], x9                               //CTR block 4k+6
5892	rev	w9, w12                                 //CTR block 4k+7
5893	eor	v4.16b, v4.16b, v11.16b                           //PRE 1
5894
5895	rev64	v6.16b, v6.16b                                    //GHASH block 4k+2
5896	orr	x9, x11, x9, lsl #32            //CTR block 4k+7
5897	mov	x23, v3.d[0]                            //AES block 4k+3 - mov low
5898
5899	aese	v1.16b, v19.16b
5900	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 1
5901	mov	x24, v3.d[1]                            //AES block 4k+3 - mov high
5902
5903	pmull	v11.1q, v4.1d, v15.1d                       //GHASH block 4k - low
5904	mov	d8, v4.d[1]                                  //GHASH block 4k - mid
5905	fmov	d3, x10                               //CTR block 4k+7
5906
5907	pmull2	v9.1q, v4.2d, v15.2d                       //GHASH block 4k - high
5908	fmov	v3.d[1], x9                               //CTR block 4k+7
5909
5910	aese	v2.16b, v18.16b
5911	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 0
5912	mov	d10, v17.d[1]                               //GHASH block 4k - mid
5913
5914	aese	v0.16b, v19.16b
5915	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 1
5916	eor	v8.8b, v8.8b, v4.8b                          //GHASH block 4k - mid
5917
5918	pmull2	v4.1q, v5.2d, v14.2d                          //GHASH block 4k+1 - high
5919
5920	aese	v2.16b, v19.16b
5921	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 1
5922	rev64	v7.16b, v7.16b                                    //GHASH block 4k+3
5923
5924	aese	v3.16b, v18.16b
5925	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 0
5926
5927	pmull	v10.1q, v8.1d, v10.1d                      //GHASH block 4k - mid
5928	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+1 - high
5929
5930	pmull	v8.1q, v5.1d, v14.1d                          //GHASH block 4k+1 - low
5931
5932	aese	v3.16b, v19.16b
5933	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 1
5934	mov	d4, v5.d[1]                                  //GHASH block 4k+1 - mid
5935
5936	aese	v0.16b, v20.16b
5937	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 2
5938
5939	aese	v1.16b, v20.16b
5940	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 2
5941	eor	v11.16b, v11.16b, v8.16b                         //GHASH block 4k+1 - low
5942
5943	aese	v2.16b, v20.16b
5944	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 2
5945
5946	aese	v0.16b, v21.16b
5947	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 3
5948	mov	d8, v6.d[1]                                  //GHASH block 4k+2 - mid
5949
5950	aese	v3.16b, v20.16b
5951	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 2
5952	eor	v4.8b, v4.8b, v5.8b                          //GHASH block 4k+1 - mid
5953
5954	pmull	v5.1q, v6.1d, v13.1d                          //GHASH block 4k+2 - low
5955
5956	aese	v0.16b, v22.16b
5957	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 4
5958
5959	aese	v3.16b, v21.16b
5960	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 3
5961	eor	v8.8b, v8.8b, v6.8b                          //GHASH block 4k+2 - mid
5962
5963	pmull	v4.1q, v4.1d, v17.1d                          //GHASH block 4k+1 - mid
5964
5965	aese	v0.16b, v23.16b
5966	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 5
5967	eor	v11.16b, v11.16b, v5.16b                         //GHASH block 4k+2 - low
5968
5969	aese	v3.16b, v22.16b
5970	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 4
5971
5972	pmull2	v5.1q, v7.2d, v12.2d                          //GHASH block 4k+3 - high
5973	eor	v10.16b, v10.16b, v4.16b                         //GHASH block 4k+1 - mid
5974
5975	pmull2	v4.1q, v6.2d, v13.2d                          //GHASH block 4k+2 - high
5976
5977	aese	v3.16b, v23.16b
5978	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 5
5979	ins	v8.d[1], v8.d[0]                                //GHASH block 4k+2 - mid
5980
5981	aese	v2.16b, v21.16b
5982	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 3
5983
5984	aese	v1.16b, v21.16b
5985	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 3
5986	eor	v9.16b, v9.16b, v4.16b                         //GHASH block 4k+2 - high
5987
5988	pmull	v4.1q, v7.1d, v12.1d                          //GHASH block 4k+3 - low
5989
5990	aese	v2.16b, v22.16b
5991	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 4
5992	mov	d6, v7.d[1]                                  //GHASH block 4k+3 - mid
5993
5994	aese	v1.16b, v22.16b
5995	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 4
5996
5997	pmull2	v8.1q, v8.2d, v16.2d                          //GHASH block 4k+2 - mid
5998
5999	aese	v2.16b, v23.16b
6000	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 5
6001	eor	v6.8b, v6.8b, v7.8b                          //GHASH block 4k+3 - mid
6002
6003	aese	v1.16b, v23.16b
6004	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 5
6005
6006	aese	v3.16b, v24.16b
6007	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 6
6008	eor	v10.16b, v10.16b, v8.16b                         //GHASH block 4k+2 - mid
6009
6010	aese	v2.16b, v24.16b
6011	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 6
6012
6013	aese	v0.16b, v24.16b
6014	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 6
6015	movi	v8.8b, #0xc2
6016
6017	aese	v1.16b, v24.16b
6018	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 6
6019	eor	v11.16b, v11.16b, v4.16b                         //GHASH block 4k+3 - low
6020
6021	pmull	v6.1q, v6.1d, v16.1d                          //GHASH block 4k+3 - mid
6022
6023	aese	v3.16b, v25.16b
6024	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 7
6025	eor	v9.16b, v9.16b, v5.16b                         //GHASH block 4k+3 - high
6026
6027	aese	v1.16b, v25.16b
6028	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 7
6029
6030	aese	v0.16b, v25.16b
6031	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 7
6032	eor	v10.16b, v10.16b, v6.16b                         //GHASH block 4k+3 - mid
6033
6034	aese	v3.16b, v26.16b
6035	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 8
6036
6037	aese	v2.16b, v25.16b
6038	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 7
6039	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
6040
6041	aese	v1.16b, v26.16b
6042	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 8
6043
6044	aese	v0.16b, v26.16b
6045	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 8
6046	shl	d8, d8, #56               //mod_constant
6047
6048	aese	v2.16b, v26.16b
6049	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 8
6050
6051	aese	v1.16b, v27.16b
6052	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 9
6053	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
6054
6055	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
6056
6057	aese	v2.16b, v27.16b
6058	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 9
6059	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
6060
6061	aese	v3.16b, v27.16b
6062	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 9
6063
6064	aese	v0.16b, v27.16b
6065	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 9
6066	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
6067
6068	aese	v2.16b, v28.16b
6069	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 10
6070
6071	aese	v3.16b, v28.16b
6072	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 10
6073
6074	aese	v0.16b, v28.16b
6075	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 10
6076	eor	x22, x22, x14                   //AES block 4k+2 - round 14 high
6077#ifdef __AARCH64EB__
6078	rev	x22, x22
6079#endif
6080	aese	v1.16b, v28.16b
6081	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 10
6082	eor	x23, x23, x13                   //AES block 4k+3 - round 14 low
6083#ifdef __AARCH64EB__
6084	rev	x23, x23
6085#endif
6086	aese	v2.16b, v29.16b
6087	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 11
6088	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
6089
6090	aese	v0.16b, v29.16b
6091	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 11
6092	add	w12, w12, #1                            //CTR block 4k+7
6093
6094	aese	v1.16b, v29.16b
6095	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 11
6096	eor	x21, x21, x13                   //AES block 4k+2 - round 14 low
6097#ifdef __AARCH64EB__
6098	rev	x21, x21
6099#endif
6100
6101	aese	v2.16b, v30.16b
6102	aesmc	v2.16b, v2.16b          //AES block 4k+6 - round 12
6103
6104	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
6105	eor	x24, x24, x14                   //AES block 4k+3 - round 14 high
6106#ifdef __AARCH64EB__
6107	rev	x24, x24
6108#endif
6109
6110	aese	v3.16b, v29.16b
6111	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 11
6112	stp	x21, x22, [x2], #16        //AES block 4k+2 - store result
6113
6114	aese	v1.16b, v30.16b
6115	aesmc	v1.16b, v1.16b          //AES block 4k+5 - round 12
6116	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
6117
6118	aese	v0.16b, v30.16b
6119	aesmc	v0.16b, v0.16b          //AES block 4k+4 - round 12
6120	stp	x23, x24, [x2], #16        //AES block 4k+3 - store result
6121
6122	aese	v3.16b, v30.16b
6123	aesmc	v3.16b, v3.16b          //AES block 4k+7 - round 12
6124	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
6125
6126	aese	v1.16b, v31.16b                                     //AES block 4k+5 - round 13
6127
6128	aese	v0.16b, v31.16b                                     //AES block 4k+4 - round 13
6129
6130	aese	v3.16b, v31.16b                                     //AES block 4k+7 - round 13
6131
6132	aese	v2.16b, v31.16b                                     //AES block 4k+6 - round 13
6133	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
6134.L256_dec_tail:	//TAIL
6135
6136	sub	x5, x4, x0   //main_end_input_ptr is number of bytes left to process
6137	ld1	{ v5.16b}, [x0], #16                      //AES block 4k+4 - load ciphertext
6138
6139	eor	v0.16b, v5.16b, v0.16b                            //AES block 4k+4 - result
6140
6141	mov	x6, v0.d[0]                            //AES block 4k+4 - mov low
6142
6143	mov	x7, v0.d[1]                            //AES block 4k+4 - mov high
6144	ext	v8.16b, v11.16b, v11.16b, #8                     //prepare final partial tag
6145
6146	cmp	x5, #48
6147
6148	eor	x6, x6, x13                   //AES block 4k+4 - round 14 low
6149#ifdef __AARCH64EB__
6150	rev	x6, x6
6151#endif
6152
6153	eor	x7, x7, x14                   //AES block 4k+4 - round 14 high
6154#ifdef __AARCH64EB__
6155	rev	x7, x7
6156#endif
6157	b.gt	.L256_dec_blocks_more_than_3
6158
6159	sub	w12, w12, #1
6160	mov	v3.16b, v2.16b
6161	movi	v10.8b, #0
6162
6163	movi	v11.8b, #0
6164	cmp	x5, #32
6165
6166	movi	v9.8b, #0
6167	mov	v2.16b, v1.16b
6168	b.gt	.L256_dec_blocks_more_than_2
6169
6170	sub	w12, w12, #1
6171
6172	mov	v3.16b, v1.16b
6173	cmp	x5, #16
6174	b.gt	.L256_dec_blocks_more_than_1
6175
6176	sub	w12, w12, #1
6177	b	.L256_dec_blocks_less_than_1
6178.L256_dec_blocks_more_than_3:	//blocks	left >  3
6179	rev64	v4.16b, v5.16b                                   //GHASH final-3 block
6180	ld1	{ v5.16b}, [x0], #16                     //AES final-2 block - load ciphertext
6181
6182	stp	x6, x7, [x2], #16       //AES final-3 block  - store result
6183
6184	mov	d10, v17.d[1]                              //GHASH final-3 block - mid
6185
6186	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6187
6188	eor	v0.16b, v5.16b, v1.16b                           //AES final-2 block - result
6189
6190	mov	d22, v4.d[1]                                //GHASH final-3 block - mid
6191
6192	mov	x6, v0.d[0]                           //AES final-2 block - mov low
6193
6194	mov	x7, v0.d[1]                           //AES final-2 block - mov high
6195
6196	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-3 block - mid
6197
6198	movi	v8.8b, #0                                       //suppress further partial tag feed in
6199
6200	pmull2	v9.1q, v4.2d, v15.2d                      //GHASH final-3 block - high
6201
6202	pmull	v10.1q, v22.1d, v10.1d                   //GHASH final-3 block - mid
6203	eor	x6, x6, x13                  //AES final-2 block - round 14 low
6204#ifdef __AARCH64EB__
6205	rev	x6, x6
6206#endif
6207
6208	pmull	v11.1q, v4.1d, v15.1d                      //GHASH final-3 block - low
6209	eor	x7, x7, x14                  //AES final-2 block - round 14 high
6210#ifdef __AARCH64EB__
6211	rev	x7, x7
6212#endif
6213.L256_dec_blocks_more_than_2:	//blocks	left >  2
6214
6215	rev64	v4.16b, v5.16b                                   //GHASH final-2 block
6216	ld1	{ v5.16b}, [x0], #16                     //AES final-1 block - load ciphertext
6217
6218	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6219	stp	x6, x7, [x2], #16       //AES final-2 block  - store result
6220
6221	eor	v0.16b, v5.16b, v2.16b                           //AES final-1 block - result
6222
6223	mov	d22, v4.d[1]                                //GHASH final-2 block - mid
6224
6225	pmull	v21.1q, v4.1d, v14.1d                         //GHASH final-2 block - low
6226
6227	pmull2	v20.1q, v4.2d, v14.2d                         //GHASH final-2 block - high
6228
6229	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-2 block - mid
6230	mov	x6, v0.d[0]                           //AES final-1 block - mov low
6231
6232	mov	x7, v0.d[1]                           //AES final-1 block - mov high
6233	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-2 block - low
6234	movi	v8.8b, #0                                       //suppress further partial tag feed in
6235
6236	pmull	v22.1q, v22.1d, v17.1d                     //GHASH final-2 block - mid
6237
6238	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-2 block - high
6239	eor	x6, x6, x13                  //AES final-1 block - round 14 low
6240#ifdef __AARCH64EB__
6241	rev	x6, x6
6242#endif
6243
6244	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-2 block - mid
6245	eor	x7, x7, x14                  //AES final-1 block - round 14 high
6246#ifdef __AARCH64EB__
6247	rev	x7, x7
6248#endif
6249.L256_dec_blocks_more_than_1:	//blocks	left >  1
6250
6251	stp	x6, x7, [x2], #16       //AES final-1 block  - store result
6252	rev64	v4.16b, v5.16b                                   //GHASH final-1 block
6253
6254	ld1	{ v5.16b}, [x0], #16                     //AES final block - load ciphertext
6255
6256	eor	v4.16b, v4.16b, v8.16b                          //feed in partial tag
6257	movi	v8.8b, #0                                       //suppress further partial tag feed in
6258
6259	mov	d22, v4.d[1]                                //GHASH final-1 block - mid
6260
6261	eor	v0.16b, v5.16b, v3.16b                           //AES final block - result
6262
6263	pmull2	v20.1q, v4.2d, v13.2d                         //GHASH final-1 block - high
6264
6265	eor	v22.8b, v22.8b, v4.8b                     //GHASH final-1 block - mid
6266
6267	pmull	v21.1q, v4.1d, v13.1d                         //GHASH final-1 block - low
6268	mov	x6, v0.d[0]                           //AES final block - mov low
6269
6270	ins	v22.d[1], v22.d[0]                           //GHASH final-1 block - mid
6271
6272	mov	x7, v0.d[1]                           //AES final block - mov high
6273
6274	pmull2	v22.1q, v22.2d, v16.2d                     //GHASH final-1 block - mid
6275	eor	x6, x6, x13                  //AES final block - round 14 low
6276#ifdef __AARCH64EB__
6277	rev	x6, x6
6278#endif
6279	eor	v11.16b, v11.16b, v21.16b                           //GHASH final-1 block - low
6280
6281	eor	v9.16b, v9.16b, v20.16b                           //GHASH final-1 block - high
6282
6283	eor	v10.16b, v10.16b, v22.16b                      //GHASH final-1 block - mid
6284	eor	x7, x7, x14                  //AES final block - round 14 high
6285#ifdef __AARCH64EB__
6286	rev	x7, x7
6287#endif
6288.L256_dec_blocks_less_than_1:	//blocks	left <= 1
6289
6290	and	x1, x1, #127                   //bit_length %= 128
6291	mvn	x14, xzr                                     //rk14_h = 0xffffffffffffffff
6292
6293	sub	x1, x1, #128                   //bit_length -= 128
6294	mvn	x13, xzr                                     //rk14_l = 0xffffffffffffffff
6295
6296	ldp	x4, x5, [x2] //load existing bytes we need to not overwrite
6297	neg	x1, x1                         //bit_length = 128 - #bits in input (in range [1,128])
6298
6299	and	x1, x1, #127                   //bit_length %= 128
6300
6301	lsr	x14, x14, x1                    //rk14_h is mask for top 64b of last block
6302	cmp	x1, #64
6303
6304	csel	x9, x13, x14, lt
6305	csel	x10, x14, xzr, lt
6306
6307	fmov	d0, x9                                  //ctr0b is mask for last block
6308	and	x6, x6, x9
6309
6310	mov	v0.d[1], x10
6311	bic	x4, x4, x9          //mask out low existing bytes
6312
6313#ifndef __AARCH64EB__
6314	rev	w9, w12
6315#else
6316	mov	w9, w12
6317#endif
6318
6319	bic	x5, x5, x10      //mask out high existing bytes
6320
6321	orr	x6, x6, x4
6322
6323	and	x7, x7, x10
6324
6325	orr	x7, x7, x5
6326
6327	and	v5.16b, v5.16b, v0.16b                            //possibly partial last block has zeroes in highest bits
6328
6329	rev64	v4.16b, v5.16b                                    //GHASH final block
6330
6331	eor	v4.16b, v4.16b, v8.16b                           //feed in partial tag
6332
6333	pmull	v21.1q, v4.1d, v12.1d                          //GHASH final block - low
6334
6335	mov	d8, v4.d[1]                                  //GHASH final block - mid
6336
6337	eor	v8.8b, v8.8b, v4.8b                          //GHASH final block - mid
6338
6339	pmull2	v20.1q, v4.2d, v12.2d                          //GHASH final block - high
6340
6341	pmull	v8.1q, v8.1d, v16.1d                          //GHASH final block - mid
6342
6343	eor	v9.16b, v9.16b, v20.16b                            //GHASH final block - high
6344
6345	eor	v11.16b, v11.16b, v21.16b                            //GHASH final block - low
6346
6347	eor	v10.16b, v10.16b, v8.16b                         //GHASH final block - mid
6348	movi	v8.8b, #0xc2
6349
6350	eor	v6.16b, v11.16b, v9.16b                         //MODULO - karatsuba tidy up
6351
6352	shl	d8, d8, #56               //mod_constant
6353
6354	eor	v10.16b, v10.16b, v6.16b                         //MODULO - karatsuba tidy up
6355
6356	pmull	v7.1q, v9.1d, v8.1d            //MODULO - top 64b align with mid
6357
6358	ext	v9.16b, v9.16b, v9.16b, #8                     //MODULO - other top alignment
6359
6360	eor	v10.16b, v10.16b, v7.16b                      //MODULO - fold into mid
6361
6362	eor	v10.16b, v10.16b, v9.16b                         //MODULO - fold into mid
6363
6364	pmull	v8.1q, v10.1d, v8.1d     //MODULO - mid 64b align with low
6365
6366	ext	v10.16b, v10.16b, v10.16b, #8                     //MODULO - other mid alignment
6367
6368	eor	v11.16b, v11.16b, v8.16b               //MODULO - fold into low
6369
6370	stp	x6, x7, [x2]
6371
6372	str	w9, [x16, #12]                          //store the updated counter
6373
6374	eor	v11.16b, v11.16b, v10.16b                         //MODULO - fold into low
6375	ext	v11.16b, v11.16b, v11.16b, #8
6376	rev64	v11.16b, v11.16b
6377	mov	x0, x15
6378	st1	{ v11.16b }, [x3]
6379
6380	ldp	x21, x22, [sp, #16]
6381	ldp	x23, x24, [sp, #32]
6382	ldp	d8, d9, [sp, #48]
6383	ldp	d10, d11, [sp, #64]
6384	ldp	d12, d13, [sp, #80]
6385	ldp	d14, d15, [sp, #96]
6386	ldp	x19, x20, [sp], #112
6387	ret
6388
6389.L256_dec_ret:
6390	mov	w0, #0x0
6391	ret
6392.size	aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6393.byte	71,72,65,83,72,32,102,111,114,32,65,82,77,118,56,44,32,67,82,89,80,84,79,71,65,77,83,32,98,121,32,60,97,112,112,114,111,64,111,112,101,110,115,115,108,46,111,114,103,62,0
6394.align	2
6395.align	2
6396#endif
6397