xref: /freebsd/crypto/openssl/crypto/modes/asm/aes-gcm-armv8_64.pl (revision 1bd9ca8b7548e5f573ae8186f3519f4bedff3a92)
1#! /usr/bin/env perl
2# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved.
3#
4# Licensed under the Apache License 2.0 (the "License").  You may not use
5# this file except in compliance with the License.  You can obtain a copy
6# in the file LICENSE in the source distribution or at
7# https://www.openssl.org/source/license.html
8
9#
10#========================================================================
11# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project,
12# derived from https://github.com/ARM-software/AArch64cryptolib, original
13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual
14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you
15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/.
16#========================================================================
17#
18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants
19#
20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks
21#
22#  ____________________________________________________
23# |                                                    |
24# | PRE                                                |
25# |____________________________________________________|
26# |                |                |                  |
27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 |
28# |________________|________________|__________________|
29# |                |                |                  |
30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 |
31# |________________|________________|__________________|
32# |                |                |                  |
33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 |
34# |________________|________________|__________________|
35# |                |                |                  |
36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 |
37# |________________|____(mostly)____|__________________|
38# |                                                    |
39# | MODULO                                             |
40# |____________________________________________________|
41#
42# PRE:
43#     Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0
44# EXT low_acc, low_acc, low_acc, #8
45# EOR res_curr (4k+0), res_curr (4k+0), low_acc
46#
47# CTR block:
48#     Increment and byte reverse counter in scalar registers and transfer to SIMD registers
49# REV     ctr32, rev_ctr32
50# ORR     ctr64, constctr96_top32, ctr32, LSL #32
51# INS     ctr_next.d[0], constctr96_bottom64      // Keeping this in scalar registers to free up space in SIMD RF
52# INS     ctr_next.d[1], ctr64X
53# ADD     rev_ctr32, #1
54#
55# AES block:
56#     Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example.
57#     Doing small trick here of loading input in scalar registers, EORing with last key and then transferring
58#     Given we are very constrained in our ASIMD registers this is quite important
59#
60#     Encrypt:
61# LDR     input_low, [ input_ptr  ], #8
62# LDR     input_high, [ input_ptr  ], #8
63# EOR     input_low, k14_low
64# EOR     input_high, k14_high
65# INS     res_curr.d[0], input_low
66# INS     res_curr.d[1], input_high
67# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
68# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
69# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
70# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
71# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
72# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
73# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
74# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
75# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
76# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
77# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
78# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
79# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
80# AESE    ctr_curr, k13
81# EOR     res_curr, res_curr, ctr_curr
82# ST1     { res_curr.16b  }, [ output_ptr  ], #16
83#
84#     Decrypt:
85# AESE    ctr_curr, k0; AESMC ctr_curr, ctr_curr
86# AESE    ctr_curr, k1; AESMC ctr_curr, ctr_curr
87# AESE    ctr_curr, k2; AESMC ctr_curr, ctr_curr
88# AESE    ctr_curr, k3; AESMC ctr_curr, ctr_curr
89# AESE    ctr_curr, k4; AESMC ctr_curr, ctr_curr
90# AESE    ctr_curr, k5; AESMC ctr_curr, ctr_curr
91# AESE    ctr_curr, k6; AESMC ctr_curr, ctr_curr
92# AESE    ctr_curr, k7; AESMC ctr_curr, ctr_curr
93# AESE    ctr_curr, k8; AESMC ctr_curr, ctr_curr
94# AESE    ctr_curr, k9; AESMC ctr_curr, ctr_curr
95# AESE    ctr_curr, k10; AESMC ctr_curr, ctr_curr
96# AESE    ctr_curr, k11; AESMC ctr_curr, ctr_curr
97# AESE    ctr_curr, k12; AESMC ctr_curr, ctr_curr
98# AESE    ctr_curr, k13
99# LDR     res_curr, [ input_ptr  ], #16
100# EOR     res_curr, res_curr, ctr_curr
101# MOV     output_low, res_curr.d[0]
102# MOV     output_high, res_curr.d[1]
103# EOR     output_low, k14_low
104# EOR     output_high, k14_high
105# STP     output_low, output_high, [ output_ptr  ], #16
106#
107# GHASH block X:
108#     do 128b karatsuba polynomial multiplication on block
109#     We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b
110#
111# multiplication:
112#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64
113#
114#     The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies:
115#     Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64
116#
117#     There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are
118#     multiplying with "twisted" powers of H
119#
120# Note: We can PMULL directly into the acc_x in first GHASH of the loop
121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical
122#       path latency dominates the performance
123#
124#       This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers
125#       than indicated here
126# REV64   res_curr, res_curr
127# INS     t_m.d[0], res_curr.d[1]
128# EOR     t_m.8B, t_m.8B, res_curr.8B
129# PMULL2  t_h, res_curr, HX
130# PMULL   t_l, res_curr, HX
131# PMULL   t_m, t_m, HX_k
132# EOR     acc_h, acc_h, t_h
133# EOR     acc_l, acc_l, t_l
134# EOR     acc_m, acc_m, t_m
135#
136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them
137#         There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo
138#         with a reversed constant
139# EOR     acc_m, acc_m, acc_h
140# EOR     acc_m, acc_m, acc_l                     // Finish off karatsuba processing
141# PMULL   t_mod, acc_h, mod_constant
142# EXT     acc_h, acc_h, acc_h, #8
143# EOR     acc_m, acc_m, acc_h
144# EOR     acc_m, acc_m, t_mod
145# PMULL   acc_h, acc_m, mod_constant
146# EXT     acc_m, acc_m, acc_m, #8
147# EOR     acc_l, acc_l, acc_h
148# EOR     acc_l, acc_l, acc_m
149
150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
152
153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
154( $xlate="${dir}arm-xlate.pl" and -f $xlate  ) or
155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or
156die "can't locate arm-xlate.pl";
157
158open OUT,"| \"$^X\" $xlate $flavour $output";
159*STDOUT=*OUT;
160
161$input_ptr="x0";  #argument block
162$bit_length="x1";
163$output_ptr="x2";
164$current_tag="x3";
165$counter="x16";
166$cc="x8";
167
168{
169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
172my ($output_l0,$output_h0)=map("x$_",(6..7));
173
174my $ctr32w="w9";
175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15));
176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
177
178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
182
183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
186
187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
190
191my $t0="v8";
192my $t0d="d8";
193
194my ($t1,$t2,$t3)=map("v$_",(28..30));
195my ($t1d,$t2d,$t3d)=map("d$_",(28..30));
196
197my $t4="v8";
198my $t4d="d8";
199my $t5="v28";
200my $t5d="d28";
201my $t6="v31";
202my $t6d="d31";
203
204my $t7="v4";
205my $t7d="d4";
206my $t8="v29";
207my $t8d="d29";
208my $t9="v30";
209my $t9d="d30";
210
211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
214
215my $mod_constantd="d8";
216my $mod_constant="v8";
217my $mod_t="v31";
218
219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27));
220my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27));
221my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27));
222my $rk2q1="v20.1q";
223my $rk3q1="v21.1q";
224my $rk4v="v22";
225my $rk4d="d22";
226
227$code=<<___;
228#include "arm_arch.h"
229
230#if __ARM_MAX_ARCH__>=8
231___
232$code.=".arch   armv8-a+crypto\n.text\n"    if ($flavour =~ /64/);
233$code.=<<___                    if ($flavour !~ /64/);
234.fpu    neon
235#ifdef __thumb2__
236.syntax        unified
237.thumb
238# define INST(a,b,c,d) $_byte  c,0xef,a,b
239#else
240.code  32
241# define INST(a,b,c,d) $_byte  a,b,c,0xf2
242#endif
243
244.text
245___
246
247#########################################################################################
248# size_t aes_gcm_enc_128_kernel(const unsigned char *in,
249#                               size_t len,
250#                               unsigned char *out,
251#                               const void *key,
252#                               unsigned char ivec[16],
253#                               u64 *Xi);
254#
255$code.=<<___;
256.global aes_gcm_enc_128_kernel
257.type   aes_gcm_enc_128_kernel,%function
258.align  4
259aes_gcm_enc_128_kernel:
260	AARCH64_VALID_CALL_TARGET
261	cbz     x1, .L128_enc_ret
262	stp     x19, x20, [sp, #-112]!
263	mov     x16, x4
264	mov     x8, x5
265	stp     x21, x22, [sp, #16]
266	stp     x23, x24, [sp, #32]
267	stp     d8, d9, [sp, #48]
268	stp     d10, d11, [sp, #64]
269	stp     d12, d13, [sp, #80]
270	stp     d14, d15, [sp, #96]
271
272	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
273#ifdef __AARCH64EB__
274	rev     $ctr96_b64x, $ctr96_b64x
275	rev     $ctr96_t32x, $ctr96_t32x
276#endif
277	ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
278#ifdef __AARCH64EB__
279	ror     $rk10_l, $rk10_l, #32
280	ror     $rk10_h, $rk10_h, #32
281#endif
282	ld1     {$acc_lb}, [$current_tag]
283	ext     $acc_lb, $acc_lb, $acc_lb, #8
284	rev64   $acc_lb, $acc_lb
285	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
286	mov     $len, $main_end_input_ptr
287
288	ld1     {$rk0s}, [$cc], #16								  @ load rk0
289	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
290	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
291
292	lsr     $rctr32x, $ctr96_t32x, #32
293	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
294#ifndef __AARCH64EB__
295	ext     $h4b, $h4b, $h4b, #8
296#endif
297	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
298	rev     $rctr32w, $rctr32w                                @ rev_ctr32
299
300	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
301	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
302	ld1     {$rk1s}, [$cc], #16								  @ load rk1
303
304	rev     $ctr32w, $rctr32w                                 @ CTR block 1
305	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
306	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
307
308	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
309	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
310
311	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
312	rev     $ctr32w, $rctr32w                                 @ CTR block 2
313
314	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
315	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
316	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
317
318	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
319	rev     $ctr32w, $rctr32w                                 @ CTR block 3
320
321	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
322	ld1     {$rk2s}, [$cc], #16								  @ load rk2
323
324	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
325	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
326
327	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
328#ifndef __AARCH64EB__
329	ext     $h3b, $h3b, $h3b, #8
330#endif
331	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
332	ld1     {$rk3s}, [$cc], #16								  @ load rk3
333
334	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
335	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
336#ifndef __AARCH64EB__
337	ext     $h1b, $h1b, $h1b, #8
338#endif
339
340	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
341	ld1     {$rk4s}, [$cc], #16								  @ load rk4
342
343	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
344	ld1     {$rk5s}, [$cc], #16								  @ load rk5
345
346	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
347	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
348
349	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
350	ld1     {$rk6s}, [$cc], #16								  @ load rk6
351
352	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
353	ld1     {$rk7s}, [$cc], #16								  @ load rk7
354
355	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
356	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
357
358	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
359	ld1     {$rk8s}, [$cc], #16								  @ load rk8
360
361	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
362	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
363#ifndef __AARCH64EB__
364	ext     $h2b, $h2b, $h2b, #8
365#endif
366
367	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
368
369	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
370	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
371
372	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
373
374	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
375
376	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
377	ld1     {$rk9s}, [$cc], #16								  @ load rk9
378
379	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
380
381	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
382	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
383
384	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
385	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
386
387	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
388	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
389
390	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
391
392	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
393
394	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
395
396	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
397
398	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
399
400	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
401
402	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
403	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
404
405	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
406
407	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
408
409	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
410
411	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
412
413	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
414
415	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
416
417	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
418
419	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
420
421	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
422
423	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
424
425	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
426
427	aese    $ctr2b, $rk9                                      @ AES block 2 - round 9
428
429	aese    $ctr0b, $rk9                                      @ AES block 0 - round 9
430
431	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
432
433	aese    $ctr1b, $rk9                                      @ AES block 1 - round 9
434
435	aese    $ctr3b, $rk9                                      @ AES block 3 - round 9
436	b.ge    .L128_enc_tail                                    @ handle tail
437
438	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
439#ifdef __AARCH64EB__
440	rev     $input_l0, $input_l0
441	rev     $input_h0, $input_h0
442#endif
443	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
444#ifdef __AARCH64EB__
445	rev     $input_l2, $input_l2
446	rev     $input_h2, $input_h2
447#endif
448	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
449#ifdef __AARCH64EB__
450	rev     $input_l1, $input_l1
451	rev     $input_h1, $input_h1
452#endif
453	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
454#ifdef __AARCH64EB__
455	rev     $input_l3, $input_l3
456	rev     $input_h3, $input_h3
457#endif
458	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 0 - round 10 low
459	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 0 - round 10 high
460
461	eor     $input_l2, $input_l2, $rk10_l                     @ AES block 2 - round 10 low
462	fmov    $ctr_t0d, $input_l0                               @ AES block 0 - mov low
463
464	eor     $input_l1, $input_l1, $rk10_l                     @ AES block 1 - round 10 low
465	eor     $input_h2, $input_h2, $rk10_h                     @ AES block 2 - round 10 high
466	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 0 - mov high
467
468	fmov    $ctr_t1d, $input_l1                               @ AES block 1 - mov low
469	eor     $input_h1, $input_h1, $rk10_h                     @ AES block 1 - round 10 high
470
471	eor     $input_l3, $input_l3, $rk10_l                     @ AES block 3 - round 10 low
472	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 1 - mov high
473
474	fmov    $ctr_t2d, $input_l2                               @ AES block 2 - mov low
475	eor     $input_h3, $input_h3, $rk10_h                     @ AES block 3 - round 10 high
476	rev     $ctr32w, $rctr32w                                 @ CTR block 4
477
478	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 2 - mov high
479	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
480
481	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 0 - result
482	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
483	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
484
485	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
486	rev     $ctr32w, $rctr32w                                 @ CTR block 5
487
488	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 1 - result
489	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
490	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
491
492	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
493	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
494	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
495
496	fmov    $ctr_t3d, $input_l3                               @ AES block 3 - mov low
497	rev     $ctr32w, $rctr32w                                 @ CTR block 6
498	st1     { $res0b}, [$output_ptr], #16                     @ AES block 0 - store result
499
500	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 3 - mov high
501	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
502
503	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
504	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 2 - result
505	st1     { $res1b}, [$output_ptr], #16                     @ AES block 1 - store result
506
507	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 6
508	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
509
510	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 6
511	rev     $ctr32w, $rctr32w                                 @ CTR block 7
512	st1     { $res2b}, [$output_ptr], #16                     @ AES block 2 - store result
513
514	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 7
515
516	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 3 - result
517	st1     { $res3b}, [$output_ptr], #16                     @ AES block 3 - store result
518	b.ge    .L128_enc_prepretail                              @ do prepretail
519
520	.L128_enc_main_loop:                                      @ main loop start
521	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+3 - load plaintext
522#ifdef __AARCH64EB__
523	rev     $input_l3, $input_l3
524	rev     $input_h3, $input_h3
525#endif
526	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
527	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
528
529	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
530	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
531
532	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
533	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
534
535	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
536	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
537	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
538
539	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
540	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
541
542	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
543	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
544
545	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
546	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
547
548	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
549	eor     $input_h3, $input_h3, $rk10_h                     @ AES block 4k+3 - round 10 high
550
551	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
552	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
553	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
554#ifdef __AARCH64EB__
555	rev     $input_l0, $input_l0
556	rev     $input_h0, $input_h0
557#endif
558	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
559	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
560
561	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
562	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
563	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
564
565	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
566	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
567	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
568
569	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
570
571	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
572	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
573
574	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
575
576	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
577	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
578
579	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
580
581	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
582	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
583
584	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
585
586	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
587	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
588
589	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
590	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 4k+4 - round 10 high
591
592	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
593	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
594
595	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
596	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
597
598	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
599	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 4k+4 - round 10 low
600
601	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
602	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
603
604	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
605
606	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
607	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
608
609	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
610
611	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
612	movi    $mod_constant.8b, #0xc2
613
614	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
615	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
616
617	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
618
619	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
620	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
621
622	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
623	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
624
625	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
626	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
627#ifdef __AARCH64EB__
628	rev     $input_l1, $input_l1
629	rev     $input_h1, $input_h1
630#endif
631	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
632	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
633
634	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
635	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
636#ifdef __AARCH64EB__
637	rev     $input_l2, $input_l2
638	rev     $input_h2, $input_h2
639#endif
640	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
641	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
642
643	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
644	eor     $input_l1, $input_l1, $rk10_l                     @ AES block 4k+5 - round 10 low
645
646	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
647	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
648
649	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
650	eor     $input_l3, $input_l3, $rk10_l                     @ AES block 4k+3 - round 10 low
651
652	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
653	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
654
655	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
656	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
657	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
658
659	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
660	fmov    $ctr_t3d, $input_l3                               @ AES block 4k+3 - mov low
661	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
662
663	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
664	fmov    $ctr_t1d, $input_l1                               @ AES block 4k+5 - mov low
665
666	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
667	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
668
669	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
670	eor     $input_h1, $input_h1, $rk10_h                     @ AES block 4k+5 - round 10 high
671
672	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
673	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 4k+5 - mov high
674
675	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
676	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 4k+3 - mov high
677
678	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
679	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
680
681	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
682	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
683
684	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
685	eor     $input_l2, $input_l2, $rk10_l                     @ AES block 4k+6 - round 10 low
686	eor     $input_h2, $input_h2, $rk10_h                     @ AES block 4k+6 - round 10 high
687
688	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
689	fmov    $ctr_t2d, $input_l2                               @ AES block 4k+6 - mov low
690
691	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
692	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 4k+6 - mov high
693
694	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
695	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
696
697	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
698	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
699
700	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
701	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
702	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
703
704	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
705	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 4k+5 - result
706
707	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
708	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
709	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
710
711	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
712	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
713	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
714
715	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
716	st1     { $res0b}, [$output_ptr], #16                     @ AES block 4k+4 - store result
717	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 4k+6 - result
718	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
719
720	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
721	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
722	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
723	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+10
724
725	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
726	st1     { $res1b}, [$output_ptr], #16                     @ AES block 4k+5 - store result
727
728	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+10
729	st1     { $res2b}, [$output_ptr], #16                     @ AES block 4k+6 - store result
730	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+11
731
732	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+11
733	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 4k+3 - result
734
735	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
736	st1     { $res3b}, [$output_ptr], #16                     @ AES block 4k+3 - store result
737	b.lt    .L128_enc_main_loop
738
739	.L128_enc_prepretail:                                     @ PREPRETAIL
740	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
741	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
742	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
743
744	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
745	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
746	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
747
748	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
749	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
750
751	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
752
753	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
754	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
755
756	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
757
758	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
759	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
760
761	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
762	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
763
764	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
765	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
766
767	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
768	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
769
770	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
771
772	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
773	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
774
775	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
776
777	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
778	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
779
780	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
781
782	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
783	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
784
785	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
786
787	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
788	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
789
790	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
791	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
792
793	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
794
795	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
796	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
797
798	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
799
800	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
801
802	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
803	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
804
805	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
806
807	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
808	movi    $mod_constant.8b, #0xc2
809
810	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
811	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
812
813	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
814
815	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
816	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
817
818	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
819
820	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
821	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
822
823	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
824
825	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
826	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
827
828	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
829	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
830
831	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
832
833	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
834	eor     $acc_mb, $acc_mb, $acc_hb                         @ karatsuba tidy up
835
836	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
837
838	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
839	ext     $acc_hb, $acc_hb, $acc_hb, #8
840
841	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
842
843	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
844	eor     $acc_mb, $acc_mb, $acc_lb
845
846	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
847
848	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
849
850	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
851
852	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
853	eor     $acc_mb, $acc_mb, $t1.16b
854
855	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
856
857	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
858
859	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
860
861	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
862	eor     $acc_mb, $acc_mb, $acc_hb
863
864	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
865
866	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
867
868	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
869
870	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
871
872	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
873	ext     $acc_mb, $acc_mb, $acc_mb, #8
874
875	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
876
877	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
878	eor     $acc_lb, $acc_lb, $t1.16b
879
880	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
881
882	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
883
884	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
885
886	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
887
888	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
889	eor     $acc_lb, $acc_lb, $acc_mb
890
891	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
892	.L128_enc_tail:                                           @ TAIL
893
894	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
895	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
896#ifdef __AARCH64EB__
897	rev     $input_l0, $input_l0
898	rev     $input_h0, $input_h0
899#endif
900	cmp     $main_end_input_ptr, #48
901
902	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
903	eor     $input_l0, $input_l0, $rk10_l                     @ AES block 4k+4 - round 10 low
904	eor     $input_h0, $input_h0, $rk10_h                     @ AES block 4k+4 - round 10 high
905
906	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
907
908	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
909
910	eor     $res1b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
911
912	b.gt    .L128_enc_blocks_more_than_3
913
914	sub     $rctr32w, $rctr32w, #1
915	movi    $acc_l.8b, #0
916	mov     $ctr3b, $ctr2b
917
918	cmp     $main_end_input_ptr, #32
919	mov     $ctr2b, $ctr1b
920	movi    $acc_h.8b, #0
921
922	movi    $acc_m.8b, #0
923	b.gt    .L128_enc_blocks_more_than_2
924
925	mov     $ctr3b, $ctr1b
926	cmp     $main_end_input_ptr, #16
927
928	sub     $rctr32w, $rctr32w, #1
929	b.gt    .L128_enc_blocks_more_than_1
930
931	sub     $rctr32w, $rctr32w, #1
932	b       .L128_enc_blocks_less_than_1
933	.L128_enc_blocks_more_than_3:                             @ blocks left >  3
934	st1     { $res1b}, [$output_ptr], #16                     @ AES final-3 block  - store result
935
936	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-2 block - load input low & high
937#ifdef __AARCH64EB__
938	rev     $input_l0, $input_l0
939	rev     $input_h0, $input_h0
940#endif
941	rev64   $res0b, $res1b                                    @ GHASH final-3 block
942
943	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
944	eor     $input_h0, $input_h0, $rk10_h                     @ AES final-2 block - round 10 high
945	eor     $input_l0, $input_l0, $rk10_l                     @ AES final-2 block - round 10 low
946
947	fmov    $res1d, $input_l0                                 @ AES final-2 block - mov low
948
949	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
950	fmov    $res1.d[1], $input_h0                             @ AES final-2 block - mov high
951
952	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
953	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
954
955	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
956
957	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
958
959	eor     $res1b, $res1b, $ctr1b                            @ AES final-2 block - result
960	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
961
962	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
963	.L128_enc_blocks_more_than_2:                             @ blocks left >  2
964
965	st1     { $res1b}, [$output_ptr], #16                     @ AES final-2 block - store result
966
967	rev64   $res0b, $res1b                                    @ GHASH final-2 block
968	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final-1 block - load input low & high
969#ifdef __AARCH64EB__
970	rev     $input_l0, $input_l0
971	rev     $input_h0, $input_h0
972#endif
973	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
974
975	eor     $input_l0, $input_l0, $rk10_l                     @ AES final-1 block - round 10 low
976
977	fmov    $res1d, $input_l0                                 @ AES final-1 block - mov low
978	eor     $input_h0, $input_h0, $rk10_h                     @ AES final-1 block - round 10 high
979
980	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
981	fmov    $res1.d[1], $input_h0                             @ AES final-1 block - mov high
982
983	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
984
985	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
986
987	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
988
989	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
990
991	eor     $res1b, $res1b, $ctr2b                            @ AES final-1 block - result
992
993	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
994
995	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
996
997	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
998
999	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
1000	.L128_enc_blocks_more_than_1:                             @ blocks left >  1
1001
1002	st1     { $res1b}, [$output_ptr], #16                     @ AES final-1 block - store result
1003
1004	rev64   $res0b, $res1b                                    @ GHASH final-1 block
1005	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES final block - load input low & high
1006#ifdef __AARCH64EB__
1007	rev     $input_l0, $input_l0
1008	rev     $input_h0, $input_h0
1009#endif
1010	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1011
1012	eor     $input_h0, $input_h0, $rk10_h                     @ AES final block - round 10 high
1013	eor     $input_l0, $input_l0, $rk10_l                     @ AES final block - round 10 low
1014
1015	fmov    $res1d, $input_l0                                 @ AES final block - mov low
1016
1017	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
1018	fmov    $res1.d[1], $input_h0                             @ AES final block - mov high
1019
1020	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
1021
1022	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
1023
1024	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
1025
1026	eor     $res1b, $res1b, $ctr3b                            @ AES final block - result
1027
1028	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
1029
1030	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
1031
1032	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
1033
1034	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
1035
1036	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
1037	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1038	.L128_enc_blocks_less_than_1:                             @ blocks left <= 1
1039
1040	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1041	mvn     $rk10_l, xzr                                      @ rk10_l = 0xffffffffffffffff
1042
1043	mvn     $rk10_h, xzr                                      @ rk10_h = 0xffffffffffffffff
1044	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
1045
1046	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
1047
1048	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1049
1050	lsr     $rk10_h, $rk10_h, $bit_length                     @ rk10_h is mask for top 64b of last block
1051	cmp     $bit_length, #64
1052
1053	csel    $input_l0, $rk10_l, $rk10_h, lt
1054	csel    $input_h0, $rk10_h, xzr, lt
1055
1056	fmov    $ctr0d, $input_l0                                 @ ctr0b is mask for last block
1057
1058	fmov    $ctr0.d[1], $input_h0
1059
1060	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
1061
1062	rev64   $res0b, $res1b                                    @ GHASH final block
1063
1064	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1065
1066	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
1067
1068	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
1069	ld1     { $rk0}, [$output_ptr]                            @ load existing bytes where the possibly partial last block is to be stored
1070
1071	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
1072#ifndef __AARCH64EB__
1073	rev     $ctr32w, $rctr32w
1074#else
1075	mov     $ctr32w, $rctr32w
1076#endif
1077	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
1078
1079	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
1080
1081	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
1082
1083	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
1084
1085	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
1086	movi    $mod_constant.8b, #0xc2
1087
1088	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1089
1090	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1091
1092	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1093
1094	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1095
1096	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1097
1098	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1099
1100	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1101
1102	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
1103
1104	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1105
1106	bif     $res1b, $rk0, $ctr0b                              @ insert existing bytes in top end of result before storing
1107
1108	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
1109	st1     { $res1b}, [$output_ptr]                          @ store all 16B
1110
1111	str     $ctr32w, [$counter, #12]                          @ store the updated counter
1112
1113	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1114	ext     $acc_lb, $acc_lb, $acc_lb, #8
1115	rev64   $acc_lb, $acc_lb
1116	mov     x0, $len
1117	st1     { $acc_l.16b }, [$current_tag]
1118	ldp     x21, x22, [sp, #16]
1119	ldp     x23, x24, [sp, #32]
1120	ldp     d8, d9, [sp, #48]
1121	ldp     d10, d11, [sp, #64]
1122	ldp     d12, d13, [sp, #80]
1123	ldp     d14, d15, [sp, #96]
1124	ldp     x19, x20, [sp], #112
1125	ret
1126
1127.L128_enc_ret:
1128	mov w0, #0x0
1129	ret
1130.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel
1131___
1132
1133#########################################################################################
1134# size_t aes_gcm_dec_128_kernel(const unsigned char *in,
1135#                               size_t len,
1136#                               unsigned char *out,
1137#                               const void *key,
1138#                               unsigned char ivec[16],
1139#                               u64 *Xi);
1140#
1141$code.=<<___;
1142.global aes_gcm_dec_128_kernel
1143.type   aes_gcm_dec_128_kernel,%function
1144.align  4
1145aes_gcm_dec_128_kernel:
1146	AARCH64_VALID_CALL_TARGET
1147	cbz     x1, .L128_dec_ret
1148	stp     x19, x20, [sp, #-112]!
1149	mov     x16, x4
1150	mov     x8, x5
1151	stp     x21, x22, [sp, #16]
1152	stp     x23, x24, [sp, #32]
1153	stp     d8, d9, [sp, #48]
1154	stp     d10, d11, [sp, #64]
1155	stp     d12, d13, [sp, #80]
1156	stp     d14, d15, [sp, #96]
1157
1158	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
1159	mov     $len, $main_end_input_ptr
1160	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
1161#ifdef __AARCH64EB__
1162	rev     $ctr96_b64x, $ctr96_b64x
1163	rev     $ctr96_t32x, $ctr96_t32x
1164#endif
1165	ldp     $rk10_l, $rk10_h, [$cc, #160]                     @ load rk10
1166#ifdef __AARCH64EB__
1167	ror     $rk10_h, $rk10_h, 32
1168	ror     $rk10_l, $rk10_l, 32
1169#endif
1170	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
1171	ld1     {$rk0s}, [$cc], #16                                @ load rk0
1172
1173	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
1174	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
1175
1176	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
1177#ifndef __AARCH64EB__
1178	ext     $h2b, $h2b, $h2b, #8
1179#endif
1180	lsr     $rctr32x, $ctr96_t32x, #32
1181	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
1182
1183	ld1     {$rk1s}, [$cc], #16                                @ load rk1
1184	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
1185	rev     $rctr32w, $rctr32w                                @ rev_ctr32
1186
1187	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
1188	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
1189
1190	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
1191	rev     $ctr32w, $rctr32w                                 @ CTR block 1
1192
1193	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
1194	ld1     {$rk2s}, [$cc], #16                                @ load rk2
1195	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
1196
1197	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
1198	rev     $ctr32w, $rctr32w                                 @ CTR block 2
1199	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
1200
1201	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
1202	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
1203
1204	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
1205	rev     $ctr32w, $rctr32w                                 @ CTR block 3
1206
1207	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
1208	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
1209	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
1210
1211	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
1212	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
1213
1214	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
1215	ld1     {$rk3s}, [$cc], #16                                @ load rk3
1216
1217	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
1218	ld1     {$rk4s}, [$cc], #16                                @ load rk4
1219
1220	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
1221	ld1     {$rk5s}, [$cc], #16                                @ load rk5
1222
1223	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
1224	ld1     {$rk6s}, [$cc], #16                                @ load rk6
1225
1226	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
1227
1228	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
1229
1230	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
1231
1232	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
1233	ld1     { $acc_lb}, [$current_tag]
1234	ext     $acc_lb, $acc_lb, $acc_lb, #8
1235	rev64   $acc_lb, $acc_lb
1236
1237	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
1238	ld1     {$rk7s}, [$cc], #16                                @ load rk7
1239
1240	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
1241
1242	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
1243
1244	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
1245	ld1     {$rk8s}, [$cc], #16                                @ load rk8
1246
1247	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
1248
1249	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
1250
1251	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
1252	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
1253#ifndef __AARCH64EB__
1254	ext     $h3b, $h3b, $h3b, #8
1255#endif
1256	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
1257	ld1     {$rk9s}, [$cc], #16                                @ load rk9
1258
1259	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
1260
1261	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
1262
1263	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
1264
1265	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
1266
1267	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
1268	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
1269#ifndef __AARCH64EB__
1270	ext     $h1b, $h1b, $h1b, #8
1271#endif
1272	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
1273
1274	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
1275
1276	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
1277
1278	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
1279
1280	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
1281	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
1282
1283	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
1284#ifndef __AARCH64EB__
1285	ext     $h4b, $h4b, $h4b, #8
1286#endif
1287	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
1288	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
1289
1290	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
1291
1292	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
1293
1294	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
1295	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
1296
1297	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
1298
1299	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
1300	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
1301
1302	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
1303
1304	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
1305
1306	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
1307	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
1308
1309	aese    $ctr2b, $rk9                                      @ AES block 2 - round 9
1310
1311	aese    $ctr3b, $rk9                                      @ AES block 3 - round 9
1312
1313	aese    $ctr0b, $rk9                                      @ AES block 0 - round 9
1314	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
1315
1316	aese    $ctr1b, $rk9                                      @ AES block 1 - round 9
1317	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
1318	b.ge    .L128_dec_tail                                    @ handle tail
1319
1320	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0 - load ciphertext; AES block 1 - load ciphertext
1321
1322	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
1323	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
1324
1325	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
1326	rev64   $res0b, $res0b                                    @ GHASH block 0
1327	rev     $ctr32w, $rctr32w                                 @ CTR block 4
1328
1329	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
1330	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
1331	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
1332
1333	rev64   $res1b, $res1b                                    @ GHASH block 1
1334	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
1335
1336	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
1337
1338	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
1339	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
1340
1341	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
1342
1343	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
1344
1345	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
1346	rev     $ctr32w, $rctr32w                                 @ CTR block 5
1347	eor     $output_l1, $output_l1, $rk10_l                   @ AES block 1 - round 10 low
1348#ifdef __AARCH64EB__
1349	rev     $output_l1, $output_l1
1350#endif
1351	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
1352	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
1353	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
1354
1355	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
1356	rev     $ctr32w, $rctr32w                                 @ CTR block 6
1357	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
1358
1359	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
1360
1361	eor     $output_h1, $output_h1, $rk10_h                   @ AES block 1 - round 10 high
1362#ifdef __AARCH64EB__
1363	rev     $output_h1, $output_h1
1364#endif
1365	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 0 - round 10 low
1366#ifdef __AARCH64EB__
1367	rev     $output_l0, $output_l0
1368#endif
1369	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
1370
1371	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 0 - round 10 high
1372#ifdef __AARCH64EB__
1373	rev     $output_h0, $output_h0
1374#endif
1375	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
1376
1377	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
1378	b.ge    .L128_dec_prepretail                              @ do prepretail
1379
1380	.L128_dec_main_loop:                                      @ main loop start
1381	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
1382	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
1383	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
1384
1385	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
1386	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
1387
1388	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
1389	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
1390
1391	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
1392	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
1393	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
1394
1395	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
1396	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
1397	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
1398
1399	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
1400	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
1401
1402	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
1403	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
1404	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
1405
1406	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
1407	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
1408	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
1409
1410	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
1411	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
1412
1413	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
1414	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
1415
1416	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
1417	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
1418
1419	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
1420
1421	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
1422	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
1423
1424	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
1425	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
1426
1427	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
1428
1429	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
1430	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
1431
1432	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
1433	eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
1434#ifdef __AARCH64EB__
1435	rev     $output_l3, $output_l3
1436#endif
1437	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
1438	eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
1439#ifdef __AARCH64EB__
1440	rev     $output_h2, $output_h2
1441#endif
1442	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
1443
1444	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
1445	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
1446
1447	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
1448
1449	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
1450	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
1451
1452	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
1453
1454	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
1455	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
1456
1457	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
1458
1459	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
1460	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
1461
1462	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
1463
1464	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
1465	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
1466
1467	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
1468	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
1469
1470	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
1471	eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
1472#ifdef __AARCH64EB__
1473	rev     $output_h3, $output_h3
1474#endif
1475	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
1476	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
1477
1478	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
1479	eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
1480#ifdef __AARCH64EB__
1481	rev     $output_l2, $output_l2
1482#endif
1483	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
1484	movi    $mod_constant.8b, #0xc2
1485
1486	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
1487	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
1488
1489	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
1490
1491	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
1492	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
1493
1494	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
1495	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
1496
1497	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
1498	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
1499	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+3 - load ciphertext
1500
1501	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
1502	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
1503
1504	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
1505	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1506
1507	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
1508	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
1509
1510	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
1511	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
1512
1513	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
1514	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1515
1516	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
1517	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
1518
1519	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1520	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
1521	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1522
1523	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
1524	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
1525
1526	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
1527	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1528
1529	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
1530
1531	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
1532	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
1533
1534	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
1535	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
1536
1537	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
1538	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1539	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
1540
1541	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
1542	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
1543
1544	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
1545
1546	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
1547	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1548	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
1549
1550	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
1551	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
1552
1553	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
1554	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
1555
1556	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
1557	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
1558	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
1559
1560	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
1561	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
1562	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1563
1564	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
1565	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
1566#ifdef __AARCH64EB__
1567	rev     $output_h0, $output_h0
1568#endif
1569	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
1570	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
1571	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
1572#ifdef __AARCH64EB__
1573	rev     $output_l0, $output_l0
1574#endif
1575	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
1576	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
1577	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
1578
1579	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
1580	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
1581	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
1582
1583	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
1584	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1585	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
1586
1587	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
1588	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
1589
1590	eor     $output_h1, $output_h1, $rk10_h                   @ AES block 4k+5 - round 10 high
1591#ifdef __AARCH64EB__
1592	rev     $output_h1, $output_h1
1593#endif
1594	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
1595
1596	eor     $output_l1, $output_l1, $rk10_l                   @ AES block 4k+5 - round 10 low
1597#ifdef __AARCH64EB__
1598	rev     $output_l1, $output_l1
1599#endif
1600	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
1601
1602	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
1603	b.lt    L128_dec_main_loop
1604
1605	.L128_dec_prepretail:                                     @ PREPRETAIL
1606	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
1607	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
1608	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
1609
1610	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
1611	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
1612
1613	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
1614	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
1615
1616	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
1617	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
1618	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
1619
1620	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
1621	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
1622
1623	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
1624	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
1625	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
1626
1627	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
1628	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
1629	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
1630
1631	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
1632	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
1633
1634	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
1635	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
1636
1637	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
1638	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
1639	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
1640
1641	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
1642	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
1643
1644	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
1645	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
1646
1647	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
1648
1649	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
1650	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
1651
1652	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
1653
1654	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
1655	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
1656
1657	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
1658
1659	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
1660	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
1661
1662	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
1663
1664	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
1665	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
1666
1667	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
1668
1669	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
1670
1671	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
1672	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
1673
1674	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
1675	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
1676
1677	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
1678
1679	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
1680	movi    $mod_constant.8b, #0xc2
1681
1682	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
1683	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
1684
1685	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
1686
1687	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
1688	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
1689
1690	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
1691	eor     $output_l3, $output_l3, $rk10_l                   @ AES block 4k+3 - round 10 low
1692#ifdef __AARCH64EB__
1693	rev     $output_l3, $output_l3
1694#endif
1695	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
1696	eor     $output_l2, $output_l2, $rk10_l                   @ AES block 4k+2 - round 10 low
1697#ifdef __AARCH64EB__
1698	rev     $output_l2, $output_l2
1699#endif
1700	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
1701
1702	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
1703
1704	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
1705	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1706
1707	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
1708
1709	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
1710	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
1711
1712	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
1713
1714	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
1715	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1716
1717	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
1718
1719	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
1720
1721	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
1722
1723	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
1724	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1725
1726	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1727
1728	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
1729	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1730
1731	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
1732
1733	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
1734	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
1735
1736	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
1737
1738	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
1739
1740	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
1741
1742	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
1743	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
1744
1745	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
1746
1747	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
1748
1749	aese    $ctr1b, $rk9                                      @ AES block 4k+5 - round 9
1750
1751	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
1752	eor     $output_h3, $output_h3, $rk10_h                   @ AES block 4k+3 - round 10 high
1753#ifdef __AARCH64EB__
1754	rev     $output_h3, $output_h3
1755#endif
1756	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
1757	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
1758
1759	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
1760
1761	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
1762	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
1763
1764	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
1765
1766	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
1767	eor     $output_h2, $output_h2, $rk10_h                   @ AES block 4k+2 - round 10 high
1768#ifdef __AARCH64EB__
1769	rev     $output_h2, $output_h2
1770#endif
1771	aese    $ctr0b, $rk9                                      @ AES block 4k+4 - round 9
1772	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
1773
1774	aese    $ctr2b, $rk9                                      @ AES block 4k+6 - round 9
1775	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
1776	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
1777
1778	aese    $ctr3b, $rk9                                      @ AES block 4k+7 - round 9
1779	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
1780	.L128_dec_tail:                                           @ TAIL
1781
1782	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
1783	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
1784
1785	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
1786
1787	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
1788
1789	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
1790
1791	cmp     $main_end_input_ptr, #48
1792
1793	eor     $output_h0, $output_h0, $rk10_h                   @ AES block 4k+4 - round 10 high
1794#ifdef __AARCH64EB__
1795	rev     $output_h0, $output_h0
1796#endif
1797	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
1798	eor     $output_l0, $output_l0, $rk10_l                   @ AES block 4k+4 - round 10 low
1799#ifdef __AARCH64EB__
1800	rev     $output_l0, $output_l0
1801#endif
1802	b.gt    .L128_dec_blocks_more_than_3
1803
1804	mov     $ctr3b, $ctr2b
1805	sub     $rctr32w, $rctr32w, #1
1806	movi    $acc_l.8b, #0
1807
1808	movi    $acc_h.8b, #0
1809	mov     $ctr2b, $ctr1b
1810
1811	movi    $acc_m.8b, #0
1812	cmp     $main_end_input_ptr, #32
1813	b.gt     .L128_dec_blocks_more_than_2
1814
1815	cmp     $main_end_input_ptr, #16
1816
1817	mov     $ctr3b, $ctr1b
1818	sub     $rctr32w, $rctr32w, #1
1819	b.gt    .L128_dec_blocks_more_than_1
1820
1821	sub     $rctr32w, $rctr32w, #1
1822	b       .L128_dec_blocks_less_than_1
1823	.L128_dec_blocks_more_than_3:                             @ blocks left >  3
1824	rev64   $res0b, $res1b                                    @ GHASH final-3 block
1825	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-2 block - load ciphertext
1826
1827	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1828
1829	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
1830	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-3 block  - store result
1831	eor     $ctr0b, $res1b, $ctr1b                            @ AES final-2 block - result
1832
1833	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
1834	mov     $output_h0, $ctr0.d[1]                            @ AES final-2 block - mov high
1835
1836	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
1837	mov     $output_l0, $ctr0.d[0]                            @ AES final-2 block - mov low
1838
1839	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
1840
1841	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
1842
1843	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1844	eor     $output_h0, $output_h0, $rk10_h                   @ AES final-2 block - round 10 high
1845#ifdef __AARCH64EB__
1846	rev     $output_h0, $output_h0
1847#endif
1848	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
1849	eor     $output_l0, $output_l0, $rk10_l                   @ AES final-2 block - round 10 low
1850#ifdef __AARCH64EB__
1851	rev     $output_l0, $output_l0
1852#endif
1853	.L128_dec_blocks_more_than_2:                             @ blocks left >  2
1854
1855	rev64   $res0b, $res1b                                    @ GHASH final-2 block
1856	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-1 block - load ciphertext
1857
1858	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1859
1860	eor     $ctr0b, $res1b, $ctr2b                            @ AES final-1 block - result
1861	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-2 block  - store result
1862
1863	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
1864
1865	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
1866
1867	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
1868	mov     $output_l0, $ctr0.d[0]                            @ AES final-1 block - mov low
1869
1870	mov     $output_h0, $ctr0.d[1]                            @ AES final-1 block - mov high
1871	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
1872
1873	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1874
1875	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
1876
1877	eor     $output_l0, $output_l0, $rk10_l                   @ AES final-1 block - round 10 low
1878#ifdef __AARCH64EB__
1879	rev     $output_l0, $output_l0
1880#endif
1881	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
1882
1883	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
1884
1885	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
1886	eor     $output_h0, $output_h0, $rk10_h                   @ AES final-1 block - round 10 high
1887#ifdef __AARCH64EB__
1888	rev     $output_h0, $output_h0
1889#endif
1890	.L128_dec_blocks_more_than_1:                             @ blocks left >  1
1891
1892	rev64   $res0b, $res1b                                    @ GHASH final-1 block
1893
1894	ld1     { $res1b}, [$input_ptr], #16                      @ AES final block - load ciphertext
1895	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1896
1897	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
1898
1899	eor     $ctr0b, $res1b, $ctr3b                            @ AES final block - result
1900
1901	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
1902
1903	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-1 block  - store result
1904	mov     $output_l0, $ctr0.d[0]                            @ AES final block - mov low
1905
1906	mov     $output_h0, $ctr0.d[1]                            @ AES final block - mov high
1907	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
1908
1909	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
1910
1911	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
1912
1913	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
1914	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
1915
1916	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
1917
1918	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
1919	eor     $output_h0, $output_h0, $rk10_h                   @ AES final block - round 10 high
1920#ifdef __AARCH64EB__
1921	rev     $output_h0, $output_h0
1922#endif
1923	eor     $output_l0, $output_l0, $rk10_l                   @ AES final block - round 10 low
1924#ifdef __AARCH64EB__
1925	rev     $output_l0, $output_l0
1926#endif
1927	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
1928	.L128_dec_blocks_less_than_1:                                            @ blocks left <= 1
1929
1930	mvn     $rk10_h, xzr                                      @ rk10_h = 0xffffffffffffffff
1931	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1932
1933	mvn     $rk10_l, xzr                                      @ rk10_l = 0xffffffffffffffff
1934	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
1935
1936	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
1937
1938	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
1939
1940	lsr     $rk10_h, $rk10_h, $bit_length                     @ rk10_h is mask for top 64b of last block
1941	cmp     $bit_length, #64
1942
1943	csel    $ctr96_b64x, $rk10_h, xzr, lt
1944	csel    $ctr32x, $rk10_l, $rk10_h, lt
1945
1946	fmov    $ctr0d, $ctr32x                                   @ ctr0b is mask for last block
1947
1948	mov     $ctr0.d[1], $ctr96_b64x
1949
1950	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
1951
1952	rev64   $res0b, $res1b                                    @ GHASH final block
1953
1954	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
1955
1956	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
1957
1958	and     $output_h0, $output_h0, $ctr96_b64x
1959
1960	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
1961	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
1962
1963	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
1964	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
1965
1966	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
1967
1968	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
1969	bic     $end_input_ptr, $end_input_ptr, $ctr32x           @ mask out low existing bytes
1970	and     $output_l0, $output_l0, $ctr32x
1971
1972#ifndef __AARCH64EB__
1973	rev     $ctr32w, $rctr32w
1974#else
1975	mov     $ctr32w, $rctr32w
1976#endif
1977
1978	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
1979	movi    $mod_constant.8b, #0xc2
1980
1981	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
1982
1983	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x   @ mask out high existing bytes
1984	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
1985
1986	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
1987
1988	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
1989
1990	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
1991
1992	orr     $output_l0, $output_l0, $end_input_ptr
1993	str     $ctr32w, [$counter, #12]                          @ store the updated counter
1994
1995	orr     $output_h0, $output_h0, $main_end_input_ptr
1996	stp     $output_l0, $output_h0, [$output_ptr]
1997	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
1998
1999	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
2000
2001	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
2002
2003	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
2004	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
2005
2006	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
2007
2008	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
2009	ext     $acc_lb, $acc_lb, $acc_lb, #8
2010	rev64   $acc_lb, $acc_lb
2011	mov     x0, $len
2012	st1     { $acc_l.16b }, [$current_tag]
2013
2014	ldp     x21, x22, [sp, #16]
2015	ldp     x23, x24, [sp, #32]
2016	ldp     d8, d9, [sp, #48]
2017	ldp     d10, d11, [sp, #64]
2018	ldp     d12, d13, [sp, #80]
2019	ldp     d14, d15, [sp, #96]
2020	ldp     x19, x20, [sp], #112
2021	ret
2022
2023	.L128_dec_ret:
2024	mov w0, #0x0
2025	ret
2026.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel
2027___
2028}
2029
2030{
2031my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
2032my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
2033my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
2034my ($output_l0,$output_h0)=map("x$_",(6..7));
2035
2036my $ctr32w="w9";
2037my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15));
2038my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
2039
2040my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
2041my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
2042my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
2043my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
2044
2045my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
2046my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
2047my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
2048
2049my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
2050my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
2051my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
2052
2053my $t0="v8";
2054my $t0d="d8";
2055my $t3="v4";
2056my $t3d="d4";
2057
2058my ($t1,$t2)=map("v$_",(30..31));
2059my ($t1d,$t2d)=map("d$_",(30..31));
2060
2061my $t4="v30";
2062my $t4d="d30";
2063my $t5="v8";
2064my $t5d="d8";
2065my $t6="v31";
2066my $t6d="d31";
2067
2068my $t7="v5";
2069my $t7d="d5";
2070my $t8="v6";
2071my $t8d="d6";
2072my $t9="v30";
2073my $t9d="d30";
2074
2075my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
2076my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
2077my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
2078
2079my $mod_constantd="d8";
2080my $mod_constant="v8";
2081my $mod_t="v31";
2082
2083my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29));
2084my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29));
2085my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29));
2086my $rk2q1="v20.1q";
2087my $rk3q1="v21.1q";
2088my $rk4v="v22";
2089my $rk4d="d22";
2090
2091#########################################################################################
2092# size_t aes_gcm_enc_192_kernel(const unsigned char *in,
2093#                               size_t len,
2094#                               unsigned char *out,
2095#                               const void *key,
2096#                               unsigned char ivec[16],
2097#                               u64 *Xi);
2098#
2099$code.=<<___;
2100.global aes_gcm_enc_192_kernel
2101.type   aes_gcm_enc_192_kernel,%function
2102.align  4
2103aes_gcm_enc_192_kernel:
2104	AARCH64_VALID_CALL_TARGET
2105	cbz     x1, .L192_enc_ret
2106	stp     x19, x20, [sp, #-112]!
2107	mov     x16, x4
2108	mov     x8, x5
2109	stp     x21, x22, [sp, #16]
2110	stp     x23, x24, [sp, #32]
2111	stp     d8, d9, [sp, #48]
2112	stp     d10, d11, [sp, #64]
2113	stp     d12, d13, [sp, #80]
2114	stp     d14, d15, [sp, #96]
2115
2116	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]             @ ctr96_b64, ctr96_t32
2117#ifdef __AARCH64EB__
2118	rev     $ctr96_b64x, $ctr96_b64x
2119	rev     $ctr96_t32x, $ctr96_t32x
2120#endif
2121	ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
2122#ifdef __AARCH64EB__
2123	ror     $rk12_l, $rk12_l, #32
2124	ror     $rk12_h, $rk12_h, #32
2125#endif
2126	ld1     {$rk0s}, [$cc], #16	                             @ load rk0
2127
2128	ld1     {$rk1s}, [$cc], #16	                             @ load rk1
2129
2130	ld1     {$rk2s}, [$cc], #16	                             @ load rk2
2131
2132	lsr     $rctr32x, $ctr96_t32x, #32
2133	ld1     {$rk3s}, [$cc], #16	                             @ load rk3
2134	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
2135
2136	ld1     {$rk4s}, [$cc], #16	                             @ load rk4
2137	rev     $rctr32w, $rctr32w                               @ rev_ctr32
2138
2139	add     $rctr32w, $rctr32w, #1                           @ increment rev_ctr32
2140	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 3
2141
2142	rev     $ctr32w, $rctr32w                                @ CTR block 1
2143	add     $rctr32w, $rctr32w, #1                           @ CTR block 1
2144	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 1
2145
2146	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 1
2147	ld1     { $ctr0b}, [$counter]                            @ special case vector load initial counter so we can start first AES block as quickly as possible
2148
2149	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 1
2150	rev     $ctr32w, $rctr32w                                @ CTR block 2
2151	add     $rctr32w, $rctr32w, #1                           @ CTR block 2
2152
2153	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 2
2154	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 2
2155
2156	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 2
2157	rev     $ctr32w, $rctr32w                                @ CTR block 3
2158
2159	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 3
2160	ld1     {$rk5s}, [$cc], #16	                             @ load rk5
2161
2162	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 3
2163
2164	ld1     {$rk6s}, [$cc], #16	                             @ load rk6
2165
2166	ld1     {$rk7s}, [$cc], #16	                             @ load rk7
2167
2168	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 0
2169	ld1     { $acc_lb}, [$current_tag]
2170	ext     $acc_lb, $acc_lb, $acc_lb, #8
2171	rev64   $acc_lb, $acc_lb
2172
2173	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 0
2174	ld1     {$rk8s}, [$cc], #16	                             @ load rk8
2175
2176	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 0
2177	ldr     $h4q, [$current_tag, #112]                       @ load h4l | h4h
2178#ifndef __AARCH64EB__
2179	ext     $h4b, $h4b, $h4b, #8
2180#endif
2181	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 0
2182	ld1     {$rk9s}, [$cc], #16	                             @ load rk9
2183
2184	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 1
2185	ld1     {$rk10s}, [$cc], #16	                         @ load rk10
2186
2187	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 1
2188	ldr     $h1q, [$current_tag, #32]                        @ load h1l | h1h
2189#ifndef __AARCH64EB__
2190	ext     $h1b, $h1b, $h1b, #8
2191#endif
2192	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 1
2193	ld1     {$rk11s}, [$cc], #16	                         @ load rk11
2194
2195	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 1
2196	ldr     $h3q, [$current_tag, #80]                        @ load h3l | h3h
2197#ifndef __AARCH64EB__
2198	ext     $h3b, $h3b, $h3b, #8
2199#endif
2200	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 2
2201
2202	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 2
2203
2204	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 2
2205
2206	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 3
2207	trn1    $acc_h.2d, $h3.2d,    $h4.2d                     @ h4h | h3h
2208
2209	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 3
2210
2211	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 2
2212	trn2    $h34k.2d,  $h3.2d,    $h4.2d                     @ h4l | h3l
2213
2214	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 4
2215
2216	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 3
2217
2218	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 3
2219
2220	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 5
2221
2222	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 4
2223
2224	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 4
2225
2226	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 6
2227
2228	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 4
2229
2230	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 5
2231
2232	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 5
2233
2234	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 5
2235
2236	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 6
2237	ldr     $h2q, [$current_tag, #64]                        @ load h2l | h2h
2238#ifndef __AARCH64EB__
2239	ext     $h2b, $h2b, $h2b, #8
2240#endif
2241	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 6
2242
2243	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 6
2244
2245	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 7
2246
2247	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 7
2248	trn2    $h12k.2d,  $h1.2d,    $h2.2d                     @ h2l | h1l
2249
2250	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 7
2251
2252	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 8
2253
2254	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 7
2255	trn1    $t0.2d,    $h1.2d,    $h2.2d                     @ h2h | h1h
2256
2257	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 8
2258
2259	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 8
2260
2261	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 8
2262
2263	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 9
2264
2265	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 9
2266
2267	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 9
2268
2269	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 9
2270
2271	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 0 - round 10
2272
2273	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 2 - round 10
2274
2275	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 1 - round 10
2276	lsr     $main_end_input_ptr, $bit_length, #3             @ byte_len
2277	mov     $len, $main_end_input_ptr
2278
2279	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 3 - round 10
2280	sub     $main_end_input_ptr, $main_end_input_ptr, #1     @ byte_len - 1
2281
2282	eor     $h12k.16b, $h12k.16b, $t0.16b                    @ h2k | h1k
2283	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0   @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
2284
2285	eor     $h34k.16b, $h34k.16b, $acc_h.16b                 @ h4k | h3k
2286
2287	aese    $ctr2b, $rk11                                    @ AES block 2 - round 11
2288	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3  @ end_input_ptr
2289	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
2290
2291	aese    $ctr1b, $rk11                                    @ AES block 1 - round 11
2292	cmp     $input_ptr, $main_end_input_ptr                  @ check if we have <= 4 blocks
2293
2294	aese    $ctr0b, $rk11                                    @ AES block 0 - round 11
2295	add     $rctr32w, $rctr32w, #1                           @ CTR block 3
2296
2297	aese    $ctr3b, $rk11                                    @ AES block 3 - round 11
2298	b.ge    .L192_enc_tail                                   @ handle tail
2299
2300	rev     $ctr32w, $rctr32w                                @ CTR block 4
2301	ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 0 - load plaintext
2302#ifdef __AARCH64EB__
2303	rev     $input_l0, $input_l0
2304	rev     $input_h0, $input_h0
2305#endif
2306	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4
2307	ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 2 - load plaintext
2308#ifdef __AARCH64EB__
2309	rev     $input_l2, $input_l2
2310	rev     $input_h2, $input_h2
2311#endif
2312	ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 3 - load plaintext
2313#ifdef __AARCH64EB__
2314	rev     $input_l3, $input_l3
2315	rev     $input_h3, $input_h3
2316#endif
2317	ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 1 - load plaintext
2318#ifdef __AARCH64EB__
2319	rev     $input_l1, $input_l1
2320	rev     $input_h1, $input_h1
2321#endif
2322	add     $input_ptr, $input_ptr, #64                      @ AES input_ptr update
2323	cmp     $input_ptr, $main_end_input_ptr                  @ check if we have <= 8 blocks
2324
2325	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 0 - round 12 low
2326
2327	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 0 - round 12 high
2328	eor     $input_h2, $input_h2, $rk12_h                    @ AES block 2 - round 12 high
2329	fmov    $ctr_t0d, $input_l0                              @ AES block 0 - mov low
2330
2331	eor     $input_h3, $input_h3, $rk12_h                    @ AES block 3 - round 12 high
2332	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 0 - mov high
2333
2334	eor     $input_l2, $input_l2, $rk12_l                    @ AES block 2 - round 12 low
2335	eor     $input_l1, $input_l1, $rk12_l                    @ AES block 1 - round 12 low
2336
2337	fmov    $ctr_t1d, $input_l1                              @ AES block 1 - mov low
2338	eor     $input_h1, $input_h1, $rk12_h                    @ AES block 1 - round 12 high
2339
2340	fmov    $ctr_t1.d[1], $input_h1                          @ AES block 1 - mov high
2341
2342	eor     $input_l3, $input_l3, $rk12_l                    @ AES block 3 - round 12 low
2343	fmov    $ctr_t2d, $input_l2                              @ AES block 2 - mov low
2344
2345	add     $rctr32w, $rctr32w, #1                           @ CTR block 4
2346	eor     $res0b, $ctr_t0b, $ctr0b                         @ AES block 0 - result
2347	fmov    $ctr0d, $ctr96_b64x                              @ CTR block 4
2348
2349	fmov    $ctr0.d[1], $ctr32x                              @ CTR block 4
2350	rev     $ctr32w, $rctr32w                                @ CTR block 5
2351
2352	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 5
2353	add     $rctr32w, $rctr32w, #1                           @ CTR block 5
2354
2355	fmov    $ctr_t3d, $input_l3                              @ AES block 3 - mov low
2356	st1     { $res0b}, [$output_ptr], #16                    @ AES block 0 - store result
2357
2358	fmov    $ctr_t2.d[1], $input_h2                          @ AES block 2 - mov high
2359
2360	eor     $res1b, $ctr_t1b, $ctr1b                         @ AES block 1 - result
2361	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 5
2362	st1     { $res1b}, [$output_ptr], #16                    @ AES block 1 - store result
2363
2364	fmov    $ctr_t3.d[1], $input_h3                          @ AES block 3 - mov high
2365
2366	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 5
2367	rev     $ctr32w, $rctr32w                                @ CTR block 6
2368
2369	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 6
2370
2371	add     $rctr32w, $rctr32w, #1                           @ CTR block 6
2372	eor     $res2b, $ctr_t2b, $ctr2b                         @ AES block 2 - result
2373	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 6
2374
2375	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 6
2376	rev     $ctr32w, $rctr32w                                @ CTR block 7
2377
2378	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 7
2379	st1     { $res2b}, [$output_ptr], #16                    @ AES block 2 - store result
2380
2381	eor     $res3b, $ctr_t3b, $ctr3b                         @ AES block 3 - result
2382	st1     { $res3b}, [$output_ptr], #16                    @ AES block 3 - store result
2383	b.ge    .L192_enc_prepretail                             @ do prepretail
2384
2385	.L192_enc_main_loop:                                     @ main loop start
2386	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 0
2387	rev64   $res1b, $res1b                                   @ GHASH block 4k+1 (t0 and t1 free)
2388
2389	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 0
2390	ldp     $input_l1, $input_h1, [$input_ptr, #16]          @ AES block 4k+5 - load plaintext
2391#ifdef __AARCH64EB__
2392	rev     $input_l1, $input_l1
2393	rev     $input_h1, $input_h1
2394#endif
2395	ext     $acc_lb, $acc_lb, $acc_lb, #8                    @ PRE 0
2396	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 4k+3
2397	rev64   $res0b, $res0b                                   @ GHASH block 4k (only t0 is free)
2398
2399	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 1
2400	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 4k+3
2401
2402	pmull2  $t1.1q, $res1.2d, $h3.2d                         @ GHASH block 4k+1 - high
2403	rev64   $res3b, $res3b                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2404	ldp     $input_l2, $input_h2, [$input_ptr, #32]          @ AES block 4k+6 - load plaintext
2405#ifdef __AARCH64EB__
2406	rev     $input_l2, $input_l2
2407	rev     $input_h2, $input_h2
2408#endif
2409	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 0
2410	ldp     $input_l3, $input_h3, [$input_ptr, #48]          @ AES block 4k+3 - load plaintext
2411#ifdef __AARCH64EB__
2412	rev     $input_l3, $input_l3
2413	rev     $input_h3, $input_h3
2414#endif
2415	pmull   $t2.1q, $res1.1d, $h3.1d                         @ GHASH block 4k+1 - low
2416	eor     $res0b, $res0b, $acc_lb                          @ PRE 1
2417
2418	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 1
2419
2420	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 1
2421	rev64   $res2b, $res2b                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2422
2423	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 0
2424	eor     $input_h3, $input_h3, $rk12_h                    @ AES block 4k+3 - round 12 high
2425
2426	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH block 4k - low
2427	mov     $t0d, $res0.d[1]                                 @ GHASH block 4k - mid
2428
2429	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 2
2430
2431	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 1
2432	eor     $input_l2, $input_l2, $rk12_l                    @ AES block 4k+6 - round 12 low
2433
2434	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH block 4k - mid
2435	eor     $acc_lb, $acc_lb, $t2.16b                        @ GHASH block 4k+1 - low
2436
2437	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 3
2438	eor     $input_l1, $input_l1, $rk12_l                    @ AES block 4k+5 - round 12 low
2439
2440	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 2
2441	mov     $t6d, $res2.d[1]                                 @ GHASH block 4k+2 - mid
2442
2443	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH block 4k - high
2444	mov     $t3d, $res1.d[1]                                 @ GHASH block 4k+1 - mid
2445
2446	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 2
2447
2448	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 3
2449
2450	mov     $acc_md, $h34k.d[1]                              @ GHASH block 4k - mid
2451	eor     $acc_hb, $acc_hb, $t1.16b                        @ GHASH block 4k+1 - high
2452
2453	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 2
2454	eor     $t6.8b, $t6.8b, $res2.8b                         @ GHASH block 4k+2 - mid
2455
2456	pmull2  $t4.1q, $res2.2d, $h2.2d                         @ GHASH block 4k+2 - high
2457
2458	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 4
2459	eor     $t3.8b, $t3.8b, $res1.8b                         @ GHASH block 4k+1 - mid
2460
2461	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 3
2462
2463	pmull2  $t7.1q, $res3.2d, $h1.2d                         @ GHASH block 4k+3 - high
2464	eor     $input_h1, $input_h1, $rk12_h                    @ AES block 4k+5 - round 12 high
2465	ins     $t6.d[1], $t6.d[0]                               @ GHASH block 4k+2 - mid
2466
2467	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 5
2468	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+3
2469
2470	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 4
2471	eor     $acc_hb, $acc_hb, $t4.16b                        @ GHASH block 4k+2 - high
2472
2473	pmull   $t3.1q, $t3.1d, $h34k.1d                         @ GHASH block 4k+1 - mid
2474	eor     $input_h2, $input_h2, $rk12_h                    @ AES block 4k+6 - round 12 high
2475
2476	pmull2  $t6.1q, $t6.2d, $h12k.2d                         @ GHASH block 4k+2 - mid
2477	eor     $input_l3, $input_l3, $rk12_l                    @ AES block 4k+3 - round 12 low
2478	mov     $t9d, $res3.d[1]                                 @ GHASH block 4k+3 - mid
2479
2480	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                     @ GHASH block 4k - mid
2481	rev     $ctr32w, $rctr32w                                @ CTR block 4k+8
2482
2483	pmull   $t5.1q, $res2.1d, $h2.1d                         @ GHASH block 4k+2 - low
2484	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+8
2485
2486	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 3
2487	eor     $t9.8b, $t9.8b, $res3.8b                         @ GHASH block 4k+3 - mid
2488
2489	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 4
2490	ldp     $input_l0, $input_h0, [$input_ptr, #0]           @ AES block 4k+4 - load plaintext
2491#ifdef __AARCH64EB__
2492	rev     $input_l0, $input_l0
2493	rev     $input_h0, $input_h0
2494#endif
2495	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 6
2496	eor     $acc_lb, $acc_lb, $t5.16b                        @ GHASH block 4k+2 - low
2497
2498	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 4
2499	add     $input_ptr, $input_ptr, #64                      @ AES input_ptr update
2500
2501	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 5
2502	movi    $mod_constant.8b, #0xc2
2503
2504	pmull   $t8.1q, $res3.1d, $h1.1d                         @ GHASH block 4k+3 - low
2505	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 4k+4 - round 12 high
2506	eor     $acc_mb, $acc_mb, $t3.16b                        @ GHASH block 4k+1 - mid
2507
2508	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 5
2509	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 4k+4 - round 12 low
2510
2511	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 6
2512	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2513
2514	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 5
2515	eor     $acc_hb, $acc_hb, $t7.16b                        @ GHASH block 4k+3 - high
2516
2517	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 7
2518	fmov    $ctr_t1d, $input_l1                              @ AES block 4k+5 - mov low
2519
2520	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 7
2521	eor     $acc_mb, $acc_mb, $t6.16b                        @ GHASH block 4k+2 - mid
2522
2523	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 6
2524	fmov    $ctr_t1.d[1], $input_h1                          @ AES block 4k+5 - mov high
2525
2526	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 8
2527	eor     $acc_lb, $acc_lb, $t8.16b                        @ GHASH block 4k+3 - low
2528
2529	pmull   $t9.1q, $t9.1d, $h12k.1d                         @ GHASH block 4k+3 - mid
2530	cmp     $input_ptr, $main_end_input_ptr                  @ LOOP CONTROL
2531	fmov    $ctr_t0d, $input_l0                              @ AES block 4k+4 - mov low
2532
2533	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 6
2534	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 4k+4 - mov high
2535
2536	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 8
2537	fmov    $ctr_t3d, $input_l3                              @ AES block 4k+3 - mov low
2538
2539	eor     $acc_mb, $acc_mb, $t9.16b                        @ GHASH block 4k+3 - mid
2540	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
2541	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+8
2542
2543	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 7
2544	fmov    $ctr_t3.d[1], $input_h3                          @ AES block 4k+3 - mov high
2545
2546	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
2547	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
2548	fmov    $ctr_t2d, $input_l2                              @ AES block 4k+6 - mov low
2549
2550	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 7
2551
2552	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 9
2553	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
2554
2555	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 8
2556
2557	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 8
2558
2559	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 9
2560
2561	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 10
2562	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
2563
2564	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 9
2565
2566	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 9
2567
2568	aese    $ctr0b, $rk11                                    @ AES block 4k+4 - round 11
2569
2570	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 10
2571	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
2572
2573	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 10
2574
2575	eor     $res0b, $ctr_t0b, $ctr0b                         @ AES block 4k+4 - result
2576	fmov    $ctr0d, $ctr96_b64x                              @ CTR block 4k+8
2577
2578	aese    $ctr1b, $rk11                                    @ AES block 4k+5 - round 11
2579	fmov    $ctr0.d[1], $ctr32x                              @ CTR block 4k+8
2580	rev     $ctr32w, $rctr32w                                @ CTR block 4k+9
2581
2582	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
2583	fmov    $ctr_t2.d[1], $input_h2                          @ AES block 4k+6 - mov high
2584	st1     { $res0b}, [$output_ptr], #16                    @ AES block 4k+4 - store result
2585
2586	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 10
2587	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+9
2588
2589	eor     $res1b, $ctr_t1b, $ctr1b                         @ AES block 4k+5 - result
2590	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+9
2591	fmov    $ctr1d, $ctr96_b64x                              @ CTR block 4k+9
2592
2593	aese    $ctr2b, $rk11                                    @ AES block 4k+6 - round 11
2594	fmov    $ctr1.d[1], $ctr32x                              @ CTR block 4k+9
2595	rev     $ctr32w, $rctr32w                                @ CTR block 4k+10
2596
2597	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+10
2598	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
2599	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+10
2600
2601	st1     { $res1b}, [$output_ptr], #16                    @ AES block 4k+5 - store result
2602	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
2603
2604	aese    $ctr3b, $rk11                                    @ AES block 4k+7 - round 11
2605	eor     $res2b, $ctr_t2b, $ctr2b                         @ AES block 4k+6 - result
2606	fmov    $ctr2d, $ctr96_b64x                              @ CTR block 4k+10
2607
2608	st1     { $res2b}, [$output_ptr], #16                    @ AES block 4k+6 - store result
2609	fmov    $ctr2.d[1], $ctr32x                              @ CTR block 4k+10
2610	rev     $ctr32w, $rctr32w                                @ CTR block 4k+11
2611
2612	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
2613	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32           @ CTR block 4k+11
2614
2615	eor     $res3b, $ctr_t3b, $ctr3b                         @ AES block 4k+3 - result
2616	st1     { $res3b}, [$output_ptr], #16                    @ AES block 4k+3 - store result
2617	b.lt    .L192_enc_main_loop
2618
2619	.L192_enc_prepretail:                                    @ PREPRETAIL
2620	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 0
2621	rev64   $res0b, $res0b                                   @ GHASH block 4k (only t0 is free)
2622
2623	fmov    $ctr3d, $ctr96_b64x                              @ CTR block 4k+3
2624	ext     $acc_lb, $acc_lb, $acc_lb, #8                    @ PRE 0
2625	add     $rctr32w, $rctr32w, #1                           @ CTR block 4k+3
2626
2627	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 0
2628	rev64   $res1b, $res1b                                   @ GHASH block 4k+1 (t0 and t1 free)
2629
2630	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 0
2631
2632	fmov    $ctr3.d[1], $ctr32x                              @ CTR block 4k+3
2633	eor     $res0b, $res0b, $acc_lb                          @ PRE 1
2634	mov     $acc_md, $h34k.d[1]                              @ GHASH block 4k - mid
2635
2636	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 1
2637	rev64   $res2b, $res2b                                   @ GHASH block 4k+2 (t0, t1, and t2 free)
2638
2639	pmull2  $t1.1q, $res1.2d, $h3.2d                         @ GHASH block 4k+1 - high
2640
2641	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH block 4k - low
2642	mov     $t0d, $res0.d[1]                                 @ GHASH block 4k - mid
2643
2644	pmull   $t2.1q, $res1.1d, $h3.1d                         @ GHASH block 4k+1 - low
2645	rev64   $res3b, $res3b                                   @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
2646
2647	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH block 4k - high
2648
2649	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH block 4k - mid
2650	mov     $t3d, $res1.d[1]                                 @ GHASH block 4k+1 - mid
2651
2652	eor     $acc_lb, $acc_lb, $t2.16b                        @ GHASH block 4k+1 - low
2653	mov     $t6d, $res2.d[1]                                 @ GHASH block 4k+2 - mid
2654
2655	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 0
2656	eor     $acc_hb, $acc_hb, $t1.16b                        @ GHASH block 4k+1 - high
2657
2658	pmull2  $t4.1q, $res2.2d, $h2.2d                         @ GHASH block 4k+2 - high
2659
2660	eor     $t3.8b, $t3.8b, $res1.8b                         @ GHASH block 4k+1 - mid
2661	eor     $t6.8b, $t6.8b, $res2.8b                         @ GHASH block 4k+2 - mid
2662
2663	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 1
2664
2665	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 1
2666	eor     $acc_hb, $acc_hb, $t4.16b                        @ GHASH block 4k+2 - high
2667
2668	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 1
2669
2670	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 2
2671	mov     $t9d, $res3.d[1]                                 @ GHASH block 4k+3 - mid
2672
2673	pmull2  $t7.1q, $res3.2d, $h1.2d                         @ GHASH block 4k+3 - high
2674	ins     $t6.d[1], $t6.d[0]                               @ GHASH block 4k+2 - mid
2675
2676	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 2
2677
2678	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                     @ GHASH block 4k - mid
2679	eor     $t9.8b, $t9.8b, $res3.8b                         @ GHASH block 4k+3 - mid
2680
2681	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 3
2682
2683	pmull2  $t6.1q, $t6.2d, $h12k.2d                         @ GHASH block 4k+2 - mid
2684
2685	pmull   $t3.1q, $t3.1d, $h34k.1d                         @ GHASH block 4k+1 - mid
2686
2687	pmull   $t9.1q, $t9.1d, $h12k.1d                         @ GHASH block 4k+3 - mid
2688	eor     $acc_hb, $acc_hb, $t7.16b                        @ GHASH block 4k+3 - high
2689
2690	pmull   $t5.1q, $res2.1d, $h2.1d                         @ GHASH block 4k+2 - low
2691
2692	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 3
2693	eor     $acc_mb, $acc_mb, $t3.16b                        @ GHASH block 4k+1 - mid
2694
2695	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 2
2696
2697	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 2
2698	eor     $acc_lb, $acc_lb, $t5.16b                        @ GHASH block 4k+2 - low
2699
2700	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 4
2701
2702	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 3
2703	eor     $acc_mb, $acc_mb, $t6.16b                        @ GHASH block 4k+2 - mid
2704
2705	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 3
2706
2707	pmull   $t8.1q, $res3.1d, $h1.1d                         @ GHASH block 4k+3 - low
2708	movi    $mod_constant.8b, #0xc2
2709
2710	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 4
2711
2712	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 4
2713
2714	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 4
2715	eor     $acc_mb, $acc_mb, $t9.16b                        @ GHASH block 4k+3 - mid
2716
2717	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 5
2718
2719	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 5
2720
2721	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 5
2722	eor     $acc_lb, $acc_lb, $t8.16b                        @ GHASH block 4k+3 - low
2723
2724	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 5
2725
2726	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 6
2727	eor     $acc_mb, $acc_mb, $acc_hb                        @ karatsuba tidy up
2728
2729	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 6
2730
2731	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 6
2732	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2733
2734	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 7
2735
2736	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 7
2737	eor     $acc_mb, $acc_mb, $acc_lb
2738
2739	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 7
2740
2741	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
2742
2743	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 6
2744	ext     $acc_hb, $acc_hb, $acc_hb, #8
2745
2746	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 8
2747
2748	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 8
2749	eor     $acc_mb, $acc_mb, $t1.16b
2750
2751	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 7
2752
2753	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 8
2754
2755	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 9
2756
2757	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 8
2758	eor     $acc_mb, $acc_mb, $acc_hb
2759
2760	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 9
2761
2762	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 9
2763
2764	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 9
2765
2766	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
2767
2768	ext     $acc_mb, $acc_mb, $acc_mb, #8
2769
2770	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b         @ AES block 4k+7 - round 10
2771
2772	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b         @ AES block 4k+4 - round 10
2773
2774	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b         @ AES block 4k+6 - round 10
2775
2776	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b         @ AES block 4k+5 - round 10
2777	eor     $acc_lb, $acc_lb, $t1.16b
2778
2779	aese    $ctr0b, $rk11                                    @ AES block 4k+4 - round 11
2780
2781	aese    $ctr3b, $rk11                                    @ AES block 4k+7 - round 11
2782
2783	aese    $ctr2b, $rk11                                    @ AES block 4k+6 - round 11
2784
2785	aese    $ctr1b, $rk11                                    @ AES block 4k+5 - round 11
2786	eor     $acc_lb, $acc_lb, $acc_mb
2787	.L192_enc_tail:                                          @ TAIL
2788
2789	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr  @ main_end_input_ptr is number of bytes left to process
2790	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES block 4k+4 - load plaintext
2791#ifdef __AARCH64EB__
2792	rev     $input_l0, $input_l0
2793	rev     $input_h0, $input_h0
2794#endif
2795	eor     $input_l0, $input_l0, $rk12_l                    @ AES block 4k+4 - round 12 low
2796	eor     $input_h0, $input_h0, $rk12_h                    @ AES block 4k+4 - round 12 high
2797
2798	fmov    $ctr_t0d, $input_l0                              @ AES block 4k+4 - mov low
2799
2800	fmov    $ctr_t0.d[1], $input_h0                          @ AES block 4k+4 - mov high
2801	cmp     $main_end_input_ptr, #48
2802
2803	eor     $res1b, $ctr_t0b, $ctr0b                         @ AES block 4k+4 - result
2804
2805	ext     $t0.16b, $acc_lb, $acc_lb, #8                    @ prepare final partial tag
2806	b.gt    .L192_enc_blocks_more_than_3
2807
2808	sub     $rctr32w, $rctr32w, #1
2809	movi    $acc_m.8b, #0
2810
2811	mov     $ctr3b, $ctr2b
2812	movi    $acc_h.8b, #0
2813	cmp     $main_end_input_ptr, #32
2814
2815	mov     $ctr2b, $ctr1b
2816	movi    $acc_l.8b, #0
2817	b.gt    .L192_enc_blocks_more_than_2
2818
2819	sub     $rctr32w, $rctr32w, #1
2820
2821	mov     $ctr3b, $ctr1b
2822	cmp     $main_end_input_ptr, #16
2823	b.gt    .L192_enc_blocks_more_than_1
2824
2825	sub     $rctr32w, $rctr32w, #1
2826	b       .L192_enc_blocks_less_than_1
2827	.L192_enc_blocks_more_than_3:                            @ blocks left >  3
2828	st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
2829
2830	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
2831#ifdef __AARCH64EB__
2832	rev     $input_l0, $input_l0
2833	rev     $input_h0, $input_h0
2834#endif
2835	rev64   $res0b, $res1b                                   @ GHASH final-3 block
2836
2837	eor     $input_l0, $input_l0, $rk12_l                    @ AES final-2 block - round 12 low
2838	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2839
2840	eor     $input_h0, $input_h0, $rk12_h                    @ AES final-2 block - round 12 high
2841	fmov    $res1d, $input_l0                                @ AES final-2 block - mov low
2842
2843	fmov    $res1.d[1], $input_h0                            @ AES final-2 block - mov high
2844
2845	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
2846
2847	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
2848
2849	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
2850
2851	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
2852
2853	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2854
2855	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
2856
2857	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
2858	eor     $res1b, $res1b, $ctr1b                           @ AES final-2 block - result
2859	.L192_enc_blocks_more_than_2:                            @ blocks left >  2
2860
2861	st1     { $res1b}, [$output_ptr], #16                    @ AES final-2 block - store result
2862
2863	rev64   $res0b, $res1b                                   @ GHASH final-2 block
2864	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
2865#ifdef __AARCH64EB__
2866	rev     $input_l0, $input_l0
2867	rev     $input_h0, $input_h0
2868#endif
2869	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2870
2871	eor     $input_h0, $input_h0, $rk12_h                    @ AES final-1 block - round 12 high
2872
2873	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
2874	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
2875
2876	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
2877	eor     $input_l0, $input_l0, $rk12_l                    @ AES final-1 block - round 12 low
2878
2879	fmov    $res1d, $input_l0                                @ AES final-1 block - mov low
2880
2881	fmov    $res1.d[1], $input_h0                            @ AES final-1 block - mov high
2882	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
2883	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
2884
2885	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
2886
2887	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
2888
2889	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2890
2891	eor     $res1b, $res1b, $ctr2b                           @ AES final-1 block - result
2892
2893	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
2894	.L192_enc_blocks_more_than_1:                            @ blocks left >  1
2895
2896	st1     { $res1b}, [$output_ptr], #16                    @ AES final-1 block - store result
2897
2898	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
2899#ifdef __AARCH64EB__
2900	rev     $input_l0, $input_l0
2901	rev     $input_h0, $input_h0
2902#endif
2903	rev64   $res0b, $res1b                                   @ GHASH final-1 block
2904
2905	eor     $input_l0, $input_l0, $rk12_l                    @ AES final block - round 12 low
2906	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2907	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
2908
2909	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
2910
2911	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
2912	eor     $input_h0, $input_h0, $rk12_h                    @ AES final block - round 12 high
2913	fmov    $res1d, $input_l0                                @ AES final block - mov low
2914
2915	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
2916	fmov    $res1.d[1], $input_h0                            @ AES final block - mov high
2917
2918	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
2919
2920	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
2921
2922	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
2923
2924	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
2925
2926	eor     $res1b, $res1b, $ctr3b                           @ AES final block - result
2927
2928	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
2929
2930	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
2931	.L192_enc_blocks_less_than_1:                            @ blocks left <= 1
2932
2933	ld1     { $rk0}, [$output_ptr]                           @ load existing bytes where the possibly partial last block is to be stored
2934#ifndef __AARCH64EB__
2935	rev     $ctr32w, $rctr32w
2936#else
2937	mov     $ctr32w, $rctr32w
2938#endif
2939	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
2940
2941	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
2942	mvn     $rk12_h, xzr                                     @ rk12_h = 0xffffffffffffffff
2943
2944	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
2945	mvn     $rk12_l, xzr                                     @ rk12_l = 0xffffffffffffffff
2946
2947	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
2948
2949	lsr     $rk12_h, $rk12_h, $bit_length                    @ rk12_h is mask for top 64b of last block
2950	cmp     $bit_length, #64
2951
2952	csel    $input_l0, $rk12_l, $rk12_h, lt
2953	csel    $input_h0, $rk12_h, xzr, lt
2954
2955	fmov    $ctr0d, $input_l0                                @ ctr0b is mask for last block
2956
2957	fmov    $ctr0.d[1], $input_h0
2958
2959	and     $res1b, $res1b, $ctr0b                           @ possibly partial last block has zeroes in highest bits
2960
2961	rev64   $res0b, $res1b                                   @ GHASH final block
2962
2963	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
2964
2965	mov     $t0d, $res0.d[1]                                 @ GHASH final block - mid
2966
2967	pmull   $rk3q1, $res0.1d, $h1.1d                         @ GHASH final block - low
2968
2969	pmull2  $rk2q1, $res0.2d, $h1.2d                         @ GHASH final block - high
2970
2971	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH final block - mid
2972
2973	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final block - low
2974
2975	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final block - high
2976
2977	pmull   $t0.1q, $t0.1d, $h12k.1d                         @ GHASH final block - mid
2978
2979	eor     $acc_mb, $acc_mb, $t0.16b                        @ GHASH final block - mid
2980	movi    $mod_constant.8b, #0xc2
2981
2982	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
2983
2984	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
2985
2986	bif     $res1b, $rk0, $ctr0b                             @ insert existing bytes in top end of result before storing
2987
2988	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
2989
2990	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
2991
2992	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
2993
2994	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
2995
2996	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
2997
2998	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
2999
3000	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
3001
3002	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
3003	str     $ctr32w, [$counter, #12]                         @ store the updated counter
3004
3005	st1     { $res1b}, [$output_ptr]                         @ store all 16B
3006
3007	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
3008	ext     $acc_lb, $acc_lb, $acc_lb, #8
3009	rev64   $acc_lb, $acc_lb
3010	mov     x0, $len
3011	st1     { $acc_l.16b }, [$current_tag]
3012
3013	ldp     x21, x22, [sp, #16]
3014	ldp     x23, x24, [sp, #32]
3015	ldp     d8, d9, [sp, #48]
3016	ldp     d10, d11, [sp, #64]
3017	ldp     d12, d13, [sp, #80]
3018	ldp     d14, d15, [sp, #96]
3019	ldp     x19, x20, [sp], #112
3020	ret
3021
3022.L192_enc_ret:
3023	mov w0, #0x0
3024	ret
3025.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel
3026___
3027
3028#########################################################################################
3029# size_t aes_gcm_dec_192_kernel(const unsigned char *in,
3030#                               size_t len,
3031#                               unsigned char *out,
3032#                               const void *key,
3033#                               unsigned char ivec[16],
3034#                               u64 *Xi);
3035#
3036$code.=<<___;
3037.global aes_gcm_dec_192_kernel
3038.type   aes_gcm_dec_192_kernel,%function
3039.align  4
3040aes_gcm_dec_192_kernel:
3041	AARCH64_VALID_CALL_TARGET
3042	cbz     x1, .L192_dec_ret
3043	stp     x19, x20, [sp, #-112]!
3044	mov     x16, x4
3045	mov     x8, x5
3046	stp     x21, x22, [sp, #16]
3047	stp     x23, x24, [sp, #32]
3048	stp     d8, d9, [sp, #48]
3049	stp     d10, d11, [sp, #64]
3050	stp     d12, d13, [sp, #80]
3051	stp     d14, d15, [sp, #96]
3052
3053	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
3054	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
3055#ifdef __AARCH64EB__
3056	rev     $ctr96_b64x, $ctr96_b64x
3057	rev     $ctr96_t32x, $ctr96_t32x
3058#endif
3059	ldp     $rk12_l, $rk12_h, [$cc, #192]                     @ load rk12
3060#ifdef __AARCH64EB__
3061	ror     $rk12_l, $rk12_l, #32
3062	ror     $rk12_h, $rk12_h, #32
3063#endif
3064	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
3065
3066	ld1     {$rk0s}, [$cc], #16                                  @ load rk0
3067
3068	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
3069	mov     $len, $main_end_input_ptr
3070	ld1     {$rk1s}, [$cc], #16                               @ load rk1
3071
3072	lsr     $rctr32x, $ctr96_t32x, #32
3073	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
3074	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
3075
3076	rev     $rctr32w, $rctr32w                                @ rev_ctr32
3077	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
3078
3079	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
3080	ld1     {$rk2s}, [$cc], #16                               @ load rk2
3081
3082	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
3083	rev     $ctr32w, $rctr32w                                 @ CTR block 1
3084
3085	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
3086	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
3087	ld1     {$rk3s}, [$cc], #16                               @ load rk3
3088
3089	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
3090	rev     $ctr32w, $rctr32w                                 @ CTR block 2
3091	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
3092
3093	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
3094	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
3095
3096	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
3097	rev     $ctr32w, $rctr32w                                 @ CTR block 3
3098
3099	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
3100	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
3101
3102	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
3103
3104	ld1     {$rk4s}, [$cc], #16                               @ load rk4
3105
3106	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
3107
3108	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
3109	ld1     {$rk5s}, [$cc], #16                               @ load rk5
3110
3111	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
3112	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
3113#ifndef __AARCH64EB__
3114	ext     $h4b, $h4b, $h4b, #8
3115#endif
3116	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
3117	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
3118#ifndef __AARCH64EB__
3119	ext     $h2b, $h2b, $h2b, #8
3120#endif
3121	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
3122	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
3123#ifndef __AARCH64EB__
3124	ext     $h3b, $h3b, $h3b, #8
3125#endif
3126	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
3127
3128	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
3129	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
3130#ifndef __AARCH64EB__
3131	ext     $h1b, $h1b, $h1b, #8
3132#endif
3133	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
3134	ld1     {$rk6s}, [$cc], #16                               @ load rk6
3135
3136	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
3137	ld1     {$rk7s}, [$cc], #16                               @ load rk7
3138
3139	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
3140	ld1     {$rk8s}, [$cc], #16                               @ load rk8
3141
3142	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
3143	ld1     {$rk9s}, [$cc], #16                               @ load rk9
3144
3145	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
3146	ld1     { $acc_lb}, [$current_tag]
3147	ext     $acc_lb, $acc_lb, $acc_lb, #8
3148	rev64   $acc_lb, $acc_lb
3149
3150	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
3151	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
3152
3153	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
3154	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
3155
3156	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
3157	ld1     {$rk10s}, [$cc], #16                              @ load rk10
3158
3159	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
3160	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
3161
3162	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
3163
3164	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
3165	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
3166
3167	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
3168	ld1     {$rk11s}, [$cc], #16                              @ load rk11
3169
3170	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
3171
3172	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
3173
3174	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
3175
3176	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
3177
3178	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
3179
3180	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
3181
3182	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
3183
3184	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
3185
3186	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
3187
3188	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
3189
3190	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
3191
3192	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
3193
3194	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
3195
3196	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
3197
3198	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
3199
3200	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
3201	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
3202
3203	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
3204	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0    @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
3205
3206	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
3207	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
3208
3209	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
3210	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
3211
3212	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
3213	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
3214
3215	aese    $ctr3b, $rk11                                     @ AES block 3 - round 11
3216
3217	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
3218
3219	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
3220
3221	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
3222	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
3223
3224	aese    $ctr2b, $rk11                                     @ AES block 2 - round 11
3225
3226	aese    $ctr1b, $rk11                                     @ AES block 1 - round 11
3227	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
3228
3229	aese    $ctr0b, $rk11                                     @ AES block 0 - round 11
3230	b.ge    .L192_dec_tail                                    @ handle tail
3231
3232	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
3233
3234	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
3235
3236	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
3237	rev     $ctr32w, $rctr32w                                 @ CTR block 4
3238	ld1     {$res2b, $res3b}, [$input_ptr], #32               @ AES block 2,3 - load ciphertext
3239
3240	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
3241
3242	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
3243
3244	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
3245	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
3246	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
3247
3248	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
3249	rev64   $res0b, $res0b                                    @ GHASH block 0
3250
3251	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
3252	rev64   $res1b, $res1b                                    @ GHASH block 1
3253	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
3254
3255	eor     $output_l1, $output_l1, $rk12_l                   @ AES block 1 - round 12 low
3256#ifdef __AARCH64EB__
3257	rev     $output_l1, $output_l1
3258#endif
3259	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
3260	rev     $ctr32w, $rctr32w                                 @ CTR block 5
3261
3262	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
3263	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
3264	eor     $output_h1, $output_h1, $rk12_h                   @ AES block 1 - round 12 high
3265#ifdef __AARCH64EB__
3266	rev     $output_h1, $output_h1
3267#endif
3268	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
3269	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
3270	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 0 - round 12 low
3271#ifdef __AARCH64EB__
3272	rev     $output_l0, $output_l0
3273#endif
3274	rev     $ctr32w, $rctr32w                                 @ CTR block 6
3275	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 0 - round 12 high
3276#ifdef __AARCH64EB__
3277	rev     $output_h0, $output_h0
3278#endif
3279	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
3280	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
3281
3282	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
3283
3284	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
3285	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
3286	b.ge    .L192_dec_prepretail                              @ do prepretail
3287
3288	.L192_dec_main_loop:                                      @ main loop start
3289	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
3290	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
3291
3292	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
3293	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
3294
3295	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
3296	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
3297	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
3298
3299	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
3300	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
3301
3302	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
3303	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
3304
3305	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
3306	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
3307
3308	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
3309	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
3310
3311	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
3312	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
3313
3314	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
3315	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
3316	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
3317
3318	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
3319	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
3320	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
3321
3322	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
3323	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
3324
3325	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
3326	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
3327	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
3328
3329	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
3330
3331	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
3332	eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
3333#ifdef __AARCH64EB__
3334	rev     $output_h2, $output_h2
3335#endif
3336	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
3337	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
3338
3339	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
3340
3341	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
3342	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
3343
3344	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
3345
3346	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
3347	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
3348	eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
3349#ifdef __AARCH64EB__
3350	rev     $output_l2, $output_l2
3351#endif
3352	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
3353
3354	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
3355
3356	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
3357	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
3358
3359	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
3360	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
3361
3362	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
3363
3364	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
3365	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
3366
3367	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
3368
3369	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
3370
3371	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
3372	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
3373
3374	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
3375
3376	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
3377
3378	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
3379	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
3380
3381	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
3382
3383	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
3384	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
3385
3386	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
3387
3388	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
3389	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
3390
3391	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
3392
3393	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
3394	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
3395
3396	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
3397
3398	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
3399	movi    $mod_constant.8b, #0xc2
3400
3401	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
3402
3403	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
3404	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
3405
3406	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
3407
3408	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
3409	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
3410
3411	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
3412
3413	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
3414	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
3415
3416	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
3417
3418	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
3419	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3420
3421	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
3422
3423	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
3424	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3425
3426	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
3427
3428	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
3429	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
3430
3431	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
3432	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3433
3434	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3435	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
3436	eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
3437#ifdef __AARCH64EB__
3438	rev     $output_l3, $output_l3
3439#endif
3440	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
3441	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3442
3443	aese    $ctr0b, $rk11                                     @ AES block 4k+4 - round 11
3444	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
3445
3446	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
3447	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3448
3449	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
3450	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
3451
3452	aese    $ctr1b, $rk11                                     @ AES block 4k+5 - round 11
3453	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
3454	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
3455
3456	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
3457	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
3458
3459	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
3460	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3461
3462	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
3463
3464	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
3465	eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
3466#ifdef __AARCH64EB__
3467	rev     $output_h3, $output_h3
3468#endif
3469	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
3470
3471	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
3472	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
3473
3474	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
3475
3476	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3477	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
3478
3479	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
3480	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
3481	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
3482
3483	aese    $ctr2b, $rk11                                     @ AES block 4k+6 - round 11
3484	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
3485
3486	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
3487	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
3488
3489	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
3490	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
3491	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3492
3493	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
3494	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
3495	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
3496
3497	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
3498#ifdef __AARCH64EB__
3499	rev     $output_l0, $output_l0
3500#endif
3501	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
3502	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3503
3504	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
3505	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
3506	eor     $output_l1, $output_l1, $rk12_l                   @ AES block 4k+5 - round 12 low
3507#ifdef __AARCH64EB__
3508	rev     $output_l1, $output_l1
3509#endif
3510	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
3511	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
3512	eor     $output_h1, $output_h1, $rk12_h                   @ AES block 4k+5 - round 12 high
3513#ifdef __AARCH64EB__
3514	rev     $output_h1, $output_h1
3515#endif
3516	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
3517#ifdef __AARCH64EB__
3518	rev     $output_h0, $output_h0
3519#endif
3520	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
3521	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3522
3523	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
3524	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
3525	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
3526
3527	aese    $ctr3b, $rk11                                     @ AES block 4k+7 - round 11
3528	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
3529	b.lt    .L192_dec_main_loop
3530
3531	.L192_dec_prepretail:                                     @ PREPRETAIL
3532	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
3533	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
3534	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
3535
3536	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
3537	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
3538
3539	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
3540	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
3541
3542	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
3543	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
3544
3545	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
3546	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
3547
3548	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
3549	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
3550
3551	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
3552	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
3553	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
3554
3555	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
3556	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
3557
3558	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
3559	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
3560	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
3561
3562	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
3563	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
3564	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
3565
3566	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
3567	eor     $output_h3, $output_h3, $rk12_h                   @ AES block 4k+3 - round 12 high
3568#ifdef __AARCH64EB__
3569	rev     $output_h3, $output_h3
3570#endif
3571	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
3572
3573	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
3574	eor     $output_l2, $output_l2, $rk12_l                   @ AES block 4k+2 - round 12 low
3575#ifdef __AARCH64EB__
3576	rev     $output_l2, $output_l2
3577#endif
3578	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
3579	eor     $output_h2, $output_h2, $rk12_h                   @ AES block 4k+2 - round 12 high
3580#ifdef __AARCH64EB__
3581	rev     $output_h2, $output_h2
3582#endif
3583	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
3584
3585	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
3586	eor     $output_l3, $output_l3, $rk12_l                   @ AES block 4k+3 - round 12 low
3587#ifdef __AARCH64EB__
3588	rev     $output_l3, $output_l3
3589#endif
3590	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
3591
3592	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
3593	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
3594
3595	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
3596	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
3597
3598	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
3599	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
3600
3601	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
3602	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
3603
3604	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
3605
3606	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
3607	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
3608
3609	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
3610
3611	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
3612	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
3613
3614	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
3615
3616	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
3617
3618	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
3619	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
3620
3621	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
3622	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
3623
3624	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
3625
3626	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
3627	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
3628
3629	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
3630
3631	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
3632	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
3633
3634	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
3635
3636	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
3637	movi    $mod_constant.8b, #0xc2
3638
3639	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
3640
3641	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
3642
3643	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3644	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
3645
3646	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
3647	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
3648
3649	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
3650
3651	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3652	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
3653
3654	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
3655
3656	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
3657	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
3658
3659	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
3660
3661	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
3662	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3663
3664	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
3665
3666	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
3667	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3668
3669	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
3670
3671	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
3672	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3673
3674	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
3675
3676	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
3677
3678	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
3679
3680	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
3681
3682	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
3683	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3684
3685	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
3686
3687	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
3688
3689	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
3690
3691	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
3692	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3693
3694	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
3695
3696	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
3697
3698	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
3699
3700	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
3701
3702	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
3703
3704	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3705
3706	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
3707
3708	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
3709
3710	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
3711	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3712
3713	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
3714
3715	aese    $ctr0b, $rk11
3716	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3717
3718	aese    $ctr2b, $rk11
3719
3720	aese    $ctr1b, $rk11
3721
3722	aese    $ctr3b, $rk11
3723
3724	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3725	.L192_dec_tail:                                           @ TAIL
3726
3727	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
3728	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
3729
3730	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
3731
3732	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
3733
3734	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
3735
3736	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
3737
3738	cmp     $main_end_input_ptr, #48
3739
3740	eor     $output_h0, $output_h0, $rk12_h                   @ AES block 4k+4 - round 12 high
3741#ifdef __AARCH64EB__
3742	rev     $output_h0, $output_h0
3743#endif
3744	eor     $output_l0, $output_l0, $rk12_l                   @ AES block 4k+4 - round 12 low
3745#ifdef __AARCH64EB__
3746	rev     $output_l0, $output_l0
3747#endif
3748	b.gt    .L192_dec_blocks_more_than_3
3749
3750	movi    $acc_l.8b, #0
3751	movi    $acc_h.8b, #0
3752
3753	mov     $ctr3b, $ctr2b
3754	mov     $ctr2b, $ctr1b
3755	sub     $rctr32w, $rctr32w, #1
3756
3757	movi    $acc_m.8b, #0
3758	cmp     $main_end_input_ptr, #32
3759	b.gt    .L192_dec_blocks_more_than_2
3760
3761	mov     $ctr3b, $ctr1b
3762	cmp     $main_end_input_ptr, #16
3763	sub     $rctr32w, $rctr32w, #1
3764
3765	b.gt    .L192_dec_blocks_more_than_1
3766
3767	sub     $rctr32w, $rctr32w, #1
3768	b       .L192_dec_blocks_less_than_1
3769	.L192_dec_blocks_more_than_3:                             @ blocks left >  3
3770	rev64   $res0b, $res1b                                    @ GHASH final-3 block
3771	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-2 block - load ciphertext
3772
3773	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-3 block  - store result
3774
3775	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3776
3777	eor     $ctr0b, $res1b, $ctr1b                            @ AES final-2 block - result
3778
3779	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH final-3 block - low
3780	mov     $output_l0, $ctr0.d[0]                            @ AES final-2 block - mov low
3781	mov     $rk4d, $res0.d[1]                                 @ GHASH final-3 block - mid
3782
3783	mov     $output_h0, $ctr0.d[1]                            @ AES final-2 block - mov high
3784
3785	mov     $acc_md, $h34k.d[1]                               @ GHASH final-3 block - mid
3786	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-3 block - mid
3787
3788	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH final-3 block - high
3789
3790	eor     $output_l0, $output_l0, $rk12_l                   @ AES final-2 block - round 12 low
3791#ifdef __AARCH64EB__
3792	rev     $output_l0, $output_l0
3793#endif
3794	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3795
3796	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                    @ GHASH final-3 block - mid
3797	eor     $output_h0, $output_h0, $rk12_h                   @ AES final-2 block - round 12 high
3798#ifdef __AARCH64EB__
3799	rev     $output_h0, $output_h0
3800#endif
3801	.L192_dec_blocks_more_than_2:                             @ blocks left >  2
3802
3803	rev64   $res0b, $res1b                                    @ GHASH final-2 block
3804	ld1     { $res1b}, [$input_ptr], #16                      @ AES final-1 block - load ciphertext
3805
3806	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3807
3808	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3809
3810	eor     $ctr0b, $res1b, $ctr2b                            @ AES final-1 block - result
3811
3812	mov     $rk4d, $res0.d[1]                                 @ GHASH final-2 block - mid
3813
3814	pmull   $rk3q1, $res0.1d, $h3.1d                          @ GHASH final-2 block - low
3815
3816	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-2 block  - store result
3817
3818	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-2 block - mid
3819	mov     $output_h0, $ctr0.d[1]                            @ AES final-1 block - mov high
3820
3821	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-2 block - low
3822	mov     $output_l0, $ctr0.d[0]                            @ AES final-1 block - mov low
3823
3824	pmull2  $rk2q1, $res0.2d, $h3.2d                          @ GHASH final-2 block - high
3825
3826	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                      @ GHASH final-2 block - mid
3827
3828	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-2 block - high
3829	eor     $output_h0, $output_h0, $rk12_h                   @ AES final-1 block - round 12 high
3830#ifdef __AARCH64EB__
3831	rev     $output_h0, $output_h0
3832#endif
3833	eor     $output_l0, $output_l0, $rk12_l                   @ AES final-1 block - round 12 low
3834#ifdef __AARCH64EB__
3835	rev     $output_l0, $output_l0
3836#endif
3837	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-2 block - mid
3838	.L192_dec_blocks_more_than_1:                             @ blocks left >  1
3839
3840	rev64   $res0b, $res1b                                    @ GHASH final-1 block
3841
3842	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3843	ld1     { $res1b}, [$input_ptr], #16                      @ AES final block - load ciphertext
3844
3845	mov     $rk4d, $res0.d[1]                                 @ GHASH final-1 block - mid
3846
3847	pmull2  $rk2q1, $res0.2d, $h2.2d                          @ GHASH final-1 block - high
3848
3849	eor     $ctr0b, $res1b, $ctr3b                            @ AES final block - result
3850	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES final-1 block  - store result
3851
3852	eor     $rk4v.8b, $rk4v.8b, $res0.8b                      @ GHASH final-1 block - mid
3853
3854	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final-1 block - high
3855
3856	pmull   $rk3q1, $res0.1d, $h2.1d                          @ GHASH final-1 block - low
3857	mov     $output_h0, $ctr0.d[1]                            @ AES final block - mov high
3858
3859	ins     $rk4v.d[1], $rk4v.d[0]                            @ GHASH final-1 block - mid
3860	mov     $output_l0, $ctr0.d[0]                            @ AES final block - mov low
3861
3862	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                      @ GHASH final-1 block - mid
3863
3864	movi    $t0.8b, #0                                        @ suppress further partial tag feed in
3865	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final-1 block - low
3866	eor     $output_h0, $output_h0, $rk12_h                   @ AES final block - round 12 high
3867#ifdef __AARCH64EB__
3868	rev     $output_h0, $output_h0
3869#endif
3870	eor     $output_l0, $output_l0, $rk12_l                   @ AES final block - round 12 low
3871#ifdef __AARCH64EB__
3872	rev     $output_l0, $output_l0
3873#endif
3874	eor     $acc_mb, $acc_mb, $rk4v.16b                       @ GHASH final-1 block - mid
3875	.L192_dec_blocks_less_than_1:                             @ blocks left <= 1
3876
3877	mvn     $rk12_l, xzr                                      @ rk12_l = 0xffffffffffffffff
3878	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr]  @ load existing bytes we need to not overwrite
3879	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
3880
3881	sub     $bit_length, $bit_length, #128                    @ bit_length -= 128
3882
3883	neg     $bit_length, $bit_length                          @ bit_length = 128 - #bits in input (in range [1,128])
3884
3885	and     $bit_length, $bit_length, #127                    @ bit_length %= 128
3886	mvn     $rk12_h, xzr                                      @ rk12_h = 0xffffffffffffffff
3887
3888	lsr     $rk12_h, $rk12_h, $bit_length                     @ rk12_h is mask for top 64b of last block
3889	cmp     $bit_length, #64
3890
3891	csel    $ctr32x, $rk12_l, $rk12_h, lt
3892	csel    $ctr96_b64x, $rk12_h, xzr, lt
3893
3894	fmov    $ctr0d, $ctr32x                                   @ ctr0b is mask for last block
3895	and     $output_l0, $output_l0, $ctr32x
3896	bic     $end_input_ptr, $end_input_ptr, $ctr32x           @ mask out low existing bytes
3897
3898	orr     $output_l0, $output_l0, $end_input_ptr
3899	mov     $ctr0.d[1], $ctr96_b64x
3900#ifndef __AARCH64EB__
3901	rev     $ctr32w, $rctr32w
3902#else
3903	mov     $ctr32w, $rctr32w
3904#endif
3905
3906	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
3907	str     $ctr32w, [$counter, #12]                          @ store the updated counter
3908
3909	rev64   $res0b, $res1b                                    @ GHASH final block
3910
3911	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
3912	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes
3913
3914	and     $output_h0, $output_h0, $ctr96_b64x
3915
3916	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
3917	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
3918
3919	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
3920
3921	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
3922
3923	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
3924
3925	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
3926
3927	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
3928
3929	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
3930	movi    $mod_constant.8b, #0xc2
3931
3932	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
3933
3934	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
3935
3936	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
3937
3938	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
3939	orr     $output_h0, $output_h0, $main_end_input_ptr
3940	stp     $output_l0, $output_h0, [$output_ptr]
3941
3942	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
3943
3944	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
3945
3946	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
3947
3948	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
3949
3950	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
3951
3952	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
3953
3954	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
3955	ext     $acc_lb, $acc_lb, $acc_lb, #8
3956	rev64   $acc_lb, $acc_lb
3957	mov     x0, $len
3958	st1     { $acc_l.16b }, [$current_tag]
3959
3960	ldp     x21, x22, [sp, #16]
3961	ldp     x23, x24, [sp, #32]
3962	ldp     d8, d9, [sp, #48]
3963	ldp     d10, d11, [sp, #64]
3964	ldp     d12, d13, [sp, #80]
3965	ldp     d14, d15, [sp, #96]
3966	ldp     x19, x20, [sp], #112
3967	ret
3968
3969.L192_dec_ret:
3970	mov w0, #0x0
3971	ret
3972.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel
3973___
3974}
3975
3976{
3977my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7));
3978my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24));
3979my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24));
3980my ($output_l0,$output_h0)=map("x$_",(6..7));
3981
3982my $ctr32w="w9";
3983my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15));
3984my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12));
3985
3986my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7));
3987my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7));
3988my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7));
3989my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7));
3990
3991my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11));
3992my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11));
3993my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11));
3994
3995my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17));
3996my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15));
3997my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15));
3998
3999my $t0="v8";
4000my $t0d="d8";
4001my $t1="v4";
4002my $t1d="d4";
4003my $t2="v8";
4004my $t2d="d8";
4005my $t3="v4";
4006my $t3d="d4";
4007my $t4="v4";
4008my $t4d="d4";
4009my $t5="v5";
4010my $t5d="d5";
4011my $t6="v8";
4012my $t6d="d8";
4013my $t7="v5";
4014my $t7d="d5";
4015my $t8="v6";
4016my $t8d="d6";
4017my $t9="v4";
4018my $t9d="d4";
4019
4020my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7));
4021my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7));
4022my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7));
4023
4024my $mod_constantd="d8";
4025my $mod_constant="v8";
4026my $mod_t="v7";
4027
4028my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31));
4029my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31));
4030my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31));
4031my $rk2q1="v20.1q";
4032my $rk3q1="v21.1q";
4033my $rk4v="v22";
4034my $rk4d="d22";
4035
4036#########################################################################################
4037# size_t aes_gcm_enc_256_kernel(const unsigned char *in,
4038#                               size_t len,
4039#                               unsigned char *out,
4040#                               const void *key,
4041#                               unsigned char ivec[16],
4042#                               u64 *Xi);
4043#
4044$code.=<<___;
4045.global aes_gcm_enc_256_kernel
4046.type   aes_gcm_enc_256_kernel,%function
4047.align  4
4048aes_gcm_enc_256_kernel:
4049	AARCH64_VALID_CALL_TARGET
4050	cbz     x1, .L256_enc_ret
4051	stp     x19, x20, [sp, #-112]!
4052	mov     x16, x4
4053	mov     x8, x5
4054	stp     x21, x22, [sp, #16]
4055	stp     x23, x24, [sp, #32]
4056	stp     d8, d9, [sp, #48]
4057	stp     d10, d11, [sp, #64]
4058	stp     d12, d13, [sp, #80]
4059	stp     d14, d15, [sp, #96]
4060
4061	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
4062	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
4063	mov     $len, $main_end_input_ptr
4064	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
4065#ifdef __AARCH64EB__
4066	rev     $ctr96_b64x, $ctr96_b64x
4067	rev     $ctr96_t32x, $ctr96_t32x
4068#endif
4069	ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
4070#ifdef __AARCH64EB__
4071	ror     $rk14_l, $rk14_l, #32
4072	ror     $rk14_h, $rk14_h, #32
4073#endif
4074	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
4075	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
4076
4077	ld1     {$rk0s}, [$cc], #16                               @ load rk0
4078	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
4079
4080	ld1     {$rk1s}, [$cc], #16                               @ load rk1
4081	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
4082
4083	lsr     $rctr32x, $ctr96_t32x, #32
4084	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
4085	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
4086
4087	rev     $rctr32w, $rctr32w                                @ rev_ctr32
4088	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
4089	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
4090
4091	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
4092	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
4093
4094	rev     $ctr32w, $rctr32w                                 @ CTR block 1
4095	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
4096
4097	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
4098	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
4099	ld1     {$rk2s}, [$cc], #16                               @ load rk2
4100
4101	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
4102	rev     $ctr32w, $rctr32w                                 @ CTR block 2
4103	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
4104
4105	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
4106	ld1     {$rk3s}, [$cc], #16                               @ load rk3
4107
4108	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
4109	rev     $ctr32w, $rctr32w                                 @ CTR block 3
4110
4111	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
4112	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
4113
4114	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
4115
4116	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
4117	ld1     {$rk4s}, [$cc], #16                               @ load rk4
4118
4119	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
4120	ld1     {$rk5s}, [$cc], #16                               @ load rk5
4121
4122	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
4123	ld1     {$rk6s}, [$cc], #16                               @ load rk6
4124
4125	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
4126	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
4127#ifndef __AARCH64EB__
4128	ext     $h3b, $h3b, $h3b, #8
4129#endif
4130	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
4131	ld1     {$rk7s}, [$cc], #16                               @ load rk7
4132
4133	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
4134	ld1     {$rk8s}, [$cc], #16                               @ load rk8
4135
4136	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
4137	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
4138#ifndef __AARCH64EB__
4139	ext     $h2b, $h2b, $h2b, #8
4140#endif
4141	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
4142	ld1     {$rk9s}, [$cc], #16                               @ load rk9
4143
4144	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
4145	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
4146#ifndef __AARCH64EB__
4147	ext     $h4b, $h4b, $h4b, #8
4148#endif
4149	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
4150	ld1     {$rk10s}, [$cc], #16                              @ load rk10
4151
4152	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
4153	ld1     {$rk11s}, [$cc], #16                              @ load rk11
4154
4155	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
4156	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
4157
4158	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
4159
4160	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
4161	ld1     { $acc_lb}, [$current_tag]
4162	ext     $acc_lb, $acc_lb, $acc_lb, #8
4163	rev64   $acc_lb, $acc_lb
4164
4165	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
4166
4167	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
4168
4169	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
4170
4171	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
4172
4173	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
4174
4175	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
4176
4177	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
4178
4179	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
4180
4181	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
4182	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
4183
4184	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
4185	ld1     {$rk12s}, [$cc], #16                              @ load rk12
4186
4187	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
4188	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
4189#ifndef __AARCH64EB__
4190	ext     $h1b, $h1b, $h1b, #8
4191#endif
4192	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
4193	ld1     {$rk13s}, [$cc], #16                              @ load rk13
4194
4195	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
4196	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
4197
4198	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
4199
4200	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
4201
4202	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
4203	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
4204
4205	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
4206
4207	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
4208
4209	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
4210
4211	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
4212
4213	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
4214
4215	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
4216
4217	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
4218
4219	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
4220
4221	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
4222
4223	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
4224
4225	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
4226
4227	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 11
4228
4229	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 11
4230
4231	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
4232
4233	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 12
4234
4235	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 12
4236
4237	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 11
4238	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
4239
4240	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 11
4241
4242	aese    $ctr2b, $rk13                                     @ AES block 2 - round 13
4243	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
4244
4245	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 12
4246
4247	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 12
4248
4249	aese    $ctr1b, $rk13                                     @ AES block 1 - round 13
4250
4251	aese    $ctr0b, $rk13                                     @ AES block 0 - round 13
4252
4253	aese    $ctr3b, $rk13                                     @ AES block 3 - round 13
4254	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
4255	b.ge    .L256_enc_tail                                    @ handle tail
4256
4257	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 1 - load plaintext
4258#ifdef __AARCH64EB__
4259	rev     $input_l1, $input_l1
4260	rev     $input_h1, $input_h1
4261#endif
4262	rev     $ctr32w, $rctr32w                                 @ CTR block 4
4263	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 0 - load plaintext
4264#ifdef __AARCH64EB__
4265	rev     $input_l0, $input_l0
4266	rev     $input_h0, $input_h0
4267#endif
4268	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 3 - load plaintext
4269#ifdef __AARCH64EB__
4270	rev     $input_l3, $input_l3
4271	rev     $input_h3, $input_h3
4272#endif
4273	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 2 - load plaintext
4274#ifdef __AARCH64EB__
4275	rev     $input_l2, $input_l2
4276	rev     $input_h2, $input_h2
4277#endif
4278	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
4279
4280	eor     $input_l1, $input_l1, $rk14_l                     @ AES block 1 - round 14 low
4281	eor     $input_h1, $input_h1, $rk14_h                     @ AES block 1 - round 14 high
4282
4283	fmov    $ctr_t1d, $input_l1                               @ AES block 1 - mov low
4284	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 0 - round 14 low
4285
4286	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 0 - round 14 high
4287	eor     $input_h3, $input_h3, $rk14_h                     @ AES block 3 - round 14 high
4288	fmov    $ctr_t0d, $input_l0                               @ AES block 0 - mov low
4289
4290	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
4291	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 0 - mov high
4292	eor     $input_l3, $input_l3, $rk14_l                     @ AES block 3 - round 14 low
4293
4294	eor     $input_l2, $input_l2, $rk14_l                     @ AES block 2 - round 14 low
4295	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 1 - mov high
4296
4297	fmov    $ctr_t2d, $input_l2                               @ AES block 2 - mov low
4298	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
4299
4300	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
4301	fmov    $ctr_t3d, $input_l3                               @ AES block 3 - mov low
4302	eor     $input_h2, $input_h2, $rk14_h                     @ AES block 2 - round 14 high
4303
4304	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 2 - mov high
4305
4306	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 0 - result
4307	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
4308
4309	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
4310	rev     $ctr32w, $rctr32w                                 @ CTR block 5
4311	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
4312
4313	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 1 - result
4314	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
4315	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
4316
4317	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
4318	rev     $ctr32w, $rctr32w                                 @ CTR block 6
4319	st1     { $res0b}, [$output_ptr], #16                     @ AES block 0 - store result
4320
4321	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 3 - mov high
4322	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
4323	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 2 - result
4324
4325	st1     { $res1b}, [$output_ptr], #16                     @ AES block 1 - store result
4326
4327	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
4328	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 6
4329
4330	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 6
4331	st1     { $res2b}, [$output_ptr], #16                     @ AES block 2 - store result
4332	rev     $ctr32w, $rctr32w                                 @ CTR block 7
4333
4334	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 7
4335
4336	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 3 - result
4337	st1     { $res3b}, [$output_ptr], #16                     @ AES block 3 - store result
4338	b.ge    L256_enc_prepretail                               @ do prepretail
4339
4340	.L256_enc_main_loop:                                      @ main loop start
4341	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
4342	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
4343
4344	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
4345	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
4346
4347	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
4348	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
4349
4350	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
4351	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
4352
4353	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
4354	ldp     $input_l3, $input_h3, [$input_ptr, #48]           @ AES block 4k+7 - load plaintext
4355#ifdef __AARCH64EB__
4356	rev     $input_l3, $input_l3
4357	rev     $input_h3, $input_h3
4358#endif
4359	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
4360	ldp     $input_l2, $input_h2, [$input_ptr, #32]           @ AES block 4k+6 - load plaintext
4361#ifdef __AARCH64EB__
4362	rev     $input_l2, $input_l2
4363	rev     $input_h2, $input_h2
4364#endif
4365	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
4366	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
4367
4368	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
4369
4370	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
4371	eor     $input_l3, $input_l3, $rk14_l                     @ AES block 4k+7 - round 14 low
4372
4373	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
4374	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
4375
4376	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
4377	eor     $input_h2, $input_h2, $rk14_h                     @ AES block 4k+6 - round 14 high
4378	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
4379
4380	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
4381	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
4382
4383	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
4384
4385	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
4386	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
4387
4388	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
4389
4390	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
4391	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4392
4393	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
4394
4395	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
4396	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4397
4398	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
4399
4400	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
4401	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
4402
4403	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
4404
4405	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
4406	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
4407
4408	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
4409
4410	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
4411	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
4412
4413	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
4414	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
4415
4416	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
4417
4418	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
4419	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
4420
4421	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
4422
4423	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
4424
4425	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
4426
4427	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
4428	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
4429
4430	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
4431
4432	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
4433
4434	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
4435
4436	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
4437	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
4438
4439	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
4440
4441	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
4442
4443	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
4444
4445	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
4446	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
4447
4448	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
4449	ldp     $input_l1, $input_h1, [$input_ptr, #16]           @ AES block 4k+5 - load plaintext
4450#ifdef __AARCH64EB__
4451	rev     $input_l1, $input_l1
4452	rev     $input_h1, $input_h1
4453#endif
4454	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
4455	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
4456
4457	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
4458	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
4459
4460	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
4461
4462	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
4463	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
4464
4465	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
4466	eor     $input_l1, $input_l1, $rk14_l                     @ AES block 4k+5 - round 14 low
4467
4468	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
4469	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
4470
4471	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
4472	eor     $input_l2, $input_l2, $rk14_l                     @ AES block 4k+6 - round 14 low
4473
4474	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
4475	movi    $mod_constant.8b, #0xc2
4476
4477	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
4478	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
4479	fmov    $ctr_t1d, $input_l1                               @ AES block 4k+5 - mov low
4480
4481	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
4482	ldp     $input_l0, $input_h0, [$input_ptr, #0]            @ AES block 4k+4 - load plaintext
4483#ifdef __AARCH64EB__
4484	rev     $input_l0, $input_l0
4485	rev     $input_h0, $input_h0
4486#endif
4487	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
4488	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
4489
4490	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
4491	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
4492
4493	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
4494
4495	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
4496	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
4497
4498	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
4499	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
4500
4501	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
4502	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
4503
4504	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
4505	add     $input_ptr, $input_ptr, #64                       @ AES input_ptr update
4506
4507	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
4508	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
4509	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
4510
4511	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
4512	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 4k+4 - round 14 low
4513
4514	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
4515	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
4516
4517	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
4518	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 4k+4 - round 14 high
4519
4520	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
4521	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
4522	eor     $mod_t.16b, $acc_hb, $mod_t.16b                   @ MODULO - fold into mid
4523
4524	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
4525	eor     $input_h1, $input_h1, $rk14_h                     @ AES block 4k+5 - round 14 high
4526
4527	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
4528	eor     $input_h3, $input_h3, $rk14_h                     @ AES block 4k+7 - round 14 high
4529
4530	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
4531	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
4532
4533	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
4534	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
4535	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
4536
4537	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
4538	fmov    $ctr_t3d, $input_l3                               @ AES block 4k+7 - mov low
4539
4540	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
4541	fmov    $ctr_t1.d[1], $input_h1                           @ AES block 4k+5 - mov high
4542
4543	fmov    $ctr_t2d, $input_l2                               @ AES block 4k+6 - mov low
4544	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
4545
4546	fmov    $ctr_t2.d[1], $input_h2                           @ AES block 4k+6 - mov high
4547
4548	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d            @ MODULO - mid 64b align with low
4549	eor     $res0b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
4550	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
4551
4552	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
4553	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
4554	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
4555
4556	eor     $res1b, $ctr_t1b, $ctr1b                          @ AES block 4k+5 - result
4557	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
4558	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
4559
4560	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
4561	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
4562
4563	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
4564	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
4565	st1     { $res0b}, [$output_ptr], #16                     @ AES block 4k+4 - store result
4566
4567	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
4568	eor     $acc_lb, $acc_lb, $acc_hb                         @ MODULO - fold into low
4569	fmov    $ctr_t3.d[1], $input_h3                           @ AES block 4k+7 - mov high
4570
4571	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
4572	st1     { $res1b}, [$output_ptr], #16                     @ AES block 4k+5 - store result
4573	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
4574
4575	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
4576	eor     $res2b, $ctr_t2b, $ctr2b                          @ AES block 4k+6 - result
4577	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+10
4578
4579	st1     { $res2b}, [$output_ptr], #16                     @ AES block 4k+6 - store result
4580	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+10
4581	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+11
4582
4583	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
4584	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+11
4585
4586	eor     $res3b, $ctr_t3b, $ctr3b                          @ AES block 4k+7 - result
4587	st1     { $res3b}, [$output_ptr], #16                     @ AES block 4k+7 - store result
4588	b.lt    L256_enc_main_loop
4589
4590	.L256_enc_prepretail:                                     @ PREPRETAIL
4591	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
4592	rev64   $res2b, $res2b                                    @ GHASH block 4k+2 (t0, t1, and t2 free)
4593
4594	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
4595	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+3
4596
4597	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
4598	rev64   $res0b, $res0b                                    @ GHASH block 4k (only t0 is free)
4599
4600	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+3
4601	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
4602
4603	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
4604
4605	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
4606
4607	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
4608	rev64   $res1b, $res1b                                    @ GHASH block 4k+1 (t0 and t1 free)
4609
4610	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
4611
4612	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
4613	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
4614
4615	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
4616
4617	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
4618	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
4619
4620	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
4621
4622	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
4623
4624	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
4625	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
4626
4627	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
4628
4629	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
4630
4631	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
4632
4633	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
4634
4635	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
4636
4637	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
4638
4639	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
4640
4641	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
4642	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
4643
4644	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
4645	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
4646
4647	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
4648
4649	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
4650	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
4651
4652	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
4653	rev64   $res3b, $res3b                                    @ GHASH block 4k+3 (t0, t1, t2 and t3 free)
4654
4655	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
4656
4657	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
4658	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
4659	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+3
4660
4661	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
4662
4663	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
4664
4665	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
4666	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
4667
4668	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
4669
4670	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
4671	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
4672
4673	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
4674
4675	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
4676	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
4677
4678	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
4679
4680	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
4681
4682	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
4683
4684	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
4685
4686	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
4687
4688	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
4689	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
4690
4691	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
4692
4693	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
4694
4695	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
4696
4697	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
4698	movi    $mod_constant.8b, #0xc2
4699
4700	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
4701
4702	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
4703	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
4704
4705	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
4706
4707	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
4708	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
4709
4710	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
4711	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
4712
4713	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
4714
4715	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
4716
4717	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
4718
4719	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
4720	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
4721
4722	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
4723
4724	eor     $acc_mb, $acc_mb, $acc_hb                         @ karatsuba tidy up
4725
4726	pmull   $t1.1q, $acc_h.1d, $mod_constant.1d
4727	ext     $acc_hb, $acc_hb, $acc_hb, #8
4728
4729	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
4730
4731	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
4732	eor     $acc_mb, $acc_mb, $acc_lb
4733
4734	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
4735
4736	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
4737
4738	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
4739
4740	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
4741	eor     $acc_mb, $acc_mb, $t1.16b
4742
4743	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
4744
4745	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
4746
4747	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
4748
4749	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
4750	eor     $acc_mb, $acc_mb, $acc_hb
4751
4752	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
4753
4754	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
4755
4756	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
4757
4758	pmull   $t1.1q, $acc_m.1d, $mod_constant.1d
4759
4760	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
4761	ext     $acc_mb, $acc_mb, $acc_mb, #8
4762
4763	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
4764
4765	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
4766	eor     $acc_lb, $acc_lb, $t1.16b
4767
4768	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
4769
4770	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
4771
4772	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
4773
4774	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
4775	eor     $acc_lb, $acc_lb, $acc_mb
4776	.L256_enc_tail:                                           @ TAIL
4777
4778	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
4779	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
4780	ldp     $input_l0, $input_h0, [$input_ptr], #16           @ AES block 4k+4 - load plaintext
4781#ifdef __AARCH64EB__
4782	rev     $input_l0, $input_l0
4783	rev     $input_h0, $input_h0
4784#endif
4785	eor     $input_l0, $input_l0, $rk14_l                     @ AES block 4k+4 - round 14 low
4786	eor     $input_h0, $input_h0, $rk14_h                     @ AES block 4k+4 - round 14 high
4787
4788	cmp     $main_end_input_ptr, #48
4789	fmov    $ctr_t0d, $input_l0                               @ AES block 4k+4 - mov low
4790
4791	fmov    $ctr_t0.d[1], $input_h0                           @ AES block 4k+4 - mov high
4792
4793	eor     $res1b, $ctr_t0b, $ctr0b                          @ AES block 4k+4 - result
4794	b.gt    .L256_enc_blocks_more_than_3
4795
4796	cmp     $main_end_input_ptr, #32
4797	mov     $ctr3b, $ctr2b
4798	movi    $acc_l.8b, #0
4799
4800	movi    $acc_h.8b, #0
4801	sub     $rctr32w, $rctr32w, #1
4802
4803	mov     $ctr2b, $ctr1b
4804	movi    $acc_m.8b, #0
4805	b.gt    .L256_enc_blocks_more_than_2
4806
4807	mov     $ctr3b, $ctr1b
4808	sub     $rctr32w, $rctr32w, #1
4809	cmp     $main_end_input_ptr, #16
4810
4811	b.gt    .L256_enc_blocks_more_than_1
4812
4813	sub     $rctr32w, $rctr32w, #1
4814	b       .L256_enc_blocks_less_than_1
4815	.L256_enc_blocks_more_than_3:                            @ blocks left >  3
4816	st1     { $res1b}, [$output_ptr], #16                    @ AES final-3 block  - store result
4817
4818	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-2 block - load input low & high
4819#ifdef __AARCH64EB__
4820	rev     $input_l0, $input_l0
4821	rev     $input_h0, $input_h0
4822#endif
4823	rev64   $res0b, $res1b                                   @ GHASH final-3 block
4824
4825	eor     $input_l0, $input_l0, $rk14_l                    @ AES final-2 block - round 14 low
4826	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4827
4828	eor     $input_h0, $input_h0, $rk14_h                    @ AES final-2 block - round 14 high
4829
4830	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
4831	fmov    $res1d, $input_l0                                @ AES final-2 block - mov low
4832
4833	fmov    $res1.d[1], $input_h0                            @ AES final-2 block - mov high
4834
4835	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
4836	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4837
4838	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
4839
4840	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
4841
4842	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
4843
4844	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
4845	eor     $res1b, $res1b, $ctr1b                           @ AES final-2 block - result
4846	.L256_enc_blocks_more_than_2:                            @ blocks left >  2
4847
4848	st1     { $res1b}, [$output_ptr], #16                    @ AES final-2 block - store result
4849
4850	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final-1 block - load input low & high
4851#ifdef __AARCH64EB__
4852	rev     $input_l0, $input_l0
4853	rev     $input_h0, $input_h0
4854#endif
4855	rev64   $res0b, $res1b                                   @ GHASH final-2 block
4856
4857	eor     $input_l0, $input_l0, $rk14_l                    @ AES final-1 block - round 14 low
4858	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4859
4860	fmov    $res1d, $input_l0                                @ AES final-1 block - mov low
4861	eor     $input_h0, $input_h0, $rk14_h                    @ AES final-1 block - round 14 high
4862
4863	fmov    $res1.d[1], $input_h0                            @ AES final-1 block - mov high
4864
4865	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4866
4867	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
4868	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
4869
4870	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
4871
4872	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
4873
4874	eor     $res1b, $res1b, $ctr2b                           @ AES final-1 block - result
4875
4876	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
4877
4878	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
4879
4880	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
4881
4882	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
4883	.L256_enc_blocks_more_than_1:                            @ blocks left >  1
4884
4885	st1     { $res1b}, [$output_ptr], #16                    @ AES final-1 block - store result
4886
4887	rev64   $res0b, $res1b                                   @ GHASH final-1 block
4888
4889	ldp     $input_l0, $input_h0, [$input_ptr], #16          @ AES final block - load input low & high
4890#ifdef __AARCH64EB__
4891	rev     $input_l0, $input_l0
4892	rev     $input_h0, $input_h0
4893#endif
4894	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4895
4896	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
4897
4898	eor     $input_l0, $input_l0, $rk14_l                    @ AES final block - round 14 low
4899	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
4900
4901	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
4902	eor     $input_h0, $input_h0, $rk14_h                    @ AES final block - round 14 high
4903
4904	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
4905
4906	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
4907
4908	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
4909	fmov    $res1d, $input_l0                                @ AES final block - mov low
4910
4911	fmov    $res1.d[1], $input_h0                            @ AES final block - mov high
4912
4913	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
4914
4915	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
4916
4917	eor     $res1b, $res1b, $ctr3b                           @ AES final block - result
4918	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
4919
4920	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
4921	.L256_enc_blocks_less_than_1:                            @ blocks left <= 1
4922
4923	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
4924
4925	mvn     $rk14_l, xzr                                     @ rk14_l = 0xffffffffffffffff
4926	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
4927
4928	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
4929	ld1     { $rk0}, [$output_ptr]                           @ load existing bytes where the possibly partial last block is to be stored
4930
4931	mvn     $rk14_h, xzr                                     @ rk14_h = 0xffffffffffffffff
4932	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
4933
4934	lsr     $rk14_h, $rk14_h, $bit_length                    @ rk14_h is mask for top 64b of last block
4935	cmp     $bit_length, #64
4936
4937	csel    $input_l0, $rk14_l, $rk14_h, lt
4938	csel    $input_h0, $rk14_h, xzr, lt
4939
4940	fmov    $ctr0d, $input_l0                                @ ctr0b is mask for last block
4941
4942	fmov    $ctr0.d[1], $input_h0
4943
4944	and     $res1b, $res1b, $ctr0b                           @ possibly partial last block has zeroes in highest bits
4945
4946	rev64   $res0b, $res1b                                   @ GHASH final block
4947
4948	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
4949
4950	bif     $res1b, $rk0, $ctr0b                             @ insert existing bytes in top end of result before storing
4951
4952	pmull2  $rk2q1, $res0.2d, $h1.2d                         @ GHASH final block - high
4953	mov     $t0d, $res0.d[1]                                 @ GHASH final block - mid
4954#ifndef __AARCH64EB__
4955	rev     $ctr32w, $rctr32w
4956#else
4957	mov     $ctr32w, $rctr32w
4958#endif
4959
4960	pmull   $rk3q1, $res0.1d, $h1.1d                         @ GHASH final block - low
4961
4962	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final block - high
4963	eor     $t0.8b, $t0.8b, $res0.8b                         @ GHASH final block - mid
4964
4965	pmull   $t0.1q, $t0.1d, $h12k.1d                         @ GHASH final block - mid
4966
4967	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final block - low
4968
4969	eor     $acc_mb, $acc_mb, $t0.16b                        @ GHASH final block - mid
4970	movi    $mod_constant.8b, #0xc2
4971
4972	eor     $t9.16b, $acc_lb, $acc_hb                        @ MODULO - karatsuba tidy up
4973
4974	shl     $mod_constantd, $mod_constantd, #56              @ mod_constant
4975
4976	eor     $acc_mb, $acc_mb, $t9.16b                        @ MODULO - karatsuba tidy up
4977
4978	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d           @ MODULO - top 64b align with mid
4979
4980	ext     $acc_hb, $acc_hb, $acc_hb, #8                    @ MODULO - other top alignment
4981
4982	eor     $acc_mb, $acc_mb, $mod_t.16b                     @ MODULO - fold into mid
4983
4984	eor     $acc_mb, $acc_mb, $acc_hb                        @ MODULO - fold into mid
4985
4986	pmull   $acc_h.1q, $acc_m.1d, $mod_constant.1d           @ MODULO - mid 64b align with low
4987
4988	ext     $acc_mb, $acc_mb, $acc_mb, #8                    @ MODULO - other mid alignment
4989
4990	str     $ctr32w, [$counter, #12]                         @ store the updated counter
4991
4992	st1     { $res1b}, [$output_ptr]                         @ store all 16B
4993	eor     $acc_lb, $acc_lb, $acc_hb                        @ MODULO - fold into low
4994
4995	eor     $acc_lb, $acc_lb, $acc_mb                        @ MODULO - fold into low
4996	ext     $acc_lb, $acc_lb, $acc_lb, #8
4997	rev64   $acc_lb, $acc_lb
4998	mov     x0, $len
4999	st1     { $acc_l.16b }, [$current_tag]
5000
5001	ldp     x21, x22, [sp, #16]
5002	ldp     x23, x24, [sp, #32]
5003	ldp     d8, d9, [sp, #48]
5004	ldp     d10, d11, [sp, #64]
5005	ldp     d12, d13, [sp, #80]
5006	ldp     d14, d15, [sp, #96]
5007	ldp     x19, x20, [sp], #112
5008	ret
5009
5010.L256_enc_ret:
5011	mov w0, #0x0
5012	ret
5013.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel
5014___
5015
5016{
5017my $t8="v4";
5018my $t8d="d4";
5019my $t9="v6";
5020my $t9d="d6";
5021#########################################################################################
5022# size_t aes_gcm_dec_256_kernel(const unsigned char *in,
5023#                               size_t len,
5024#                               unsigned char *out,
5025#                               const void *key,
5026#                               unsigned char ivec[16],
5027#                               u64 *Xi);
5028#
5029$code.=<<___;
5030.global aes_gcm_dec_256_kernel
5031.type   aes_gcm_dec_256_kernel,%function
5032.align  4
5033aes_gcm_dec_256_kernel:
5034	AARCH64_VALID_CALL_TARGET
5035	cbz     x1, .L256_dec_ret
5036	stp     x19, x20, [sp, #-112]!
5037	mov     x16, x4
5038	mov     x8, x5
5039	stp     x21, x22, [sp, #16]
5040	stp     x23, x24, [sp, #32]
5041	stp     d8, d9, [sp, #48]
5042	stp     d10, d11, [sp, #64]
5043	stp     d12, d13, [sp, #80]
5044	stp     d14, d15, [sp, #96]
5045
5046	lsr     $main_end_input_ptr, $bit_length, #3              @ byte_len
5047	mov     $len, $main_end_input_ptr
5048	ldp     $ctr96_b64x, $ctr96_t32x, [$counter]              @ ctr96_b64, ctr96_t32
5049#ifdef __AARCH64EB__
5050	rev     $ctr96_b64x, $ctr96_b64x
5051	rev     $ctr96_t32x, $ctr96_t32x
5052#endif
5053	ldp     $rk14_l, $rk14_h, [$cc, #224]                     @ load rk14
5054#ifdef __AARCH64EB__
5055	ror     $rk14_h, $rk14_h, #32
5056	ror     $rk14_l, $rk14_l, #32
5057#endif
5058	ld1     {$rk0s}, [$cc], #16                               @ load rk0
5059	sub     $main_end_input_ptr, $main_end_input_ptr, #1      @ byte_len - 1
5060
5061	ld1     {$rk1s}, [$cc], #16                               @ load rk1
5062	and     $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail)
5063
5064	add     $end_input_ptr, $input_ptr, $bit_length, lsr #3   @ end_input_ptr
5065	ld1     {$rk2s}, [$cc], #16                               @ load rk2
5066
5067	lsr     $rctr32x, $ctr96_t32x, #32
5068	ld1     {$rk3s}, [$cc], #16                               @ load rk3
5069	orr     $ctr96_t32w, $ctr96_t32w, $ctr96_t32w
5070
5071	ld1     {$rk4s}, [$cc], #16                               @ load rk4
5072	add     $main_end_input_ptr, $main_end_input_ptr, $input_ptr
5073	rev     $rctr32w, $rctr32w                                @ rev_ctr32
5074
5075	add     $rctr32w, $rctr32w, #1                            @ increment rev_ctr32
5076	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 3
5077
5078	rev     $ctr32w, $rctr32w                                 @ CTR block 1
5079	add     $rctr32w, $rctr32w, #1                            @ CTR block 1
5080	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 1
5081
5082	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 1
5083	ld1     { $ctr0b}, [$counter]                             @ special case vector load initial counter so we can start first AES block as quickly as possible
5084
5085	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 1
5086	rev     $ctr32w, $rctr32w                                 @ CTR block 2
5087	add     $rctr32w, $rctr32w, #1                            @ CTR block 2
5088
5089	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 2
5090	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 2
5091
5092	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 2
5093	rev     $ctr32w, $rctr32w                                 @ CTR block 3
5094
5095	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 3
5096	ld1     {$rk5s}, [$cc], #16                               @ load rk5
5097
5098	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 3
5099	add     $rctr32w, $rctr32w, #1                            @ CTR block 3
5100
5101	ld1     {$rk6s}, [$cc], #16                               @ load rk6
5102
5103	ld1     {$rk7s}, [$cc], #16                               @ load rk7
5104
5105	ld1     {$rk8s}, [$cc], #16                               @ load rk8
5106
5107	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 0
5108	ldr     $h3q, [$current_tag, #80]                         @ load h3l | h3h
5109#ifndef __AARCH64EB__
5110	ext     $h3b, $h3b, $h3b, #8
5111#endif
5112
5113	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 0
5114	ldr     $h4q, [$current_tag, #112]                        @ load h4l | h4h
5115#ifndef __AARCH64EB__
5116	ext     $h4b, $h4b, $h4b, #8
5117#endif
5118
5119	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 0
5120	ldr     $h2q, [$current_tag, #64]                         @ load h2l | h2h
5121#ifndef __AARCH64EB__
5122	ext     $h2b, $h2b, $h2b, #8
5123#endif
5124
5125	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 0
5126	ld1     {$rk9s}, [$cc], #16                                 @ load rk9
5127
5128	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 1
5129
5130	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 1
5131	ld1     { $acc_lb}, [$current_tag]
5132	ext     $acc_lb, $acc_lb, $acc_lb, #8
5133	rev64   $acc_lb, $acc_lb
5134
5135	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 1
5136	ld1     {$rk10s}, [$cc], #16                              @ load rk10
5137
5138	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 1
5139	ld1     {$rk11s}, [$cc], #16                              @ load rk11
5140
5141	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 2
5142	ldr     $h1q, [$current_tag, #32]                         @ load h1l | h1h
5143#ifndef __AARCH64EB__
5144	ext     $h1b, $h1b, $h1b, #8
5145#endif
5146	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 2
5147	ld1     {$rk12s}, [$cc], #16                              @ load rk12
5148
5149	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 2
5150
5151	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 3
5152
5153	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 2
5154
5155	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 3
5156
5157	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 4
5158	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 4 blocks
5159
5160	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 3
5161
5162	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 3
5163
5164	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 4
5165
5166	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 4
5167
5168	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 4
5169
5170	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 5
5171
5172	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 5
5173
5174	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 5
5175
5176	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 5
5177
5178	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 6
5179
5180	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 6
5181
5182	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 6
5183
5184	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 6
5185
5186	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 7
5187
5188	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 7
5189
5190	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 7
5191
5192	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 8
5193
5194	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 7
5195
5196	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 8
5197
5198	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 8
5199
5200	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 9
5201
5202	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 8
5203	ld1     {$rk13s}, [$cc], #16                             @ load rk13
5204
5205	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 9
5206
5207	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 10
5208
5209	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 9
5210
5211	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 10
5212
5213	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 9
5214
5215	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 10
5216
5217	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 11
5218
5219	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 10
5220
5221	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 11
5222
5223	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 11
5224
5225	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 11
5226
5227	trn1    $acc_h.2d, $h3.2d,    $h4.2d                      @ h4h | h3h
5228
5229	trn2    $h34k.2d,  $h3.2d,    $h4.2d                      @ h4l | h3l
5230
5231	trn1    $t0.2d,    $h1.2d,    $h2.2d                      @ h2h | h1h
5232	trn2    $h12k.2d,  $h1.2d,    $h2.2d                      @ h2l | h1l
5233
5234	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 1 - round 12
5235
5236	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 0 - round 12
5237
5238	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 2 - round 12
5239
5240	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 3 - round 12
5241	eor     $h34k.16b, $h34k.16b, $acc_h.16b                  @ h4k | h3k
5242
5243	aese    $ctr1b, $rk13                                     @ AES block 1 - round 13
5244
5245	aese    $ctr2b, $rk13                                     @ AES block 2 - round 13
5246	eor     $h12k.16b, $h12k.16b, $t0.16b                     @ h2k | h1k
5247
5248	aese    $ctr3b, $rk13                                     @ AES block 3 - round 13
5249
5250	aese    $ctr0b, $rk13                                     @ AES block 0 - round 13
5251	b.ge    .L256_dec_tail                                    @ handle tail
5252
5253	ld1     {$res0b, $res1b}, [$input_ptr], #32               @ AES block 0,1 - load ciphertext
5254
5255	rev     $ctr32w, $rctr32w                                 @ CTR block 4
5256
5257	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 0 - result
5258
5259	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 1 - result
5260	rev64   $res1b, $res1b                                    @ GHASH block 1
5261	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 2 - load ciphertext
5262
5263	mov     $output_h0, $ctr0.d[1]                            @ AES block 0 - mov high
5264
5265	mov     $output_l0, $ctr0.d[0]                            @ AES block 0 - mov low
5266	rev64   $res0b, $res0b                                    @ GHASH block 0
5267	add     $rctr32w, $rctr32w, #1                            @ CTR block 4
5268
5269	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4
5270	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4
5271
5272	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4
5273	rev     $ctr32w, $rctr32w                                 @ CTR block 5
5274	add     $rctr32w, $rctr32w, #1                            @ CTR block 5
5275
5276	mov     $output_l1, $ctr1.d[0]                            @ AES block 1 - mov low
5277
5278	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 5
5279	mov     $output_h1, $ctr1.d[1]                            @ AES block 1 - mov high
5280	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 0 - round 14 high
5281#ifdef __AARCH64EB__
5282	rev     $output_h0, $output_h0
5283#endif
5284	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 0 - round 14 low
5285#ifdef __AARCH64EB__
5286	rev     $output_l0, $output_l0
5287#endif
5288	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 0 - store result
5289	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 5
5290
5291	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 3 - load ciphertext
5292
5293	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 5
5294	rev     $ctr32w, $rctr32w                                 @ CTR block 6
5295	add     $rctr32w, $rctr32w, #1                            @ CTR block 6
5296
5297	eor     $output_l1, $output_l1, $rk14_l                   @ AES block 1 - round 14 low
5298#ifdef __AARCH64EB__
5299	rev     $output_l1, $output_l1
5300#endif
5301	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 6
5302
5303	eor     $output_h1, $output_h1, $rk14_h                   @ AES block 1 - round 14 high
5304#ifdef __AARCH64EB__
5305	rev     $output_h1, $output_h1
5306#endif
5307	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 1 - store result
5308
5309	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 2 - result
5310	cmp     $input_ptr, $main_end_input_ptr                   @ check if we have <= 8 blocks
5311	b.ge    .L256_dec_prepretail                              @ do prepretail
5312
5313	.L256_dec_main_loop:                                      @ main loop start
5314	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
5315	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
5316	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
5317
5318	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
5319	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
5320
5321	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
5322	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
5323
5324	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
5325	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
5326	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
5327
5328	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
5329	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
5330
5331	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
5332	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
5333
5334	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
5335	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
5336	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
5337
5338	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
5339	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
5340
5341	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
5342	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
5343
5344	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
5345	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
5346
5347	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
5348	eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
5349#ifdef __AARCH64EB__
5350	rev     $output_h2, $output_h2
5351#endif
5352	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
5353	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
5354
5355	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
5356	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
5357
5358	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
5359	eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
5360#ifdef __AARCH64EB__
5361	rev     $output_l2, $output_l2
5362#endif
5363	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
5364	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
5365
5366	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
5367
5368	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
5369
5370	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
5371	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
5372
5373	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
5374	eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
5375#ifdef __AARCH64EB__
5376	rev     $output_l3, $output_l3
5377#endif
5378	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
5379	eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
5380#ifdef __AARCH64EB__
5381	rev     $output_h3, $output_h3
5382#endif
5383	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
5384
5385	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
5386
5387	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
5388	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
5389
5390	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
5391	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
5392
5393	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
5394	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
5395
5396	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
5397	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
5398
5399	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
5400	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
5401
5402	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
5403
5404	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
5405	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
5406
5407	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
5408
5409	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
5410	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
5411
5412	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
5413	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+8
5414
5415	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
5416	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
5417
5418	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
5419	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+8
5420
5421	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
5422
5423	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
5424	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
5425
5426	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
5427
5428	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
5429	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
5430
5431	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
5432
5433	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
5434
5435	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
5436	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
5437
5438	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
5439
5440	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
5441	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+8
5442	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
5443
5444	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
5445
5446	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
5447	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
5448
5449	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
5450
5451	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
5452	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
5453
5454	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
5455
5456	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
5457	movi    $mod_constant.8b, #0xc2
5458
5459	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
5460	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
5461
5462	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
5463
5464	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
5465	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5466
5467	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
5468	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
5469
5470	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
5471
5472	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5473	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5474
5475	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
5476	ld1     {$res0b}, [$input_ptr], #16                       @ AES block 4k+4 - load ciphertext
5477
5478	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
5479	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5480
5481	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
5482	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5483
5484	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
5485	ld1     {$res1b}, [$input_ptr], #16                       @ AES block 4k+5 - load ciphertext
5486
5487	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
5488	eor     $ctr0b, $res0b, $ctr0b                            @ AES block 4k+4 - result
5489
5490	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
5491	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
5492
5493	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
5494	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
5495
5496	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
5497	ld1     {$res2b}, [$input_ptr], #16                       @ AES block 4k+6 - load ciphertext
5498
5499	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
5500	ld1     {$res3b}, [$input_ptr], #16                       @ AES block 4k+7 - load ciphertext
5501
5502	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
5503	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
5504
5505	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
5506	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
5507
5508	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
5509	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
5510
5511	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
5512	fmov    $ctr0d, $ctr96_b64x                               @ CTR block 4k+8
5513
5514	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
5515	fmov    $ctr0.d[1], $ctr32x                               @ CTR block 4k+8
5516
5517	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
5518	eor     $ctr1b, $res1b, $ctr1b                            @ AES block 4k+5 - result
5519	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+9
5520
5521	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
5522	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+9
5523	cmp     $input_ptr, $main_end_input_ptr                   @ LOOP CONTROL
5524
5525	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+9
5526
5527	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
5528#ifdef __AARCH64EB__
5529	rev     $output_l0, $output_l0
5530#endif
5531	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
5532#ifdef __AARCH64EB__
5533	rev     $output_h0, $output_h0
5534#endif
5535	mov     $output_h1, $ctr1.d[1]                            @ AES block 4k+5 - mov high
5536	eor     $ctr2b, $res2b, $ctr2b                            @ AES block 4k+6 - result
5537	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
5538
5539	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
5540	mov     $output_l1, $ctr1.d[0]                            @ AES block 4k+5 - mov low
5541
5542	fmov    $ctr1d, $ctr96_b64x                               @ CTR block 4k+9
5543	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
5544
5545	fmov    $ctr1.d[1], $ctr32x                               @ CTR block 4k+9
5546	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+10
5547	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+10
5548
5549	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
5550	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+10
5551
5552	rev64   $res1b, $res1b                                    @ GHASH block 4k+5
5553	eor     $output_h1, $output_h1, $rk14_h                   @ AES block 4k+5 - round 14 high
5554#ifdef __AARCH64EB__
5555	rev     $output_h1, $output_h1
5556#endif
5557	stp     $output_l0, $output_h0, [$output_ptr], #16        @ AES block 4k+4 - store result
5558
5559	eor     $output_l1, $output_l1, $rk14_l                   @ AES block 4k+5 - round 14 low
5560#ifdef __AARCH64EB__
5561	rev     $output_l1, $output_l1
5562#endif
5563	stp     $output_l1, $output_h1, [$output_ptr], #16        @ AES block 4k+5 - store result
5564
5565	rev64   $res0b, $res0b                                    @ GHASH block 4k+4
5566	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
5567	b.lt    .L256_dec_main_loop
5568
5569
5570	.L256_dec_prepretail:                                     @ PREPRETAIL
5571	ext     $acc_lb, $acc_lb, $acc_lb, #8                     @ PRE 0
5572	mov     $output_l2, $ctr2.d[0]                            @ AES block 4k+2 - mov low
5573	eor     $ctr3b, $res3b, $ctr3b                            @ AES block 4k+3 - result
5574
5575	aese    $ctr0b, $rk0  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 0
5576	mov     $output_h2, $ctr2.d[1]                            @ AES block 4k+2 - mov high
5577
5578	aese    $ctr1b, $rk0  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 0
5579	fmov    $ctr2d, $ctr96_b64x                               @ CTR block 4k+6
5580
5581	fmov    $ctr2.d[1], $ctr32x                               @ CTR block 4k+6
5582	rev     $ctr32w, $rctr32w                                 @ CTR block 4k+7
5583	eor     $res0b, $res0b, $acc_lb                           @ PRE 1
5584
5585	rev64   $res2b, $res2b                                    @ GHASH block 4k+2
5586	orr     $ctr32x, $ctr96_t32x, $ctr32x, lsl #32            @ CTR block 4k+7
5587	mov     $output_l3, $ctr3.d[0]                            @ AES block 4k+3 - mov low
5588
5589	aese    $ctr1b, $rk1  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 1
5590	mov     $output_h3, $ctr3.d[1]                            @ AES block 4k+3 - mov high
5591
5592	pmull   $acc_l.1q, $res0.1d, $h4.1d                       @ GHASH block 4k - low
5593	mov     $t0d, $res0.d[1]                                  @ GHASH block 4k - mid
5594	fmov    $ctr3d, $ctr96_b64x                               @ CTR block 4k+7
5595
5596	pmull2  $acc_h.1q, $res0.2d, $h4.2d                       @ GHASH block 4k - high
5597	fmov    $ctr3.d[1], $ctr32x                               @ CTR block 4k+7
5598
5599	aese    $ctr2b, $rk0  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 0
5600	mov     $acc_md, $h34k.d[1]                               @ GHASH block 4k - mid
5601
5602	aese    $ctr0b, $rk1  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 1
5603	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH block 4k - mid
5604
5605	pmull2  $t1.1q, $res1.2d, $h3.2d                          @ GHASH block 4k+1 - high
5606
5607	aese    $ctr2b, $rk1  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 1
5608	rev64   $res3b, $res3b                                    @ GHASH block 4k+3
5609
5610	aese    $ctr3b, $rk0  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 0
5611
5612	pmull   $acc_m.1q, $t0.1d, $acc_m.1d                      @ GHASH block 4k - mid
5613	eor     $acc_hb, $acc_hb, $t1.16b                         @ GHASH block 4k+1 - high
5614
5615	pmull   $t2.1q, $res1.1d, $h3.1d                          @ GHASH block 4k+1 - low
5616
5617	aese    $ctr3b, $rk1  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 1
5618	mov     $t3d, $res1.d[1]                                  @ GHASH block 4k+1 - mid
5619
5620	aese    $ctr0b, $rk2  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 2
5621
5622	aese    $ctr1b, $rk2  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 2
5623	eor     $acc_lb, $acc_lb, $t2.16b                         @ GHASH block 4k+1 - low
5624
5625	aese    $ctr2b, $rk2  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 2
5626
5627	aese    $ctr0b, $rk3  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 3
5628	mov     $t6d, $res2.d[1]                                  @ GHASH block 4k+2 - mid
5629
5630	aese    $ctr3b, $rk2  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 2
5631	eor     $t3.8b, $t3.8b, $res1.8b                          @ GHASH block 4k+1 - mid
5632
5633	pmull   $t5.1q, $res2.1d, $h2.1d                          @ GHASH block 4k+2 - low
5634
5635	aese    $ctr0b, $rk4  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 4
5636
5637	aese    $ctr3b, $rk3  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 3
5638	eor     $t6.8b, $t6.8b, $res2.8b                          @ GHASH block 4k+2 - mid
5639
5640	pmull   $t3.1q, $t3.1d, $h34k.1d                          @ GHASH block 4k+1 - mid
5641
5642	aese    $ctr0b, $rk5  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 5
5643	eor     $acc_lb, $acc_lb, $t5.16b                         @ GHASH block 4k+2 - low
5644
5645	aese    $ctr3b, $rk4  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 4
5646
5647	pmull2  $t7.1q, $res3.2d, $h1.2d                          @ GHASH block 4k+3 - high
5648	eor     $acc_mb, $acc_mb, $t3.16b                         @ GHASH block 4k+1 - mid
5649
5650	pmull2  $t4.1q, $res2.2d, $h2.2d                          @ GHASH block 4k+2 - high
5651
5652	aese    $ctr3b, $rk5  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 5
5653	ins     $t6.d[1], $t6.d[0]                                @ GHASH block 4k+2 - mid
5654
5655	aese    $ctr2b, $rk3  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 3
5656
5657	aese    $ctr1b, $rk3  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 3
5658	eor     $acc_hb, $acc_hb, $t4.16b                         @ GHASH block 4k+2 - high
5659
5660	pmull   $t8.1q, $res3.1d, $h1.1d                          @ GHASH block 4k+3 - low
5661
5662	aese    $ctr2b, $rk4  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 4
5663	mov     $t9d, $res3.d[1]                                  @ GHASH block 4k+3 - mid
5664
5665	aese    $ctr1b, $rk4  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 4
5666
5667	pmull2  $t6.1q, $t6.2d, $h12k.2d                          @ GHASH block 4k+2 - mid
5668
5669	aese    $ctr2b, $rk5  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 5
5670	eor     $t9.8b, $t9.8b, $res3.8b                          @ GHASH block 4k+3 - mid
5671
5672	aese    $ctr1b, $rk5  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 5
5673
5674	aese    $ctr3b, $rk6  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 6
5675	eor     $acc_mb, $acc_mb, $t6.16b                         @ GHASH block 4k+2 - mid
5676
5677	aese    $ctr2b, $rk6  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 6
5678
5679	aese    $ctr0b, $rk6  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 6
5680	movi    $mod_constant.8b, #0xc2
5681
5682	aese    $ctr1b, $rk6  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 6
5683	eor     $acc_lb, $acc_lb, $t8.16b                         @ GHASH block 4k+3 - low
5684
5685	pmull   $t9.1q, $t9.1d, $h12k.1d                          @ GHASH block 4k+3 - mid
5686
5687	aese    $ctr3b, $rk7  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 7
5688	eor     $acc_hb, $acc_hb, $t7.16b                         @ GHASH block 4k+3 - high
5689
5690	aese    $ctr1b, $rk7  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 7
5691
5692	aese    $ctr0b, $rk7  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 7
5693	eor     $acc_mb, $acc_mb, $t9.16b                         @ GHASH block 4k+3 - mid
5694
5695	aese    $ctr3b, $rk8  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 8
5696
5697	aese    $ctr2b, $rk7  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 7
5698	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5699
5700	aese    $ctr1b, $rk8  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 8
5701
5702	aese    $ctr0b, $rk8  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 8
5703	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5704
5705	aese    $ctr2b, $rk8  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 8
5706
5707	aese    $ctr1b, $rk9  \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 9
5708	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5709
5710	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5711
5712	aese    $ctr2b, $rk9  \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 9
5713	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5714
5715	aese    $ctr3b, $rk9  \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 9
5716
5717	aese    $ctr0b, $rk9  \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 9
5718	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
5719
5720	aese    $ctr2b, $rk10 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 10
5721
5722	aese    $ctr3b, $rk10 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 10
5723
5724	aese    $ctr0b, $rk10 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 10
5725	eor     $output_h2, $output_h2, $rk14_h                   @ AES block 4k+2 - round 14 high
5726#ifdef __AARCH64EB__
5727	rev     $output_h2, $output_h2
5728#endif
5729	aese    $ctr1b, $rk10 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 10
5730	eor     $output_l3, $output_l3, $rk14_l                   @ AES block 4k+3 - round 14 low
5731#ifdef __AARCH64EB__
5732	rev     $output_l3, $output_l3
5733#endif
5734	aese    $ctr2b, $rk11 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 11
5735	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
5736
5737	aese    $ctr0b, $rk11 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 11
5738	add     $rctr32w, $rctr32w, #1                            @ CTR block 4k+7
5739
5740	aese    $ctr1b, $rk11 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 11
5741	eor     $output_l2, $output_l2, $rk14_l                   @ AES block 4k+2 - round 14 low
5742#ifdef __AARCH64EB__
5743	rev     $output_l2, $output_l2
5744#endif
5745
5746	aese    $ctr2b, $rk12 \n  aesmc   $ctr2b, $ctr2b          @ AES block 4k+6 - round 12
5747
5748	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
5749	eor     $output_h3, $output_h3, $rk14_h                   @ AES block 4k+3 - round 14 high
5750#ifdef __AARCH64EB__
5751	rev     $output_h3, $output_h3
5752#endif
5753
5754	aese    $ctr3b, $rk11 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 11
5755	stp     $output_l2, $output_h2, [$output_ptr], #16        @ AES block 4k+2 - store result
5756
5757	aese    $ctr1b, $rk12 \n  aesmc   $ctr1b, $ctr1b          @ AES block 4k+5 - round 12
5758	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
5759
5760	aese    $ctr0b, $rk12 \n  aesmc   $ctr0b, $ctr0b          @ AES block 4k+4 - round 12
5761	stp     $output_l3, $output_h3, [$output_ptr], #16        @ AES block 4k+3 - store result
5762
5763	aese    $ctr3b, $rk12 \n  aesmc   $ctr3b, $ctr3b          @ AES block 4k+7 - round 12
5764	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
5765
5766	aese    $ctr1b, $rk13                                     @ AES block 4k+5 - round 13
5767
5768	aese    $ctr0b, $rk13                                     @ AES block 4k+4 - round 13
5769
5770	aese    $ctr3b, $rk13                                     @ AES block 4k+7 - round 13
5771
5772	aese    $ctr2b, $rk13                                     @ AES block 4k+6 - round 13
5773	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
5774	.L256_dec_tail:                                           @ TAIL
5775
5776	sub     $main_end_input_ptr, $end_input_ptr, $input_ptr   @ main_end_input_ptr is number of bytes left to process
5777	ld1     { $res1b}, [$input_ptr], #16                      @ AES block 4k+4 - load ciphertext
5778
5779	eor     $ctr0b, $res1b, $ctr0b                            @ AES block 4k+4 - result
5780
5781	mov     $output_l0, $ctr0.d[0]                            @ AES block 4k+4 - mov low
5782
5783	mov     $output_h0, $ctr0.d[1]                            @ AES block 4k+4 - mov high
5784	ext     $t0.16b, $acc_lb, $acc_lb, #8                     @ prepare final partial tag
5785
5786	cmp     $main_end_input_ptr, #48
5787
5788	eor     $output_l0, $output_l0, $rk14_l                   @ AES block 4k+4 - round 14 low
5789#ifdef __AARCH64EB__
5790	rev     $output_l0, $output_l0
5791#endif
5792
5793	eor     $output_h0, $output_h0, $rk14_h                   @ AES block 4k+4 - round 14 high
5794#ifdef __AARCH64EB__
5795	rev     $output_h0, $output_h0
5796#endif
5797	b.gt    .L256_dec_blocks_more_than_3
5798
5799	sub     $rctr32w, $rctr32w, #1
5800	mov     $ctr3b, $ctr2b
5801	movi    $acc_m.8b, #0
5802
5803	movi    $acc_l.8b, #0
5804	cmp     $main_end_input_ptr, #32
5805
5806	movi    $acc_h.8b, #0
5807	mov     $ctr2b, $ctr1b
5808	b.gt    .L256_dec_blocks_more_than_2
5809
5810	sub     $rctr32w, $rctr32w, #1
5811
5812	mov     $ctr3b, $ctr1b
5813	cmp     $main_end_input_ptr, #16
5814	b.gt    .L256_dec_blocks_more_than_1
5815
5816	sub     $rctr32w, $rctr32w, #1
5817	b       .L256_dec_blocks_less_than_1
5818	.L256_dec_blocks_more_than_3:                            @ blocks left >  3
5819	rev64   $res0b, $res1b                                   @ GHASH final-3 block
5820	ld1     { $res1b}, [$input_ptr], #16                     @ AES final-2 block - load ciphertext
5821
5822	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-3 block  - store result
5823
5824	mov     $acc_md, $h34k.d[1]                              @ GHASH final-3 block - mid
5825
5826	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5827
5828	eor     $ctr0b, $res1b, $ctr1b                           @ AES final-2 block - result
5829
5830	mov     $rk4d, $res0.d[1]                                @ GHASH final-3 block - mid
5831
5832	mov     $output_l0, $ctr0.d[0]                           @ AES final-2 block - mov low
5833
5834	mov     $output_h0, $ctr0.d[1]                           @ AES final-2 block - mov high
5835
5836	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-3 block - mid
5837
5838	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5839
5840	pmull2  $acc_h.1q, $res0.2d, $h4.2d                      @ GHASH final-3 block - high
5841
5842	pmull   $acc_m.1q, $rk4v.1d, $acc_m.1d                   @ GHASH final-3 block - mid
5843	eor     $output_l0, $output_l0, $rk14_l                  @ AES final-2 block - round 14 low
5844#ifdef __AARCH64EB__
5845	rev     $output_l0, $output_l0
5846#endif
5847
5848	pmull   $acc_l.1q, $res0.1d, $h4.1d                      @ GHASH final-3 block - low
5849	eor     $output_h0, $output_h0, $rk14_h                  @ AES final-2 block - round 14 high
5850#ifdef __AARCH64EB__
5851	rev     $output_h0, $output_h0
5852#endif
5853	.L256_dec_blocks_more_than_2:                            @ blocks left >  2
5854
5855	rev64   $res0b, $res1b                                   @ GHASH final-2 block
5856	ld1     { $res1b}, [$input_ptr], #16                     @ AES final-1 block - load ciphertext
5857
5858	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5859	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-2 block  - store result
5860
5861	eor     $ctr0b, $res1b, $ctr2b                           @ AES final-1 block - result
5862
5863	mov     $rk4d, $res0.d[1]                                @ GHASH final-2 block - mid
5864
5865	pmull   $rk3q1, $res0.1d, $h3.1d                         @ GHASH final-2 block - low
5866
5867	pmull2  $rk2q1, $res0.2d, $h3.2d                         @ GHASH final-2 block - high
5868
5869	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-2 block - mid
5870	mov     $output_l0, $ctr0.d[0]                           @ AES final-1 block - mov low
5871
5872	mov     $output_h0, $ctr0.d[1]                           @ AES final-1 block - mov high
5873	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-2 block - low
5874	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5875
5876	pmull   $rk4v.1q, $rk4v.1d, $h34k.1d                     @ GHASH final-2 block - mid
5877
5878	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-2 block - high
5879	eor     $output_l0, $output_l0, $rk14_l                  @ AES final-1 block - round 14 low
5880#ifdef __AARCH64EB__
5881	rev     $output_l0, $output_l0
5882#endif
5883
5884	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-2 block - mid
5885	eor     $output_h0, $output_h0, $rk14_h                  @ AES final-1 block - round 14 high
5886#ifdef __AARCH64EB__
5887	rev     $output_h0, $output_h0
5888#endif
5889	.L256_dec_blocks_more_than_1:                            @ blocks left >  1
5890
5891	stp     $output_l0, $output_h0, [$output_ptr], #16       @ AES final-1 block  - store result
5892	rev64   $res0b, $res1b                                   @ GHASH final-1 block
5893
5894	ld1     { $res1b}, [$input_ptr], #16                     @ AES final block - load ciphertext
5895
5896	eor     $res0b, $res0b, $t0.16b                          @ feed in partial tag
5897	movi    $t0.8b, #0                                       @ suppress further partial tag feed in
5898
5899	mov     $rk4d, $res0.d[1]                                @ GHASH final-1 block - mid
5900
5901	eor     $ctr0b, $res1b, $ctr3b                           @ AES final block - result
5902
5903	pmull2  $rk2q1, $res0.2d, $h2.2d                         @ GHASH final-1 block - high
5904
5905	eor     $rk4v.8b, $rk4v.8b, $res0.8b                     @ GHASH final-1 block - mid
5906
5907	pmull   $rk3q1, $res0.1d, $h2.1d                         @ GHASH final-1 block - low
5908	mov     $output_l0, $ctr0.d[0]                           @ AES final block - mov low
5909
5910	ins     $rk4v.d[1], $rk4v.d[0]                           @ GHASH final-1 block - mid
5911
5912	mov     $output_h0, $ctr0.d[1]                           @ AES final block - mov high
5913
5914	pmull2  $rk4v.1q, $rk4v.2d, $h12k.2d                     @ GHASH final-1 block - mid
5915	eor     $output_l0, $output_l0, $rk14_l                  @ AES final block - round 14 low
5916#ifdef __AARCH64EB__
5917	rev     $output_l0, $output_l0
5918#endif
5919	eor     $acc_lb, $acc_lb, $rk3                           @ GHASH final-1 block - low
5920
5921	eor     $acc_hb, $acc_hb, $rk2                           @ GHASH final-1 block - high
5922
5923	eor     $acc_mb, $acc_mb, $rk4v.16b                      @ GHASH final-1 block - mid
5924	eor     $output_h0, $output_h0, $rk14_h                  @ AES final block - round 14 high
5925#ifdef __AARCH64EB__
5926	rev     $output_h0, $output_h0
5927#endif
5928	.L256_dec_blocks_less_than_1:                            @ blocks left <= 1
5929
5930	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
5931	mvn     $rk14_h, xzr                                     @ rk14_h = 0xffffffffffffffff
5932
5933	sub     $bit_length, $bit_length, #128                   @ bit_length -= 128
5934	mvn     $rk14_l, xzr                                     @ rk14_l = 0xffffffffffffffff
5935
5936	ldp     $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite
5937	neg     $bit_length, $bit_length                         @ bit_length = 128 - #bits in input (in range [1,128])
5938
5939	and     $bit_length, $bit_length, #127                   @ bit_length %= 128
5940
5941	lsr     $rk14_h, $rk14_h, $bit_length                    @ rk14_h is mask for top 64b of last block
5942	cmp     $bit_length, #64
5943
5944	csel    $ctr32x, $rk14_l, $rk14_h, lt
5945	csel    $ctr96_b64x, $rk14_h, xzr, lt
5946
5947	fmov    $ctr0d, $ctr32x                                  @ ctr0b is mask for last block
5948	and     $output_l0, $output_l0, $ctr32x
5949
5950	mov     $ctr0.d[1], $ctr96_b64x
5951	bic     $end_input_ptr, $end_input_ptr, $ctr32x          @ mask out low existing bytes
5952
5953#ifndef __AARCH64EB__
5954	rev     $ctr32w, $rctr32w
5955#else
5956	mov     $ctr32w, $rctr32w
5957#endif
5958
5959	bic     $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x      @ mask out high existing bytes
5960
5961	orr     $output_l0, $output_l0, $end_input_ptr
5962
5963	and     $output_h0, $output_h0, $ctr96_b64x
5964
5965	orr     $output_h0, $output_h0, $main_end_input_ptr
5966
5967	and     $res1b, $res1b, $ctr0b                            @ possibly partial last block has zeroes in highest bits
5968
5969	rev64   $res0b, $res1b                                    @ GHASH final block
5970
5971	eor     $res0b, $res0b, $t0.16b                           @ feed in partial tag
5972
5973	pmull   $rk3q1, $res0.1d, $h1.1d                          @ GHASH final block - low
5974
5975	mov     $t0d, $res0.d[1]                                  @ GHASH final block - mid
5976
5977	eor     $t0.8b, $t0.8b, $res0.8b                          @ GHASH final block - mid
5978
5979	pmull2  $rk2q1, $res0.2d, $h1.2d                          @ GHASH final block - high
5980
5981	pmull   $t0.1q, $t0.1d, $h12k.1d                          @ GHASH final block - mid
5982
5983	eor     $acc_hb, $acc_hb, $rk2                            @ GHASH final block - high
5984
5985	eor     $acc_lb, $acc_lb, $rk3                            @ GHASH final block - low
5986
5987	eor     $acc_mb, $acc_mb, $t0.16b                         @ GHASH final block - mid
5988	movi    $mod_constant.8b, #0xc2
5989
5990	eor     $t9.16b, $acc_lb, $acc_hb                         @ MODULO - karatsuba tidy up
5991
5992	shl     $mod_constantd, $mod_constantd, #56               @ mod_constant
5993
5994	eor     $acc_mb, $acc_mb, $t9.16b                         @ MODULO - karatsuba tidy up
5995
5996	pmull   $mod_t.1q, $acc_h.1d, $mod_constant.1d            @ MODULO - top 64b align with mid
5997
5998	ext     $acc_hb, $acc_hb, $acc_hb, #8                     @ MODULO - other top alignment
5999
6000	eor     $acc_mb, $acc_mb, $mod_t.16b                      @ MODULO - fold into mid
6001
6002	eor     $acc_mb, $acc_mb, $acc_hb                         @ MODULO - fold into mid
6003
6004	pmull   $mod_constant.1q, $acc_m.1d, $mod_constant.1d     @ MODULO - mid 64b align with low
6005
6006	ext     $acc_mb, $acc_mb, $acc_mb, #8                     @ MODULO - other mid alignment
6007
6008	eor     $acc_lb, $acc_lb, $mod_constant.16b               @ MODULO - fold into low
6009
6010	stp     $output_l0, $output_h0, [$output_ptr]
6011
6012	str     $ctr32w, [$counter, #12]                          @ store the updated counter
6013
6014	eor     $acc_lb, $acc_lb, $acc_mb                         @ MODULO - fold into low
6015	ext     $acc_lb, $acc_lb, $acc_lb, #8
6016	rev64   $acc_lb, $acc_lb
6017	mov     x0, $len
6018	st1     { $acc_l.16b }, [$current_tag]
6019
6020	ldp     x21, x22, [sp, #16]
6021	ldp     x23, x24, [sp, #32]
6022	ldp     d8, d9, [sp, #48]
6023	ldp     d10, d11, [sp, #64]
6024	ldp     d12, d13, [sp, #80]
6025	ldp     d14, d15, [sp, #96]
6026	ldp     x19, x20, [sp], #112
6027	ret
6028
6029.L256_dec_ret:
6030	mov w0, #0x0
6031	ret
6032.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel
6033___
6034}
6035}
6036
6037$code.=<<___;
6038.asciz  "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
6039.align  2
6040#endif
6041___
6042
6043if ($flavour =~ /64/) {         ######## 64-bit code
6044    sub unvmov {
6045        my $arg=shift;
6046
6047        $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o &&
6048        sprintf "ins    v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1,
6049                             $3<8?$3:$3+8,($4 eq "lo")?0:1;
6050    }
6051    foreach(split("\n",$code)) {
6052        s/@\s/\/\//o;               # old->new style commentary
6053        print $_,"\n";
6054    }
6055} else {                ######## 32-bit code
6056    sub unvdup32 {
6057        my $arg=shift;
6058
6059        $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o &&
6060        sprintf "vdup.32    q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1;
6061    }
6062    sub unvpmullp64 {
6063        my ($mnemonic,$arg)=@_;
6064
6065        if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) {
6066            my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19)
6067                       |(($2&7)<<17)|(($2&8)<<4)
6068                       |(($3&7)<<1) |(($3&8)<<2);
6069            $word |= 0x00010001  if ($mnemonic =~ "2");
6070            # since ARMv7 instructions are always encoded little-endian.
6071            # correct solution is to use .inst directive, but older%%%%
6072            # assemblers don't implement it:-(
6073            sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
6074                    $word&0xff,($word>>8)&0xff,
6075                    ($word>>16)&0xff,($word>>24)&0xff,
6076                    $mnemonic,$arg;
6077        }
6078    }
6079
6080    foreach(split("\n",$code)) {
6081        s/\b[wx]([0-9]+)\b/r$1/go;      # new->old registers
6082        s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go;   # new->old registers
6083        s/\/\/\s?/@ /o;             # new->old style commentary
6084
6085        # fix up remaining new-style suffixes
6086        s/\],#[0-9]+/]!/o;
6087
6088        s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o         or
6089        s/vdup\.32\s+(.*)/unvdup32($1)/geo              or
6090        s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo        or
6091        s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo   or
6092        s/^(\s+)b\./$1b/o                       or
6093        s/^(\s+)ret/$1bx\tlr/o;
6094
6095        if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
6096            print "     it      $2\n";
6097        }
6098        s/__AARCH64E([BL])__/__ARME$1__/go;
6099        print $_,"\n";
6100    }
6101}
6102
6103close STDOUT or die "error closing STDOUT: $!"; # enforce flush
6104