1#! /usr/bin/env perl 2# Copyright 2019-2023 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9# 10#======================================================================== 11# Written by Fangming Fang <fangming.fang@arm.com> for the OpenSSL project, 12# derived from https://github.com/ARM-software/AArch64cryptolib, original 13# author Samuel Lee <Samuel.Lee@arm.com>. The module is, however, dual 14# licensed under OpenSSL and CRYPTOGAMS licenses depending on where you 15# obtain it. For further details see http://www.openssl.org/~appro/cryptogams/. 16#======================================================================== 17# 18# Approach - assume we don't want to reload constants, so reserve ~half of vector register file for constants 19# 20# main loop to act on 4 16B blocks per iteration, and then do modulo of the accumulated intermediate hashes from the 4 blocks 21# 22# ____________________________________________________ 23# | | 24# | PRE | 25# |____________________________________________________| 26# | | | | 27# | CTR block 4k+8 | AES block 4k+4 | GHASH block 4k+0 | 28# |________________|________________|__________________| 29# | | | | 30# | CTR block 4k+9 | AES block 4k+5 | GHASH block 4k+1 | 31# |________________|________________|__________________| 32# | | | | 33# | CTR block 4k+10| AES block 4k+6 | GHASH block 4k+2 | 34# |________________|________________|__________________| 35# | | | | 36# | CTR block 4k+11| AES block 4k+7 | GHASH block 4k+3 | 37# |________________|____(mostly)____|__________________| 38# | | 39# | MODULO | 40# |____________________________________________________| 41# 42# PRE: 43# Ensure previous generated intermediate hash is aligned and merged with result for GHASH 4k+0 44# EXT low_acc, low_acc, low_acc, #8 45# EOR res_curr (4k+0), res_curr (4k+0), low_acc 46# 47# CTR block: 48# Increment and byte reverse counter in scalar registers and transfer to SIMD registers 49# REV ctr32, rev_ctr32 50# ORR ctr64, constctr96_top32, ctr32, LSL #32 51# INS ctr_next.d[0], constctr96_bottom64 // Keeping this in scalar registers to free up space in SIMD RF 52# INS ctr_next.d[1], ctr64X 53# ADD rev_ctr32, #1 54# 55# AES block: 56# Do AES encryption/decryption on CTR block X and EOR it with input block X. Take 256 bytes key below for example. 57# Doing small trick here of loading input in scalar registers, EORing with last key and then transferring 58# Given we are very constrained in our ASIMD registers this is quite important 59# 60# Encrypt: 61# LDR input_low, [ input_ptr ], #8 62# LDR input_high, [ input_ptr ], #8 63# EOR input_low, k14_low 64# EOR input_high, k14_high 65# INS res_curr.d[0], input_low 66# INS res_curr.d[1], input_high 67# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 68# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 69# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 70# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 71# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 72# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 73# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 74# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 75# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 76# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 77# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 78# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 79# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 80# AESE ctr_curr, k13 81# EOR res_curr, res_curr, ctr_curr 82# ST1 { res_curr.16b }, [ output_ptr ], #16 83# 84# Decrypt: 85# AESE ctr_curr, k0; AESMC ctr_curr, ctr_curr 86# AESE ctr_curr, k1; AESMC ctr_curr, ctr_curr 87# AESE ctr_curr, k2; AESMC ctr_curr, ctr_curr 88# AESE ctr_curr, k3; AESMC ctr_curr, ctr_curr 89# AESE ctr_curr, k4; AESMC ctr_curr, ctr_curr 90# AESE ctr_curr, k5; AESMC ctr_curr, ctr_curr 91# AESE ctr_curr, k6; AESMC ctr_curr, ctr_curr 92# AESE ctr_curr, k7; AESMC ctr_curr, ctr_curr 93# AESE ctr_curr, k8; AESMC ctr_curr, ctr_curr 94# AESE ctr_curr, k9; AESMC ctr_curr, ctr_curr 95# AESE ctr_curr, k10; AESMC ctr_curr, ctr_curr 96# AESE ctr_curr, k11; AESMC ctr_curr, ctr_curr 97# AESE ctr_curr, k12; AESMC ctr_curr, ctr_curr 98# AESE ctr_curr, k13 99# LDR res_curr, [ input_ptr ], #16 100# EOR res_curr, res_curr, ctr_curr 101# MOV output_low, res_curr.d[0] 102# MOV output_high, res_curr.d[1] 103# EOR output_low, k14_low 104# EOR output_high, k14_high 105# STP output_low, output_high, [ output_ptr ], #16 106# 107# GHASH block X: 108# do 128b karatsuba polynomial multiplication on block 109# We only have 64b->128b polynomial multipliers, naively that means we need to do 4 64b multiplies to generate a 128b 110# 111# multiplication: 112# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah,Bl) ^ Pmull(Al,Bh))<<64 113# 114# The idea behind Karatsuba multiplication is that we can do just 3 64b multiplies: 115# Pmull(A,B) == (Pmull(Ah,Bh)<<128 | Pmull(Al,Bl)) ^ (Pmull(Ah^Al,Bh^Bl) ^ Pmull(Ah,Bh) ^ Pmull(Al,Bl))<<64 116# 117# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are 118# multiplying with "twisted" powers of H 119# 120# Note: We can PMULL directly into the acc_x in first GHASH of the loop 121# Note: For scheduling big cores we want to split the processing to happen over two loop iterations - otherwise the critical 122# path latency dominates the performance 123# 124# This has a knock on effect on register pressure, so we have to be a bit more clever with our temporary registers 125# than indicated here 126# REV64 res_curr, res_curr 127# INS t_m.d[0], res_curr.d[1] 128# EOR t_m.8B, t_m.8B, res_curr.8B 129# PMULL2 t_h, res_curr, HX 130# PMULL t_l, res_curr, HX 131# PMULL t_m, t_m, HX_k 132# EOR acc_h, acc_h, t_h 133# EOR acc_l, acc_l, t_l 134# EOR acc_m, acc_m, t_m 135# 136# MODULO: take the partial accumulators (~representing sum of 256b multiplication results), from GHASH and do modulo reduction on them 137# There is some complication here because the bit order of GHASH's PMULL is reversed compared to elsewhere, so we are doing modulo 138# with a reversed constant 139# EOR acc_m, acc_m, acc_h 140# EOR acc_m, acc_m, acc_l // Finish off karatsuba processing 141# PMULL t_mod, acc_h, mod_constant 142# EXT acc_h, acc_h, acc_h, #8 143# EOR acc_m, acc_m, acc_h 144# EOR acc_m, acc_m, t_mod 145# PMULL acc_h, acc_m, mod_constant 146# EXT acc_m, acc_m, acc_m, #8 147# EOR acc_l, acc_l, acc_h 148# EOR acc_l, acc_l, acc_m 149 150$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 151$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 152 153$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; 154( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 155( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or 156die "can't locate arm-xlate.pl"; 157 158open OUT,"| \"$^X\" $xlate $flavour $output"; 159*STDOUT=*OUT; 160 161$input_ptr="x0"; #argument block 162$bit_length="x1"; 163$output_ptr="x2"; 164$current_tag="x3"; 165$counter="x16"; 166$cc="x8"; 167 168{ 169my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 170my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 171my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 172my ($output_l0,$output_h0)=map("x$_",(6..7)); 173 174my $ctr32w="w9"; 175my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk10_l,$rk10_h,$len)=map("x$_",(9..15)); 176my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 177 178my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 179my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 180my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 181my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 182 183my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 184my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 185my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 186 187my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 188my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 189my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 190 191my $t0="v8"; 192my $t0d="d8"; 193 194my ($t1,$t2,$t3)=map("v$_",(28..30)); 195my ($t1d,$t2d,$t3d)=map("d$_",(28..30)); 196 197my $t4="v8"; 198my $t4d="d8"; 199my $t5="v28"; 200my $t5d="d28"; 201my $t6="v31"; 202my $t6d="d31"; 203 204my $t7="v4"; 205my $t7d="d4"; 206my $t8="v29"; 207my $t8d="d29"; 208my $t9="v30"; 209my $t9d="d30"; 210 211my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 212my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 213my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 214 215my $mod_constantd="d8"; 216my $mod_constant="v8"; 217my $mod_t="v31"; 218 219my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9)=map("v$_.16b",(18..27)); 220my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s)=map("v$_.4s",(18..27)); 221my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q)=map("q$_",(18..27)); 222my $rk2q1="v20.1q"; 223my $rk3q1="v21.1q"; 224my $rk4v="v22"; 225my $rk4d="d22"; 226 227$code=<<___; 228#include "arm_arch.h" 229 230#if __ARM_MAX_ARCH__>=8 231___ 232$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/); 233$code.=<<___ if ($flavour !~ /64/); 234.fpu neon 235#ifdef __thumb2__ 236.syntax unified 237.thumb 238# define INST(a,b,c,d) $_byte c,0xef,a,b 239#else 240.code 32 241# define INST(a,b,c,d) $_byte a,b,c,0xf2 242#endif 243 244.text 245___ 246 247######################################################################################### 248# size_t aes_gcm_enc_128_kernel(const unsigned char *in, 249# size_t len, 250# unsigned char *out, 251# const void *key, 252# unsigned char ivec[16], 253# u64 *Xi); 254# 255$code.=<<___; 256.global aes_gcm_enc_128_kernel 257.type aes_gcm_enc_128_kernel,%function 258.align 4 259aes_gcm_enc_128_kernel: 260 AARCH64_VALID_CALL_TARGET 261 cbz x1, .L128_enc_ret 262 stp x19, x20, [sp, #-112]! 263 mov x16, x4 264 mov x8, x5 265 stp x21, x22, [sp, #16] 266 stp x23, x24, [sp, #32] 267 stp d8, d9, [sp, #48] 268 stp d10, d11, [sp, #64] 269 stp d12, d13, [sp, #80] 270 stp d14, d15, [sp, #96] 271 272 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 273#ifdef __AARCH64EB__ 274 rev $ctr96_b64x, $ctr96_b64x 275 rev $ctr96_t32x, $ctr96_t32x 276#endif 277 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 278#ifdef __AARCH64EB__ 279 ror $rk10_l, $rk10_l, #32 280 ror $rk10_h, $rk10_h, #32 281#endif 282 ld1 {$acc_lb}, [$current_tag] 283 ext $acc_lb, $acc_lb, $acc_lb, #8 284 rev64 $acc_lb, $acc_lb 285 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 286 mov $len, $main_end_input_ptr 287 288 ld1 {$rk0s}, [$cc], #16 @ load rk0 289 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 290 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 291 292 lsr $rctr32x, $ctr96_t32x, #32 293 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 294#ifndef __AARCH64EB__ 295 ext $h4b, $h4b, $h4b, #8 296#endif 297 fmov $ctr1d, $ctr96_b64x @ CTR block 1 298 rev $rctr32w, $rctr32w @ rev_ctr32 299 300 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 301 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 302 ld1 {$rk1s}, [$cc], #16 @ load rk1 303 304 rev $ctr32w, $rctr32w @ CTR block 1 305 add $rctr32w, $rctr32w, #1 @ CTR block 1 306 fmov $ctr3d, $ctr96_b64x @ CTR block 3 307 308 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 309 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 310 311 fmov $ctr1.d[1], $ctr32x @ CTR block 1 312 rev $ctr32w, $rctr32w @ CTR block 2 313 314 fmov $ctr2d, $ctr96_b64x @ CTR block 2 315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 316 add $rctr32w, $rctr32w, #1 @ CTR block 2 317 318 fmov $ctr2.d[1], $ctr32x @ CTR block 2 319 rev $ctr32w, $rctr32w @ CTR block 3 320 321 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 322 ld1 {$rk2s}, [$cc], #16 @ load rk2 323 324 add $rctr32w, $rctr32w, #1 @ CTR block 3 325 fmov $ctr3.d[1], $ctr32x @ CTR block 3 326 327 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 328#ifndef __AARCH64EB__ 329 ext $h3b, $h3b, $h3b, #8 330#endif 331 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 332 ld1 {$rk3s}, [$cc], #16 @ load rk3 333 334 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 335 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 336#ifndef __AARCH64EB__ 337 ext $h1b, $h1b, $h1b, #8 338#endif 339 340 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 341 ld1 {$rk4s}, [$cc], #16 @ load rk4 342 343 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 344 ld1 {$rk5s}, [$cc], #16 @ load rk5 345 346 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 347 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 348 349 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 350 ld1 {$rk6s}, [$cc], #16 @ load rk6 351 352 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 353 ld1 {$rk7s}, [$cc], #16 @ load rk7 354 355 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 356 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 357 358 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 359 ld1 {$rk8s}, [$cc], #16 @ load rk8 360 361 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 362 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 363#ifndef __AARCH64EB__ 364 ext $h2b, $h2b, $h2b, #8 365#endif 366 367 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 368 369 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 370 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 371 372 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 373 374 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 375 376 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 377 ld1 {$rk9s}, [$cc], #16 @ load rk9 378 379 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 380 381 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 382 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 383 384 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 385 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 386 387 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 388 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 389 390 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 391 392 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 393 394 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 395 396 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 397 398 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 399 400 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 401 402 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 403 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 404 405 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 406 407 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 408 409 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 410 411 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 412 413 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 414 415 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 416 417 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 418 419 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 420 421 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 422 423 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 424 425 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 426 427 aese $ctr2b, $rk9 @ AES block 2 - round 9 428 429 aese $ctr0b, $rk9 @ AES block 0 - round 9 430 431 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 432 433 aese $ctr1b, $rk9 @ AES block 1 - round 9 434 435 aese $ctr3b, $rk9 @ AES block 3 - round 9 436 b.ge .L128_enc_tail @ handle tail 437 438 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 439#ifdef __AARCH64EB__ 440 rev $input_l0, $input_l0 441 rev $input_h0, $input_h0 442#endif 443 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 444#ifdef __AARCH64EB__ 445 rev $input_l2, $input_l2 446 rev $input_h2, $input_h2 447#endif 448 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 449#ifdef __AARCH64EB__ 450 rev $input_l1, $input_l1 451 rev $input_h1, $input_h1 452#endif 453 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 454#ifdef __AARCH64EB__ 455 rev $input_l3, $input_l3 456 rev $input_h3, $input_h3 457#endif 458 eor $input_l0, $input_l0, $rk10_l @ AES block 0 - round 10 low 459 eor $input_h0, $input_h0, $rk10_h @ AES block 0 - round 10 high 460 461 eor $input_l2, $input_l2, $rk10_l @ AES block 2 - round 10 low 462 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 463 464 eor $input_l1, $input_l1, $rk10_l @ AES block 1 - round 10 low 465 eor $input_h2, $input_h2, $rk10_h @ AES block 2 - round 10 high 466 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 467 468 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 469 eor $input_h1, $input_h1, $rk10_h @ AES block 1 - round 10 high 470 471 eor $input_l3, $input_l3, $rk10_l @ AES block 3 - round 10 low 472 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 473 474 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 475 eor $input_h3, $input_h3, $rk10_h @ AES block 3 - round 10 high 476 rev $ctr32w, $rctr32w @ CTR block 4 477 478 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 479 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 480 481 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 482 fmov $ctr0d, $ctr96_b64x @ CTR block 4 483 add $rctr32w, $rctr32w, #1 @ CTR block 4 484 485 fmov $ctr0.d[1], $ctr32x @ CTR block 4 486 rev $ctr32w, $rctr32w @ CTR block 5 487 488 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 489 fmov $ctr1d, $ctr96_b64x @ CTR block 5 490 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 491 492 add $rctr32w, $rctr32w, #1 @ CTR block 5 493 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 494 fmov $ctr1.d[1], $ctr32x @ CTR block 5 495 496 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 497 rev $ctr32w, $rctr32w @ CTR block 6 498 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 499 500 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 501 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 502 503 add $rctr32w, $rctr32w, #1 @ CTR block 6 504 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 505 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 506 507 fmov $ctr2d, $ctr96_b64x @ CTR block 6 508 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 509 510 fmov $ctr2.d[1], $ctr32x @ CTR block 6 511 rev $ctr32w, $rctr32w @ CTR block 7 512 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 513 514 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 515 516 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 517 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 518 b.ge .L128_enc_prepretail @ do prepretail 519 520 .L128_enc_main_loop: @ main loop start 521 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext 522#ifdef __AARCH64EB__ 523 rev $input_l3, $input_l3 524 rev $input_h3, $input_h3 525#endif 526 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 527 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 528 529 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 530 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 531 532 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 533 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 534 535 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 536 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 537 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 538 539 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 540 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 541 542 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 543 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 544 545 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 546 eor $res0b, $res0b, $acc_lb @ PRE 1 547 548 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 549 eor $input_h3, $input_h3, $rk10_h @ AES block 4k+3 - round 10 high 550 551 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 552 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 553 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 554#ifdef __AARCH64EB__ 555 rev $input_l0, $input_l0 556 rev $input_h0, $input_h0 557#endif 558 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 559 rev $ctr32w, $rctr32w @ CTR block 4k+8 560 561 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 562 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 563 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 564 565 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 566 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 567 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 568 569 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 570 571 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 572 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 573 574 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 575 576 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 577 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 578 579 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 580 581 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 582 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 583 584 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 585 586 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 587 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 588 589 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 590 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high 591 592 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 593 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 594 595 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 596 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 597 598 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 599 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low 600 601 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 602 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 603 604 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 605 606 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 607 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 608 609 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 610 611 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 612 movi $mod_constant.8b, #0xc2 613 614 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 615 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 616 617 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 618 619 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 620 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 621 622 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 623 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 624 625 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 626 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 627#ifdef __AARCH64EB__ 628 rev $input_l1, $input_l1 629 rev $input_h1, $input_h1 630#endif 631 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 632 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 633 634 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 635 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 636#ifdef __AARCH64EB__ 637 rev $input_l2, $input_l2 638 rev $input_h2, $input_h2 639#endif 640 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 641 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 642 643 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 644 eor $input_l1, $input_l1, $rk10_l @ AES block 4k+5 - round 10 low 645 646 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 647 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 648 649 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 650 eor $input_l3, $input_l3, $rk10_l @ AES block 4k+3 - round 10 low 651 652 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 653 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 654 655 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 656 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 657 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 658 659 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 660 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low 661 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 662 663 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 664 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 665 666 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 667 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 668 669 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 670 eor $input_h1, $input_h1, $rk10_h @ AES block 4k+5 - round 10 high 671 672 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 673 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 674 675 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 676 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high 677 678 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 679 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 680 681 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 682 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 683 684 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 685 eor $input_l2, $input_l2, $rk10_l @ AES block 4k+6 - round 10 low 686 eor $input_h2, $input_h2, $rk10_h @ AES block 4k+6 - round 10 high 687 688 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 689 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 690 691 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 692 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 693 694 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 695 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 696 697 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 698 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 699 700 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 701 rev $ctr32w, $rctr32w @ CTR block 4k+9 702 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 703 704 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 705 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 706 707 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 708 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 709 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 710 711 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 712 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 713 rev $ctr32w, $rctr32w @ CTR block 4k+10 714 715 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 716 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 717 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 718 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 719 720 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 721 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 722 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 723 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 724 725 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 726 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 727 728 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 729 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 730 rev $ctr32w, $rctr32w @ CTR block 4k+11 731 732 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 733 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result 734 735 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 736 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result 737 b.lt .L128_enc_main_loop 738 739 .L128_enc_prepretail: @ PREPRETAIL 740 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 741 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 742 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 743 744 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 745 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 746 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 747 748 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 749 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 750 751 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 752 753 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 754 eor $res0b, $res0b, $acc_lb @ PRE 1 755 756 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 757 758 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 759 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 760 761 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 762 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 763 764 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 765 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 766 767 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 768 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 769 770 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 771 772 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 773 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 774 775 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 776 777 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 778 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 779 780 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 781 782 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 783 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 784 785 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 786 787 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 788 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 789 790 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 791 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 792 793 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 794 795 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 796 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 797 798 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 799 800 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 801 802 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 803 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 804 805 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 806 807 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 808 movi $mod_constant.8b, #0xc2 809 810 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 811 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 812 813 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 814 815 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 816 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 817 818 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 819 820 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 821 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 822 823 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 824 825 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 826 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 827 828 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 829 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 830 831 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 832 833 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 834 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 835 836 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 837 838 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 839 ext $acc_hb, $acc_hb, $acc_hb, #8 840 841 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 842 843 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 844 eor $acc_mb, $acc_mb, $acc_lb 845 846 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 847 848 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 849 850 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 851 852 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 853 eor $acc_mb, $acc_mb, $t1.16b 854 855 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 856 857 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 858 859 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 860 861 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 862 eor $acc_mb, $acc_mb, $acc_hb 863 864 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 865 866 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 867 868 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 869 870 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 871 872 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 873 ext $acc_mb, $acc_mb, $acc_mb, #8 874 875 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 876 877 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 878 eor $acc_lb, $acc_lb, $t1.16b 879 880 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 881 882 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 883 884 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 885 886 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 887 888 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 889 eor $acc_lb, $acc_lb, $acc_mb 890 891 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 892 .L128_enc_tail: @ TAIL 893 894 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 895 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 896#ifdef __AARCH64EB__ 897 rev $input_l0, $input_l0 898 rev $input_h0, $input_h0 899#endif 900 cmp $main_end_input_ptr, #48 901 902 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 903 eor $input_l0, $input_l0, $rk10_l @ AES block 4k+4 - round 10 low 904 eor $input_h0, $input_h0, $rk10_h @ AES block 4k+4 - round 10 high 905 906 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 907 908 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 909 910 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 911 912 b.gt .L128_enc_blocks_more_than_3 913 914 sub $rctr32w, $rctr32w, #1 915 movi $acc_l.8b, #0 916 mov $ctr3b, $ctr2b 917 918 cmp $main_end_input_ptr, #32 919 mov $ctr2b, $ctr1b 920 movi $acc_h.8b, #0 921 922 movi $acc_m.8b, #0 923 b.gt .L128_enc_blocks_more_than_2 924 925 mov $ctr3b, $ctr1b 926 cmp $main_end_input_ptr, #16 927 928 sub $rctr32w, $rctr32w, #1 929 b.gt .L128_enc_blocks_more_than_1 930 931 sub $rctr32w, $rctr32w, #1 932 b .L128_enc_blocks_less_than_1 933 .L128_enc_blocks_more_than_3: @ blocks left > 3 934 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 935 936 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 937#ifdef __AARCH64EB__ 938 rev $input_l0, $input_l0 939 rev $input_h0, $input_h0 940#endif 941 rev64 $res0b, $res1b @ GHASH final-3 block 942 943 eor $res0b, $res0b, $t0.16b @ feed in partial tag 944 eor $input_h0, $input_h0, $rk10_h @ AES final-2 block - round 10 high 945 eor $input_l0, $input_l0, $rk10_l @ AES final-2 block - round 10 low 946 947 fmov $res1d, $input_l0 @ AES final-2 block - mov low 948 949 movi $t0.8b, #0 @ suppress further partial tag feed in 950 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 951 952 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 953 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 954 955 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 956 957 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 958 959 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 960 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 961 962 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 963 .L128_enc_blocks_more_than_2: @ blocks left > 2 964 965 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 966 967 rev64 $res0b, $res1b @ GHASH final-2 block 968 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 969#ifdef __AARCH64EB__ 970 rev $input_l0, $input_l0 971 rev $input_h0, $input_h0 972#endif 973 eor $res0b, $res0b, $t0.16b @ feed in partial tag 974 975 eor $input_l0, $input_l0, $rk10_l @ AES final-1 block - round 10 low 976 977 fmov $res1d, $input_l0 @ AES final-1 block - mov low 978 eor $input_h0, $input_h0, $rk10_h @ AES final-1 block - round 10 high 979 980 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 981 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 982 983 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 984 985 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 986 987 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 988 989 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 990 991 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 992 993 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 994 995 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 996 997 movi $t0.8b, #0 @ suppress further partial tag feed in 998 999 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 1000 .L128_enc_blocks_more_than_1: @ blocks left > 1 1001 1002 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 1003 1004 rev64 $res0b, $res1b @ GHASH final-1 block 1005 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 1006#ifdef __AARCH64EB__ 1007 rev $input_l0, $input_l0 1008 rev $input_h0, $input_h0 1009#endif 1010 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1011 1012 eor $input_h0, $input_h0, $rk10_h @ AES final block - round 10 high 1013 eor $input_l0, $input_l0, $rk10_l @ AES final block - round 10 low 1014 1015 fmov $res1d, $input_l0 @ AES final block - mov low 1016 1017 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 1018 fmov $res1.d[1], $input_h0 @ AES final block - mov high 1019 1020 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 1021 1022 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 1023 1024 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 1025 1026 eor $res1b, $res1b, $ctr3b @ AES final block - result 1027 1028 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 1029 1030 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 1031 1032 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 1033 1034 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 1035 1036 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 1037 movi $t0.8b, #0 @ suppress further partial tag feed in 1038 .L128_enc_blocks_less_than_1: @ blocks left <= 1 1039 1040 and $bit_length, $bit_length, #127 @ bit_length %= 128 1041 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff 1042 1043 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff 1044 sub $bit_length, $bit_length, #128 @ bit_length -= 128 1045 1046 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 1047 1048 and $bit_length, $bit_length, #127 @ bit_length %= 128 1049 1050 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block 1051 cmp $bit_length, #64 1052 1053 csel $input_l0, $rk10_l, $rk10_h, lt 1054 csel $input_h0, $rk10_h, xzr, lt 1055 1056 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 1057 1058 fmov $ctr0.d[1], $input_h0 1059 1060 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 1061 1062 rev64 $res0b, $res1b @ GHASH final block 1063 1064 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1065 1066 mov $t0d, $res0.d[1] @ GHASH final block - mid 1067 1068 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 1069 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 1070 1071 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 1072#ifndef __AARCH64EB__ 1073 rev $ctr32w, $rctr32w 1074#else 1075 mov $ctr32w, $rctr32w 1076#endif 1077 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 1078 1079 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 1080 1081 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 1082 1083 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 1084 1085 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 1086 movi $mod_constant.8b, #0xc2 1087 1088 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1089 1090 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1091 1092 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1093 1094 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1095 1096 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1097 1098 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1099 1100 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1101 1102 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1103 1104 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1105 1106 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 1107 1108 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 1109 st1 { $res1b}, [$output_ptr] @ store all 16B 1110 1111 str $ctr32w, [$counter, #12] @ store the updated counter 1112 1113 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1114 ext $acc_lb, $acc_lb, $acc_lb, #8 1115 rev64 $acc_lb, $acc_lb 1116 mov x0, $len 1117 st1 { $acc_l.16b }, [$current_tag] 1118 ldp x21, x22, [sp, #16] 1119 ldp x23, x24, [sp, #32] 1120 ldp d8, d9, [sp, #48] 1121 ldp d10, d11, [sp, #64] 1122 ldp d12, d13, [sp, #80] 1123 ldp d14, d15, [sp, #96] 1124 ldp x19, x20, [sp], #112 1125 ret 1126 1127.L128_enc_ret: 1128 mov w0, #0x0 1129 ret 1130.size aes_gcm_enc_128_kernel,.-aes_gcm_enc_128_kernel 1131___ 1132 1133######################################################################################### 1134# size_t aes_gcm_dec_128_kernel(const unsigned char *in, 1135# size_t len, 1136# unsigned char *out, 1137# const void *key, 1138# unsigned char ivec[16], 1139# u64 *Xi); 1140# 1141$code.=<<___; 1142.global aes_gcm_dec_128_kernel 1143.type aes_gcm_dec_128_kernel,%function 1144.align 4 1145aes_gcm_dec_128_kernel: 1146 AARCH64_VALID_CALL_TARGET 1147 cbz x1, .L128_dec_ret 1148 stp x19, x20, [sp, #-112]! 1149 mov x16, x4 1150 mov x8, x5 1151 stp x21, x22, [sp, #16] 1152 stp x23, x24, [sp, #32] 1153 stp d8, d9, [sp, #48] 1154 stp d10, d11, [sp, #64] 1155 stp d12, d13, [sp, #80] 1156 stp d14, d15, [sp, #96] 1157 1158 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 1159 mov $len, $main_end_input_ptr 1160 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 1161#ifdef __AARCH64EB__ 1162 rev $ctr96_b64x, $ctr96_b64x 1163 rev $ctr96_t32x, $ctr96_t32x 1164#endif 1165 ldp $rk10_l, $rk10_h, [$cc, #160] @ load rk10 1166#ifdef __AARCH64EB__ 1167 ror $rk10_h, $rk10_h, 32 1168 ror $rk10_l, $rk10_l, 32 1169#endif 1170 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 1171 ld1 {$rk0s}, [$cc], #16 @ load rk0 1172 1173 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 1174 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 1175 1176 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 1177#ifndef __AARCH64EB__ 1178 ext $h2b, $h2b, $h2b, #8 1179#endif 1180 lsr $rctr32x, $ctr96_t32x, #32 1181 fmov $ctr2d, $ctr96_b64x @ CTR block 2 1182 1183 ld1 {$rk1s}, [$cc], #16 @ load rk1 1184 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 1185 rev $rctr32w, $rctr32w @ rev_ctr32 1186 1187 fmov $ctr1d, $ctr96_b64x @ CTR block 1 1188 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 1189 1190 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 1191 rev $ctr32w, $rctr32w @ CTR block 1 1192 1193 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 1194 ld1 {$rk2s}, [$cc], #16 @ load rk2 1195 add $rctr32w, $rctr32w, #1 @ CTR block 1 1196 1197 fmov $ctr1.d[1], $ctr32x @ CTR block 1 1198 rev $ctr32w, $rctr32w @ CTR block 2 1199 add $rctr32w, $rctr32w, #1 @ CTR block 2 1200 1201 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 1202 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 1203 1204 fmov $ctr2.d[1], $ctr32x @ CTR block 2 1205 rev $ctr32w, $rctr32w @ CTR block 3 1206 1207 fmov $ctr3d, $ctr96_b64x @ CTR block 3 1208 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 1209 add $rctr32w, $rctr32w, #1 @ CTR block 3 1210 1211 fmov $ctr3.d[1], $ctr32x @ CTR block 3 1212 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 1213 1214 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 1215 ld1 {$rk3s}, [$cc], #16 @ load rk3 1216 1217 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 1218 ld1 {$rk4s}, [$cc], #16 @ load rk4 1219 1220 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 1221 ld1 {$rk5s}, [$cc], #16 @ load rk5 1222 1223 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 1224 ld1 {$rk6s}, [$cc], #16 @ load rk6 1225 1226 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 1227 1228 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 1229 1230 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 1231 1232 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 1233 ld1 { $acc_lb}, [$current_tag] 1234 ext $acc_lb, $acc_lb, $acc_lb, #8 1235 rev64 $acc_lb, $acc_lb 1236 1237 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 1238 ld1 {$rk7s}, [$cc], #16 @ load rk7 1239 1240 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 1241 1242 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 1243 1244 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 1245 ld1 {$rk8s}, [$cc], #16 @ load rk8 1246 1247 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 1248 1249 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 1250 1251 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 1252 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 1253#ifndef __AARCH64EB__ 1254 ext $h3b, $h3b, $h3b, #8 1255#endif 1256 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 1257 ld1 {$rk9s}, [$cc], #16 @ load rk9 1258 1259 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 1260 1261 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 1262 1263 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 1264 1265 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 1266 1267 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 1268 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 1269#ifndef __AARCH64EB__ 1270 ext $h1b, $h1b, $h1b, #8 1271#endif 1272 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 1273 1274 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 1275 1276 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 1277 1278 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 1279 1280 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 1281 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 1282 1283 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 1284#ifndef __AARCH64EB__ 1285 ext $h4b, $h4b, $h4b, #8 1286#endif 1287 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 1288 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 1289 1290 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 1291 1292 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 1293 1294 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 1295 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 1296 1297 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 1298 1299 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 1300 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 1301 1302 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 1303 1304 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 1305 1306 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 1307 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 1308 1309 aese $ctr2b, $rk9 @ AES block 2 - round 9 1310 1311 aese $ctr3b, $rk9 @ AES block 3 - round 9 1312 1313 aese $ctr0b, $rk9 @ AES block 0 - round 9 1314 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 1315 1316 aese $ctr1b, $rk9 @ AES block 1 - round 9 1317 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 1318 b.ge .L128_dec_tail @ handle tail 1319 1320 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0 - load ciphertext; AES block 1 - load ciphertext 1321 1322 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 1323 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext 1324 1325 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 1326 rev64 $res0b, $res0b @ GHASH block 0 1327 rev $ctr32w, $rctr32w @ CTR block 4 1328 1329 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 1330 add $rctr32w, $rctr32w, #1 @ CTR block 4 1331 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext 1332 1333 rev64 $res1b, $res1b @ GHASH block 1 1334 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 1335 1336 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 1337 1338 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 1339 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 1340 1341 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 1342 1343 fmov $ctr0d, $ctr96_b64x @ CTR block 4 1344 1345 fmov $ctr0.d[1], $ctr32x @ CTR block 4 1346 rev $ctr32w, $rctr32w @ CTR block 5 1347 eor $output_l1, $output_l1, $rk10_l @ AES block 1 - round 10 low 1348#ifdef __AARCH64EB__ 1349 rev $output_l1, $output_l1 1350#endif 1351 fmov $ctr1d, $ctr96_b64x @ CTR block 5 1352 add $rctr32w, $rctr32w, #1 @ CTR block 5 1353 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 1354 1355 fmov $ctr1.d[1], $ctr32x @ CTR block 5 1356 rev $ctr32w, $rctr32w @ CTR block 6 1357 add $rctr32w, $rctr32w, #1 @ CTR block 6 1358 1359 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 1360 1361 eor $output_h1, $output_h1, $rk10_h @ AES block 1 - round 10 high 1362#ifdef __AARCH64EB__ 1363 rev $output_h1, $output_h1 1364#endif 1365 eor $output_l0, $output_l0, $rk10_l @ AES block 0 - round 10 low 1366#ifdef __AARCH64EB__ 1367 rev $output_l0, $output_l0 1368#endif 1369 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 1370 1371 eor $output_h0, $output_h0, $rk10_h @ AES block 0 - round 10 high 1372#ifdef __AARCH64EB__ 1373 rev $output_h0, $output_h0 1374#endif 1375 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 1376 1377 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 1378 b.ge .L128_dec_prepretail @ do prepretail 1379 1380 .L128_dec_main_loop: @ main loop start 1381 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 1382 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 1383 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 1384 1385 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 1386 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 1387 1388 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 1389 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 1390 1391 rev64 $res2b, $res2b @ GHASH block 4k+2 1392 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 1393 rev $ctr32w, $rctr32w @ CTR block 4k+7 1394 1395 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 1396 eor $res0b, $res0b, $acc_lb @ PRE 1 1397 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 1398 1399 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 1400 rev64 $res3b, $res3b @ GHASH block 4k+3 1401 1402 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 1403 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 1404 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 1405 1406 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 1407 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 1408 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 1409 1410 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 1411 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 1412 1413 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 1414 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 1415 1416 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 1417 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 1418 1419 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 1420 1421 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 1422 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 1423 1424 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 1425 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 1426 1427 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 1428 1429 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 1430 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 1431 1432 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 1433 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low 1434#ifdef __AARCH64EB__ 1435 rev $output_l3, $output_l3 1436#endif 1437 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 1438 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high 1439#ifdef __AARCH64EB__ 1440 rev $output_h2, $output_h2 1441#endif 1442 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 1443 1444 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 1445 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 1446 1447 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 1448 1449 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 1450 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 1451 1452 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 1453 1454 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 1455 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 1456 1457 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 1458 1459 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 1460 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 1461 1462 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 1463 1464 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 1465 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 1466 1467 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 1468 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 1469 1470 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 1471 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high 1472#ifdef __AARCH64EB__ 1473 rev $output_h3, $output_h3 1474#endif 1475 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 1476 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 1477 1478 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 1479 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low 1480#ifdef __AARCH64EB__ 1481 rev $output_l2, $output_l2 1482#endif 1483 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 1484 movi $mod_constant.8b, #0xc2 1485 1486 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 1487 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 1488 1489 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 1490 1491 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 1492 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 1493 1494 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 1495 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 1496 1497 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 1498 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 1499 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+3 - load ciphertext 1500 1501 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 1502 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 1503 1504 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 1505 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1506 1507 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 1508 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 1509 1510 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 1511 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 1512 1513 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 1514 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1515 1516 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 1517 rev $ctr32w, $rctr32w @ CTR block 4k+8 1518 1519 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1520 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 1521 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1522 1523 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 1524 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 1525 1526 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 1527 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1528 1529 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 1530 1531 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 1532 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 1533 1534 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 1535 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 1536 1537 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 1538 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1539 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 1540 1541 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 1542 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 1543 1544 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 1545 1546 rev64 $res1b, $res1b @ GHASH block 4k+5 1547 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1548 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 1549 1550 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 1551 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 1552 1553 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 1554 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 1555 1556 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1557 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 1558 rev $ctr32w, $rctr32w @ CTR block 4k+9 1559 1560 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 1561 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 1562 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1563 1564 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 1565 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high 1566#ifdef __AARCH64EB__ 1567 rev $output_h0, $output_h0 1568#endif 1569 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 1570 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 1571 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low 1572#ifdef __AARCH64EB__ 1573 rev $output_l0, $output_l0 1574#endif 1575 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 1576 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 1577 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 1578 1579 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 1580 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 1581 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 1582 1583 rev64 $res0b, $res0b @ GHASH block 4k+4 1584 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1585 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 1586 1587 rev $ctr32w, $rctr32w @ CTR block 4k+10 1588 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 1589 1590 eor $output_h1, $output_h1, $rk10_h @ AES block 4k+5 - round 10 high 1591#ifdef __AARCH64EB__ 1592 rev $output_h1, $output_h1 1593#endif 1594 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 1595 1596 eor $output_l1, $output_l1, $rk10_l @ AES block 4k+5 - round 10 low 1597#ifdef __AARCH64EB__ 1598 rev $output_l1, $output_l1 1599#endif 1600 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 1601 1602 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 1603 b.lt L128_dec_main_loop 1604 1605 .L128_dec_prepretail: @ PREPRETAIL 1606 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 1607 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 1608 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 1609 1610 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 1611 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 1612 1613 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 1614 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 1615 1616 eor $res0b, $res0b, $acc_lb @ PRE 1 1617 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 1618 rev64 $res2b, $res2b @ GHASH block 4k+2 1619 1620 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 1621 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 1622 1623 rev $ctr32w, $rctr32w @ CTR block 4k+7 1624 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 1625 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 1626 1627 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 1628 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 1629 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 1630 1631 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 1632 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 1633 1634 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 1635 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 1636 1637 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 1638 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 1639 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 1640 1641 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 1642 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 1643 1644 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 1645 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 1646 1647 rev64 $res3b, $res3b @ GHASH block 4k+3 1648 1649 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 1650 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 1651 1652 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 1653 1654 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 1655 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 1656 1657 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 1658 1659 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 1660 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 1661 1662 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 1663 1664 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 1665 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 1666 1667 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 1668 1669 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 1670 1671 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 1672 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 1673 1674 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 1675 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 1676 1677 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 1678 1679 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 1680 movi $mod_constant.8b, #0xc2 1681 1682 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 1683 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 1684 1685 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 1686 1687 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 1688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 1689 1690 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 1691 eor $output_l3, $output_l3, $rk10_l @ AES block 4k+3 - round 10 low 1692#ifdef __AARCH64EB__ 1693 rev $output_l3, $output_l3 1694#endif 1695 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 1696 eor $output_l2, $output_l2, $rk10_l @ AES block 4k+2 - round 10 low 1697#ifdef __AARCH64EB__ 1698 rev $output_l2, $output_l2 1699#endif 1700 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 1701 1702 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 1703 1704 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 1705 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1706 1707 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 1708 1709 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 1710 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 1711 1712 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 1713 1714 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 1715 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1716 1717 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 1718 1719 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 1720 1721 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 1722 1723 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 1724 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1725 1726 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1727 1728 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 1729 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1730 1731 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 1732 1733 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 1734 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 1735 1736 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 1737 1738 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 1739 1740 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 1741 1742 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 1743 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 1744 1745 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 1746 1747 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 1748 1749 aese $ctr1b, $rk9 @ AES block 4k+5 - round 9 1750 1751 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 1752 eor $output_h3, $output_h3, $rk10_h @ AES block 4k+3 - round 10 high 1753#ifdef __AARCH64EB__ 1754 rev $output_h3, $output_h3 1755#endif 1756 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 1757 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 1758 1759 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 1760 1761 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 1762 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 1763 1764 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 1765 1766 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 1767 eor $output_h2, $output_h2, $rk10_h @ AES block 4k+2 - round 10 high 1768#ifdef __AARCH64EB__ 1769 rev $output_h2, $output_h2 1770#endif 1771 aese $ctr0b, $rk9 @ AES block 4k+4 - round 9 1772 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 1773 1774 aese $ctr2b, $rk9 @ AES block 4k+6 - round 9 1775 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 1776 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 1777 1778 aese $ctr3b, $rk9 @ AES block 4k+7 - round 9 1779 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 1780 .L128_dec_tail: @ TAIL 1781 1782 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 1783 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 1784 1785 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 1786 1787 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 1788 1789 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 1790 1791 cmp $main_end_input_ptr, #48 1792 1793 eor $output_h0, $output_h0, $rk10_h @ AES block 4k+4 - round 10 high 1794#ifdef __AARCH64EB__ 1795 rev $output_h0, $output_h0 1796#endif 1797 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 1798 eor $output_l0, $output_l0, $rk10_l @ AES block 4k+4 - round 10 low 1799#ifdef __AARCH64EB__ 1800 rev $output_l0, $output_l0 1801#endif 1802 b.gt .L128_dec_blocks_more_than_3 1803 1804 mov $ctr3b, $ctr2b 1805 sub $rctr32w, $rctr32w, #1 1806 movi $acc_l.8b, #0 1807 1808 movi $acc_h.8b, #0 1809 mov $ctr2b, $ctr1b 1810 1811 movi $acc_m.8b, #0 1812 cmp $main_end_input_ptr, #32 1813 b.gt .L128_dec_blocks_more_than_2 1814 1815 cmp $main_end_input_ptr, #16 1816 1817 mov $ctr3b, $ctr1b 1818 sub $rctr32w, $rctr32w, #1 1819 b.gt .L128_dec_blocks_more_than_1 1820 1821 sub $rctr32w, $rctr32w, #1 1822 b .L128_dec_blocks_less_than_1 1823 .L128_dec_blocks_more_than_3: @ blocks left > 3 1824 rev64 $res0b, $res1b @ GHASH final-3 block 1825 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 1826 1827 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1828 1829 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 1830 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 1831 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 1832 1833 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 1834 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 1835 1836 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 1837 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 1838 1839 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 1840 1841 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 1842 1843 movi $t0.8b, #0 @ suppress further partial tag feed in 1844 eor $output_h0, $output_h0, $rk10_h @ AES final-2 block - round 10 high 1845#ifdef __AARCH64EB__ 1846 rev $output_h0, $output_h0 1847#endif 1848 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 1849 eor $output_l0, $output_l0, $rk10_l @ AES final-2 block - round 10 low 1850#ifdef __AARCH64EB__ 1851 rev $output_l0, $output_l0 1852#endif 1853 .L128_dec_blocks_more_than_2: @ blocks left > 2 1854 1855 rev64 $res0b, $res1b @ GHASH final-2 block 1856 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 1857 1858 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1859 1860 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 1861 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 1862 1863 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 1864 1865 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 1866 1867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 1868 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 1869 1870 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 1871 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 1872 1873 movi $t0.8b, #0 @ suppress further partial tag feed in 1874 1875 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 1876 1877 eor $output_l0, $output_l0, $rk10_l @ AES final-1 block - round 10 low 1878#ifdef __AARCH64EB__ 1879 rev $output_l0, $output_l0 1880#endif 1881 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 1882 1883 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 1884 1885 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 1886 eor $output_h0, $output_h0, $rk10_h @ AES final-1 block - round 10 high 1887#ifdef __AARCH64EB__ 1888 rev $output_h0, $output_h0 1889#endif 1890 .L128_dec_blocks_more_than_1: @ blocks left > 1 1891 1892 rev64 $res0b, $res1b @ GHASH final-1 block 1893 1894 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 1895 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1896 1897 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 1898 1899 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 1900 1901 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 1902 1903 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 1904 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 1905 1906 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 1907 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 1908 1909 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 1910 1911 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 1912 1913 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 1914 movi $t0.8b, #0 @ suppress further partial tag feed in 1915 1916 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 1917 1918 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 1919 eor $output_h0, $output_h0, $rk10_h @ AES final block - round 10 high 1920#ifdef __AARCH64EB__ 1921 rev $output_h0, $output_h0 1922#endif 1923 eor $output_l0, $output_l0, $rk10_l @ AES final block - round 10 low 1924#ifdef __AARCH64EB__ 1925 rev $output_l0, $output_l0 1926#endif 1927 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 1928 .L128_dec_blocks_less_than_1: @ blocks left <= 1 1929 1930 mvn $rk10_h, xzr @ rk10_h = 0xffffffffffffffff 1931 and $bit_length, $bit_length, #127 @ bit_length %= 128 1932 1933 mvn $rk10_l, xzr @ rk10_l = 0xffffffffffffffff 1934 sub $bit_length, $bit_length, #128 @ bit_length -= 128 1935 1936 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 1937 1938 and $bit_length, $bit_length, #127 @ bit_length %= 128 1939 1940 lsr $rk10_h, $rk10_h, $bit_length @ rk10_h is mask for top 64b of last block 1941 cmp $bit_length, #64 1942 1943 csel $ctr96_b64x, $rk10_h, xzr, lt 1944 csel $ctr32x, $rk10_l, $rk10_h, lt 1945 1946 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 1947 1948 mov $ctr0.d[1], $ctr96_b64x 1949 1950 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 1951 1952 rev64 $res0b, $res1b @ GHASH final block 1953 1954 eor $res0b, $res0b, $t0.16b @ feed in partial tag 1955 1956 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 1957 1958 and $output_h0, $output_h0, $ctr96_b64x 1959 1960 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 1961 mov $t0d, $res0.d[1] @ GHASH final block - mid 1962 1963 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 1964 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 1965 1966 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 1967 1968 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 1969 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 1970 and $output_l0, $output_l0, $ctr32x 1971 1972#ifndef __AARCH64EB__ 1973 rev $ctr32w, $rctr32w 1974#else 1975 mov $ctr32w, $rctr32w 1976#endif 1977 1978 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 1979 movi $mod_constant.8b, #0xc2 1980 1981 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 1982 1983 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 1984 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 1985 1986 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 1987 1988 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 1989 1990 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 1991 1992 orr $output_l0, $output_l0, $end_input_ptr 1993 str $ctr32w, [$counter, #12] @ store the updated counter 1994 1995 orr $output_h0, $output_h0, $main_end_input_ptr 1996 stp $output_l0, $output_h0, [$output_ptr] 1997 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 1998 1999 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 2000 2001 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2002 2003 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2004 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 2005 2006 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 2007 2008 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 2009 ext $acc_lb, $acc_lb, $acc_lb, #8 2010 rev64 $acc_lb, $acc_lb 2011 mov x0, $len 2012 st1 { $acc_l.16b }, [$current_tag] 2013 2014 ldp x21, x22, [sp, #16] 2015 ldp x23, x24, [sp, #32] 2016 ldp d8, d9, [sp, #48] 2017 ldp d10, d11, [sp, #64] 2018 ldp d12, d13, [sp, #80] 2019 ldp d14, d15, [sp, #96] 2020 ldp x19, x20, [sp], #112 2021 ret 2022 2023 .L128_dec_ret: 2024 mov w0, #0x0 2025 ret 2026.size aes_gcm_dec_128_kernel,.-aes_gcm_dec_128_kernel 2027___ 2028} 2029 2030{ 2031my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 2032my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 2033my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 2034my ($output_l0,$output_h0)=map("x$_",(6..7)); 2035 2036my $ctr32w="w9"; 2037my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk12_l,$rk12_h,$len)=map("x$_",(9..15)); 2038my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 2039 2040my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 2041my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 2042my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 2043my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 2044 2045my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 2046my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 2047my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 2048 2049my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 2050my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 2051my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 2052 2053my $t0="v8"; 2054my $t0d="d8"; 2055my $t3="v4"; 2056my $t3d="d4"; 2057 2058my ($t1,$t2)=map("v$_",(30..31)); 2059my ($t1d,$t2d)=map("d$_",(30..31)); 2060 2061my $t4="v30"; 2062my $t4d="d30"; 2063my $t5="v8"; 2064my $t5d="d8"; 2065my $t6="v31"; 2066my $t6d="d31"; 2067 2068my $t7="v5"; 2069my $t7d="d5"; 2070my $t8="v6"; 2071my $t8d="d6"; 2072my $t9="v30"; 2073my $t9d="d30"; 2074 2075my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 2076my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 2077my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 2078 2079my $mod_constantd="d8"; 2080my $mod_constant="v8"; 2081my $mod_t="v31"; 2082 2083my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11)=map("v$_.16b",(18..29)); 2084my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q)=map("q$_",(18..29)); 2085my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s)=map("v$_.4s",(18..29)); 2086my $rk2q1="v20.1q"; 2087my $rk3q1="v21.1q"; 2088my $rk4v="v22"; 2089my $rk4d="d22"; 2090 2091######################################################################################### 2092# size_t aes_gcm_enc_192_kernel(const unsigned char *in, 2093# size_t len, 2094# unsigned char *out, 2095# const void *key, 2096# unsigned char ivec[16], 2097# u64 *Xi); 2098# 2099$code.=<<___; 2100.global aes_gcm_enc_192_kernel 2101.type aes_gcm_enc_192_kernel,%function 2102.align 4 2103aes_gcm_enc_192_kernel: 2104 AARCH64_VALID_CALL_TARGET 2105 cbz x1, .L192_enc_ret 2106 stp x19, x20, [sp, #-112]! 2107 mov x16, x4 2108 mov x8, x5 2109 stp x21, x22, [sp, #16] 2110 stp x23, x24, [sp, #32] 2111 stp d8, d9, [sp, #48] 2112 stp d10, d11, [sp, #64] 2113 stp d12, d13, [sp, #80] 2114 stp d14, d15, [sp, #96] 2115 2116 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 2117#ifdef __AARCH64EB__ 2118 rev $ctr96_b64x, $ctr96_b64x 2119 rev $ctr96_t32x, $ctr96_t32x 2120#endif 2121 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 2122#ifdef __AARCH64EB__ 2123 ror $rk12_l, $rk12_l, #32 2124 ror $rk12_h, $rk12_h, #32 2125#endif 2126 ld1 {$rk0s}, [$cc], #16 @ load rk0 2127 2128 ld1 {$rk1s}, [$cc], #16 @ load rk1 2129 2130 ld1 {$rk2s}, [$cc], #16 @ load rk2 2131 2132 lsr $rctr32x, $ctr96_t32x, #32 2133 ld1 {$rk3s}, [$cc], #16 @ load rk3 2134 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 2135 2136 ld1 {$rk4s}, [$cc], #16 @ load rk4 2137 rev $rctr32w, $rctr32w @ rev_ctr32 2138 2139 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 2140 fmov $ctr3d, $ctr96_b64x @ CTR block 3 2141 2142 rev $ctr32w, $rctr32w @ CTR block 1 2143 add $rctr32w, $rctr32w, #1 @ CTR block 1 2144 fmov $ctr1d, $ctr96_b64x @ CTR block 1 2145 2146 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 2147 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 2148 2149 fmov $ctr1.d[1], $ctr32x @ CTR block 1 2150 rev $ctr32w, $rctr32w @ CTR block 2 2151 add $rctr32w, $rctr32w, #1 @ CTR block 2 2152 2153 fmov $ctr2d, $ctr96_b64x @ CTR block 2 2154 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 2155 2156 fmov $ctr2.d[1], $ctr32x @ CTR block 2 2157 rev $ctr32w, $rctr32w @ CTR block 3 2158 2159 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 2160 ld1 {$rk5s}, [$cc], #16 @ load rk5 2161 2162 fmov $ctr3.d[1], $ctr32x @ CTR block 3 2163 2164 ld1 {$rk6s}, [$cc], #16 @ load rk6 2165 2166 ld1 {$rk7s}, [$cc], #16 @ load rk7 2167 2168 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 2169 ld1 { $acc_lb}, [$current_tag] 2170 ext $acc_lb, $acc_lb, $acc_lb, #8 2171 rev64 $acc_lb, $acc_lb 2172 2173 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 2174 ld1 {$rk8s}, [$cc], #16 @ load rk8 2175 2176 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 2177 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 2178#ifndef __AARCH64EB__ 2179 ext $h4b, $h4b, $h4b, #8 2180#endif 2181 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 2182 ld1 {$rk9s}, [$cc], #16 @ load rk9 2183 2184 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 2185 ld1 {$rk10s}, [$cc], #16 @ load rk10 2186 2187 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 2188 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 2189#ifndef __AARCH64EB__ 2190 ext $h1b, $h1b, $h1b, #8 2191#endif 2192 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 2193 ld1 {$rk11s}, [$cc], #16 @ load rk11 2194 2195 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 2196 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 2197#ifndef __AARCH64EB__ 2198 ext $h3b, $h3b, $h3b, #8 2199#endif 2200 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 2201 2202 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 2203 2204 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 2205 2206 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 2207 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 2208 2209 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 2210 2211 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 2212 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 2213 2214 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 2215 2216 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 2217 2218 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 2219 2220 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 2221 2222 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 2223 2224 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 2225 2226 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 2227 2228 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 2229 2230 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 2231 2232 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 2233 2234 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 2235 2236 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 2237 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 2238#ifndef __AARCH64EB__ 2239 ext $h2b, $h2b, $h2b, #8 2240#endif 2241 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 2242 2243 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 2244 2245 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 2246 2247 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 2248 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 2249 2250 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 2251 2252 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 2253 2254 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 2255 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 2256 2257 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 2258 2259 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 2260 2261 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 2262 2263 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 2264 2265 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 2266 2267 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 2268 2269 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 2270 2271 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 2272 2273 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 2274 2275 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 2276 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 2277 mov $len, $main_end_input_ptr 2278 2279 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 2280 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 2281 2282 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 2283 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 2284 2285 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 2286 2287 aese $ctr2b, $rk11 @ AES block 2 - round 11 2288 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 2289 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 2290 2291 aese $ctr1b, $rk11 @ AES block 1 - round 11 2292 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 2293 2294 aese $ctr0b, $rk11 @ AES block 0 - round 11 2295 add $rctr32w, $rctr32w, #1 @ CTR block 3 2296 2297 aese $ctr3b, $rk11 @ AES block 3 - round 11 2298 b.ge .L192_enc_tail @ handle tail 2299 2300 rev $ctr32w, $rctr32w @ CTR block 4 2301 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 2302#ifdef __AARCH64EB__ 2303 rev $input_l0, $input_l0 2304 rev $input_h0, $input_h0 2305#endif 2306 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 2307 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 2308#ifdef __AARCH64EB__ 2309 rev $input_l2, $input_l2 2310 rev $input_h2, $input_h2 2311#endif 2312 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 2313#ifdef __AARCH64EB__ 2314 rev $input_l3, $input_l3 2315 rev $input_h3, $input_h3 2316#endif 2317 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 2318#ifdef __AARCH64EB__ 2319 rev $input_l1, $input_l1 2320 rev $input_h1, $input_h1 2321#endif 2322 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 2323 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 2324 2325 eor $input_l0, $input_l0, $rk12_l @ AES block 0 - round 12 low 2326 2327 eor $input_h0, $input_h0, $rk12_h @ AES block 0 - round 12 high 2328 eor $input_h2, $input_h2, $rk12_h @ AES block 2 - round 12 high 2329 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 2330 2331 eor $input_h3, $input_h3, $rk12_h @ AES block 3 - round 12 high 2332 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 2333 2334 eor $input_l2, $input_l2, $rk12_l @ AES block 2 - round 12 low 2335 eor $input_l1, $input_l1, $rk12_l @ AES block 1 - round 12 low 2336 2337 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 2338 eor $input_h1, $input_h1, $rk12_h @ AES block 1 - round 12 high 2339 2340 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 2341 2342 eor $input_l3, $input_l3, $rk12_l @ AES block 3 - round 12 low 2343 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 2344 2345 add $rctr32w, $rctr32w, #1 @ CTR block 4 2346 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 2347 fmov $ctr0d, $ctr96_b64x @ CTR block 4 2348 2349 fmov $ctr0.d[1], $ctr32x @ CTR block 4 2350 rev $ctr32w, $rctr32w @ CTR block 5 2351 2352 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 2353 add $rctr32w, $rctr32w, #1 @ CTR block 5 2354 2355 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 2356 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 2357 2358 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 2359 2360 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 2361 fmov $ctr1d, $ctr96_b64x @ CTR block 5 2362 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 2363 2364 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 2365 2366 fmov $ctr1.d[1], $ctr32x @ CTR block 5 2367 rev $ctr32w, $rctr32w @ CTR block 6 2368 2369 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 2370 2371 add $rctr32w, $rctr32w, #1 @ CTR block 6 2372 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 2373 fmov $ctr2d, $ctr96_b64x @ CTR block 6 2374 2375 fmov $ctr2.d[1], $ctr32x @ CTR block 6 2376 rev $ctr32w, $rctr32w @ CTR block 7 2377 2378 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 2379 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 2380 2381 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 2382 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 2383 b.ge .L192_enc_prepretail @ do prepretail 2384 2385 .L192_enc_main_loop: @ main loop start 2386 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 2387 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 2388 2389 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 2390 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 2391#ifdef __AARCH64EB__ 2392 rev $input_l1, $input_l1 2393 rev $input_h1, $input_h1 2394#endif 2395 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 2396 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 2397 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 2398 2399 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 2400 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 2401 2402 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 2403 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2404 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 2405#ifdef __AARCH64EB__ 2406 rev $input_l2, $input_l2 2407 rev $input_h2, $input_h2 2408#endif 2409 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 2410 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+3 - load plaintext 2411#ifdef __AARCH64EB__ 2412 rev $input_l3, $input_l3 2413 rev $input_h3, $input_h3 2414#endif 2415 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 2416 eor $res0b, $res0b, $acc_lb @ PRE 1 2417 2418 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 2419 2420 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 2421 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 2422 2423 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 2424 eor $input_h3, $input_h3, $rk12_h @ AES block 4k+3 - round 12 high 2425 2426 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 2427 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 2428 2429 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 2430 2431 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 2432 eor $input_l2, $input_l2, $rk12_l @ AES block 4k+6 - round 12 low 2433 2434 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 2435 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 2436 2437 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 2438 eor $input_l1, $input_l1, $rk12_l @ AES block 4k+5 - round 12 low 2439 2440 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 2441 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 2442 2443 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 2444 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 2445 2446 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 2447 2448 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 2449 2450 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 2451 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 2452 2453 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 2454 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 2455 2456 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 2457 2458 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 2459 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 2460 2461 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 2462 2463 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 2464 eor $input_h1, $input_h1, $rk12_h @ AES block 4k+5 - round 12 high 2465 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 2466 2467 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 2468 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 2469 2470 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 2471 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 2472 2473 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 2474 eor $input_h2, $input_h2, $rk12_h @ AES block 4k+6 - round 12 high 2475 2476 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 2477 eor $input_l3, $input_l3, $rk12_l @ AES block 4k+3 - round 12 low 2478 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 2479 2480 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 2481 rev $ctr32w, $rctr32w @ CTR block 4k+8 2482 2483 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 2484 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 2485 2486 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 2487 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 2488 2489 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 2490 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 2491#ifdef __AARCH64EB__ 2492 rev $input_l0, $input_l0 2493 rev $input_h0, $input_h0 2494#endif 2495 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 2496 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 2497 2498 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 2499 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 2500 2501 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 2502 movi $mod_constant.8b, #0xc2 2503 2504 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 2505 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high 2506 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 2507 2508 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 2509 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low 2510 2511 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 2512 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2513 2514 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 2515 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 2516 2517 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 2518 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 2519 2520 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 2521 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 2522 2523 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 2524 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 2525 2526 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 2527 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 2528 2529 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 2530 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 2531 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 2532 2533 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 2534 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 2535 2536 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 2537 fmov $ctr_t3d, $input_l3 @ AES block 4k+3 - mov low 2538 2539 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 2540 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 2541 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 2542 2543 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 2544 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+3 - mov high 2545 2546 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 2547 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 2548 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 2549 2550 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 2551 2552 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 2553 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 2554 2555 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 2556 2557 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 2558 2559 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 2560 2561 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 2562 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 2563 2564 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 2565 2566 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 2567 2568 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 2569 2570 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 2571 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2572 2573 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 2574 2575 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 2576 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 2577 2578 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 2579 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 2580 rev $ctr32w, $rctr32w @ CTR block 4k+9 2581 2582 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2583 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 2584 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 2585 2586 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 2587 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 2588 2589 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 2590 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 2591 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 2592 2593 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 2594 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 2595 rev $ctr32w, $rctr32w @ CTR block 4k+10 2596 2597 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 2598 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 2599 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 2600 2601 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 2602 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 2603 2604 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 2605 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 2606 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 2607 2608 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 2609 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 2610 rev $ctr32w, $rctr32w @ CTR block 4k+11 2611 2612 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 2613 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 2614 2615 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+3 - result 2616 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+3 - store result 2617 b.lt .L192_enc_main_loop 2618 2619 .L192_enc_prepretail: @ PREPRETAIL 2620 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 2621 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 2622 2623 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 2624 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 2625 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 2626 2627 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 2628 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 2629 2630 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 2631 2632 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 2633 eor $res0b, $res0b, $acc_lb @ PRE 1 2634 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 2635 2636 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 2637 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 2638 2639 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 2640 2641 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 2642 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 2643 2644 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 2645 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 2646 2647 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 2648 2649 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 2650 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 2651 2652 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 2653 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 2654 2655 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 2656 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 2657 2658 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 2659 2660 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 2661 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 2662 2663 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 2664 2665 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 2666 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 2667 2668 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 2669 2670 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 2671 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 2672 2673 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 2674 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 2675 2676 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 2677 2678 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 2679 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 2680 2681 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 2682 2683 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 2684 2685 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 2686 2687 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 2688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 2689 2690 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 2691 2692 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 2693 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 2694 2695 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 2696 2697 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 2698 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 2699 2700 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 2701 2702 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 2703 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 2704 2705 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 2706 2707 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 2708 movi $mod_constant.8b, #0xc2 2709 2710 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 2711 2712 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 2713 2714 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 2715 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 2716 2717 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 2718 2719 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 2720 2721 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 2722 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 2723 2724 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 2725 2726 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 2727 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 2728 2729 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 2730 2731 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 2732 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2733 2734 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 2735 2736 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 2737 eor $acc_mb, $acc_mb, $acc_lb 2738 2739 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 2740 2741 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 2742 2743 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 2744 ext $acc_hb, $acc_hb, $acc_hb, #8 2745 2746 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 2747 2748 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 2749 eor $acc_mb, $acc_mb, $t1.16b 2750 2751 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 2752 2753 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 2754 2755 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 2756 2757 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 2758 eor $acc_mb, $acc_mb, $acc_hb 2759 2760 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 2761 2762 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 2763 2764 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 2765 2766 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 2767 2768 ext $acc_mb, $acc_mb, $acc_mb, #8 2769 2770 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 2771 2772 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 2773 2774 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 2775 2776 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 2777 eor $acc_lb, $acc_lb, $t1.16b 2778 2779 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 2780 2781 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 2782 2783 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 2784 2785 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 2786 eor $acc_lb, $acc_lb, $acc_mb 2787 .L192_enc_tail: @ TAIL 2788 2789 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 2790 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 2791#ifdef __AARCH64EB__ 2792 rev $input_l0, $input_l0 2793 rev $input_h0, $input_h0 2794#endif 2795 eor $input_l0, $input_l0, $rk12_l @ AES block 4k+4 - round 12 low 2796 eor $input_h0, $input_h0, $rk12_h @ AES block 4k+4 - round 12 high 2797 2798 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 2799 2800 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 2801 cmp $main_end_input_ptr, #48 2802 2803 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 2804 2805 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 2806 b.gt .L192_enc_blocks_more_than_3 2807 2808 sub $rctr32w, $rctr32w, #1 2809 movi $acc_m.8b, #0 2810 2811 mov $ctr3b, $ctr2b 2812 movi $acc_h.8b, #0 2813 cmp $main_end_input_ptr, #32 2814 2815 mov $ctr2b, $ctr1b 2816 movi $acc_l.8b, #0 2817 b.gt .L192_enc_blocks_more_than_2 2818 2819 sub $rctr32w, $rctr32w, #1 2820 2821 mov $ctr3b, $ctr1b 2822 cmp $main_end_input_ptr, #16 2823 b.gt .L192_enc_blocks_more_than_1 2824 2825 sub $rctr32w, $rctr32w, #1 2826 b .L192_enc_blocks_less_than_1 2827 .L192_enc_blocks_more_than_3: @ blocks left > 3 2828 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 2829 2830 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 2831#ifdef __AARCH64EB__ 2832 rev $input_l0, $input_l0 2833 rev $input_h0, $input_h0 2834#endif 2835 rev64 $res0b, $res1b @ GHASH final-3 block 2836 2837 eor $input_l0, $input_l0, $rk12_l @ AES final-2 block - round 12 low 2838 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2839 2840 eor $input_h0, $input_h0, $rk12_h @ AES final-2 block - round 12 high 2841 fmov $res1d, $input_l0 @ AES final-2 block - mov low 2842 2843 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 2844 2845 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 2846 2847 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 2848 2849 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 2850 2851 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 2852 2853 movi $t0.8b, #0 @ suppress further partial tag feed in 2854 2855 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 2856 2857 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 2858 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 2859 .L192_enc_blocks_more_than_2: @ blocks left > 2 2860 2861 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 2862 2863 rev64 $res0b, $res1b @ GHASH final-2 block 2864 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 2865#ifdef __AARCH64EB__ 2866 rev $input_l0, $input_l0 2867 rev $input_h0, $input_h0 2868#endif 2869 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2870 2871 eor $input_h0, $input_h0, $rk12_h @ AES final-1 block - round 12 high 2872 2873 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 2874 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 2875 2876 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 2877 eor $input_l0, $input_l0, $rk12_l @ AES final-1 block - round 12 low 2878 2879 fmov $res1d, $input_l0 @ AES final-1 block - mov low 2880 2881 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 2882 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 2883 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 2884 2885 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 2886 2887 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 2888 2889 movi $t0.8b, #0 @ suppress further partial tag feed in 2890 2891 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 2892 2893 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 2894 .L192_enc_blocks_more_than_1: @ blocks left > 1 2895 2896 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 2897 2898 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 2899#ifdef __AARCH64EB__ 2900 rev $input_l0, $input_l0 2901 rev $input_h0, $input_h0 2902#endif 2903 rev64 $res0b, $res1b @ GHASH final-1 block 2904 2905 eor $input_l0, $input_l0, $rk12_l @ AES final block - round 12 low 2906 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2907 movi $t0.8b, #0 @ suppress further partial tag feed in 2908 2909 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 2910 2911 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 2912 eor $input_h0, $input_h0, $rk12_h @ AES final block - round 12 high 2913 fmov $res1d, $input_l0 @ AES final block - mov low 2914 2915 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 2916 fmov $res1.d[1], $input_h0 @ AES final block - mov high 2917 2918 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 2919 2920 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 2921 2922 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 2923 2924 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 2925 2926 eor $res1b, $res1b, $ctr3b @ AES final block - result 2927 2928 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 2929 2930 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 2931 .L192_enc_blocks_less_than_1: @ blocks left <= 1 2932 2933 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 2934#ifndef __AARCH64EB__ 2935 rev $ctr32w, $rctr32w 2936#else 2937 mov $ctr32w, $rctr32w 2938#endif 2939 and $bit_length, $bit_length, #127 @ bit_length %= 128 2940 2941 sub $bit_length, $bit_length, #128 @ bit_length -= 128 2942 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff 2943 2944 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 2945 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff 2946 2947 and $bit_length, $bit_length, #127 @ bit_length %= 128 2948 2949 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block 2950 cmp $bit_length, #64 2951 2952 csel $input_l0, $rk12_l, $rk12_h, lt 2953 csel $input_h0, $rk12_h, xzr, lt 2954 2955 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 2956 2957 fmov $ctr0.d[1], $input_h0 2958 2959 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 2960 2961 rev64 $res0b, $res1b @ GHASH final block 2962 2963 eor $res0b, $res0b, $t0.16b @ feed in partial tag 2964 2965 mov $t0d, $res0.d[1] @ GHASH final block - mid 2966 2967 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 2968 2969 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 2970 2971 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 2972 2973 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 2974 2975 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 2976 2977 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 2978 2979 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 2980 movi $mod_constant.8b, #0xc2 2981 2982 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 2983 2984 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 2985 2986 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 2987 2988 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 2989 2990 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 2991 2992 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 2993 2994 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 2995 2996 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 2997 2998 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 2999 3000 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3001 3002 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 3003 str $ctr32w, [$counter, #12] @ store the updated counter 3004 3005 st1 { $res1b}, [$output_ptr] @ store all 16B 3006 3007 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3008 ext $acc_lb, $acc_lb, $acc_lb, #8 3009 rev64 $acc_lb, $acc_lb 3010 mov x0, $len 3011 st1 { $acc_l.16b }, [$current_tag] 3012 3013 ldp x21, x22, [sp, #16] 3014 ldp x23, x24, [sp, #32] 3015 ldp d8, d9, [sp, #48] 3016 ldp d10, d11, [sp, #64] 3017 ldp d12, d13, [sp, #80] 3018 ldp d14, d15, [sp, #96] 3019 ldp x19, x20, [sp], #112 3020 ret 3021 3022.L192_enc_ret: 3023 mov w0, #0x0 3024 ret 3025.size aes_gcm_enc_192_kernel,.-aes_gcm_enc_192_kernel 3026___ 3027 3028######################################################################################### 3029# size_t aes_gcm_dec_192_kernel(const unsigned char *in, 3030# size_t len, 3031# unsigned char *out, 3032# const void *key, 3033# unsigned char ivec[16], 3034# u64 *Xi); 3035# 3036$code.=<<___; 3037.global aes_gcm_dec_192_kernel 3038.type aes_gcm_dec_192_kernel,%function 3039.align 4 3040aes_gcm_dec_192_kernel: 3041 AARCH64_VALID_CALL_TARGET 3042 cbz x1, .L192_dec_ret 3043 stp x19, x20, [sp, #-112]! 3044 mov x16, x4 3045 mov x8, x5 3046 stp x21, x22, [sp, #16] 3047 stp x23, x24, [sp, #32] 3048 stp d8, d9, [sp, #48] 3049 stp d10, d11, [sp, #64] 3050 stp d12, d13, [sp, #80] 3051 stp d14, d15, [sp, #96] 3052 3053 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 3054 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 3055#ifdef __AARCH64EB__ 3056 rev $ctr96_b64x, $ctr96_b64x 3057 rev $ctr96_t32x, $ctr96_t32x 3058#endif 3059 ldp $rk12_l, $rk12_h, [$cc, #192] @ load rk12 3060#ifdef __AARCH64EB__ 3061 ror $rk12_l, $rk12_l, #32 3062 ror $rk12_h, $rk12_h, #32 3063#endif 3064 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 3065 3066 ld1 {$rk0s}, [$cc], #16 @ load rk0 3067 3068 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 3069 mov $len, $main_end_input_ptr 3070 ld1 {$rk1s}, [$cc], #16 @ load rk1 3071 3072 lsr $rctr32x, $ctr96_t32x, #32 3073 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 3074 fmov $ctr3d, $ctr96_b64x @ CTR block 3 3075 3076 rev $rctr32w, $rctr32w @ rev_ctr32 3077 fmov $ctr1d, $ctr96_b64x @ CTR block 1 3078 3079 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 3080 ld1 {$rk2s}, [$cc], #16 @ load rk2 3081 3082 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 3083 rev $ctr32w, $rctr32w @ CTR block 1 3084 3085 add $rctr32w, $rctr32w, #1 @ CTR block 1 3086 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 3087 ld1 {$rk3s}, [$cc], #16 @ load rk3 3088 3089 fmov $ctr1.d[1], $ctr32x @ CTR block 1 3090 rev $ctr32w, $rctr32w @ CTR block 2 3091 add $rctr32w, $rctr32w, #1 @ CTR block 2 3092 3093 fmov $ctr2d, $ctr96_b64x @ CTR block 2 3094 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 3095 3096 fmov $ctr2.d[1], $ctr32x @ CTR block 2 3097 rev $ctr32w, $rctr32w @ CTR block 3 3098 3099 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 3100 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 3101 3102 fmov $ctr3.d[1], $ctr32x @ CTR block 3 3103 3104 ld1 {$rk4s}, [$cc], #16 @ load rk4 3105 3106 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 3107 3108 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 3109 ld1 {$rk5s}, [$cc], #16 @ load rk5 3110 3111 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 3112 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 3113#ifndef __AARCH64EB__ 3114 ext $h4b, $h4b, $h4b, #8 3115#endif 3116 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 3117 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 3118#ifndef __AARCH64EB__ 3119 ext $h2b, $h2b, $h2b, #8 3120#endif 3121 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 3122 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 3123#ifndef __AARCH64EB__ 3124 ext $h3b, $h3b, $h3b, #8 3125#endif 3126 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 3127 3128 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 3129 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 3130#ifndef __AARCH64EB__ 3131 ext $h1b, $h1b, $h1b, #8 3132#endif 3133 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 3134 ld1 {$rk6s}, [$cc], #16 @ load rk6 3135 3136 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 3137 ld1 {$rk7s}, [$cc], #16 @ load rk7 3138 3139 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 3140 ld1 {$rk8s}, [$cc], #16 @ load rk8 3141 3142 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 3143 ld1 {$rk9s}, [$cc], #16 @ load rk9 3144 3145 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 3146 ld1 { $acc_lb}, [$current_tag] 3147 ext $acc_lb, $acc_lb, $acc_lb, #8 3148 rev64 $acc_lb, $acc_lb 3149 3150 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 3151 add $rctr32w, $rctr32w, #1 @ CTR block 3 3152 3153 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 3154 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 3155 3156 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 3157 ld1 {$rk10s}, [$cc], #16 @ load rk10 3158 3159 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 3160 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 3161 3162 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 3163 3164 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 3165 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 3166 3167 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 3168 ld1 {$rk11s}, [$cc], #16 @ load rk11 3169 3170 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 3171 3172 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 3173 3174 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 3175 3176 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 3177 3178 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 3179 3180 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 3181 3182 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 3183 3184 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 3185 3186 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 3187 3188 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 3189 3190 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 3191 3192 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 3193 3194 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 3195 3196 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 3197 3198 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 3199 3200 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 3201 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 3202 3203 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 3204 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 3205 3206 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 3207 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 3208 3209 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 3210 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 3211 3212 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 3213 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 3214 3215 aese $ctr3b, $rk11 @ AES block 3 - round 11 3216 3217 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 3218 3219 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 3220 3221 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 3222 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 3223 3224 aese $ctr2b, $rk11 @ AES block 2 - round 11 3225 3226 aese $ctr1b, $rk11 @ AES block 1 - round 11 3227 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 3228 3229 aese $ctr0b, $rk11 @ AES block 0 - round 11 3230 b.ge .L192_dec_tail @ handle tail 3231 3232 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext 3233 3234 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 3235 3236 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 3237 rev $ctr32w, $rctr32w @ CTR block 4 3238 ld1 {$res2b, $res3b}, [$input_ptr], #32 @ AES block 2,3 - load ciphertext 3239 3240 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 3241 3242 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 3243 3244 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 3245 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 3246 add $rctr32w, $rctr32w, #1 @ CTR block 4 3247 3248 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 3249 rev64 $res0b, $res0b @ GHASH block 0 3250 3251 fmov $ctr0d, $ctr96_b64x @ CTR block 4 3252 rev64 $res1b, $res1b @ GHASH block 1 3253 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 3254 3255 eor $output_l1, $output_l1, $rk12_l @ AES block 1 - round 12 low 3256#ifdef __AARCH64EB__ 3257 rev $output_l1, $output_l1 3258#endif 3259 fmov $ctr0.d[1], $ctr32x @ CTR block 4 3260 rev $ctr32w, $rctr32w @ CTR block 5 3261 3262 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 3263 fmov $ctr1d, $ctr96_b64x @ CTR block 5 3264 eor $output_h1, $output_h1, $rk12_h @ AES block 1 - round 12 high 3265#ifdef __AARCH64EB__ 3266 rev $output_h1, $output_h1 3267#endif 3268 add $rctr32w, $rctr32w, #1 @ CTR block 5 3269 fmov $ctr1.d[1], $ctr32x @ CTR block 5 3270 eor $output_l0, $output_l0, $rk12_l @ AES block 0 - round 12 low 3271#ifdef __AARCH64EB__ 3272 rev $output_l0, $output_l0 3273#endif 3274 rev $ctr32w, $rctr32w @ CTR block 6 3275 eor $output_h0, $output_h0, $rk12_h @ AES block 0 - round 12 high 3276#ifdef __AARCH64EB__ 3277 rev $output_h0, $output_h0 3278#endif 3279 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 3280 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 3281 3282 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 3283 3284 add $rctr32w, $rctr32w, #1 @ CTR block 6 3285 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 3286 b.ge .L192_dec_prepretail @ do prepretail 3287 3288 .L192_dec_main_loop: @ main loop start 3289 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 3290 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 3291 3292 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 3293 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 3294 3295 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 3296 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 3297 rev64 $res3b, $res3b @ GHASH block 4k+3 3298 3299 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 3300 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 3301 3302 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 3303 eor $res0b, $res0b, $acc_lb @ PRE 1 3304 3305 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 3306 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 3307 3308 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 3309 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 3310 3311 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 3312 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 3313 3314 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 3315 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 3316 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 3317 3318 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 3319 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 3320 rev $ctr32w, $rctr32w @ CTR block 4k+7 3321 3322 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 3323 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 3324 3325 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 3326 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 3327 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 3328 3329 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 3330 3331 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 3332 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high 3333#ifdef __AARCH64EB__ 3334 rev $output_h2, $output_h2 3335#endif 3336 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 3337 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 3338 3339 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 3340 3341 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 3342 rev64 $res2b, $res2b @ GHASH block 4k+2 3343 3344 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 3345 3346 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 3347 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 3348 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low 3349#ifdef __AARCH64EB__ 3350 rev $output_l2, $output_l2 3351#endif 3352 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 3353 3354 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 3355 3356 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 3357 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 3358 3359 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 3360 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 3361 3362 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 3363 3364 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 3365 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 3366 3367 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 3368 3369 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 3370 3371 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 3372 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 3373 3374 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 3375 3376 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 3377 3378 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 3379 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 3380 3381 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 3382 3383 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 3384 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 3385 3386 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 3387 3388 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 3389 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 3390 3391 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 3392 3393 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 3394 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 3395 3396 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 3397 3398 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 3399 movi $mod_constant.8b, #0xc2 3400 3401 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 3402 3403 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 3404 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 3405 3406 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 3407 3408 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 3409 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 3410 3411 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 3412 3413 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 3414 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 3415 3416 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 3417 3418 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 3419 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3420 3421 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 3422 3423 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 3424 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3425 3426 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 3427 3428 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 3429 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 3430 3431 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 3432 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3433 3434 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3435 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 3436 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low 3437#ifdef __AARCH64EB__ 3438 rev $output_l3, $output_l3 3439#endif 3440 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 3441 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3442 3443 aese $ctr0b, $rk11 @ AES block 4k+4 - round 11 3444 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 3445 3446 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 3447 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3448 3449 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 3450 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 3451 3452 aese $ctr1b, $rk11 @ AES block 4k+5 - round 11 3453 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext 3454 rev $ctr32w, $rctr32w @ CTR block 4k+8 3455 3456 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 3457 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 3458 3459 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 3460 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3461 3462 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 3463 3464 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 3465 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high 3466#ifdef __AARCH64EB__ 3467 rev $output_h3, $output_h3 3468#endif 3469 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 3470 3471 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 3472 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 3473 3474 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 3475 3476 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3477 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 3478 3479 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 3480 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 3481 rev64 $res1b, $res1b @ GHASH block 4k+5 3482 3483 aese $ctr2b, $rk11 @ AES block 4k+6 - round 11 3484 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 3485 3486 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 3487 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 3488 3489 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 3490 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 3491 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3492 3493 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 3494 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 3495 rev $ctr32w, $rctr32w @ CTR block 4k+9 3496 3497 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low 3498#ifdef __AARCH64EB__ 3499 rev $output_l0, $output_l0 3500#endif 3501 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 3502 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3503 3504 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 3505 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 3506 eor $output_l1, $output_l1, $rk12_l @ AES block 4k+5 - round 12 low 3507#ifdef __AARCH64EB__ 3508 rev $output_l1, $output_l1 3509#endif 3510 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 3511 rev $ctr32w, $rctr32w @ CTR block 4k+10 3512 eor $output_h1, $output_h1, $rk12_h @ AES block 4k+5 - round 12 high 3513#ifdef __AARCH64EB__ 3514 rev $output_h1, $output_h1 3515#endif 3516 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high 3517#ifdef __AARCH64EB__ 3518 rev $output_h0, $output_h0 3519#endif 3520 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 3521 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3522 3523 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 3524 rev64 $res0b, $res0b @ GHASH block 4k+4 3525 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 3526 3527 aese $ctr3b, $rk11 @ AES block 4k+7 - round 11 3528 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 3529 b.lt .L192_dec_main_loop 3530 3531 .L192_dec_prepretail: @ PREPRETAIL 3532 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 3533 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 3534 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 3535 3536 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 3537 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 3538 3539 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 3540 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 3541 3542 eor $res0b, $res0b, $acc_lb @ PRE 1 3543 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 3544 3545 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 3546 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 3547 3548 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 3549 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 3550 3551 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 3552 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 3553 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 3554 3555 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 3556 rev64 $res2b, $res2b @ GHASH block 4k+2 3557 3558 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 3559 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 3560 rev $ctr32w, $rctr32w @ CTR block 4k+7 3561 3562 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 3563 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 3564 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 3565 3566 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 3567 eor $output_h3, $output_h3, $rk12_h @ AES block 4k+3 - round 12 high 3568#ifdef __AARCH64EB__ 3569 rev $output_h3, $output_h3 3570#endif 3571 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 3572 3573 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 3574 eor $output_l2, $output_l2, $rk12_l @ AES block 4k+2 - round 12 low 3575#ifdef __AARCH64EB__ 3576 rev $output_l2, $output_l2 3577#endif 3578 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 3579 eor $output_h2, $output_h2, $rk12_h @ AES block 4k+2 - round 12 high 3580#ifdef __AARCH64EB__ 3581 rev $output_h2, $output_h2 3582#endif 3583 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 3584 3585 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 3586 eor $output_l3, $output_l3, $rk12_l @ AES block 4k+3 - round 12 low 3587#ifdef __AARCH64EB__ 3588 rev $output_l3, $output_l3 3589#endif 3590 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 3591 3592 rev64 $res3b, $res3b @ GHASH block 4k+3 3593 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 3594 3595 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 3596 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 3597 3598 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 3599 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 3600 3601 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 3602 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 3603 3604 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 3605 3606 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 3607 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 3608 3609 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 3610 3611 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 3612 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 3613 3614 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 3615 3616 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 3617 3618 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 3619 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 3620 3621 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 3622 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 3623 3624 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 3625 3626 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 3627 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 3628 3629 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 3630 3631 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 3632 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 3633 3634 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 3635 3636 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 3637 movi $mod_constant.8b, #0xc2 3638 3639 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 3640 3641 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 3642 3643 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3644 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 3645 3646 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 3647 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 3648 3649 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 3650 3651 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3652 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 3653 3654 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 3655 3656 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 3657 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 3658 3659 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 3660 3661 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 3662 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3663 3664 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 3665 3666 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 3667 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3668 3669 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 3670 3671 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 3672 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3673 3674 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 3675 3676 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 3677 3678 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 3679 3680 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 3681 3682 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 3683 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3684 3685 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 3686 3687 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 3688 3689 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 3690 3691 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 3692 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3693 3694 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 3695 3696 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 3697 3698 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 3699 3700 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 3701 3702 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 3703 3704 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3705 3706 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 3707 3708 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 3709 3710 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 3711 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3712 3713 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 3714 3715 aese $ctr0b, $rk11 3716 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3717 3718 aese $ctr2b, $rk11 3719 3720 aese $ctr1b, $rk11 3721 3722 aese $ctr3b, $rk11 3723 3724 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3725 .L192_dec_tail: @ TAIL 3726 3727 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 3728 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 3729 3730 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 3731 3732 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 3733 3734 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 3735 3736 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 3737 3738 cmp $main_end_input_ptr, #48 3739 3740 eor $output_h0, $output_h0, $rk12_h @ AES block 4k+4 - round 12 high 3741#ifdef __AARCH64EB__ 3742 rev $output_h0, $output_h0 3743#endif 3744 eor $output_l0, $output_l0, $rk12_l @ AES block 4k+4 - round 12 low 3745#ifdef __AARCH64EB__ 3746 rev $output_l0, $output_l0 3747#endif 3748 b.gt .L192_dec_blocks_more_than_3 3749 3750 movi $acc_l.8b, #0 3751 movi $acc_h.8b, #0 3752 3753 mov $ctr3b, $ctr2b 3754 mov $ctr2b, $ctr1b 3755 sub $rctr32w, $rctr32w, #1 3756 3757 movi $acc_m.8b, #0 3758 cmp $main_end_input_ptr, #32 3759 b.gt .L192_dec_blocks_more_than_2 3760 3761 mov $ctr3b, $ctr1b 3762 cmp $main_end_input_ptr, #16 3763 sub $rctr32w, $rctr32w, #1 3764 3765 b.gt .L192_dec_blocks_more_than_1 3766 3767 sub $rctr32w, $rctr32w, #1 3768 b .L192_dec_blocks_less_than_1 3769 .L192_dec_blocks_more_than_3: @ blocks left > 3 3770 rev64 $res0b, $res1b @ GHASH final-3 block 3771 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 3772 3773 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 3774 3775 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3776 3777 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 3778 3779 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 3780 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 3781 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 3782 3783 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 3784 3785 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 3786 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 3787 3788 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 3789 3790 eor $output_l0, $output_l0, $rk12_l @ AES final-2 block - round 12 low 3791#ifdef __AARCH64EB__ 3792 rev $output_l0, $output_l0 3793#endif 3794 movi $t0.8b, #0 @ suppress further partial tag feed in 3795 3796 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 3797 eor $output_h0, $output_h0, $rk12_h @ AES final-2 block - round 12 high 3798#ifdef __AARCH64EB__ 3799 rev $output_h0, $output_h0 3800#endif 3801 .L192_dec_blocks_more_than_2: @ blocks left > 2 3802 3803 rev64 $res0b, $res1b @ GHASH final-2 block 3804 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 3805 3806 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3807 3808 movi $t0.8b, #0 @ suppress further partial tag feed in 3809 3810 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 3811 3812 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 3813 3814 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 3815 3816 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 3817 3818 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 3819 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 3820 3821 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 3822 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 3823 3824 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 3825 3826 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 3827 3828 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 3829 eor $output_h0, $output_h0, $rk12_h @ AES final-1 block - round 12 high 3830#ifdef __AARCH64EB__ 3831 rev $output_h0, $output_h0 3832#endif 3833 eor $output_l0, $output_l0, $rk12_l @ AES final-1 block - round 12 low 3834#ifdef __AARCH64EB__ 3835 rev $output_l0, $output_l0 3836#endif 3837 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 3838 .L192_dec_blocks_more_than_1: @ blocks left > 1 3839 3840 rev64 $res0b, $res1b @ GHASH final-1 block 3841 3842 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3843 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 3844 3845 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 3846 3847 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 3848 3849 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 3850 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 3851 3852 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 3853 3854 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 3855 3856 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 3857 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 3858 3859 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 3860 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 3861 3862 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 3863 3864 movi $t0.8b, #0 @ suppress further partial tag feed in 3865 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 3866 eor $output_h0, $output_h0, $rk12_h @ AES final block - round 12 high 3867#ifdef __AARCH64EB__ 3868 rev $output_h0, $output_h0 3869#endif 3870 eor $output_l0, $output_l0, $rk12_l @ AES final block - round 12 low 3871#ifdef __AARCH64EB__ 3872 rev $output_l0, $output_l0 3873#endif 3874 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 3875 .L192_dec_blocks_less_than_1: @ blocks left <= 1 3876 3877 mvn $rk12_l, xzr @ rk12_l = 0xffffffffffffffff 3878 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 3879 and $bit_length, $bit_length, #127 @ bit_length %= 128 3880 3881 sub $bit_length, $bit_length, #128 @ bit_length -= 128 3882 3883 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 3884 3885 and $bit_length, $bit_length, #127 @ bit_length %= 128 3886 mvn $rk12_h, xzr @ rk12_h = 0xffffffffffffffff 3887 3888 lsr $rk12_h, $rk12_h, $bit_length @ rk12_h is mask for top 64b of last block 3889 cmp $bit_length, #64 3890 3891 csel $ctr32x, $rk12_l, $rk12_h, lt 3892 csel $ctr96_b64x, $rk12_h, xzr, lt 3893 3894 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 3895 and $output_l0, $output_l0, $ctr32x 3896 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 3897 3898 orr $output_l0, $output_l0, $end_input_ptr 3899 mov $ctr0.d[1], $ctr96_b64x 3900#ifndef __AARCH64EB__ 3901 rev $ctr32w, $rctr32w 3902#else 3903 mov $ctr32w, $rctr32w 3904#endif 3905 3906 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 3907 str $ctr32w, [$counter, #12] @ store the updated counter 3908 3909 rev64 $res0b, $res1b @ GHASH final block 3910 3911 eor $res0b, $res0b, $t0.16b @ feed in partial tag 3912 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 3913 3914 and $output_h0, $output_h0, $ctr96_b64x 3915 3916 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 3917 mov $t0d, $res0.d[1] @ GHASH final block - mid 3918 3919 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 3920 3921 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 3922 3923 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 3924 3925 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 3926 3927 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 3928 3929 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 3930 movi $mod_constant.8b, #0xc2 3931 3932 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 3933 3934 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 3935 3936 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 3937 3938 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 3939 orr $output_h0, $output_h0, $main_end_input_ptr 3940 stp $output_l0, $output_h0, [$output_ptr] 3941 3942 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 3943 3944 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 3945 3946 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 3947 3948 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 3949 3950 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 3951 3952 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 3953 3954 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 3955 ext $acc_lb, $acc_lb, $acc_lb, #8 3956 rev64 $acc_lb, $acc_lb 3957 mov x0, $len 3958 st1 { $acc_l.16b }, [$current_tag] 3959 3960 ldp x21, x22, [sp, #16] 3961 ldp x23, x24, [sp, #32] 3962 ldp d8, d9, [sp, #48] 3963 ldp d10, d11, [sp, #64] 3964 ldp d12, d13, [sp, #80] 3965 ldp d14, d15, [sp, #96] 3966 ldp x19, x20, [sp], #112 3967 ret 3968 3969.L192_dec_ret: 3970 mov w0, #0x0 3971 ret 3972.size aes_gcm_dec_192_kernel,.-aes_gcm_dec_192_kernel 3973___ 3974} 3975 3976{ 3977my ($end_input_ptr,$main_end_input_ptr,$input_l0,$input_h0)=map("x$_",(4..7)); 3978my ($input_l1,$input_h1,$input_l2,$input_h2,$input_l3,$input_h3)=map("x$_",(19..24)); 3979my ($output_l1,$output_h1,$output_l2,$output_h2,$output_l3,$output_h3)=map("x$_",(19..24)); 3980my ($output_l0,$output_h0)=map("x$_",(6..7)); 3981 3982my $ctr32w="w9"; 3983my ($ctr32x,$ctr96_b64x,$ctr96_t32x,$rctr32x,$rk14_l,$rk14_h,$len)=map("x$_",(9..15)); 3984my ($ctr96_t32w,$rctr32w)=map("w$_",(11..12)); 3985 3986my ($ctr0b,$ctr1b,$ctr2b,$ctr3b,$res0b,$res1b,$res2b,$res3b)=map("v$_.16b",(0..7)); 3987my ($ctr0,$ctr1,$ctr2,$ctr3,$res0,$res1,$res2,$res3)=map("v$_",(0..7)); 3988my ($ctr0d,$ctr1d,$ctr2d,$ctr3d,$res0d,$res1d,$res2d,$res3d)=map("d$_",(0..7)); 3989my ($res0q,$res1q,$res2q,$res3q)=map("q$_",(4..7)); 3990 3991my ($acc_hb,$acc_mb,$acc_lb)=map("v$_.16b",(9..11)); 3992my ($acc_h,$acc_m,$acc_l)=map("v$_",(9..11)); 3993my ($acc_hd,$acc_md,$acc_ld)=map("d$_",(9..11)); 3994 3995my ($h1,$h2,$h3,$h4,$h12k,$h34k)=map("v$_",(12..17)); 3996my ($h1q,$h2q,$h3q,$h4q)=map("q$_",(12..15)); 3997my ($h1b,$h2b,$h3b,$h4b)=map("v$_.16b",(12..15)); 3998 3999my $t0="v8"; 4000my $t0d="d8"; 4001my $t1="v4"; 4002my $t1d="d4"; 4003my $t2="v8"; 4004my $t2d="d8"; 4005my $t3="v4"; 4006my $t3d="d4"; 4007my $t4="v4"; 4008my $t4d="d4"; 4009my $t5="v5"; 4010my $t5d="d5"; 4011my $t6="v8"; 4012my $t6d="d8"; 4013my $t7="v5"; 4014my $t7d="d5"; 4015my $t8="v6"; 4016my $t8d="d6"; 4017my $t9="v4"; 4018my $t9d="d4"; 4019 4020my ($ctr_t0,$ctr_t1,$ctr_t2,$ctr_t3)=map("v$_",(4..7)); 4021my ($ctr_t0d,$ctr_t1d,$ctr_t2d,$ctr_t3d)=map("d$_",(4..7)); 4022my ($ctr_t0b,$ctr_t1b,$ctr_t2b,$ctr_t3b)=map("v$_.16b",(4..7)); 4023 4024my $mod_constantd="d8"; 4025my $mod_constant="v8"; 4026my $mod_t="v7"; 4027 4028my ($rk0,$rk1,$rk2,$rk3,$rk4,$rk5,$rk6,$rk7,$rk8,$rk9,$rk10,$rk11,$rk12,$rk13)=map("v$_.16b",(18..31)); 4029my ($rk0s,$rk1s,$rk2s,$rk3s,$rk4s,$rk5s,$rk6s,$rk7s,$rk8s,$rk9s,$rk10s,$rk11s,$rk12s,$rk13s)=map("v$_.4s",(18..31)); 4030my ($rk0q,$rk1q,$rk2q,$rk3q,$rk4q,$rk5q,$rk6q,$rk7q,$rk8q,$rk9q,$rk10q,$rk11q,$rk12q,$rk13q)=map("q$_",(18..31)); 4031my $rk2q1="v20.1q"; 4032my $rk3q1="v21.1q"; 4033my $rk4v="v22"; 4034my $rk4d="d22"; 4035 4036######################################################################################### 4037# size_t aes_gcm_enc_256_kernel(const unsigned char *in, 4038# size_t len, 4039# unsigned char *out, 4040# const void *key, 4041# unsigned char ivec[16], 4042# u64 *Xi); 4043# 4044$code.=<<___; 4045.global aes_gcm_enc_256_kernel 4046.type aes_gcm_enc_256_kernel,%function 4047.align 4 4048aes_gcm_enc_256_kernel: 4049 AARCH64_VALID_CALL_TARGET 4050 cbz x1, .L256_enc_ret 4051 stp x19, x20, [sp, #-112]! 4052 mov x16, x4 4053 mov x8, x5 4054 stp x21, x22, [sp, #16] 4055 stp x23, x24, [sp, #32] 4056 stp d8, d9, [sp, #48] 4057 stp d10, d11, [sp, #64] 4058 stp d12, d13, [sp, #80] 4059 stp d14, d15, [sp, #96] 4060 4061 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 4062 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 4063 mov $len, $main_end_input_ptr 4064 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 4065#ifdef __AARCH64EB__ 4066 rev $ctr96_b64x, $ctr96_b64x 4067 rev $ctr96_t32x, $ctr96_t32x 4068#endif 4069 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 4070#ifdef __AARCH64EB__ 4071 ror $rk14_l, $rk14_l, #32 4072 ror $rk14_h, $rk14_h, #32 4073#endif 4074 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 4075 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 4076 4077 ld1 {$rk0s}, [$cc], #16 @ load rk0 4078 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 4079 4080 ld1 {$rk1s}, [$cc], #16 @ load rk1 4081 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 4082 4083 lsr $rctr32x, $ctr96_t32x, #32 4084 fmov $ctr2d, $ctr96_b64x @ CTR block 2 4085 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 4086 4087 rev $rctr32w, $rctr32w @ rev_ctr32 4088 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 4089 fmov $ctr1d, $ctr96_b64x @ CTR block 1 4090 4091 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 4092 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 4093 4094 rev $ctr32w, $rctr32w @ CTR block 1 4095 fmov $ctr3d, $ctr96_b64x @ CTR block 3 4096 4097 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 4098 add $rctr32w, $rctr32w, #1 @ CTR block 1 4099 ld1 {$rk2s}, [$cc], #16 @ load rk2 4100 4101 fmov $ctr1.d[1], $ctr32x @ CTR block 1 4102 rev $ctr32w, $rctr32w @ CTR block 2 4103 add $rctr32w, $rctr32w, #1 @ CTR block 2 4104 4105 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 4106 ld1 {$rk3s}, [$cc], #16 @ load rk3 4107 4108 fmov $ctr2.d[1], $ctr32x @ CTR block 2 4109 rev $ctr32w, $rctr32w @ CTR block 3 4110 4111 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 4112 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 4113 4114 fmov $ctr3.d[1], $ctr32x @ CTR block 3 4115 4116 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 4117 ld1 {$rk4s}, [$cc], #16 @ load rk4 4118 4119 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 4120 ld1 {$rk5s}, [$cc], #16 @ load rk5 4121 4122 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 4123 ld1 {$rk6s}, [$cc], #16 @ load rk6 4124 4125 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 4126 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 4127#ifndef __AARCH64EB__ 4128 ext $h3b, $h3b, $h3b, #8 4129#endif 4130 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 4131 ld1 {$rk7s}, [$cc], #16 @ load rk7 4132 4133 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 4134 ld1 {$rk8s}, [$cc], #16 @ load rk8 4135 4136 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 4137 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 4138#ifndef __AARCH64EB__ 4139 ext $h2b, $h2b, $h2b, #8 4140#endif 4141 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 4142 ld1 {$rk9s}, [$cc], #16 @ load rk9 4143 4144 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 4145 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 4146#ifndef __AARCH64EB__ 4147 ext $h4b, $h4b, $h4b, #8 4148#endif 4149 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 4150 ld1 {$rk10s}, [$cc], #16 @ load rk10 4151 4152 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 4153 ld1 {$rk11s}, [$cc], #16 @ load rk11 4154 4155 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 4156 add $rctr32w, $rctr32w, #1 @ CTR block 3 4157 4158 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 4159 4160 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 4161 ld1 { $acc_lb}, [$current_tag] 4162 ext $acc_lb, $acc_lb, $acc_lb, #8 4163 rev64 $acc_lb, $acc_lb 4164 4165 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 4166 4167 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 4168 4169 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 4170 4171 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 4172 4173 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 4174 4175 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 4176 4177 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 4178 4179 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 4180 4181 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 4182 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 4183 4184 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 4185 ld1 {$rk12s}, [$cc], #16 @ load rk12 4186 4187 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 4188 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 4189#ifndef __AARCH64EB__ 4190 ext $h1b, $h1b, $h1b, #8 4191#endif 4192 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 4193 ld1 {$rk13s}, [$cc], #16 @ load rk13 4194 4195 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 4196 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 4197 4198 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 4199 4200 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 4201 4202 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 4203 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 4204 4205 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 4206 4207 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 4208 4209 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 4210 4211 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 4212 4213 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 4214 4215 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 4216 4217 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 4218 4219 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 4220 4221 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 4222 4223 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 4224 4225 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 4226 4227 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 4228 4229 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 4230 4231 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 4232 4233 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 4234 4235 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 4236 4237 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 4238 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 4239 4240 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 4241 4242 aese $ctr2b, $rk13 @ AES block 2 - round 13 4243 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 4244 4245 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 4246 4247 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 4248 4249 aese $ctr1b, $rk13 @ AES block 1 - round 13 4250 4251 aese $ctr0b, $rk13 @ AES block 0 - round 13 4252 4253 aese $ctr3b, $rk13 @ AES block 3 - round 13 4254 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 4255 b.ge .L256_enc_tail @ handle tail 4256 4257 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 1 - load plaintext 4258#ifdef __AARCH64EB__ 4259 rev $input_l1, $input_l1 4260 rev $input_h1, $input_h1 4261#endif 4262 rev $ctr32w, $rctr32w @ CTR block 4 4263 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 0 - load plaintext 4264#ifdef __AARCH64EB__ 4265 rev $input_l0, $input_l0 4266 rev $input_h0, $input_h0 4267#endif 4268 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 3 - load plaintext 4269#ifdef __AARCH64EB__ 4270 rev $input_l3, $input_l3 4271 rev $input_h3, $input_h3 4272#endif 4273 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 2 - load plaintext 4274#ifdef __AARCH64EB__ 4275 rev $input_l2, $input_l2 4276 rev $input_h2, $input_h2 4277#endif 4278 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 4279 4280 eor $input_l1, $input_l1, $rk14_l @ AES block 1 - round 14 low 4281 eor $input_h1, $input_h1, $rk14_h @ AES block 1 - round 14 high 4282 4283 fmov $ctr_t1d, $input_l1 @ AES block 1 - mov low 4284 eor $input_l0, $input_l0, $rk14_l @ AES block 0 - round 14 low 4285 4286 eor $input_h0, $input_h0, $rk14_h @ AES block 0 - round 14 high 4287 eor $input_h3, $input_h3, $rk14_h @ AES block 3 - round 14 high 4288 fmov $ctr_t0d, $input_l0 @ AES block 0 - mov low 4289 4290 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 4291 fmov $ctr_t0.d[1], $input_h0 @ AES block 0 - mov high 4292 eor $input_l3, $input_l3, $rk14_l @ AES block 3 - round 14 low 4293 4294 eor $input_l2, $input_l2, $rk14_l @ AES block 2 - round 14 low 4295 fmov $ctr_t1.d[1], $input_h1 @ AES block 1 - mov high 4296 4297 fmov $ctr_t2d, $input_l2 @ AES block 2 - mov low 4298 add $rctr32w, $rctr32w, #1 @ CTR block 4 4299 4300 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 4301 fmov $ctr_t3d, $input_l3 @ AES block 3 - mov low 4302 eor $input_h2, $input_h2, $rk14_h @ AES block 2 - round 14 high 4303 4304 fmov $ctr_t2.d[1], $input_h2 @ AES block 2 - mov high 4305 4306 eor $res0b, $ctr_t0b, $ctr0b @ AES block 0 - result 4307 fmov $ctr0d, $ctr96_b64x @ CTR block 4 4308 4309 fmov $ctr0.d[1], $ctr32x @ CTR block 4 4310 rev $ctr32w, $rctr32w @ CTR block 5 4311 add $rctr32w, $rctr32w, #1 @ CTR block 5 4312 4313 eor $res1b, $ctr_t1b, $ctr1b @ AES block 1 - result 4314 fmov $ctr1d, $ctr96_b64x @ CTR block 5 4315 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 4316 4317 fmov $ctr1.d[1], $ctr32x @ CTR block 5 4318 rev $ctr32w, $rctr32w @ CTR block 6 4319 st1 { $res0b}, [$output_ptr], #16 @ AES block 0 - store result 4320 4321 fmov $ctr_t3.d[1], $input_h3 @ AES block 3 - mov high 4322 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 4323 eor $res2b, $ctr_t2b, $ctr2b @ AES block 2 - result 4324 4325 st1 { $res1b}, [$output_ptr], #16 @ AES block 1 - store result 4326 4327 add $rctr32w, $rctr32w, #1 @ CTR block 6 4328 fmov $ctr2d, $ctr96_b64x @ CTR block 6 4329 4330 fmov $ctr2.d[1], $ctr32x @ CTR block 6 4331 st1 { $res2b}, [$output_ptr], #16 @ AES block 2 - store result 4332 rev $ctr32w, $rctr32w @ CTR block 7 4333 4334 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 7 4335 4336 eor $res3b, $ctr_t3b, $ctr3b @ AES block 3 - result 4337 st1 { $res3b}, [$output_ptr], #16 @ AES block 3 - store result 4338 b.ge L256_enc_prepretail @ do prepretail 4339 4340 .L256_enc_main_loop: @ main loop start 4341 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 4342 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 4343 4344 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 4345 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 4346 4347 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 4348 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 4349 4350 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 4351 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 4352 4353 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 4354 ldp $input_l3, $input_h3, [$input_ptr, #48] @ AES block 4k+7 - load plaintext 4355#ifdef __AARCH64EB__ 4356 rev $input_l3, $input_l3 4357 rev $input_h3, $input_h3 4358#endif 4359 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 4360 ldp $input_l2, $input_h2, [$input_ptr, #32] @ AES block 4k+6 - load plaintext 4361#ifdef __AARCH64EB__ 4362 rev $input_l2, $input_l2 4363 rev $input_h2, $input_h2 4364#endif 4365 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 4366 eor $res0b, $res0b, $acc_lb @ PRE 1 4367 4368 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 4369 4370 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 4371 eor $input_l3, $input_l3, $rk14_l @ AES block 4k+7 - round 14 low 4372 4373 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 4374 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 4375 4376 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 4377 eor $input_h2, $input_h2, $rk14_h @ AES block 4k+6 - round 14 high 4378 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 4379 4380 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 4381 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 4382 4383 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 4384 4385 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 4386 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 4387 4388 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 4389 4390 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 4391 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4392 4393 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 4394 4395 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 4396 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 4397 4398 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 4399 4400 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 4401 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 4402 4403 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 4404 4405 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 4406 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 4407 4408 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 4409 4410 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 4411 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 4412 4413 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 4414 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 4415 4416 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 4417 4418 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 4419 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 4420 4421 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 4422 4423 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 4424 4425 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 4426 4427 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 4428 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 4429 4430 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 4431 4432 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 4433 4434 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 4435 4436 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 4437 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 4438 4439 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 4440 4441 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 4442 4443 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 4444 4445 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 4446 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 4447 4448 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 4449 ldp $input_l1, $input_h1, [$input_ptr, #16] @ AES block 4k+5 - load plaintext 4450#ifdef __AARCH64EB__ 4451 rev $input_l1, $input_l1 4452 rev $input_h1, $input_h1 4453#endif 4454 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 4455 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 4456 4457 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 4458 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 4459 4460 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 4461 4462 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 4463 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 4464 4465 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 4466 eor $input_l1, $input_l1, $rk14_l @ AES block 4k+5 - round 14 low 4467 4468 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 4469 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 4470 4471 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 4472 eor $input_l2, $input_l2, $rk14_l @ AES block 4k+6 - round 14 low 4473 4474 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 4475 movi $mod_constant.8b, #0xc2 4476 4477 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 4478 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 4479 fmov $ctr_t1d, $input_l1 @ AES block 4k+5 - mov low 4480 4481 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 4482 ldp $input_l0, $input_h0, [$input_ptr, #0] @ AES block 4k+4 - load plaintext 4483#ifdef __AARCH64EB__ 4484 rev $input_l0, $input_l0 4485 rev $input_h0, $input_h0 4486#endif 4487 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 4488 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4489 4490 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 4491 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 4492 4493 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 4494 4495 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 4496 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 4497 4498 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 4499 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 4500 4501 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 4502 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 4503 4504 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 4505 add $input_ptr, $input_ptr, #64 @ AES input_ptr update 4506 4507 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 4508 rev $ctr32w, $rctr32w @ CTR block 4k+8 4509 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 4510 4511 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 4512 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low 4513 4514 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 4515 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 4516 4517 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 4518 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high 4519 4520 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 4521 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 4522 eor $mod_t.16b, $acc_hb, $mod_t.16b @ MODULO - fold into mid 4523 4524 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 4525 eor $input_h1, $input_h1, $rk14_h @ AES block 4k+5 - round 14 high 4526 4527 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 4528 eor $input_h3, $input_h3, $rk14_h @ AES block 4k+7 - round 14 high 4529 4530 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 4531 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 4532 4533 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 4534 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 4535 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 4536 4537 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 4538 fmov $ctr_t3d, $input_l3 @ AES block 4k+7 - mov low 4539 4540 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 4541 fmov $ctr_t1.d[1], $input_h1 @ AES block 4k+5 - mov high 4542 4543 fmov $ctr_t2d, $input_l2 @ AES block 4k+6 - mov low 4544 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 4545 4546 fmov $ctr_t2.d[1], $input_h2 @ AES block 4k+6 - mov high 4547 4548 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 4549 eor $res0b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 4550 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 4551 4552 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 4553 rev $ctr32w, $rctr32w @ CTR block 4k+9 4554 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 4555 4556 eor $res1b, $ctr_t1b, $ctr1b @ AES block 4k+5 - result 4557 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 4558 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 4559 4560 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 4561 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 4562 4563 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 4564 rev $ctr32w, $rctr32w @ CTR block 4k+10 4565 st1 { $res0b}, [$output_ptr], #16 @ AES block 4k+4 - store result 4566 4567 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 4568 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 4569 fmov $ctr_t3.d[1], $input_h3 @ AES block 4k+7 - mov high 4570 4571 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 4572 st1 { $res1b}, [$output_ptr], #16 @ AES block 4k+5 - store result 4573 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 4574 4575 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 4576 eor $res2b, $ctr_t2b, $ctr2b @ AES block 4k+6 - result 4577 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+10 4578 4579 st1 { $res2b}, [$output_ptr], #16 @ AES block 4k+6 - store result 4580 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+10 4581 rev $ctr32w, $rctr32w @ CTR block 4k+11 4582 4583 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 4584 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+11 4585 4586 eor $res3b, $ctr_t3b, $ctr3b @ AES block 4k+7 - result 4587 st1 { $res3b}, [$output_ptr], #16 @ AES block 4k+7 - store result 4588 b.lt L256_enc_main_loop 4589 4590 .L256_enc_prepretail: @ PREPRETAIL 4591 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 4592 rev64 $res2b, $res2b @ GHASH block 4k+2 (t0, t1, and t2 free) 4593 4594 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 4595 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+3 4596 4597 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 4598 rev64 $res0b, $res0b @ GHASH block 4k (only t0 is free) 4599 4600 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+3 4601 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 4602 4603 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 4604 4605 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 4606 4607 eor $res0b, $res0b, $acc_lb @ PRE 1 4608 rev64 $res1b, $res1b @ GHASH block 4k+1 (t0 and t1 free) 4609 4610 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 4611 4612 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 4613 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 4614 4615 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 4616 4617 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 4618 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 4619 4620 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 4621 4622 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 4623 4624 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 4625 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 4626 4627 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 4628 4629 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 4630 4631 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 4632 4633 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 4634 4635 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 4636 4637 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 4638 4639 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 4640 4641 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 4642 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 4643 4644 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 4645 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 4646 4647 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 4648 4649 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 4650 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 4651 4652 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 4653 rev64 $res3b, $res3b @ GHASH block 4k+3 (t0, t1, t2 and t3 free) 4654 4655 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 4656 4657 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 4658 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 4659 add $rctr32w, $rctr32w, #1 @ CTR block 4k+3 4660 4661 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 4662 4663 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 4664 4665 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 4666 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 4667 4668 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 4669 4670 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 4671 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 4672 4673 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 4674 4675 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 4676 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 4677 4678 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 4679 4680 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 4681 4682 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 4683 4684 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 4685 4686 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 4687 4688 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 4689 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 4690 4691 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 4692 4693 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 4694 4695 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 4696 4697 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 4698 movi $mod_constant.8b, #0xc2 4699 4700 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 4701 4702 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 4703 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 4704 4705 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 4706 4707 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 4708 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4709 4710 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 4711 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 4712 4713 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 4714 4715 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 4716 4717 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 4718 4719 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 4720 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 4721 4722 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 4723 4724 eor $acc_mb, $acc_mb, $acc_hb @ karatsuba tidy up 4725 4726 pmull $t1.1q, $acc_h.1d, $mod_constant.1d 4727 ext $acc_hb, $acc_hb, $acc_hb, #8 4728 4729 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 4730 4731 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 4732 eor $acc_mb, $acc_mb, $acc_lb 4733 4734 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 4735 4736 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 4737 4738 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 4739 4740 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 4741 eor $acc_mb, $acc_mb, $t1.16b 4742 4743 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 4744 4745 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 4746 4747 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 4748 4749 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 4750 eor $acc_mb, $acc_mb, $acc_hb 4751 4752 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 4753 4754 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 4755 4756 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 4757 4758 pmull $t1.1q, $acc_m.1d, $mod_constant.1d 4759 4760 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 4761 ext $acc_mb, $acc_mb, $acc_mb, #8 4762 4763 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 4764 4765 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 4766 eor $acc_lb, $acc_lb, $t1.16b 4767 4768 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 4769 4770 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 4771 4772 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 4773 4774 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 4775 eor $acc_lb, $acc_lb, $acc_mb 4776 .L256_enc_tail: @ TAIL 4777 4778 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 4779 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 4780 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES block 4k+4 - load plaintext 4781#ifdef __AARCH64EB__ 4782 rev $input_l0, $input_l0 4783 rev $input_h0, $input_h0 4784#endif 4785 eor $input_l0, $input_l0, $rk14_l @ AES block 4k+4 - round 14 low 4786 eor $input_h0, $input_h0, $rk14_h @ AES block 4k+4 - round 14 high 4787 4788 cmp $main_end_input_ptr, #48 4789 fmov $ctr_t0d, $input_l0 @ AES block 4k+4 - mov low 4790 4791 fmov $ctr_t0.d[1], $input_h0 @ AES block 4k+4 - mov high 4792 4793 eor $res1b, $ctr_t0b, $ctr0b @ AES block 4k+4 - result 4794 b.gt .L256_enc_blocks_more_than_3 4795 4796 cmp $main_end_input_ptr, #32 4797 mov $ctr3b, $ctr2b 4798 movi $acc_l.8b, #0 4799 4800 movi $acc_h.8b, #0 4801 sub $rctr32w, $rctr32w, #1 4802 4803 mov $ctr2b, $ctr1b 4804 movi $acc_m.8b, #0 4805 b.gt .L256_enc_blocks_more_than_2 4806 4807 mov $ctr3b, $ctr1b 4808 sub $rctr32w, $rctr32w, #1 4809 cmp $main_end_input_ptr, #16 4810 4811 b.gt .L256_enc_blocks_more_than_1 4812 4813 sub $rctr32w, $rctr32w, #1 4814 b .L256_enc_blocks_less_than_1 4815 .L256_enc_blocks_more_than_3: @ blocks left > 3 4816 st1 { $res1b}, [$output_ptr], #16 @ AES final-3 block - store result 4817 4818 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-2 block - load input low & high 4819#ifdef __AARCH64EB__ 4820 rev $input_l0, $input_l0 4821 rev $input_h0, $input_h0 4822#endif 4823 rev64 $res0b, $res1b @ GHASH final-3 block 4824 4825 eor $input_l0, $input_l0, $rk14_l @ AES final-2 block - round 14 low 4826 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4827 4828 eor $input_h0, $input_h0, $rk14_h @ AES final-2 block - round 14 high 4829 4830 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 4831 fmov $res1d, $input_l0 @ AES final-2 block - mov low 4832 4833 fmov $res1.d[1], $input_h0 @ AES final-2 block - mov high 4834 4835 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 4836 movi $t0.8b, #0 @ suppress further partial tag feed in 4837 4838 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 4839 4840 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 4841 4842 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 4843 4844 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 4845 eor $res1b, $res1b, $ctr1b @ AES final-2 block - result 4846 .L256_enc_blocks_more_than_2: @ blocks left > 2 4847 4848 st1 { $res1b}, [$output_ptr], #16 @ AES final-2 block - store result 4849 4850 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final-1 block - load input low & high 4851#ifdef __AARCH64EB__ 4852 rev $input_l0, $input_l0 4853 rev $input_h0, $input_h0 4854#endif 4855 rev64 $res0b, $res1b @ GHASH final-2 block 4856 4857 eor $input_l0, $input_l0, $rk14_l @ AES final-1 block - round 14 low 4858 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4859 4860 fmov $res1d, $input_l0 @ AES final-1 block - mov low 4861 eor $input_h0, $input_h0, $rk14_h @ AES final-1 block - round 14 high 4862 4863 fmov $res1.d[1], $input_h0 @ AES final-1 block - mov high 4864 4865 movi $t0.8b, #0 @ suppress further partial tag feed in 4866 4867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 4868 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 4869 4870 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 4871 4872 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 4873 4874 eor $res1b, $res1b, $ctr2b @ AES final-1 block - result 4875 4876 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 4877 4878 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 4879 4880 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 4881 4882 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 4883 .L256_enc_blocks_more_than_1: @ blocks left > 1 4884 4885 st1 { $res1b}, [$output_ptr], #16 @ AES final-1 block - store result 4886 4887 rev64 $res0b, $res1b @ GHASH final-1 block 4888 4889 ldp $input_l0, $input_h0, [$input_ptr], #16 @ AES final block - load input low & high 4890#ifdef __AARCH64EB__ 4891 rev $input_l0, $input_l0 4892 rev $input_h0, $input_h0 4893#endif 4894 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4895 4896 movi $t0.8b, #0 @ suppress further partial tag feed in 4897 4898 eor $input_l0, $input_l0, $rk14_l @ AES final block - round 14 low 4899 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 4900 4901 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 4902 eor $input_h0, $input_h0, $rk14_h @ AES final block - round 14 high 4903 4904 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 4905 4906 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 4907 4908 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 4909 fmov $res1d, $input_l0 @ AES final block - mov low 4910 4911 fmov $res1.d[1], $input_h0 @ AES final block - mov high 4912 4913 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 4914 4915 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 4916 4917 eor $res1b, $res1b, $ctr3b @ AES final block - result 4918 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 4919 4920 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 4921 .L256_enc_blocks_less_than_1: @ blocks left <= 1 4922 4923 and $bit_length, $bit_length, #127 @ bit_length %= 128 4924 4925 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff 4926 sub $bit_length, $bit_length, #128 @ bit_length -= 128 4927 4928 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 4929 ld1 { $rk0}, [$output_ptr] @ load existing bytes where the possibly partial last block is to be stored 4930 4931 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff 4932 and $bit_length, $bit_length, #127 @ bit_length %= 128 4933 4934 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block 4935 cmp $bit_length, #64 4936 4937 csel $input_l0, $rk14_l, $rk14_h, lt 4938 csel $input_h0, $rk14_h, xzr, lt 4939 4940 fmov $ctr0d, $input_l0 @ ctr0b is mask for last block 4941 4942 fmov $ctr0.d[1], $input_h0 4943 4944 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 4945 4946 rev64 $res0b, $res1b @ GHASH final block 4947 4948 eor $res0b, $res0b, $t0.16b @ feed in partial tag 4949 4950 bif $res1b, $rk0, $ctr0b @ insert existing bytes in top end of result before storing 4951 4952 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 4953 mov $t0d, $res0.d[1] @ GHASH final block - mid 4954#ifndef __AARCH64EB__ 4955 rev $ctr32w, $rctr32w 4956#else 4957 mov $ctr32w, $rctr32w 4958#endif 4959 4960 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 4961 4962 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 4963 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 4964 4965 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 4966 4967 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 4968 4969 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 4970 movi $mod_constant.8b, #0xc2 4971 4972 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 4973 4974 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 4975 4976 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 4977 4978 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 4979 4980 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 4981 4982 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 4983 4984 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 4985 4986 pmull $acc_h.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 4987 4988 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 4989 4990 str $ctr32w, [$counter, #12] @ store the updated counter 4991 4992 st1 { $res1b}, [$output_ptr] @ store all 16B 4993 eor $acc_lb, $acc_lb, $acc_hb @ MODULO - fold into low 4994 4995 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 4996 ext $acc_lb, $acc_lb, $acc_lb, #8 4997 rev64 $acc_lb, $acc_lb 4998 mov x0, $len 4999 st1 { $acc_l.16b }, [$current_tag] 5000 5001 ldp x21, x22, [sp, #16] 5002 ldp x23, x24, [sp, #32] 5003 ldp d8, d9, [sp, #48] 5004 ldp d10, d11, [sp, #64] 5005 ldp d12, d13, [sp, #80] 5006 ldp d14, d15, [sp, #96] 5007 ldp x19, x20, [sp], #112 5008 ret 5009 5010.L256_enc_ret: 5011 mov w0, #0x0 5012 ret 5013.size aes_gcm_enc_256_kernel,.-aes_gcm_enc_256_kernel 5014___ 5015 5016{ 5017my $t8="v4"; 5018my $t8d="d4"; 5019my $t9="v6"; 5020my $t9d="d6"; 5021######################################################################################### 5022# size_t aes_gcm_dec_256_kernel(const unsigned char *in, 5023# size_t len, 5024# unsigned char *out, 5025# const void *key, 5026# unsigned char ivec[16], 5027# u64 *Xi); 5028# 5029$code.=<<___; 5030.global aes_gcm_dec_256_kernel 5031.type aes_gcm_dec_256_kernel,%function 5032.align 4 5033aes_gcm_dec_256_kernel: 5034 AARCH64_VALID_CALL_TARGET 5035 cbz x1, .L256_dec_ret 5036 stp x19, x20, [sp, #-112]! 5037 mov x16, x4 5038 mov x8, x5 5039 stp x21, x22, [sp, #16] 5040 stp x23, x24, [sp, #32] 5041 stp d8, d9, [sp, #48] 5042 stp d10, d11, [sp, #64] 5043 stp d12, d13, [sp, #80] 5044 stp d14, d15, [sp, #96] 5045 5046 lsr $main_end_input_ptr, $bit_length, #3 @ byte_len 5047 mov $len, $main_end_input_ptr 5048 ldp $ctr96_b64x, $ctr96_t32x, [$counter] @ ctr96_b64, ctr96_t32 5049#ifdef __AARCH64EB__ 5050 rev $ctr96_b64x, $ctr96_b64x 5051 rev $ctr96_t32x, $ctr96_t32x 5052#endif 5053 ldp $rk14_l, $rk14_h, [$cc, #224] @ load rk14 5054#ifdef __AARCH64EB__ 5055 ror $rk14_h, $rk14_h, #32 5056 ror $rk14_l, $rk14_l, #32 5057#endif 5058 ld1 {$rk0s}, [$cc], #16 @ load rk0 5059 sub $main_end_input_ptr, $main_end_input_ptr, #1 @ byte_len - 1 5060 5061 ld1 {$rk1s}, [$cc], #16 @ load rk1 5062 and $main_end_input_ptr, $main_end_input_ptr, #0xffffffffffffffc0 @ number of bytes to be processed in main loop (at least 1 byte must be handled by tail) 5063 5064 add $end_input_ptr, $input_ptr, $bit_length, lsr #3 @ end_input_ptr 5065 ld1 {$rk2s}, [$cc], #16 @ load rk2 5066 5067 lsr $rctr32x, $ctr96_t32x, #32 5068 ld1 {$rk3s}, [$cc], #16 @ load rk3 5069 orr $ctr96_t32w, $ctr96_t32w, $ctr96_t32w 5070 5071 ld1 {$rk4s}, [$cc], #16 @ load rk4 5072 add $main_end_input_ptr, $main_end_input_ptr, $input_ptr 5073 rev $rctr32w, $rctr32w @ rev_ctr32 5074 5075 add $rctr32w, $rctr32w, #1 @ increment rev_ctr32 5076 fmov $ctr3d, $ctr96_b64x @ CTR block 3 5077 5078 rev $ctr32w, $rctr32w @ CTR block 1 5079 add $rctr32w, $rctr32w, #1 @ CTR block 1 5080 fmov $ctr1d, $ctr96_b64x @ CTR block 1 5081 5082 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 1 5083 ld1 { $ctr0b}, [$counter] @ special case vector load initial counter so we can start first AES block as quickly as possible 5084 5085 fmov $ctr1.d[1], $ctr32x @ CTR block 1 5086 rev $ctr32w, $rctr32w @ CTR block 2 5087 add $rctr32w, $rctr32w, #1 @ CTR block 2 5088 5089 fmov $ctr2d, $ctr96_b64x @ CTR block 2 5090 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 2 5091 5092 fmov $ctr2.d[1], $ctr32x @ CTR block 2 5093 rev $ctr32w, $rctr32w @ CTR block 3 5094 5095 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 3 5096 ld1 {$rk5s}, [$cc], #16 @ load rk5 5097 5098 fmov $ctr3.d[1], $ctr32x @ CTR block 3 5099 add $rctr32w, $rctr32w, #1 @ CTR block 3 5100 5101 ld1 {$rk6s}, [$cc], #16 @ load rk6 5102 5103 ld1 {$rk7s}, [$cc], #16 @ load rk7 5104 5105 ld1 {$rk8s}, [$cc], #16 @ load rk8 5106 5107 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 0 5108 ldr $h3q, [$current_tag, #80] @ load h3l | h3h 5109#ifndef __AARCH64EB__ 5110 ext $h3b, $h3b, $h3b, #8 5111#endif 5112 5113 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 0 5114 ldr $h4q, [$current_tag, #112] @ load h4l | h4h 5115#ifndef __AARCH64EB__ 5116 ext $h4b, $h4b, $h4b, #8 5117#endif 5118 5119 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 0 5120 ldr $h2q, [$current_tag, #64] @ load h2l | h2h 5121#ifndef __AARCH64EB__ 5122 ext $h2b, $h2b, $h2b, #8 5123#endif 5124 5125 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 0 5126 ld1 {$rk9s}, [$cc], #16 @ load rk9 5127 5128 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 1 5129 5130 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 1 5131 ld1 { $acc_lb}, [$current_tag] 5132 ext $acc_lb, $acc_lb, $acc_lb, #8 5133 rev64 $acc_lb, $acc_lb 5134 5135 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 1 5136 ld1 {$rk10s}, [$cc], #16 @ load rk10 5137 5138 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 1 5139 ld1 {$rk11s}, [$cc], #16 @ load rk11 5140 5141 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 2 5142 ldr $h1q, [$current_tag, #32] @ load h1l | h1h 5143#ifndef __AARCH64EB__ 5144 ext $h1b, $h1b, $h1b, #8 5145#endif 5146 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 2 5147 ld1 {$rk12s}, [$cc], #16 @ load rk12 5148 5149 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 2 5150 5151 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 3 5152 5153 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 2 5154 5155 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 3 5156 5157 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 4 5158 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 4 blocks 5159 5160 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 3 5161 5162 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 3 5163 5164 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 4 5165 5166 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 4 5167 5168 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 4 5169 5170 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 5 5171 5172 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 5 5173 5174 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 5 5175 5176 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 5 5177 5178 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 6 5179 5180 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 6 5181 5182 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 6 5183 5184 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 6 5185 5186 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 7 5187 5188 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 7 5189 5190 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 7 5191 5192 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 8 5193 5194 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 7 5195 5196 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 8 5197 5198 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 8 5199 5200 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 9 5201 5202 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 8 5203 ld1 {$rk13s}, [$cc], #16 @ load rk13 5204 5205 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 9 5206 5207 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 10 5208 5209 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 9 5210 5211 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 10 5212 5213 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 9 5214 5215 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 10 5216 5217 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 11 5218 5219 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 10 5220 5221 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 11 5222 5223 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 11 5224 5225 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 11 5226 5227 trn1 $acc_h.2d, $h3.2d, $h4.2d @ h4h | h3h 5228 5229 trn2 $h34k.2d, $h3.2d, $h4.2d @ h4l | h3l 5230 5231 trn1 $t0.2d, $h1.2d, $h2.2d @ h2h | h1h 5232 trn2 $h12k.2d, $h1.2d, $h2.2d @ h2l | h1l 5233 5234 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 1 - round 12 5235 5236 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 0 - round 12 5237 5238 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 2 - round 12 5239 5240 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 3 - round 12 5241 eor $h34k.16b, $h34k.16b, $acc_h.16b @ h4k | h3k 5242 5243 aese $ctr1b, $rk13 @ AES block 1 - round 13 5244 5245 aese $ctr2b, $rk13 @ AES block 2 - round 13 5246 eor $h12k.16b, $h12k.16b, $t0.16b @ h2k | h1k 5247 5248 aese $ctr3b, $rk13 @ AES block 3 - round 13 5249 5250 aese $ctr0b, $rk13 @ AES block 0 - round 13 5251 b.ge .L256_dec_tail @ handle tail 5252 5253 ld1 {$res0b, $res1b}, [$input_ptr], #32 @ AES block 0,1 - load ciphertext 5254 5255 rev $ctr32w, $rctr32w @ CTR block 4 5256 5257 eor $ctr0b, $res0b, $ctr0b @ AES block 0 - result 5258 5259 eor $ctr1b, $res1b, $ctr1b @ AES block 1 - result 5260 rev64 $res1b, $res1b @ GHASH block 1 5261 ld1 {$res2b}, [$input_ptr], #16 @ AES block 2 - load ciphertext 5262 5263 mov $output_h0, $ctr0.d[1] @ AES block 0 - mov high 5264 5265 mov $output_l0, $ctr0.d[0] @ AES block 0 - mov low 5266 rev64 $res0b, $res0b @ GHASH block 0 5267 add $rctr32w, $rctr32w, #1 @ CTR block 4 5268 5269 fmov $ctr0d, $ctr96_b64x @ CTR block 4 5270 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4 5271 5272 fmov $ctr0.d[1], $ctr32x @ CTR block 4 5273 rev $ctr32w, $rctr32w @ CTR block 5 5274 add $rctr32w, $rctr32w, #1 @ CTR block 5 5275 5276 mov $output_l1, $ctr1.d[0] @ AES block 1 - mov low 5277 5278 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 5 5279 mov $output_h1, $ctr1.d[1] @ AES block 1 - mov high 5280 eor $output_h0, $output_h0, $rk14_h @ AES block 0 - round 14 high 5281#ifdef __AARCH64EB__ 5282 rev $output_h0, $output_h0 5283#endif 5284 eor $output_l0, $output_l0, $rk14_l @ AES block 0 - round 14 low 5285#ifdef __AARCH64EB__ 5286 rev $output_l0, $output_l0 5287#endif 5288 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 0 - store result 5289 fmov $ctr1d, $ctr96_b64x @ CTR block 5 5290 5291 ld1 {$res3b}, [$input_ptr], #16 @ AES block 3 - load ciphertext 5292 5293 fmov $ctr1.d[1], $ctr32x @ CTR block 5 5294 rev $ctr32w, $rctr32w @ CTR block 6 5295 add $rctr32w, $rctr32w, #1 @ CTR block 6 5296 5297 eor $output_l1, $output_l1, $rk14_l @ AES block 1 - round 14 low 5298#ifdef __AARCH64EB__ 5299 rev $output_l1, $output_l1 5300#endif 5301 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 6 5302 5303 eor $output_h1, $output_h1, $rk14_h @ AES block 1 - round 14 high 5304#ifdef __AARCH64EB__ 5305 rev $output_h1, $output_h1 5306#endif 5307 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 1 - store result 5308 5309 eor $ctr2b, $res2b, $ctr2b @ AES block 2 - result 5310 cmp $input_ptr, $main_end_input_ptr @ check if we have <= 8 blocks 5311 b.ge .L256_dec_prepretail @ do prepretail 5312 5313 .L256_dec_main_loop: @ main loop start 5314 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 5315 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 5316 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 5317 5318 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 5319 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 5320 5321 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 5322 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 5323 5324 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 5325 eor $res0b, $res0b, $acc_lb @ PRE 1 5326 rev $ctr32w, $rctr32w @ CTR block 4k+7 5327 5328 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 5329 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 5330 5331 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 5332 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 5333 5334 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 5335 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 5336 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 5337 5338 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 5339 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 5340 5341 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 5342 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 5343 5344 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 5345 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 5346 5347 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 5348 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high 5349#ifdef __AARCH64EB__ 5350 rev $output_h2, $output_h2 5351#endif 5352 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 5353 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 5354 5355 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 5356 rev64 $res2b, $res2b @ GHASH block 4k+2 5357 5358 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 5359 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low 5360#ifdef __AARCH64EB__ 5361 rev $output_l2, $output_l2 5362#endif 5363 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 5364 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 5365 5366 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 5367 5368 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 5369 5370 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 5371 rev64 $res3b, $res3b @ GHASH block 4k+3 5372 5373 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 5374 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low 5375#ifdef __AARCH64EB__ 5376 rev $output_l3, $output_l3 5377#endif 5378 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 5379 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high 5380#ifdef __AARCH64EB__ 5381 rev $output_h3, $output_h3 5382#endif 5383 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 5384 5385 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 5386 5387 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 5388 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 5389 5390 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 5391 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 5392 5393 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 5394 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 5395 5396 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 5397 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 5398 5399 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 5400 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 5401 5402 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 5403 5404 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 5405 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 5406 5407 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 5408 5409 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 5410 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 5411 5412 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 5413 rev $ctr32w, $rctr32w @ CTR block 4k+8 5414 5415 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 5416 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 5417 5418 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 5419 add $rctr32w, $rctr32w, #1 @ CTR block 4k+8 5420 5421 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 5422 5423 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 5424 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 5425 5426 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 5427 5428 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 5429 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 5430 5431 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 5432 5433 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 5434 5435 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 5436 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 5437 5438 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 5439 5440 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 5441 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+8 5442 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 5443 5444 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 5445 5446 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 5447 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 5448 5449 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 5450 5451 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 5452 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 5453 5454 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 5455 5456 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 5457 movi $mod_constant.8b, #0xc2 5458 5459 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 5460 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 5461 5462 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 5463 5464 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 5465 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5466 5467 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 5468 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 5469 5470 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 5471 5472 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5473 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5474 5475 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 5476 ld1 {$res0b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 5477 5478 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 5479 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5480 5481 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 5482 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5483 5484 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 5485 ld1 {$res1b}, [$input_ptr], #16 @ AES block 4k+5 - load ciphertext 5486 5487 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 5488 eor $ctr0b, $res0b, $ctr0b @ AES block 4k+4 - result 5489 5490 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 5491 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 5492 5493 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 5494 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 5495 5496 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 5497 ld1 {$res2b}, [$input_ptr], #16 @ AES block 4k+6 - load ciphertext 5498 5499 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 5500 ld1 {$res3b}, [$input_ptr], #16 @ AES block 4k+7 - load ciphertext 5501 5502 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 5503 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 5504 5505 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 5506 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 5507 5508 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 5509 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 5510 5511 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 5512 fmov $ctr0d, $ctr96_b64x @ CTR block 4k+8 5513 5514 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 5515 fmov $ctr0.d[1], $ctr32x @ CTR block 4k+8 5516 5517 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 5518 eor $ctr1b, $res1b, $ctr1b @ AES block 4k+5 - result 5519 rev $ctr32w, $rctr32w @ CTR block 4k+9 5520 5521 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 5522 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+9 5523 cmp $input_ptr, $main_end_input_ptr @ LOOP CONTROL 5524 5525 add $rctr32w, $rctr32w, #1 @ CTR block 4k+9 5526 5527 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low 5528#ifdef __AARCH64EB__ 5529 rev $output_l0, $output_l0 5530#endif 5531 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high 5532#ifdef __AARCH64EB__ 5533 rev $output_h0, $output_h0 5534#endif 5535 mov $output_h1, $ctr1.d[1] @ AES block 4k+5 - mov high 5536 eor $ctr2b, $res2b, $ctr2b @ AES block 4k+6 - result 5537 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 5538 5539 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 5540 mov $output_l1, $ctr1.d[0] @ AES block 4k+5 - mov low 5541 5542 fmov $ctr1d, $ctr96_b64x @ CTR block 4k+9 5543 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 5544 5545 fmov $ctr1.d[1], $ctr32x @ CTR block 4k+9 5546 rev $ctr32w, $rctr32w @ CTR block 4k+10 5547 add $rctr32w, $rctr32w, #1 @ CTR block 4k+10 5548 5549 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 5550 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+10 5551 5552 rev64 $res1b, $res1b @ GHASH block 4k+5 5553 eor $output_h1, $output_h1, $rk14_h @ AES block 4k+5 - round 14 high 5554#ifdef __AARCH64EB__ 5555 rev $output_h1, $output_h1 5556#endif 5557 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES block 4k+4 - store result 5558 5559 eor $output_l1, $output_l1, $rk14_l @ AES block 4k+5 - round 14 low 5560#ifdef __AARCH64EB__ 5561 rev $output_l1, $output_l1 5562#endif 5563 stp $output_l1, $output_h1, [$output_ptr], #16 @ AES block 4k+5 - store result 5564 5565 rev64 $res0b, $res0b @ GHASH block 4k+4 5566 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 5567 b.lt .L256_dec_main_loop 5568 5569 5570 .L256_dec_prepretail: @ PREPRETAIL 5571 ext $acc_lb, $acc_lb, $acc_lb, #8 @ PRE 0 5572 mov $output_l2, $ctr2.d[0] @ AES block 4k+2 - mov low 5573 eor $ctr3b, $res3b, $ctr3b @ AES block 4k+3 - result 5574 5575 aese $ctr0b, $rk0 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 0 5576 mov $output_h2, $ctr2.d[1] @ AES block 4k+2 - mov high 5577 5578 aese $ctr1b, $rk0 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 0 5579 fmov $ctr2d, $ctr96_b64x @ CTR block 4k+6 5580 5581 fmov $ctr2.d[1], $ctr32x @ CTR block 4k+6 5582 rev $ctr32w, $rctr32w @ CTR block 4k+7 5583 eor $res0b, $res0b, $acc_lb @ PRE 1 5584 5585 rev64 $res2b, $res2b @ GHASH block 4k+2 5586 orr $ctr32x, $ctr96_t32x, $ctr32x, lsl #32 @ CTR block 4k+7 5587 mov $output_l3, $ctr3.d[0] @ AES block 4k+3 - mov low 5588 5589 aese $ctr1b, $rk1 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 1 5590 mov $output_h3, $ctr3.d[1] @ AES block 4k+3 - mov high 5591 5592 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH block 4k - low 5593 mov $t0d, $res0.d[1] @ GHASH block 4k - mid 5594 fmov $ctr3d, $ctr96_b64x @ CTR block 4k+7 5595 5596 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH block 4k - high 5597 fmov $ctr3.d[1], $ctr32x @ CTR block 4k+7 5598 5599 aese $ctr2b, $rk0 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 0 5600 mov $acc_md, $h34k.d[1] @ GHASH block 4k - mid 5601 5602 aese $ctr0b, $rk1 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 1 5603 eor $t0.8b, $t0.8b, $res0.8b @ GHASH block 4k - mid 5604 5605 pmull2 $t1.1q, $res1.2d, $h3.2d @ GHASH block 4k+1 - high 5606 5607 aese $ctr2b, $rk1 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 1 5608 rev64 $res3b, $res3b @ GHASH block 4k+3 5609 5610 aese $ctr3b, $rk0 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 0 5611 5612 pmull $acc_m.1q, $t0.1d, $acc_m.1d @ GHASH block 4k - mid 5613 eor $acc_hb, $acc_hb, $t1.16b @ GHASH block 4k+1 - high 5614 5615 pmull $t2.1q, $res1.1d, $h3.1d @ GHASH block 4k+1 - low 5616 5617 aese $ctr3b, $rk1 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 1 5618 mov $t3d, $res1.d[1] @ GHASH block 4k+1 - mid 5619 5620 aese $ctr0b, $rk2 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 2 5621 5622 aese $ctr1b, $rk2 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 2 5623 eor $acc_lb, $acc_lb, $t2.16b @ GHASH block 4k+1 - low 5624 5625 aese $ctr2b, $rk2 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 2 5626 5627 aese $ctr0b, $rk3 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 3 5628 mov $t6d, $res2.d[1] @ GHASH block 4k+2 - mid 5629 5630 aese $ctr3b, $rk2 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 2 5631 eor $t3.8b, $t3.8b, $res1.8b @ GHASH block 4k+1 - mid 5632 5633 pmull $t5.1q, $res2.1d, $h2.1d @ GHASH block 4k+2 - low 5634 5635 aese $ctr0b, $rk4 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 4 5636 5637 aese $ctr3b, $rk3 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 3 5638 eor $t6.8b, $t6.8b, $res2.8b @ GHASH block 4k+2 - mid 5639 5640 pmull $t3.1q, $t3.1d, $h34k.1d @ GHASH block 4k+1 - mid 5641 5642 aese $ctr0b, $rk5 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 5 5643 eor $acc_lb, $acc_lb, $t5.16b @ GHASH block 4k+2 - low 5644 5645 aese $ctr3b, $rk4 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 4 5646 5647 pmull2 $t7.1q, $res3.2d, $h1.2d @ GHASH block 4k+3 - high 5648 eor $acc_mb, $acc_mb, $t3.16b @ GHASH block 4k+1 - mid 5649 5650 pmull2 $t4.1q, $res2.2d, $h2.2d @ GHASH block 4k+2 - high 5651 5652 aese $ctr3b, $rk5 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 5 5653 ins $t6.d[1], $t6.d[0] @ GHASH block 4k+2 - mid 5654 5655 aese $ctr2b, $rk3 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 3 5656 5657 aese $ctr1b, $rk3 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 3 5658 eor $acc_hb, $acc_hb, $t4.16b @ GHASH block 4k+2 - high 5659 5660 pmull $t8.1q, $res3.1d, $h1.1d @ GHASH block 4k+3 - low 5661 5662 aese $ctr2b, $rk4 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 4 5663 mov $t9d, $res3.d[1] @ GHASH block 4k+3 - mid 5664 5665 aese $ctr1b, $rk4 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 4 5666 5667 pmull2 $t6.1q, $t6.2d, $h12k.2d @ GHASH block 4k+2 - mid 5668 5669 aese $ctr2b, $rk5 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 5 5670 eor $t9.8b, $t9.8b, $res3.8b @ GHASH block 4k+3 - mid 5671 5672 aese $ctr1b, $rk5 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 5 5673 5674 aese $ctr3b, $rk6 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 6 5675 eor $acc_mb, $acc_mb, $t6.16b @ GHASH block 4k+2 - mid 5676 5677 aese $ctr2b, $rk6 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 6 5678 5679 aese $ctr0b, $rk6 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 6 5680 movi $mod_constant.8b, #0xc2 5681 5682 aese $ctr1b, $rk6 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 6 5683 eor $acc_lb, $acc_lb, $t8.16b @ GHASH block 4k+3 - low 5684 5685 pmull $t9.1q, $t9.1d, $h12k.1d @ GHASH block 4k+3 - mid 5686 5687 aese $ctr3b, $rk7 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 7 5688 eor $acc_hb, $acc_hb, $t7.16b @ GHASH block 4k+3 - high 5689 5690 aese $ctr1b, $rk7 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 7 5691 5692 aese $ctr0b, $rk7 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 7 5693 eor $acc_mb, $acc_mb, $t9.16b @ GHASH block 4k+3 - mid 5694 5695 aese $ctr3b, $rk8 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 8 5696 5697 aese $ctr2b, $rk7 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 7 5698 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5699 5700 aese $ctr1b, $rk8 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 8 5701 5702 aese $ctr0b, $rk8 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 8 5703 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5704 5705 aese $ctr2b, $rk8 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 8 5706 5707 aese $ctr1b, $rk9 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 9 5708 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5709 5710 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5711 5712 aese $ctr2b, $rk9 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 9 5713 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5714 5715 aese $ctr3b, $rk9 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 9 5716 5717 aese $ctr0b, $rk9 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 9 5718 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 5719 5720 aese $ctr2b, $rk10 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 10 5721 5722 aese $ctr3b, $rk10 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 10 5723 5724 aese $ctr0b, $rk10 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 10 5725 eor $output_h2, $output_h2, $rk14_h @ AES block 4k+2 - round 14 high 5726#ifdef __AARCH64EB__ 5727 rev $output_h2, $output_h2 5728#endif 5729 aese $ctr1b, $rk10 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 10 5730 eor $output_l3, $output_l3, $rk14_l @ AES block 4k+3 - round 14 low 5731#ifdef __AARCH64EB__ 5732 rev $output_l3, $output_l3 5733#endif 5734 aese $ctr2b, $rk11 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 11 5735 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 5736 5737 aese $ctr0b, $rk11 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 11 5738 add $rctr32w, $rctr32w, #1 @ CTR block 4k+7 5739 5740 aese $ctr1b, $rk11 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 11 5741 eor $output_l2, $output_l2, $rk14_l @ AES block 4k+2 - round 14 low 5742#ifdef __AARCH64EB__ 5743 rev $output_l2, $output_l2 5744#endif 5745 5746 aese $ctr2b, $rk12 \n aesmc $ctr2b, $ctr2b @ AES block 4k+6 - round 12 5747 5748 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 5749 eor $output_h3, $output_h3, $rk14_h @ AES block 4k+3 - round 14 high 5750#ifdef __AARCH64EB__ 5751 rev $output_h3, $output_h3 5752#endif 5753 5754 aese $ctr3b, $rk11 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 11 5755 stp $output_l2, $output_h2, [$output_ptr], #16 @ AES block 4k+2 - store result 5756 5757 aese $ctr1b, $rk12 \n aesmc $ctr1b, $ctr1b @ AES block 4k+5 - round 12 5758 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 5759 5760 aese $ctr0b, $rk12 \n aesmc $ctr0b, $ctr0b @ AES block 4k+4 - round 12 5761 stp $output_l3, $output_h3, [$output_ptr], #16 @ AES block 4k+3 - store result 5762 5763 aese $ctr3b, $rk12 \n aesmc $ctr3b, $ctr3b @ AES block 4k+7 - round 12 5764 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 5765 5766 aese $ctr1b, $rk13 @ AES block 4k+5 - round 13 5767 5768 aese $ctr0b, $rk13 @ AES block 4k+4 - round 13 5769 5770 aese $ctr3b, $rk13 @ AES block 4k+7 - round 13 5771 5772 aese $ctr2b, $rk13 @ AES block 4k+6 - round 13 5773 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 5774 .L256_dec_tail: @ TAIL 5775 5776 sub $main_end_input_ptr, $end_input_ptr, $input_ptr @ main_end_input_ptr is number of bytes left to process 5777 ld1 { $res1b}, [$input_ptr], #16 @ AES block 4k+4 - load ciphertext 5778 5779 eor $ctr0b, $res1b, $ctr0b @ AES block 4k+4 - result 5780 5781 mov $output_l0, $ctr0.d[0] @ AES block 4k+4 - mov low 5782 5783 mov $output_h0, $ctr0.d[1] @ AES block 4k+4 - mov high 5784 ext $t0.16b, $acc_lb, $acc_lb, #8 @ prepare final partial tag 5785 5786 cmp $main_end_input_ptr, #48 5787 5788 eor $output_l0, $output_l0, $rk14_l @ AES block 4k+4 - round 14 low 5789#ifdef __AARCH64EB__ 5790 rev $output_l0, $output_l0 5791#endif 5792 5793 eor $output_h0, $output_h0, $rk14_h @ AES block 4k+4 - round 14 high 5794#ifdef __AARCH64EB__ 5795 rev $output_h0, $output_h0 5796#endif 5797 b.gt .L256_dec_blocks_more_than_3 5798 5799 sub $rctr32w, $rctr32w, #1 5800 mov $ctr3b, $ctr2b 5801 movi $acc_m.8b, #0 5802 5803 movi $acc_l.8b, #0 5804 cmp $main_end_input_ptr, #32 5805 5806 movi $acc_h.8b, #0 5807 mov $ctr2b, $ctr1b 5808 b.gt .L256_dec_blocks_more_than_2 5809 5810 sub $rctr32w, $rctr32w, #1 5811 5812 mov $ctr3b, $ctr1b 5813 cmp $main_end_input_ptr, #16 5814 b.gt .L256_dec_blocks_more_than_1 5815 5816 sub $rctr32w, $rctr32w, #1 5817 b .L256_dec_blocks_less_than_1 5818 .L256_dec_blocks_more_than_3: @ blocks left > 3 5819 rev64 $res0b, $res1b @ GHASH final-3 block 5820 ld1 { $res1b}, [$input_ptr], #16 @ AES final-2 block - load ciphertext 5821 5822 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-3 block - store result 5823 5824 mov $acc_md, $h34k.d[1] @ GHASH final-3 block - mid 5825 5826 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5827 5828 eor $ctr0b, $res1b, $ctr1b @ AES final-2 block - result 5829 5830 mov $rk4d, $res0.d[1] @ GHASH final-3 block - mid 5831 5832 mov $output_l0, $ctr0.d[0] @ AES final-2 block - mov low 5833 5834 mov $output_h0, $ctr0.d[1] @ AES final-2 block - mov high 5835 5836 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-3 block - mid 5837 5838 movi $t0.8b, #0 @ suppress further partial tag feed in 5839 5840 pmull2 $acc_h.1q, $res0.2d, $h4.2d @ GHASH final-3 block - high 5841 5842 pmull $acc_m.1q, $rk4v.1d, $acc_m.1d @ GHASH final-3 block - mid 5843 eor $output_l0, $output_l0, $rk14_l @ AES final-2 block - round 14 low 5844#ifdef __AARCH64EB__ 5845 rev $output_l0, $output_l0 5846#endif 5847 5848 pmull $acc_l.1q, $res0.1d, $h4.1d @ GHASH final-3 block - low 5849 eor $output_h0, $output_h0, $rk14_h @ AES final-2 block - round 14 high 5850#ifdef __AARCH64EB__ 5851 rev $output_h0, $output_h0 5852#endif 5853 .L256_dec_blocks_more_than_2: @ blocks left > 2 5854 5855 rev64 $res0b, $res1b @ GHASH final-2 block 5856 ld1 { $res1b}, [$input_ptr], #16 @ AES final-1 block - load ciphertext 5857 5858 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5859 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-2 block - store result 5860 5861 eor $ctr0b, $res1b, $ctr2b @ AES final-1 block - result 5862 5863 mov $rk4d, $res0.d[1] @ GHASH final-2 block - mid 5864 5865 pmull $rk3q1, $res0.1d, $h3.1d @ GHASH final-2 block - low 5866 5867 pmull2 $rk2q1, $res0.2d, $h3.2d @ GHASH final-2 block - high 5868 5869 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-2 block - mid 5870 mov $output_l0, $ctr0.d[0] @ AES final-1 block - mov low 5871 5872 mov $output_h0, $ctr0.d[1] @ AES final-1 block - mov high 5873 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-2 block - low 5874 movi $t0.8b, #0 @ suppress further partial tag feed in 5875 5876 pmull $rk4v.1q, $rk4v.1d, $h34k.1d @ GHASH final-2 block - mid 5877 5878 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-2 block - high 5879 eor $output_l0, $output_l0, $rk14_l @ AES final-1 block - round 14 low 5880#ifdef __AARCH64EB__ 5881 rev $output_l0, $output_l0 5882#endif 5883 5884 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-2 block - mid 5885 eor $output_h0, $output_h0, $rk14_h @ AES final-1 block - round 14 high 5886#ifdef __AARCH64EB__ 5887 rev $output_h0, $output_h0 5888#endif 5889 .L256_dec_blocks_more_than_1: @ blocks left > 1 5890 5891 stp $output_l0, $output_h0, [$output_ptr], #16 @ AES final-1 block - store result 5892 rev64 $res0b, $res1b @ GHASH final-1 block 5893 5894 ld1 { $res1b}, [$input_ptr], #16 @ AES final block - load ciphertext 5895 5896 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5897 movi $t0.8b, #0 @ suppress further partial tag feed in 5898 5899 mov $rk4d, $res0.d[1] @ GHASH final-1 block - mid 5900 5901 eor $ctr0b, $res1b, $ctr3b @ AES final block - result 5902 5903 pmull2 $rk2q1, $res0.2d, $h2.2d @ GHASH final-1 block - high 5904 5905 eor $rk4v.8b, $rk4v.8b, $res0.8b @ GHASH final-1 block - mid 5906 5907 pmull $rk3q1, $res0.1d, $h2.1d @ GHASH final-1 block - low 5908 mov $output_l0, $ctr0.d[0] @ AES final block - mov low 5909 5910 ins $rk4v.d[1], $rk4v.d[0] @ GHASH final-1 block - mid 5911 5912 mov $output_h0, $ctr0.d[1] @ AES final block - mov high 5913 5914 pmull2 $rk4v.1q, $rk4v.2d, $h12k.2d @ GHASH final-1 block - mid 5915 eor $output_l0, $output_l0, $rk14_l @ AES final block - round 14 low 5916#ifdef __AARCH64EB__ 5917 rev $output_l0, $output_l0 5918#endif 5919 eor $acc_lb, $acc_lb, $rk3 @ GHASH final-1 block - low 5920 5921 eor $acc_hb, $acc_hb, $rk2 @ GHASH final-1 block - high 5922 5923 eor $acc_mb, $acc_mb, $rk4v.16b @ GHASH final-1 block - mid 5924 eor $output_h0, $output_h0, $rk14_h @ AES final block - round 14 high 5925#ifdef __AARCH64EB__ 5926 rev $output_h0, $output_h0 5927#endif 5928 .L256_dec_blocks_less_than_1: @ blocks left <= 1 5929 5930 and $bit_length, $bit_length, #127 @ bit_length %= 128 5931 mvn $rk14_h, xzr @ rk14_h = 0xffffffffffffffff 5932 5933 sub $bit_length, $bit_length, #128 @ bit_length -= 128 5934 mvn $rk14_l, xzr @ rk14_l = 0xffffffffffffffff 5935 5936 ldp $end_input_ptr, $main_end_input_ptr, [$output_ptr] @ load existing bytes we need to not overwrite 5937 neg $bit_length, $bit_length @ bit_length = 128 - #bits in input (in range [1,128]) 5938 5939 and $bit_length, $bit_length, #127 @ bit_length %= 128 5940 5941 lsr $rk14_h, $rk14_h, $bit_length @ rk14_h is mask for top 64b of last block 5942 cmp $bit_length, #64 5943 5944 csel $ctr32x, $rk14_l, $rk14_h, lt 5945 csel $ctr96_b64x, $rk14_h, xzr, lt 5946 5947 fmov $ctr0d, $ctr32x @ ctr0b is mask for last block 5948 and $output_l0, $output_l0, $ctr32x 5949 5950 mov $ctr0.d[1], $ctr96_b64x 5951 bic $end_input_ptr, $end_input_ptr, $ctr32x @ mask out low existing bytes 5952 5953#ifndef __AARCH64EB__ 5954 rev $ctr32w, $rctr32w 5955#else 5956 mov $ctr32w, $rctr32w 5957#endif 5958 5959 bic $main_end_input_ptr, $main_end_input_ptr, $ctr96_b64x @ mask out high existing bytes 5960 5961 orr $output_l0, $output_l0, $end_input_ptr 5962 5963 and $output_h0, $output_h0, $ctr96_b64x 5964 5965 orr $output_h0, $output_h0, $main_end_input_ptr 5966 5967 and $res1b, $res1b, $ctr0b @ possibly partial last block has zeroes in highest bits 5968 5969 rev64 $res0b, $res1b @ GHASH final block 5970 5971 eor $res0b, $res0b, $t0.16b @ feed in partial tag 5972 5973 pmull $rk3q1, $res0.1d, $h1.1d @ GHASH final block - low 5974 5975 mov $t0d, $res0.d[1] @ GHASH final block - mid 5976 5977 eor $t0.8b, $t0.8b, $res0.8b @ GHASH final block - mid 5978 5979 pmull2 $rk2q1, $res0.2d, $h1.2d @ GHASH final block - high 5980 5981 pmull $t0.1q, $t0.1d, $h12k.1d @ GHASH final block - mid 5982 5983 eor $acc_hb, $acc_hb, $rk2 @ GHASH final block - high 5984 5985 eor $acc_lb, $acc_lb, $rk3 @ GHASH final block - low 5986 5987 eor $acc_mb, $acc_mb, $t0.16b @ GHASH final block - mid 5988 movi $mod_constant.8b, #0xc2 5989 5990 eor $t9.16b, $acc_lb, $acc_hb @ MODULO - karatsuba tidy up 5991 5992 shl $mod_constantd, $mod_constantd, #56 @ mod_constant 5993 5994 eor $acc_mb, $acc_mb, $t9.16b @ MODULO - karatsuba tidy up 5995 5996 pmull $mod_t.1q, $acc_h.1d, $mod_constant.1d @ MODULO - top 64b align with mid 5997 5998 ext $acc_hb, $acc_hb, $acc_hb, #8 @ MODULO - other top alignment 5999 6000 eor $acc_mb, $acc_mb, $mod_t.16b @ MODULO - fold into mid 6001 6002 eor $acc_mb, $acc_mb, $acc_hb @ MODULO - fold into mid 6003 6004 pmull $mod_constant.1q, $acc_m.1d, $mod_constant.1d @ MODULO - mid 64b align with low 6005 6006 ext $acc_mb, $acc_mb, $acc_mb, #8 @ MODULO - other mid alignment 6007 6008 eor $acc_lb, $acc_lb, $mod_constant.16b @ MODULO - fold into low 6009 6010 stp $output_l0, $output_h0, [$output_ptr] 6011 6012 str $ctr32w, [$counter, #12] @ store the updated counter 6013 6014 eor $acc_lb, $acc_lb, $acc_mb @ MODULO - fold into low 6015 ext $acc_lb, $acc_lb, $acc_lb, #8 6016 rev64 $acc_lb, $acc_lb 6017 mov x0, $len 6018 st1 { $acc_l.16b }, [$current_tag] 6019 6020 ldp x21, x22, [sp, #16] 6021 ldp x23, x24, [sp, #32] 6022 ldp d8, d9, [sp, #48] 6023 ldp d10, d11, [sp, #64] 6024 ldp d12, d13, [sp, #80] 6025 ldp d14, d15, [sp, #96] 6026 ldp x19, x20, [sp], #112 6027 ret 6028 6029.L256_dec_ret: 6030 mov w0, #0x0 6031 ret 6032.size aes_gcm_dec_256_kernel,.-aes_gcm_dec_256_kernel 6033___ 6034} 6035} 6036 6037$code.=<<___; 6038.asciz "GHASH for ARMv8, CRYPTOGAMS by <appro\@openssl.org>" 6039.align 2 6040#endif 6041___ 6042 6043if ($flavour =~ /64/) { ######## 64-bit code 6044 sub unvmov { 6045 my $arg=shift; 6046 6047 $arg =~ m/q([0-9]+)#(lo|hi),\s*q([0-9]+)#(lo|hi)/o && 6048 sprintf "ins v%d.d[%d],v%d.d[%d]",$1<8?$1:$1+8,($2 eq "lo")?0:1, 6049 $3<8?$3:$3+8,($4 eq "lo")?0:1; 6050 } 6051 foreach(split("\n",$code)) { 6052 s/@\s/\/\//o; # old->new style commentary 6053 print $_,"\n"; 6054 } 6055} else { ######## 32-bit code 6056 sub unvdup32 { 6057 my $arg=shift; 6058 6059 $arg =~ m/q([0-9]+),\s*q([0-9]+)\[([0-3])\]/o && 6060 sprintf "vdup.32 q%d,d%d[%d]",$1,2*$2+($3>>1),$3&1; 6061 } 6062 sub unvpmullp64 { 6063 my ($mnemonic,$arg)=@_; 6064 6065 if ($arg =~ m/q([0-9]+),\s*q([0-9]+),\s*q([0-9]+)/o) { 6066 my $word = 0xf2a00e00|(($1&7)<<13)|(($1&8)<<19) 6067 |(($2&7)<<17)|(($2&8)<<4) 6068 |(($3&7)<<1) |(($3&8)<<2); 6069 $word |= 0x00010001 if ($mnemonic =~ "2"); 6070 # since ARMv7 instructions are always encoded little-endian. 6071 # correct solution is to use .inst directive, but older%%%% 6072 # assemblers don't implement it:-( 6073 sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s", 6074 $word&0xff,($word>>8)&0xff, 6075 ($word>>16)&0xff,($word>>24)&0xff, 6076 $mnemonic,$arg; 6077 } 6078 } 6079 6080 foreach(split("\n",$code)) { 6081 s/\b[wx]([0-9]+)\b/r$1/go; # new->old registers 6082 s/\bv([0-9])\.[12468]+[bsd]\b/q$1/go; # new->old registers 6083 s/\/\/\s?/@ /o; # new->old style commentary 6084 6085 # fix up remaining new-style suffixes 6086 s/\],#[0-9]+/]!/o; 6087 6088 s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or 6089 s/vdup\.32\s+(.*)/unvdup32($1)/geo or 6090 s/v?(pmull2?)\.p64\s+(.*)/unvpmullp64($1,$2)/geo or 6091 s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo or 6092 s/^(\s+)b\./$1b/o or 6093 s/^(\s+)ret/$1bx\tlr/o; 6094 6095 if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) { 6096 print " it $2\n"; 6097 } 6098 s/__AARCH64E([BL])__/__ARME$1__/go; 6099 print $_,"\n"; 6100 } 6101} 6102 6103close STDOUT or die "error closing STDOUT: $!"; # enforce flush 6104