1#!/usr/bin/env perl 2# Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved. 3# 4# Licensed under the Apache License 2.0 (the "License"). You may not use 5# this file except in compliance with the License. You can obtain a copy 6# in the file LICENSE in the source distribution or at 7# https://www.openssl.org/source/license.html 8 9use strict; 10 11my $output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; 12my $flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; 13my $xlate; 14 15$0 =~ m/(.*[\/\\])[^\/\\]+$/; my $dir=$1; 16( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or 17( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate ) or 18die "can't locate arm-xlate.pl"; 19 20open OUT,"| \"$^X\" $xlate $flavour $output"; 21*STDOUT=*OUT; 22 23my $code = data(); 24print $code; 25 26close STDOUT or die "error closing STDOUT: $!"; # enforce flush 27 28sub data 29{ 30 local $/; 31 return <DATA>; 32} 33 34__END__ 35// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved. 36// 37// Licensed under the OpenSSL license (the "License"). You may not use 38// this file except in compliance with the License. You can obtain a copy 39// in the file LICENSE in the source distribution or at 40// https://www.openssl.org/source/license.html 41// 42// ==================================================================== 43// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL 44// project. Rights for redistribution and usage in source and binary 45// forms are granted according to the OpenSSL license. 46// ==================================================================== 47// 48// This implementation is a translation of bsaes-armv7 for AArch64. 49// No attempt has been made to carry across the build switches for 50// kernel targets, since the Linux kernel crypto support has moved on 51// from when it was based on OpenSSL. 52 53// A lot of hand-scheduling has been performed. Consequently, this code 54// doesn't factor out neatly into macros in the same way that the 55// AArch32 version did, and there is little to be gained by wrapping it 56// up in Perl, and it is presented as pure assembly. 57 58 59#include "crypto/arm_arch.h" 60 61.text 62 63.extern AES_cbc_encrypt 64.extern AES_encrypt 65.extern AES_decrypt 66 67.type _bsaes_decrypt8,%function 68.align 4 69// On entry: 70// x9 -> key (previously expanded using _bsaes_key_convert) 71// x10 = number of rounds 72// v0-v7 input data 73// On exit: 74// x9-x11 corrupted 75// other general-purpose registers preserved 76// v0-v7 output data 77// v11-v15 preserved 78// other SIMD registers corrupted 79_bsaes_decrypt8: 80 ldr q8, [x9], #16 81 adrp x11, .LM0ISR 82 add x11, x11, #:lo12:.LM0ISR 83 movi v9.16b, #0x55 84 ldr q10, [x11], #16 85 movi v16.16b, #0x33 86 movi v17.16b, #0x0f 87 sub x10, x10, #1 88 eor v0.16b, v0.16b, v8.16b 89 eor v1.16b, v1.16b, v8.16b 90 eor v2.16b, v2.16b, v8.16b 91 eor v4.16b, v4.16b, v8.16b 92 eor v3.16b, v3.16b, v8.16b 93 eor v5.16b, v5.16b, v8.16b 94 tbl v0.16b, {v0.16b}, v10.16b 95 tbl v1.16b, {v1.16b}, v10.16b 96 tbl v2.16b, {v2.16b}, v10.16b 97 tbl v4.16b, {v4.16b}, v10.16b 98 eor v6.16b, v6.16b, v8.16b 99 eor v7.16b, v7.16b, v8.16b 100 tbl v3.16b, {v3.16b}, v10.16b 101 tbl v5.16b, {v5.16b}, v10.16b 102 tbl v6.16b, {v6.16b}, v10.16b 103 ushr v8.2d, v0.2d, #1 104 tbl v7.16b, {v7.16b}, v10.16b 105 ushr v10.2d, v4.2d, #1 106 ushr v18.2d, v2.2d, #1 107 eor v8.16b, v8.16b, v1.16b 108 ushr v19.2d, v6.2d, #1 109 eor v10.16b, v10.16b, v5.16b 110 eor v18.16b, v18.16b, v3.16b 111 and v8.16b, v8.16b, v9.16b 112 eor v19.16b, v19.16b, v7.16b 113 and v10.16b, v10.16b, v9.16b 114 and v18.16b, v18.16b, v9.16b 115 eor v1.16b, v1.16b, v8.16b 116 shl v8.2d, v8.2d, #1 117 and v9.16b, v19.16b, v9.16b 118 eor v5.16b, v5.16b, v10.16b 119 shl v10.2d, v10.2d, #1 120 eor v3.16b, v3.16b, v18.16b 121 shl v18.2d, v18.2d, #1 122 eor v0.16b, v0.16b, v8.16b 123 shl v8.2d, v9.2d, #1 124 eor v7.16b, v7.16b, v9.16b 125 eor v4.16b, v4.16b, v10.16b 126 eor v2.16b, v2.16b, v18.16b 127 ushr v9.2d, v1.2d, #2 128 eor v6.16b, v6.16b, v8.16b 129 ushr v8.2d, v0.2d, #2 130 ushr v10.2d, v5.2d, #2 131 ushr v18.2d, v4.2d, #2 132 eor v9.16b, v9.16b, v3.16b 133 eor v8.16b, v8.16b, v2.16b 134 eor v10.16b, v10.16b, v7.16b 135 eor v18.16b, v18.16b, v6.16b 136 and v9.16b, v9.16b, v16.16b 137 and v8.16b, v8.16b, v16.16b 138 and v10.16b, v10.16b, v16.16b 139 and v16.16b, v18.16b, v16.16b 140 eor v3.16b, v3.16b, v9.16b 141 shl v9.2d, v9.2d, #2 142 eor v2.16b, v2.16b, v8.16b 143 shl v8.2d, v8.2d, #2 144 eor v7.16b, v7.16b, v10.16b 145 shl v10.2d, v10.2d, #2 146 eor v6.16b, v6.16b, v16.16b 147 shl v16.2d, v16.2d, #2 148 eor v1.16b, v1.16b, v9.16b 149 eor v0.16b, v0.16b, v8.16b 150 eor v5.16b, v5.16b, v10.16b 151 eor v4.16b, v4.16b, v16.16b 152 ushr v8.2d, v3.2d, #4 153 ushr v9.2d, v2.2d, #4 154 ushr v10.2d, v1.2d, #4 155 ushr v16.2d, v0.2d, #4 156 eor v8.16b, v8.16b, v7.16b 157 eor v9.16b, v9.16b, v6.16b 158 eor v10.16b, v10.16b, v5.16b 159 eor v16.16b, v16.16b, v4.16b 160 and v8.16b, v8.16b, v17.16b 161 and v9.16b, v9.16b, v17.16b 162 and v10.16b, v10.16b, v17.16b 163 and v16.16b, v16.16b, v17.16b 164 eor v7.16b, v7.16b, v8.16b 165 shl v8.2d, v8.2d, #4 166 eor v6.16b, v6.16b, v9.16b 167 shl v9.2d, v9.2d, #4 168 eor v5.16b, v5.16b, v10.16b 169 shl v10.2d, v10.2d, #4 170 eor v4.16b, v4.16b, v16.16b 171 shl v16.2d, v16.2d, #4 172 eor v3.16b, v3.16b, v8.16b 173 eor v2.16b, v2.16b, v9.16b 174 eor v1.16b, v1.16b, v10.16b 175 eor v0.16b, v0.16b, v16.16b 176 b .Ldec_sbox 177.align 4 178.Ldec_loop: 179 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 180 ldp q8, q9, [x9], #32 181 eor v0.16b, v16.16b, v0.16b 182 ldr q10, [x9], #16 183 eor v1.16b, v17.16b, v1.16b 184 ldr q16, [x9], #16 185 eor v2.16b, v18.16b, v2.16b 186 eor v3.16b, v19.16b, v3.16b 187 eor v4.16b, v8.16b, v4.16b 188 eor v5.16b, v9.16b, v5.16b 189 eor v6.16b, v10.16b, v6.16b 190 eor v7.16b, v16.16b, v7.16b 191 tbl v0.16b, {v0.16b}, v28.16b 192 tbl v1.16b, {v1.16b}, v28.16b 193 tbl v2.16b, {v2.16b}, v28.16b 194 tbl v3.16b, {v3.16b}, v28.16b 195 tbl v4.16b, {v4.16b}, v28.16b 196 tbl v5.16b, {v5.16b}, v28.16b 197 tbl v6.16b, {v6.16b}, v28.16b 198 tbl v7.16b, {v7.16b}, v28.16b 199.Ldec_sbox: 200 eor v1.16b, v1.16b, v4.16b 201 eor v3.16b, v3.16b, v4.16b 202 subs x10, x10, #1 203 eor v4.16b, v4.16b, v7.16b 204 eor v2.16b, v2.16b, v7.16b 205 eor v1.16b, v1.16b, v6.16b 206 eor v6.16b, v6.16b, v4.16b 207 eor v2.16b, v2.16b, v5.16b 208 eor v0.16b, v0.16b, v1.16b 209 eor v7.16b, v7.16b, v6.16b 210 eor v8.16b, v6.16b, v2.16b 211 and v9.16b, v4.16b, v6.16b 212 eor v10.16b, v2.16b, v6.16b 213 eor v3.16b, v3.16b, v0.16b 214 eor v5.16b, v5.16b, v0.16b 215 eor v16.16b, v7.16b, v4.16b 216 eor v17.16b, v4.16b, v0.16b 217 and v18.16b, v0.16b, v2.16b 218 eor v19.16b, v7.16b, v4.16b 219 eor v1.16b, v1.16b, v3.16b 220 eor v20.16b, v3.16b, v0.16b 221 eor v21.16b, v5.16b, v2.16b 222 eor v22.16b, v3.16b, v7.16b 223 and v8.16b, v17.16b, v8.16b 224 orr v17.16b, v3.16b, v5.16b 225 eor v23.16b, v1.16b, v6.16b 226 eor v24.16b, v20.16b, v16.16b 227 eor v25.16b, v1.16b, v5.16b 228 orr v26.16b, v20.16b, v21.16b 229 and v20.16b, v20.16b, v21.16b 230 and v27.16b, v7.16b, v1.16b 231 eor v21.16b, v21.16b, v23.16b 232 orr v28.16b, v16.16b, v23.16b 233 orr v29.16b, v22.16b, v25.16b 234 eor v26.16b, v26.16b, v8.16b 235 and v16.16b, v16.16b, v23.16b 236 and v22.16b, v22.16b, v25.16b 237 and v21.16b, v24.16b, v21.16b 238 eor v8.16b, v28.16b, v8.16b 239 eor v23.16b, v5.16b, v2.16b 240 eor v24.16b, v1.16b, v6.16b 241 eor v16.16b, v16.16b, v22.16b 242 eor v22.16b, v3.16b, v0.16b 243 eor v25.16b, v29.16b, v21.16b 244 eor v21.16b, v26.16b, v21.16b 245 eor v8.16b, v8.16b, v20.16b 246 eor v26.16b, v23.16b, v24.16b 247 eor v16.16b, v16.16b, v20.16b 248 eor v28.16b, v22.16b, v19.16b 249 eor v20.16b, v25.16b, v20.16b 250 eor v9.16b, v21.16b, v9.16b 251 eor v8.16b, v8.16b, v18.16b 252 eor v18.16b, v5.16b, v1.16b 253 eor v21.16b, v16.16b, v17.16b 254 eor v16.16b, v16.16b, v17.16b 255 eor v17.16b, v20.16b, v27.16b 256 eor v20.16b, v3.16b, v7.16b 257 eor v25.16b, v9.16b, v8.16b 258 eor v27.16b, v0.16b, v4.16b 259 and v29.16b, v9.16b, v17.16b 260 eor v30.16b, v8.16b, v29.16b 261 eor v31.16b, v21.16b, v29.16b 262 eor v29.16b, v21.16b, v29.16b 263 bsl v30.16b, v17.16b, v21.16b 264 bsl v31.16b, v9.16b, v8.16b 265 bsl v16.16b, v30.16b, v29.16b 266 bsl v21.16b, v29.16b, v30.16b 267 eor v8.16b, v31.16b, v30.16b 268 and v1.16b, v1.16b, v31.16b 269 and v9.16b, v16.16b, v31.16b 270 and v6.16b, v6.16b, v30.16b 271 eor v16.16b, v17.16b, v21.16b 272 and v4.16b, v4.16b, v30.16b 273 eor v17.16b, v8.16b, v30.16b 274 and v21.16b, v24.16b, v8.16b 275 eor v9.16b, v9.16b, v25.16b 276 and v19.16b, v19.16b, v8.16b 277 eor v24.16b, v30.16b, v16.16b 278 eor v25.16b, v30.16b, v16.16b 279 and v7.16b, v7.16b, v17.16b 280 and v10.16b, v10.16b, v16.16b 281 eor v29.16b, v9.16b, v16.16b 282 eor v30.16b, v31.16b, v9.16b 283 and v0.16b, v24.16b, v0.16b 284 and v9.16b, v18.16b, v9.16b 285 and v2.16b, v25.16b, v2.16b 286 eor v10.16b, v10.16b, v6.16b 287 eor v18.16b, v29.16b, v16.16b 288 and v5.16b, v30.16b, v5.16b 289 eor v24.16b, v8.16b, v29.16b 290 and v25.16b, v26.16b, v29.16b 291 and v26.16b, v28.16b, v29.16b 292 eor v8.16b, v8.16b, v29.16b 293 eor v17.16b, v17.16b, v18.16b 294 eor v5.16b, v1.16b, v5.16b 295 and v23.16b, v24.16b, v23.16b 296 eor v21.16b, v21.16b, v25.16b 297 eor v19.16b, v19.16b, v26.16b 298 eor v0.16b, v4.16b, v0.16b 299 and v3.16b, v17.16b, v3.16b 300 eor v1.16b, v9.16b, v1.16b 301 eor v9.16b, v25.16b, v23.16b 302 eor v5.16b, v5.16b, v21.16b 303 eor v2.16b, v6.16b, v2.16b 304 and v6.16b, v8.16b, v22.16b 305 eor v3.16b, v7.16b, v3.16b 306 and v8.16b, v20.16b, v18.16b 307 eor v10.16b, v10.16b, v9.16b 308 eor v0.16b, v0.16b, v19.16b 309 eor v9.16b, v1.16b, v9.16b 310 eor v1.16b, v2.16b, v21.16b 311 eor v3.16b, v3.16b, v19.16b 312 and v16.16b, v27.16b, v16.16b 313 eor v17.16b, v26.16b, v6.16b 314 eor v6.16b, v8.16b, v7.16b 315 eor v7.16b, v1.16b, v9.16b 316 eor v1.16b, v5.16b, v3.16b 317 eor v2.16b, v10.16b, v3.16b 318 eor v4.16b, v16.16b, v4.16b 319 eor v8.16b, v6.16b, v17.16b 320 eor v5.16b, v9.16b, v3.16b 321 eor v9.16b, v0.16b, v1.16b 322 eor v6.16b, v7.16b, v1.16b 323 eor v0.16b, v4.16b, v17.16b 324 eor v4.16b, v8.16b, v7.16b 325 eor v7.16b, v9.16b, v2.16b 326 eor v8.16b, v3.16b, v0.16b 327 eor v7.16b, v7.16b, v5.16b 328 eor v3.16b, v4.16b, v7.16b 329 eor v4.16b, v7.16b, v0.16b 330 eor v7.16b, v8.16b, v3.16b 331 bcc .Ldec_done 332 ext v8.16b, v0.16b, v0.16b, #8 333 ext v9.16b, v1.16b, v1.16b, #8 334 ldr q28, [x11] // load from .LISR in common case (x10 > 0) 335 ext v10.16b, v6.16b, v6.16b, #8 336 ext v16.16b, v3.16b, v3.16b, #8 337 ext v17.16b, v5.16b, v5.16b, #8 338 ext v18.16b, v4.16b, v4.16b, #8 339 eor v8.16b, v8.16b, v0.16b 340 eor v9.16b, v9.16b, v1.16b 341 eor v10.16b, v10.16b, v6.16b 342 eor v16.16b, v16.16b, v3.16b 343 eor v17.16b, v17.16b, v5.16b 344 ext v19.16b, v2.16b, v2.16b, #8 345 ext v20.16b, v7.16b, v7.16b, #8 346 eor v18.16b, v18.16b, v4.16b 347 eor v6.16b, v6.16b, v8.16b 348 eor v8.16b, v2.16b, v10.16b 349 eor v4.16b, v4.16b, v9.16b 350 eor v2.16b, v19.16b, v2.16b 351 eor v9.16b, v20.16b, v7.16b 352 eor v0.16b, v0.16b, v16.16b 353 eor v1.16b, v1.16b, v16.16b 354 eor v6.16b, v6.16b, v17.16b 355 eor v8.16b, v8.16b, v16.16b 356 eor v7.16b, v7.16b, v18.16b 357 eor v4.16b, v4.16b, v16.16b 358 eor v2.16b, v3.16b, v2.16b 359 eor v1.16b, v1.16b, v17.16b 360 eor v3.16b, v5.16b, v9.16b 361 eor v5.16b, v8.16b, v17.16b 362 eor v7.16b, v7.16b, v17.16b 363 ext v8.16b, v0.16b, v0.16b, #12 364 ext v9.16b, v6.16b, v6.16b, #12 365 ext v10.16b, v4.16b, v4.16b, #12 366 ext v16.16b, v1.16b, v1.16b, #12 367 ext v17.16b, v5.16b, v5.16b, #12 368 ext v18.16b, v7.16b, v7.16b, #12 369 eor v0.16b, v0.16b, v8.16b 370 eor v6.16b, v6.16b, v9.16b 371 eor v4.16b, v4.16b, v10.16b 372 ext v19.16b, v2.16b, v2.16b, #12 373 ext v20.16b, v3.16b, v3.16b, #12 374 eor v1.16b, v1.16b, v16.16b 375 eor v5.16b, v5.16b, v17.16b 376 eor v7.16b, v7.16b, v18.16b 377 eor v2.16b, v2.16b, v19.16b 378 eor v16.16b, v16.16b, v0.16b 379 eor v3.16b, v3.16b, v20.16b 380 eor v17.16b, v17.16b, v4.16b 381 eor v10.16b, v10.16b, v6.16b 382 ext v0.16b, v0.16b, v0.16b, #8 383 eor v9.16b, v9.16b, v1.16b 384 ext v1.16b, v1.16b, v1.16b, #8 385 eor v8.16b, v8.16b, v3.16b 386 eor v16.16b, v16.16b, v3.16b 387 eor v18.16b, v18.16b, v5.16b 388 eor v19.16b, v19.16b, v7.16b 389 ext v21.16b, v5.16b, v5.16b, #8 390 ext v5.16b, v7.16b, v7.16b, #8 391 eor v7.16b, v20.16b, v2.16b 392 ext v4.16b, v4.16b, v4.16b, #8 393 ext v20.16b, v3.16b, v3.16b, #8 394 eor v17.16b, v17.16b, v3.16b 395 ext v2.16b, v2.16b, v2.16b, #8 396 eor v3.16b, v10.16b, v3.16b 397 ext v10.16b, v6.16b, v6.16b, #8 398 eor v0.16b, v0.16b, v8.16b 399 eor v1.16b, v1.16b, v16.16b 400 eor v5.16b, v5.16b, v18.16b 401 eor v3.16b, v3.16b, v4.16b 402 eor v7.16b, v20.16b, v7.16b 403 eor v6.16b, v2.16b, v19.16b 404 eor v4.16b, v21.16b, v17.16b 405 eor v2.16b, v10.16b, v9.16b 406 bne .Ldec_loop 407 ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) 408 b .Ldec_loop 409.align 4 410.Ldec_done: 411 ushr v8.2d, v0.2d, #1 412 movi v9.16b, #0x55 413 ldr q10, [x9] 414 ushr v16.2d, v2.2d, #1 415 movi v17.16b, #0x33 416 ushr v18.2d, v6.2d, #1 417 movi v19.16b, #0x0f 418 eor v8.16b, v8.16b, v1.16b 419 ushr v20.2d, v3.2d, #1 420 eor v16.16b, v16.16b, v7.16b 421 eor v18.16b, v18.16b, v4.16b 422 and v8.16b, v8.16b, v9.16b 423 eor v20.16b, v20.16b, v5.16b 424 and v16.16b, v16.16b, v9.16b 425 and v18.16b, v18.16b, v9.16b 426 shl v21.2d, v8.2d, #1 427 eor v1.16b, v1.16b, v8.16b 428 and v8.16b, v20.16b, v9.16b 429 eor v7.16b, v7.16b, v16.16b 430 shl v9.2d, v16.2d, #1 431 eor v4.16b, v4.16b, v18.16b 432 shl v16.2d, v18.2d, #1 433 eor v0.16b, v0.16b, v21.16b 434 shl v18.2d, v8.2d, #1 435 eor v5.16b, v5.16b, v8.16b 436 eor v2.16b, v2.16b, v9.16b 437 eor v6.16b, v6.16b, v16.16b 438 ushr v8.2d, v1.2d, #2 439 eor v3.16b, v3.16b, v18.16b 440 ushr v9.2d, v0.2d, #2 441 ushr v16.2d, v7.2d, #2 442 ushr v18.2d, v2.2d, #2 443 eor v8.16b, v8.16b, v4.16b 444 eor v9.16b, v9.16b, v6.16b 445 eor v16.16b, v16.16b, v5.16b 446 eor v18.16b, v18.16b, v3.16b 447 and v8.16b, v8.16b, v17.16b 448 and v9.16b, v9.16b, v17.16b 449 and v16.16b, v16.16b, v17.16b 450 and v17.16b, v18.16b, v17.16b 451 eor v4.16b, v4.16b, v8.16b 452 shl v8.2d, v8.2d, #2 453 eor v6.16b, v6.16b, v9.16b 454 shl v9.2d, v9.2d, #2 455 eor v5.16b, v5.16b, v16.16b 456 shl v16.2d, v16.2d, #2 457 eor v3.16b, v3.16b, v17.16b 458 shl v17.2d, v17.2d, #2 459 eor v1.16b, v1.16b, v8.16b 460 eor v0.16b, v0.16b, v9.16b 461 eor v7.16b, v7.16b, v16.16b 462 eor v2.16b, v2.16b, v17.16b 463 ushr v8.2d, v4.2d, #4 464 ushr v9.2d, v6.2d, #4 465 ushr v16.2d, v1.2d, #4 466 ushr v17.2d, v0.2d, #4 467 eor v8.16b, v8.16b, v5.16b 468 eor v9.16b, v9.16b, v3.16b 469 eor v16.16b, v16.16b, v7.16b 470 eor v17.16b, v17.16b, v2.16b 471 and v8.16b, v8.16b, v19.16b 472 and v9.16b, v9.16b, v19.16b 473 and v16.16b, v16.16b, v19.16b 474 and v17.16b, v17.16b, v19.16b 475 eor v5.16b, v5.16b, v8.16b 476 shl v8.2d, v8.2d, #4 477 eor v3.16b, v3.16b, v9.16b 478 shl v9.2d, v9.2d, #4 479 eor v7.16b, v7.16b, v16.16b 480 shl v16.2d, v16.2d, #4 481 eor v2.16b, v2.16b, v17.16b 482 shl v17.2d, v17.2d, #4 483 eor v4.16b, v4.16b, v8.16b 484 eor v6.16b, v6.16b, v9.16b 485 eor v7.16b, v7.16b, v10.16b 486 eor v1.16b, v1.16b, v16.16b 487 eor v2.16b, v2.16b, v10.16b 488 eor v0.16b, v0.16b, v17.16b 489 eor v4.16b, v4.16b, v10.16b 490 eor v6.16b, v6.16b, v10.16b 491 eor v3.16b, v3.16b, v10.16b 492 eor v5.16b, v5.16b, v10.16b 493 eor v1.16b, v1.16b, v10.16b 494 eor v0.16b, v0.16b, v10.16b 495 ret 496.size _bsaes_decrypt8,.-_bsaes_decrypt8 497 498.rodata 499.type _bsaes_consts,%object 500.align 6 501_bsaes_consts: 502// InvShiftRows constants 503// Used in _bsaes_decrypt8, which assumes contiguity 504// .LM0ISR used with round 0 key 505// .LISR used with middle round keys 506// .LISRM0 used with final round key 507.LM0ISR: 508.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 509.LISR: 510.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 511.LISRM0: 512.quad 0x01040b0e0205080f, 0x0306090c00070a0d 513 514// ShiftRows constants 515// Used in _bsaes_encrypt8, which assumes contiguity 516// .LM0SR used with round 0 key 517// .LSR used with middle round keys 518// .LSRM0 used with final round key 519.LM0SR: 520.quad 0x0a0e02060f03070b, 0x0004080c05090d01 521.LSR: 522.quad 0x0504070600030201, 0x0f0e0d0c0a09080b 523.LSRM0: 524.quad 0x0304090e00050a0f, 0x01060b0c0207080d 525 526.LM0_bigendian: 527.quad 0x02060a0e03070b0f, 0x0004080c0105090d 528.LM0_littleendian: 529.quad 0x0105090d0004080c, 0x03070b0f02060a0e 530 531// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into 532// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR 533.LREVM0SR: 534.quad 0x090d01050c000408, 0x03070b0f060a0e02 535 536.align 6 537.size _bsaes_consts,.-_bsaes_consts 538 539.previous 540 541.type _bsaes_encrypt8,%function 542.align 4 543// On entry: 544// x9 -> key (previously expanded using _bsaes_key_convert) 545// x10 = number of rounds 546// v0-v7 input data 547// On exit: 548// x9-x11 corrupted 549// other general-purpose registers preserved 550// v0-v7 output data 551// v11-v15 preserved 552// other SIMD registers corrupted 553_bsaes_encrypt8: 554 ldr q8, [x9], #16 555 adrp x11, .LM0SR 556 add x11, x11, #:lo12:.LM0SR 557 ldr q9, [x11], #16 558_bsaes_encrypt8_alt: 559 eor v0.16b, v0.16b, v8.16b 560 eor v1.16b, v1.16b, v8.16b 561 sub x10, x10, #1 562 eor v2.16b, v2.16b, v8.16b 563 eor v4.16b, v4.16b, v8.16b 564 eor v3.16b, v3.16b, v8.16b 565 eor v5.16b, v5.16b, v8.16b 566 tbl v0.16b, {v0.16b}, v9.16b 567 tbl v1.16b, {v1.16b}, v9.16b 568 tbl v2.16b, {v2.16b}, v9.16b 569 tbl v4.16b, {v4.16b}, v9.16b 570 eor v6.16b, v6.16b, v8.16b 571 eor v7.16b, v7.16b, v8.16b 572 tbl v3.16b, {v3.16b}, v9.16b 573 tbl v5.16b, {v5.16b}, v9.16b 574 tbl v6.16b, {v6.16b}, v9.16b 575 ushr v8.2d, v0.2d, #1 576 movi v10.16b, #0x55 577 tbl v7.16b, {v7.16b}, v9.16b 578 ushr v9.2d, v4.2d, #1 579 movi v16.16b, #0x33 580 ushr v17.2d, v2.2d, #1 581 eor v8.16b, v8.16b, v1.16b 582 movi v18.16b, #0x0f 583 ushr v19.2d, v6.2d, #1 584 eor v9.16b, v9.16b, v5.16b 585 eor v17.16b, v17.16b, v3.16b 586 and v8.16b, v8.16b, v10.16b 587 eor v19.16b, v19.16b, v7.16b 588 and v9.16b, v9.16b, v10.16b 589 and v17.16b, v17.16b, v10.16b 590 eor v1.16b, v1.16b, v8.16b 591 shl v8.2d, v8.2d, #1 592 and v10.16b, v19.16b, v10.16b 593 eor v5.16b, v5.16b, v9.16b 594 shl v9.2d, v9.2d, #1 595 eor v3.16b, v3.16b, v17.16b 596 shl v17.2d, v17.2d, #1 597 eor v0.16b, v0.16b, v8.16b 598 shl v8.2d, v10.2d, #1 599 eor v7.16b, v7.16b, v10.16b 600 eor v4.16b, v4.16b, v9.16b 601 eor v2.16b, v2.16b, v17.16b 602 ushr v9.2d, v1.2d, #2 603 eor v6.16b, v6.16b, v8.16b 604 ushr v8.2d, v0.2d, #2 605 ushr v10.2d, v5.2d, #2 606 ushr v17.2d, v4.2d, #2 607 eor v9.16b, v9.16b, v3.16b 608 eor v8.16b, v8.16b, v2.16b 609 eor v10.16b, v10.16b, v7.16b 610 eor v17.16b, v17.16b, v6.16b 611 and v9.16b, v9.16b, v16.16b 612 and v8.16b, v8.16b, v16.16b 613 and v10.16b, v10.16b, v16.16b 614 and v16.16b, v17.16b, v16.16b 615 eor v3.16b, v3.16b, v9.16b 616 shl v9.2d, v9.2d, #2 617 eor v2.16b, v2.16b, v8.16b 618 shl v8.2d, v8.2d, #2 619 eor v7.16b, v7.16b, v10.16b 620 shl v10.2d, v10.2d, #2 621 eor v6.16b, v6.16b, v16.16b 622 shl v16.2d, v16.2d, #2 623 eor v1.16b, v1.16b, v9.16b 624 eor v0.16b, v0.16b, v8.16b 625 eor v5.16b, v5.16b, v10.16b 626 eor v4.16b, v4.16b, v16.16b 627 ushr v8.2d, v3.2d, #4 628 ushr v9.2d, v2.2d, #4 629 ushr v10.2d, v1.2d, #4 630 ushr v16.2d, v0.2d, #4 631 eor v8.16b, v8.16b, v7.16b 632 eor v9.16b, v9.16b, v6.16b 633 eor v10.16b, v10.16b, v5.16b 634 eor v16.16b, v16.16b, v4.16b 635 and v8.16b, v8.16b, v18.16b 636 and v9.16b, v9.16b, v18.16b 637 and v10.16b, v10.16b, v18.16b 638 and v16.16b, v16.16b, v18.16b 639 eor v7.16b, v7.16b, v8.16b 640 shl v8.2d, v8.2d, #4 641 eor v6.16b, v6.16b, v9.16b 642 shl v9.2d, v9.2d, #4 643 eor v5.16b, v5.16b, v10.16b 644 shl v10.2d, v10.2d, #4 645 eor v4.16b, v4.16b, v16.16b 646 shl v16.2d, v16.2d, #4 647 eor v3.16b, v3.16b, v8.16b 648 eor v2.16b, v2.16b, v9.16b 649 eor v1.16b, v1.16b, v10.16b 650 eor v0.16b, v0.16b, v16.16b 651 b .Lenc_sbox 652.align 4 653.Lenc_loop: 654 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 655 ldp q8, q9, [x9], #32 656 eor v0.16b, v16.16b, v0.16b 657 ldr q10, [x9], #16 658 eor v1.16b, v17.16b, v1.16b 659 ldr q16, [x9], #16 660 eor v2.16b, v18.16b, v2.16b 661 eor v3.16b, v19.16b, v3.16b 662 eor v4.16b, v8.16b, v4.16b 663 eor v5.16b, v9.16b, v5.16b 664 eor v6.16b, v10.16b, v6.16b 665 eor v7.16b, v16.16b, v7.16b 666 tbl v0.16b, {v0.16b}, v28.16b 667 tbl v1.16b, {v1.16b}, v28.16b 668 tbl v2.16b, {v2.16b}, v28.16b 669 tbl v3.16b, {v3.16b}, v28.16b 670 tbl v4.16b, {v4.16b}, v28.16b 671 tbl v5.16b, {v5.16b}, v28.16b 672 tbl v6.16b, {v6.16b}, v28.16b 673 tbl v7.16b, {v7.16b}, v28.16b 674.Lenc_sbox: 675 eor v5.16b, v5.16b, v6.16b 676 eor v3.16b, v3.16b, v0.16b 677 subs x10, x10, #1 678 eor v2.16b, v2.16b, v1.16b 679 eor v5.16b, v5.16b, v0.16b 680 eor v8.16b, v3.16b, v7.16b 681 eor v6.16b, v6.16b, v2.16b 682 eor v7.16b, v7.16b, v5.16b 683 eor v8.16b, v8.16b, v4.16b 684 eor v3.16b, v6.16b, v3.16b 685 eor v4.16b, v4.16b, v5.16b 686 eor v6.16b, v1.16b, v5.16b 687 eor v2.16b, v2.16b, v7.16b 688 eor v1.16b, v8.16b, v1.16b 689 eor v8.16b, v7.16b, v4.16b 690 eor v9.16b, v3.16b, v0.16b 691 eor v10.16b, v7.16b, v6.16b 692 eor v16.16b, v5.16b, v3.16b 693 eor v17.16b, v6.16b, v2.16b 694 eor v18.16b, v5.16b, v1.16b 695 eor v19.16b, v2.16b, v4.16b 696 eor v20.16b, v1.16b, v0.16b 697 orr v21.16b, v8.16b, v9.16b 698 orr v22.16b, v10.16b, v16.16b 699 eor v23.16b, v8.16b, v17.16b 700 eor v24.16b, v9.16b, v18.16b 701 and v19.16b, v19.16b, v20.16b 702 orr v20.16b, v17.16b, v18.16b 703 and v8.16b, v8.16b, v9.16b 704 and v9.16b, v17.16b, v18.16b 705 and v17.16b, v23.16b, v24.16b 706 and v10.16b, v10.16b, v16.16b 707 eor v16.16b, v21.16b, v19.16b 708 eor v18.16b, v20.16b, v19.16b 709 and v19.16b, v2.16b, v1.16b 710 and v20.16b, v6.16b, v5.16b 711 eor v21.16b, v22.16b, v17.16b 712 eor v9.16b, v9.16b, v10.16b 713 eor v10.16b, v16.16b, v17.16b 714 eor v16.16b, v18.16b, v8.16b 715 and v17.16b, v4.16b, v0.16b 716 orr v18.16b, v7.16b, v3.16b 717 eor v21.16b, v21.16b, v8.16b 718 eor v8.16b, v9.16b, v8.16b 719 eor v9.16b, v10.16b, v19.16b 720 eor v10.16b, v3.16b, v0.16b 721 eor v16.16b, v16.16b, v17.16b 722 eor v17.16b, v5.16b, v1.16b 723 eor v19.16b, v21.16b, v20.16b 724 eor v20.16b, v8.16b, v18.16b 725 eor v8.16b, v8.16b, v18.16b 726 eor v18.16b, v7.16b, v4.16b 727 eor v21.16b, v9.16b, v16.16b 728 eor v22.16b, v6.16b, v2.16b 729 and v23.16b, v9.16b, v19.16b 730 eor v24.16b, v10.16b, v17.16b 731 eor v25.16b, v0.16b, v1.16b 732 eor v26.16b, v7.16b, v6.16b 733 eor v27.16b, v18.16b, v22.16b 734 eor v28.16b, v3.16b, v5.16b 735 eor v29.16b, v16.16b, v23.16b 736 eor v30.16b, v20.16b, v23.16b 737 eor v23.16b, v20.16b, v23.16b 738 eor v31.16b, v4.16b, v2.16b 739 bsl v29.16b, v19.16b, v20.16b 740 bsl v30.16b, v9.16b, v16.16b 741 bsl v8.16b, v29.16b, v23.16b 742 bsl v20.16b, v23.16b, v29.16b 743 eor v9.16b, v30.16b, v29.16b 744 and v5.16b, v5.16b, v30.16b 745 and v8.16b, v8.16b, v30.16b 746 and v1.16b, v1.16b, v29.16b 747 eor v16.16b, v19.16b, v20.16b 748 and v2.16b, v2.16b, v29.16b 749 eor v19.16b, v9.16b, v29.16b 750 and v17.16b, v17.16b, v9.16b 751 eor v8.16b, v8.16b, v21.16b 752 and v20.16b, v22.16b, v9.16b 753 eor v21.16b, v29.16b, v16.16b 754 eor v22.16b, v29.16b, v16.16b 755 and v23.16b, v25.16b, v16.16b 756 and v6.16b, v6.16b, v19.16b 757 eor v25.16b, v8.16b, v16.16b 758 eor v29.16b, v30.16b, v8.16b 759 and v4.16b, v21.16b, v4.16b 760 and v8.16b, v28.16b, v8.16b 761 and v0.16b, v22.16b, v0.16b 762 eor v21.16b, v23.16b, v1.16b 763 eor v22.16b, v9.16b, v25.16b 764 eor v9.16b, v9.16b, v25.16b 765 eor v23.16b, v25.16b, v16.16b 766 and v3.16b, v29.16b, v3.16b 767 and v24.16b, v24.16b, v25.16b 768 and v25.16b, v27.16b, v25.16b 769 and v10.16b, v22.16b, v10.16b 770 and v9.16b, v9.16b, v18.16b 771 eor v18.16b, v19.16b, v23.16b 772 and v19.16b, v26.16b, v23.16b 773 eor v3.16b, v5.16b, v3.16b 774 eor v17.16b, v17.16b, v24.16b 775 eor v10.16b, v24.16b, v10.16b 776 and v16.16b, v31.16b, v16.16b 777 eor v20.16b, v20.16b, v25.16b 778 eor v9.16b, v25.16b, v9.16b 779 eor v4.16b, v2.16b, v4.16b 780 and v7.16b, v18.16b, v7.16b 781 eor v18.16b, v19.16b, v6.16b 782 eor v5.16b, v8.16b, v5.16b 783 eor v0.16b, v1.16b, v0.16b 784 eor v1.16b, v21.16b, v10.16b 785 eor v8.16b, v3.16b, v17.16b 786 eor v2.16b, v16.16b, v2.16b 787 eor v3.16b, v6.16b, v7.16b 788 eor v6.16b, v18.16b, v9.16b 789 eor v4.16b, v4.16b, v20.16b 790 eor v10.16b, v5.16b, v10.16b 791 eor v0.16b, v0.16b, v17.16b 792 eor v9.16b, v2.16b, v9.16b 793 eor v3.16b, v3.16b, v20.16b 794 eor v7.16b, v6.16b, v1.16b 795 eor v5.16b, v8.16b, v4.16b 796 eor v6.16b, v10.16b, v1.16b 797 eor v2.16b, v4.16b, v0.16b 798 eor v4.16b, v3.16b, v10.16b 799 eor v9.16b, v9.16b, v7.16b 800 eor v3.16b, v0.16b, v5.16b 801 eor v0.16b, v1.16b, v4.16b 802 eor v1.16b, v4.16b, v8.16b 803 eor v4.16b, v9.16b, v5.16b 804 eor v6.16b, v6.16b, v3.16b 805 bcc .Lenc_done 806 ext v8.16b, v0.16b, v0.16b, #12 807 ext v9.16b, v4.16b, v4.16b, #12 808 ldr q28, [x11] 809 ext v10.16b, v6.16b, v6.16b, #12 810 ext v16.16b, v1.16b, v1.16b, #12 811 ext v17.16b, v3.16b, v3.16b, #12 812 ext v18.16b, v7.16b, v7.16b, #12 813 eor v0.16b, v0.16b, v8.16b 814 eor v4.16b, v4.16b, v9.16b 815 eor v6.16b, v6.16b, v10.16b 816 ext v19.16b, v2.16b, v2.16b, #12 817 ext v20.16b, v5.16b, v5.16b, #12 818 eor v1.16b, v1.16b, v16.16b 819 eor v3.16b, v3.16b, v17.16b 820 eor v7.16b, v7.16b, v18.16b 821 eor v2.16b, v2.16b, v19.16b 822 eor v16.16b, v16.16b, v0.16b 823 eor v5.16b, v5.16b, v20.16b 824 eor v17.16b, v17.16b, v6.16b 825 eor v10.16b, v10.16b, v4.16b 826 ext v0.16b, v0.16b, v0.16b, #8 827 eor v9.16b, v9.16b, v1.16b 828 ext v1.16b, v1.16b, v1.16b, #8 829 eor v8.16b, v8.16b, v5.16b 830 eor v16.16b, v16.16b, v5.16b 831 eor v18.16b, v18.16b, v3.16b 832 eor v19.16b, v19.16b, v7.16b 833 ext v3.16b, v3.16b, v3.16b, #8 834 ext v7.16b, v7.16b, v7.16b, #8 835 eor v20.16b, v20.16b, v2.16b 836 ext v6.16b, v6.16b, v6.16b, #8 837 ext v21.16b, v5.16b, v5.16b, #8 838 eor v17.16b, v17.16b, v5.16b 839 ext v2.16b, v2.16b, v2.16b, #8 840 eor v10.16b, v10.16b, v5.16b 841 ext v22.16b, v4.16b, v4.16b, #8 842 eor v0.16b, v0.16b, v8.16b 843 eor v1.16b, v1.16b, v16.16b 844 eor v5.16b, v7.16b, v18.16b 845 eor v4.16b, v3.16b, v17.16b 846 eor v3.16b, v6.16b, v10.16b 847 eor v7.16b, v21.16b, v20.16b 848 eor v6.16b, v2.16b, v19.16b 849 eor v2.16b, v22.16b, v9.16b 850 bne .Lenc_loop 851 ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) 852 b .Lenc_loop 853.align 4 854.Lenc_done: 855 ushr v8.2d, v0.2d, #1 856 movi v9.16b, #0x55 857 ldr q10, [x9] 858 ushr v16.2d, v3.2d, #1 859 movi v17.16b, #0x33 860 ushr v18.2d, v4.2d, #1 861 movi v19.16b, #0x0f 862 eor v8.16b, v8.16b, v1.16b 863 ushr v20.2d, v2.2d, #1 864 eor v16.16b, v16.16b, v7.16b 865 eor v18.16b, v18.16b, v6.16b 866 and v8.16b, v8.16b, v9.16b 867 eor v20.16b, v20.16b, v5.16b 868 and v16.16b, v16.16b, v9.16b 869 and v18.16b, v18.16b, v9.16b 870 shl v21.2d, v8.2d, #1 871 eor v1.16b, v1.16b, v8.16b 872 and v8.16b, v20.16b, v9.16b 873 eor v7.16b, v7.16b, v16.16b 874 shl v9.2d, v16.2d, #1 875 eor v6.16b, v6.16b, v18.16b 876 shl v16.2d, v18.2d, #1 877 eor v0.16b, v0.16b, v21.16b 878 shl v18.2d, v8.2d, #1 879 eor v5.16b, v5.16b, v8.16b 880 eor v3.16b, v3.16b, v9.16b 881 eor v4.16b, v4.16b, v16.16b 882 ushr v8.2d, v1.2d, #2 883 eor v2.16b, v2.16b, v18.16b 884 ushr v9.2d, v0.2d, #2 885 ushr v16.2d, v7.2d, #2 886 ushr v18.2d, v3.2d, #2 887 eor v8.16b, v8.16b, v6.16b 888 eor v9.16b, v9.16b, v4.16b 889 eor v16.16b, v16.16b, v5.16b 890 eor v18.16b, v18.16b, v2.16b 891 and v8.16b, v8.16b, v17.16b 892 and v9.16b, v9.16b, v17.16b 893 and v16.16b, v16.16b, v17.16b 894 and v17.16b, v18.16b, v17.16b 895 eor v6.16b, v6.16b, v8.16b 896 shl v8.2d, v8.2d, #2 897 eor v4.16b, v4.16b, v9.16b 898 shl v9.2d, v9.2d, #2 899 eor v5.16b, v5.16b, v16.16b 900 shl v16.2d, v16.2d, #2 901 eor v2.16b, v2.16b, v17.16b 902 shl v17.2d, v17.2d, #2 903 eor v1.16b, v1.16b, v8.16b 904 eor v0.16b, v0.16b, v9.16b 905 eor v7.16b, v7.16b, v16.16b 906 eor v3.16b, v3.16b, v17.16b 907 ushr v8.2d, v6.2d, #4 908 ushr v9.2d, v4.2d, #4 909 ushr v16.2d, v1.2d, #4 910 ushr v17.2d, v0.2d, #4 911 eor v8.16b, v8.16b, v5.16b 912 eor v9.16b, v9.16b, v2.16b 913 eor v16.16b, v16.16b, v7.16b 914 eor v17.16b, v17.16b, v3.16b 915 and v8.16b, v8.16b, v19.16b 916 and v9.16b, v9.16b, v19.16b 917 and v16.16b, v16.16b, v19.16b 918 and v17.16b, v17.16b, v19.16b 919 eor v5.16b, v5.16b, v8.16b 920 shl v8.2d, v8.2d, #4 921 eor v2.16b, v2.16b, v9.16b 922 shl v9.2d, v9.2d, #4 923 eor v7.16b, v7.16b, v16.16b 924 shl v16.2d, v16.2d, #4 925 eor v3.16b, v3.16b, v17.16b 926 shl v17.2d, v17.2d, #4 927 eor v6.16b, v6.16b, v8.16b 928 eor v4.16b, v4.16b, v9.16b 929 eor v7.16b, v7.16b, v10.16b 930 eor v1.16b, v1.16b, v16.16b 931 eor v3.16b, v3.16b, v10.16b 932 eor v0.16b, v0.16b, v17.16b 933 eor v6.16b, v6.16b, v10.16b 934 eor v4.16b, v4.16b, v10.16b 935 eor v2.16b, v2.16b, v10.16b 936 eor v5.16b, v5.16b, v10.16b 937 eor v1.16b, v1.16b, v10.16b 938 eor v0.16b, v0.16b, v10.16b 939 ret 940.size _bsaes_encrypt8,.-_bsaes_encrypt8 941 942.type _bsaes_key_convert,%function 943.align 4 944// On entry: 945// x9 -> input key (big-endian) 946// x10 = number of rounds 947// x17 -> output key (native endianness) 948// On exit: 949// x9, x10 corrupted 950// x11 -> .LM0_bigendian 951// x17 -> last quadword of output key 952// other general-purpose registers preserved 953// v2-v6 preserved 954// v7.16b[] = 0x63 955// v8-v14 preserved 956// v15 = last round key (converted to native endianness) 957// other SIMD registers corrupted 958_bsaes_key_convert: 959#ifdef __AARCH64EL__ 960 adrp x11, .LM0_littleendian 961 add x11, x11, #:lo12:.LM0_littleendian 962#else 963 adrp x11, .LM0_bigendian 964 add x11, x11, #:lo12:.LM0_bigendian 965#endif 966 ldr q0, [x9], #16 // load round 0 key 967 ldr q1, [x11] // .LM0 968 ldr q15, [x9], #16 // load round 1 key 969 970 movi v7.16b, #0x63 // compose .L63 971 movi v16.16b, #0x01 // bit masks 972 movi v17.16b, #0x02 973 movi v18.16b, #0x04 974 movi v19.16b, #0x08 975 movi v20.16b, #0x10 976 movi v21.16b, #0x20 977 movi v22.16b, #0x40 978 movi v23.16b, #0x80 979 980#ifdef __AARCH64EL__ 981 rev32 v0.16b, v0.16b 982#endif 983 sub x10, x10, #1 984 str q0, [x17], #16 // save round 0 key 985 986.align 4 987.Lkey_loop: 988 tbl v0.16b, {v15.16b}, v1.16b 989 ldr q15, [x9], #16 // load next round key 990 991 eor v0.16b, v0.16b, v7.16b 992 cmtst v24.16b, v0.16b, v16.16b 993 cmtst v25.16b, v0.16b, v17.16b 994 cmtst v26.16b, v0.16b, v18.16b 995 cmtst v27.16b, v0.16b, v19.16b 996 cmtst v28.16b, v0.16b, v20.16b 997 cmtst v29.16b, v0.16b, v21.16b 998 cmtst v30.16b, v0.16b, v22.16b 999 cmtst v31.16b, v0.16b, v23.16b 1000 sub x10, x10, #1 1001 st1 {v24.16b-v27.16b}, [x17], #64 // write bit-sliced round key 1002 st1 {v28.16b-v31.16b}, [x17], #64 1003 cbnz x10, .Lkey_loop 1004 1005 // don't save last round key 1006#ifdef __AARCH64EL__ 1007 rev32 v15.16b, v15.16b 1008 adrp x11, .LM0_bigendian 1009 add x11, x11, #:lo12:.LM0_bigendian 1010#endif 1011 ret 1012.size _bsaes_key_convert,.-_bsaes_key_convert 1013 1014.globl ossl_bsaes_cbc_encrypt 1015.type ossl_bsaes_cbc_encrypt,%function 1016.align 4 1017// On entry: 1018// x0 -> input ciphertext 1019// x1 -> output plaintext 1020// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) 1021// x3 -> key 1022// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) 1023// w5 must be == 0 1024// On exit: 1025// Output plaintext filled in 1026// Initialisation vector overwritten with last quadword of ciphertext 1027// No output registers, usual AAPCS64 register preservation 1028ossl_bsaes_cbc_encrypt: 1029 AARCH64_VALID_CALL_TARGET 1030 cmp x2, #128 1031 bhs .Lcbc_do_bsaes 1032 b AES_cbc_encrypt 1033.Lcbc_do_bsaes: 1034 1035 // it is up to the caller to make sure we are called with enc == 0 1036 1037 stp x29, x30, [sp, #-48]! 1038 stp d8, d9, [sp, #16] 1039 stp d10, d15, [sp, #32] 1040 lsr x2, x2, #4 // len in 16 byte blocks 1041 1042 ldr w15, [x3, #240] // get # of rounds 1043 mov x14, sp 1044 1045 // allocate the key schedule on the stack 1046 add x17, sp, #96 1047 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1048 1049 // populate the key schedule 1050 mov x9, x3 // pass key 1051 mov x10, x15 // pass # of rounds 1052 mov sp, x17 // sp is sp 1053 bl _bsaes_key_convert 1054 ldr q6, [sp] 1055 str q15, [x17] // save last round key 1056 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1057 str q6, [sp] 1058 1059 ldr q15, [x4] // load IV 1060 b .Lcbc_dec_loop 1061 1062.align 4 1063.Lcbc_dec_loop: 1064 subs x2, x2, #0x8 1065 bmi .Lcbc_dec_loop_finish 1066 1067 ldr q0, [x0], #16 // load input 1068 mov x9, sp // pass the key 1069 ldr q1, [x0], #16 1070 mov x10, x15 1071 ldr q2, [x0], #16 1072 ldr q3, [x0], #16 1073 ldr q4, [x0], #16 1074 ldr q5, [x0], #16 1075 ldr q6, [x0], #16 1076 ldr q7, [x0], #-7*16 1077 1078 bl _bsaes_decrypt8 1079 1080 ldr q16, [x0], #16 // reload input 1081 eor v0.16b, v0.16b, v15.16b // ^= IV 1082 eor v1.16b, v1.16b, v16.16b 1083 str q0, [x1], #16 // write output 1084 ldr q0, [x0], #16 1085 str q1, [x1], #16 1086 ldr q1, [x0], #16 1087 eor v1.16b, v4.16b, v1.16b 1088 ldr q4, [x0], #16 1089 eor v2.16b, v2.16b, v4.16b 1090 eor v0.16b, v6.16b, v0.16b 1091 ldr q4, [x0], #16 1092 str q0, [x1], #16 1093 str q1, [x1], #16 1094 eor v0.16b, v7.16b, v4.16b 1095 ldr q1, [x0], #16 1096 str q2, [x1], #16 1097 ldr q2, [x0], #16 1098 ldr q15, [x0], #16 1099 str q0, [x1], #16 1100 eor v0.16b, v5.16b, v2.16b 1101 eor v1.16b, v3.16b, v1.16b 1102 str q1, [x1], #16 1103 str q0, [x1], #16 1104 1105 b .Lcbc_dec_loop 1106 1107.Lcbc_dec_loop_finish: 1108 adds x2, x2, #8 1109 beq .Lcbc_dec_done 1110 1111 ldr q0, [x0], #16 // load input 1112 cmp x2, #2 1113 blo .Lcbc_dec_one 1114 ldr q1, [x0], #16 1115 mov x9, sp // pass the key 1116 mov x10, x15 1117 beq .Lcbc_dec_two 1118 ldr q2, [x0], #16 1119 cmp x2, #4 1120 blo .Lcbc_dec_three 1121 ldr q3, [x0], #16 1122 beq .Lcbc_dec_four 1123 ldr q4, [x0], #16 1124 cmp x2, #6 1125 blo .Lcbc_dec_five 1126 ldr q5, [x0], #16 1127 beq .Lcbc_dec_six 1128 ldr q6, [x0], #-6*16 1129 1130 bl _bsaes_decrypt8 1131 1132 ldr q5, [x0], #16 // reload input 1133 eor v0.16b, v0.16b, v15.16b // ^= IV 1134 ldr q8, [x0], #16 1135 ldr q9, [x0], #16 1136 ldr q10, [x0], #16 1137 str q0, [x1], #16 // write output 1138 ldr q0, [x0], #16 1139 eor v1.16b, v1.16b, v5.16b 1140 ldr q5, [x0], #16 1141 eor v6.16b, v6.16b, v8.16b 1142 ldr q15, [x0] 1143 eor v4.16b, v4.16b, v9.16b 1144 eor v2.16b, v2.16b, v10.16b 1145 str q1, [x1], #16 1146 eor v0.16b, v7.16b, v0.16b 1147 str q6, [x1], #16 1148 eor v1.16b, v3.16b, v5.16b 1149 str q4, [x1], #16 1150 str q2, [x1], #16 1151 str q0, [x1], #16 1152 str q1, [x1] 1153 b .Lcbc_dec_done 1154.align 4 1155.Lcbc_dec_six: 1156 sub x0, x0, #0x60 1157 bl _bsaes_decrypt8 1158 ldr q3, [x0], #16 // reload input 1159 eor v0.16b, v0.16b, v15.16b // ^= IV 1160 ldr q5, [x0], #16 1161 ldr q8, [x0], #16 1162 ldr q9, [x0], #16 1163 str q0, [x1], #16 // write output 1164 ldr q0, [x0], #16 1165 eor v1.16b, v1.16b, v3.16b 1166 ldr q15, [x0] 1167 eor v3.16b, v6.16b, v5.16b 1168 eor v4.16b, v4.16b, v8.16b 1169 eor v2.16b, v2.16b, v9.16b 1170 str q1, [x1], #16 1171 eor v0.16b, v7.16b, v0.16b 1172 str q3, [x1], #16 1173 str q4, [x1], #16 1174 str q2, [x1], #16 1175 str q0, [x1] 1176 b .Lcbc_dec_done 1177.align 4 1178.Lcbc_dec_five: 1179 sub x0, x0, #0x50 1180 bl _bsaes_decrypt8 1181 ldr q3, [x0], #16 // reload input 1182 eor v0.16b, v0.16b, v15.16b // ^= IV 1183 ldr q5, [x0], #16 1184 ldr q7, [x0], #16 1185 ldr q8, [x0], #16 1186 str q0, [x1], #16 // write output 1187 ldr q15, [x0] 1188 eor v0.16b, v1.16b, v3.16b 1189 eor v1.16b, v6.16b, v5.16b 1190 eor v3.16b, v4.16b, v7.16b 1191 str q0, [x1], #16 1192 eor v0.16b, v2.16b, v8.16b 1193 str q1, [x1], #16 1194 str q3, [x1], #16 1195 str q0, [x1] 1196 b .Lcbc_dec_done 1197.align 4 1198.Lcbc_dec_four: 1199 sub x0, x0, #0x40 1200 bl _bsaes_decrypt8 1201 ldr q2, [x0], #16 // reload input 1202 eor v0.16b, v0.16b, v15.16b // ^= IV 1203 ldr q3, [x0], #16 1204 ldr q5, [x0], #16 1205 str q0, [x1], #16 // write output 1206 ldr q15, [x0] 1207 eor v0.16b, v1.16b, v2.16b 1208 eor v1.16b, v6.16b, v3.16b 1209 eor v2.16b, v4.16b, v5.16b 1210 str q0, [x1], #16 1211 str q1, [x1], #16 1212 str q2, [x1] 1213 b .Lcbc_dec_done 1214.align 4 1215.Lcbc_dec_three: 1216 sub x0, x0, #0x30 1217 bl _bsaes_decrypt8 1218 ldr q2, [x0], #16 // reload input 1219 eor v0.16b, v0.16b, v15.16b // ^= IV 1220 ldr q3, [x0], #16 1221 ldr q15, [x0] 1222 str q0, [x1], #16 // write output 1223 eor v0.16b, v1.16b, v2.16b 1224 eor v1.16b, v6.16b, v3.16b 1225 str q0, [x1], #16 1226 str q1, [x1] 1227 b .Lcbc_dec_done 1228.align 4 1229.Lcbc_dec_two: 1230 sub x0, x0, #0x20 1231 bl _bsaes_decrypt8 1232 ldr q2, [x0], #16 // reload input 1233 eor v0.16b, v0.16b, v15.16b // ^= IV 1234 ldr q15, [x0] 1235 str q0, [x1], #16 // write output 1236 eor v0.16b, v1.16b, v2.16b 1237 str q0, [x1] 1238 b .Lcbc_dec_done 1239.align 4 1240.Lcbc_dec_one: 1241 sub x0, x0, #0x10 1242 stp x1, x4, [sp, #-32]! 1243 str x14, [sp, #16] 1244 mov v8.16b, v15.16b 1245 mov v15.16b, v0.16b 1246 mov x2, x3 1247 bl AES_decrypt 1248 ldr x14, [sp, #16] 1249 ldp x1, x4, [sp], #32 1250 ldr q0, [x1] // load result 1251 eor v0.16b, v0.16b, v8.16b // ^= IV 1252 str q0, [x1] // write output 1253 1254.align 4 1255.Lcbc_dec_done: 1256 movi v0.16b, #0 1257 movi v1.16b, #0 1258.Lcbc_dec_bzero:// wipe key schedule [if any] 1259 stp q0, q1, [sp], #32 1260 cmp sp, x14 1261 bne .Lcbc_dec_bzero 1262 str q15, [x4] // return IV 1263 ldp d8, d9, [sp, #16] 1264 ldp d10, d15, [sp, #32] 1265 ldp x29, x30, [sp], #48 1266 ret 1267.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1268 1269.globl ossl_bsaes_ctr32_encrypt_blocks 1270.type ossl_bsaes_ctr32_encrypt_blocks,%function 1271.align 4 1272// On entry: 1273// x0 -> input text (whole 16-byte blocks) 1274// x1 -> output text (whole 16-byte blocks) 1275// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) 1276// x3 -> key 1277// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block 1278// On exit: 1279// Output text filled in 1280// No output registers, usual AAPCS64 register preservation 1281ossl_bsaes_ctr32_encrypt_blocks: 1282 AARCH64_VALID_CALL_TARGET 1283 cmp x2, #8 // use plain AES for 1284 blo .Lctr_enc_short // small sizes 1285 1286 stp x29, x30, [sp, #-80]! 1287 stp d8, d9, [sp, #16] 1288 stp d10, d11, [sp, #32] 1289 stp d12, d13, [sp, #48] 1290 stp d14, d15, [sp, #64] 1291 1292 ldr w15, [x3, #240] // get # of rounds 1293 mov x14, sp 1294 1295 // allocate the key schedule on the stack 1296 add x17, sp, #96 1297 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1298 1299 // populate the key schedule 1300 mov x9, x3 // pass key 1301 mov x10, x15 // pass # of rounds 1302 mov sp, x17 // sp is sp 1303 bl _bsaes_key_convert 1304 eor v7.16b, v7.16b, v15.16b // fix up last round key 1305 str q7, [x17] // save last round key 1306 1307 ldr q0, [x4] // load counter 1308 add x13, x11, #.LREVM0SR-.LM0_bigendian 1309 ldr q4, [sp] // load round0 key 1310 1311 movi v8.4s, #1 // compose 1<<96 1312 movi v9.16b, #0 1313 rev32 v15.16b, v0.16b 1314 rev32 v0.16b, v0.16b 1315 ext v11.16b, v9.16b, v8.16b, #4 1316 rev32 v4.16b, v4.16b 1317 add v12.4s, v11.4s, v11.4s // compose 2<<96 1318 str q4, [sp] // save adjusted round0 key 1319 add v13.4s, v11.4s, v12.4s // compose 3<<96 1320 add v14.4s, v12.4s, v12.4s // compose 4<<96 1321 b .Lctr_enc_loop 1322 1323.align 4 1324.Lctr_enc_loop: 1325 // Intermix prologue from _bsaes_encrypt8 to use the opportunity 1326 // to flip byte order in 32-bit counter 1327 1328 add v1.4s, v15.4s, v11.4s // +1 1329 add x9, sp, #0x10 // pass next round key 1330 add v2.4s, v15.4s, v12.4s // +2 1331 ldr q9, [x13] // .LREVM0SR 1332 ldr q8, [sp] // load round0 key 1333 add v3.4s, v15.4s, v13.4s // +3 1334 mov x10, x15 // pass rounds 1335 sub x11, x13, #.LREVM0SR-.LSR // pass constants 1336 add v6.4s, v2.4s, v14.4s 1337 add v4.4s, v15.4s, v14.4s // +4 1338 add v7.4s, v3.4s, v14.4s 1339 add v15.4s, v4.4s, v14.4s // next counter 1340 add v5.4s, v1.4s, v14.4s 1341 1342 bl _bsaes_encrypt8_alt 1343 1344 subs x2, x2, #8 1345 blo .Lctr_enc_loop_done 1346 1347 ldr q16, [x0], #16 1348 ldr q17, [x0], #16 1349 eor v1.16b, v1.16b, v17.16b 1350 ldr q17, [x0], #16 1351 eor v0.16b, v0.16b, v16.16b 1352 eor v4.16b, v4.16b, v17.16b 1353 str q0, [x1], #16 1354 ldr q16, [x0], #16 1355 str q1, [x1], #16 1356 mov v0.16b, v15.16b 1357 str q4, [x1], #16 1358 ldr q1, [x0], #16 1359 eor v4.16b, v6.16b, v16.16b 1360 eor v1.16b, v3.16b, v1.16b 1361 ldr q3, [x0], #16 1362 eor v3.16b, v7.16b, v3.16b 1363 ldr q6, [x0], #16 1364 eor v2.16b, v2.16b, v6.16b 1365 ldr q6, [x0], #16 1366 eor v5.16b, v5.16b, v6.16b 1367 str q4, [x1], #16 1368 str q1, [x1], #16 1369 str q3, [x1], #16 1370 str q2, [x1], #16 1371 str q5, [x1], #16 1372 1373 bne .Lctr_enc_loop 1374 b .Lctr_enc_done 1375 1376.align 4 1377.Lctr_enc_loop_done: 1378 add x2, x2, #8 1379 ldr q16, [x0], #16 // load input 1380 eor v0.16b, v0.16b, v16.16b 1381 str q0, [x1], #16 // write output 1382 cmp x2, #2 1383 blo .Lctr_enc_done 1384 ldr q17, [x0], #16 1385 eor v1.16b, v1.16b, v17.16b 1386 str q1, [x1], #16 1387 beq .Lctr_enc_done 1388 ldr q18, [x0], #16 1389 eor v4.16b, v4.16b, v18.16b 1390 str q4, [x1], #16 1391 cmp x2, #4 1392 blo .Lctr_enc_done 1393 ldr q19, [x0], #16 1394 eor v6.16b, v6.16b, v19.16b 1395 str q6, [x1], #16 1396 beq .Lctr_enc_done 1397 ldr q20, [x0], #16 1398 eor v3.16b, v3.16b, v20.16b 1399 str q3, [x1], #16 1400 cmp x2, #6 1401 blo .Lctr_enc_done 1402 ldr q21, [x0], #16 1403 eor v7.16b, v7.16b, v21.16b 1404 str q7, [x1], #16 1405 beq .Lctr_enc_done 1406 ldr q22, [x0] 1407 eor v2.16b, v2.16b, v22.16b 1408 str q2, [x1], #16 1409 1410.Lctr_enc_done: 1411 movi v0.16b, #0 1412 movi v1.16b, #0 1413.Lctr_enc_bzero: // wipe key schedule [if any] 1414 stp q0, q1, [sp], #32 1415 cmp sp, x14 1416 bne .Lctr_enc_bzero 1417 1418 ldp d8, d9, [sp, #16] 1419 ldp d10, d11, [sp, #32] 1420 ldp d12, d13, [sp, #48] 1421 ldp d14, d15, [sp, #64] 1422 ldp x29, x30, [sp], #80 1423 ret 1424 1425.Lctr_enc_short: 1426 stp x29, x30, [sp, #-96]! 1427 stp x19, x20, [sp, #16] 1428 stp x21, x22, [sp, #32] 1429 str x23, [sp, #48] 1430 1431 mov x19, x0 // copy arguments 1432 mov x20, x1 1433 mov x21, x2 1434 mov x22, x3 1435 ldr w23, [x4, #12] // load counter .LSW 1436 ldr q1, [x4] // load whole counter value 1437#ifdef __AARCH64EL__ 1438 rev w23, w23 1439#endif 1440 str q1, [sp, #80] // copy counter value 1441 1442.Lctr_enc_short_loop: 1443 add x0, sp, #80 // input counter value 1444 add x1, sp, #64 // output on the stack 1445 mov x2, x22 // key 1446 1447 bl AES_encrypt 1448 1449 ldr q0, [x19], #16 // load input 1450 ldr q1, [sp, #64] // load encrypted counter 1451 add x23, x23, #1 1452#ifdef __AARCH64EL__ 1453 rev w0, w23 1454 str w0, [sp, #80+12] // next counter value 1455#else 1456 str w23, [sp, #80+12] // next counter value 1457#endif 1458 eor v0.16b, v0.16b, v1.16b 1459 str q0, [x20], #16 // store output 1460 subs x21, x21, #1 1461 bne .Lctr_enc_short_loop 1462 1463 movi v0.16b, #0 1464 movi v1.16b, #0 1465 stp q0, q1, [sp, #64] 1466 1467 ldr x23, [sp, #48] 1468 ldp x21, x22, [sp, #32] 1469 ldp x19, x20, [sp, #16] 1470 ldp x29, x30, [sp], #96 1471 ret 1472.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 1473 1474.globl ossl_bsaes_xts_encrypt 1475.type ossl_bsaes_xts_encrypt,%function 1476.align 4 1477// On entry: 1478// x0 -> input plaintext 1479// x1 -> output ciphertext 1480// x2 -> length of text in bytes (must be at least 16) 1481// x3 -> key1 (used to encrypt the XORed plaintext blocks) 1482// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1483// x5 -> 16-byte initial vector (typically, sector number) 1484// On exit: 1485// Output ciphertext filled in 1486// No output registers, usual AAPCS64 register preservation 1487ossl_bsaes_xts_encrypt: 1488 AARCH64_VALID_CALL_TARGET 1489 // Stack layout: 1490 // sp -> 1491 // nrounds*128-96 bytes: key schedule 1492 // x19 -> 1493 // 16 bytes: frame record 1494 // 4*16 bytes: tweak storage across _bsaes_encrypt8 1495 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1496 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1497 stp x29, x30, [sp, #-192]! 1498 stp x19, x20, [sp, #80] 1499 stp x21, x22, [sp, #96] 1500 str x23, [sp, #112] 1501 stp d8, d9, [sp, #128] 1502 stp d10, d11, [sp, #144] 1503 stp d12, d13, [sp, #160] 1504 stp d14, d15, [sp, #176] 1505 1506 mov x19, sp 1507 mov x20, x0 1508 mov x21, x1 1509 mov x22, x2 1510 mov x23, x3 1511 1512 // generate initial tweak 1513 sub sp, sp, #16 1514 mov x0, x5 // iv[] 1515 mov x1, sp 1516 mov x2, x4 // key2 1517 bl AES_encrypt 1518 ldr q11, [sp], #16 1519 1520 ldr w1, [x23, #240] // get # of rounds 1521 // allocate the key schedule on the stack 1522 add x17, sp, #96 1523 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1524 1525 // populate the key schedule 1526 mov x9, x23 // pass key 1527 mov x10, x1 // pass # of rounds 1528 mov sp, x17 1529 bl _bsaes_key_convert 1530 eor v15.16b, v15.16b, v7.16b // fix up last round key 1531 str q15, [x17] // save last round key 1532 1533 subs x22, x22, #0x80 1534 blo .Lxts_enc_short 1535 b .Lxts_enc_loop 1536 1537.align 4 1538.Lxts_enc_loop: 1539 ldr q8, .Lxts_magic 1540 mov x10, x1 // pass rounds 1541 add x2, x19, #16 1542 ldr q0, [x20], #16 1543 sshr v1.2d, v11.2d, #63 1544 mov x9, sp // pass key schedule 1545 ldr q6, .Lxts_magic+16 1546 add v2.2d, v11.2d, v11.2d 1547 cmtst v3.2d, v11.2d, v6.2d 1548 and v1.16b, v1.16b, v8.16b 1549 ext v1.16b, v1.16b, v1.16b, #8 1550 and v3.16b, v3.16b, v8.16b 1551 ldr q4, [x20], #16 1552 eor v12.16b, v2.16b, v1.16b 1553 eor v1.16b, v4.16b, v12.16b 1554 eor v0.16b, v0.16b, v11.16b 1555 cmtst v2.2d, v12.2d, v6.2d 1556 add v4.2d, v12.2d, v12.2d 1557 add x0, x19, #16 1558 ext v3.16b, v3.16b, v3.16b, #8 1559 and v2.16b, v2.16b, v8.16b 1560 eor v13.16b, v4.16b, v3.16b 1561 ldr q3, [x20], #16 1562 ext v4.16b, v2.16b, v2.16b, #8 1563 eor v2.16b, v3.16b, v13.16b 1564 ldr q3, [x20], #16 1565 add v5.2d, v13.2d, v13.2d 1566 cmtst v7.2d, v13.2d, v6.2d 1567 and v7.16b, v7.16b, v8.16b 1568 ldr q9, [x20], #16 1569 ext v7.16b, v7.16b, v7.16b, #8 1570 ldr q10, [x20], #16 1571 eor v14.16b, v5.16b, v4.16b 1572 ldr q16, [x20], #16 1573 add v4.2d, v14.2d, v14.2d 1574 eor v3.16b, v3.16b, v14.16b 1575 eor v15.16b, v4.16b, v7.16b 1576 add v5.2d, v15.2d, v15.2d 1577 ldr q7, [x20], #16 1578 cmtst v4.2d, v14.2d, v6.2d 1579 and v17.16b, v4.16b, v8.16b 1580 cmtst v18.2d, v15.2d, v6.2d 1581 eor v4.16b, v9.16b, v15.16b 1582 ext v9.16b, v17.16b, v17.16b, #8 1583 eor v9.16b, v5.16b, v9.16b 1584 add v17.2d, v9.2d, v9.2d 1585 and v18.16b, v18.16b, v8.16b 1586 eor v5.16b, v10.16b, v9.16b 1587 str q9, [x2], #16 1588 ext v10.16b, v18.16b, v18.16b, #8 1589 cmtst v9.2d, v9.2d, v6.2d 1590 and v9.16b, v9.16b, v8.16b 1591 eor v10.16b, v17.16b, v10.16b 1592 cmtst v17.2d, v10.2d, v6.2d 1593 eor v6.16b, v16.16b, v10.16b 1594 str q10, [x2], #16 1595 ext v9.16b, v9.16b, v9.16b, #8 1596 add v10.2d, v10.2d, v10.2d 1597 eor v9.16b, v10.16b, v9.16b 1598 str q9, [x2], #16 1599 eor v7.16b, v7.16b, v9.16b 1600 add v9.2d, v9.2d, v9.2d 1601 and v8.16b, v17.16b, v8.16b 1602 ext v8.16b, v8.16b, v8.16b, #8 1603 eor v8.16b, v9.16b, v8.16b 1604 str q8, [x2] // next round tweak 1605 1606 bl _bsaes_encrypt8 1607 1608 ldr q8, [x0], #16 1609 eor v0.16b, v0.16b, v11.16b 1610 eor v1.16b, v1.16b, v12.16b 1611 ldr q9, [x0], #16 1612 eor v4.16b, v4.16b, v13.16b 1613 eor v6.16b, v6.16b, v14.16b 1614 ldr q10, [x0], #16 1615 eor v3.16b, v3.16b, v15.16b 1616 subs x22, x22, #0x80 1617 str q0, [x21], #16 1618 ldr q11, [x0] // next round tweak 1619 str q1, [x21], #16 1620 eor v0.16b, v7.16b, v8.16b 1621 eor v1.16b, v2.16b, v9.16b 1622 str q4, [x21], #16 1623 eor v2.16b, v5.16b, v10.16b 1624 str q6, [x21], #16 1625 str q3, [x21], #16 1626 str q0, [x21], #16 1627 str q1, [x21], #16 1628 str q2, [x21], #16 1629 bpl .Lxts_enc_loop 1630 1631.Lxts_enc_short: 1632 adds x22, x22, #0x70 1633 bmi .Lxts_enc_done 1634 1635 ldr q8, .Lxts_magic 1636 sshr v1.2d, v11.2d, #63 1637 add v2.2d, v11.2d, v11.2d 1638 ldr q9, .Lxts_magic+16 1639 subs x22, x22, #0x10 1640 ldr q0, [x20], #16 1641 and v1.16b, v1.16b, v8.16b 1642 cmtst v3.2d, v11.2d, v9.2d 1643 ext v1.16b, v1.16b, v1.16b, #8 1644 and v3.16b, v3.16b, v8.16b 1645 eor v12.16b, v2.16b, v1.16b 1646 ext v1.16b, v3.16b, v3.16b, #8 1647 add v2.2d, v12.2d, v12.2d 1648 cmtst v3.2d, v12.2d, v9.2d 1649 eor v13.16b, v2.16b, v1.16b 1650 and v22.16b, v3.16b, v8.16b 1651 bmi .Lxts_enc_1 1652 1653 ext v2.16b, v22.16b, v22.16b, #8 1654 add v3.2d, v13.2d, v13.2d 1655 ldr q1, [x20], #16 1656 cmtst v4.2d, v13.2d, v9.2d 1657 subs x22, x22, #0x10 1658 eor v14.16b, v3.16b, v2.16b 1659 and v23.16b, v4.16b, v8.16b 1660 bmi .Lxts_enc_2 1661 1662 ext v3.16b, v23.16b, v23.16b, #8 1663 add v4.2d, v14.2d, v14.2d 1664 ldr q2, [x20], #16 1665 cmtst v5.2d, v14.2d, v9.2d 1666 eor v0.16b, v0.16b, v11.16b 1667 subs x22, x22, #0x10 1668 eor v15.16b, v4.16b, v3.16b 1669 and v24.16b, v5.16b, v8.16b 1670 bmi .Lxts_enc_3 1671 1672 ext v4.16b, v24.16b, v24.16b, #8 1673 add v5.2d, v15.2d, v15.2d 1674 ldr q3, [x20], #16 1675 cmtst v6.2d, v15.2d, v9.2d 1676 eor v1.16b, v1.16b, v12.16b 1677 subs x22, x22, #0x10 1678 eor v16.16b, v5.16b, v4.16b 1679 and v25.16b, v6.16b, v8.16b 1680 bmi .Lxts_enc_4 1681 1682 ext v5.16b, v25.16b, v25.16b, #8 1683 add v6.2d, v16.2d, v16.2d 1684 add x0, x19, #16 1685 cmtst v7.2d, v16.2d, v9.2d 1686 ldr q4, [x20], #16 1687 eor v2.16b, v2.16b, v13.16b 1688 str q16, [x0], #16 1689 subs x22, x22, #0x10 1690 eor v17.16b, v6.16b, v5.16b 1691 and v26.16b, v7.16b, v8.16b 1692 bmi .Lxts_enc_5 1693 1694 ext v7.16b, v26.16b, v26.16b, #8 1695 add v18.2d, v17.2d, v17.2d 1696 ldr q5, [x20], #16 1697 eor v3.16b, v3.16b, v14.16b 1698 str q17, [x0], #16 1699 subs x22, x22, #0x10 1700 eor v18.16b, v18.16b, v7.16b 1701 bmi .Lxts_enc_6 1702 1703 ldr q6, [x20], #16 1704 eor v4.16b, v4.16b, v15.16b 1705 eor v5.16b, v5.16b, v16.16b 1706 str q18, [x0] // next round tweak 1707 mov x9, sp // pass key schedule 1708 mov x10, x1 1709 add x0, x19, #16 1710 sub x22, x22, #0x10 1711 eor v6.16b, v6.16b, v17.16b 1712 1713 bl _bsaes_encrypt8 1714 1715 ldr q16, [x0], #16 1716 eor v0.16b, v0.16b, v11.16b 1717 eor v1.16b, v1.16b, v12.16b 1718 ldr q17, [x0], #16 1719 eor v4.16b, v4.16b, v13.16b 1720 eor v6.16b, v6.16b, v14.16b 1721 eor v3.16b, v3.16b, v15.16b 1722 ldr q11, [x0] // next round tweak 1723 str q0, [x21], #16 1724 str q1, [x21], #16 1725 eor v0.16b, v7.16b, v16.16b 1726 eor v1.16b, v2.16b, v17.16b 1727 str q4, [x21], #16 1728 str q6, [x21], #16 1729 str q3, [x21], #16 1730 str q0, [x21], #16 1731 str q1, [x21], #16 1732 b .Lxts_enc_done 1733 1734.align 4 1735.Lxts_enc_6: 1736 eor v4.16b, v4.16b, v15.16b 1737 eor v5.16b, v5.16b, v16.16b 1738 mov x9, sp // pass key schedule 1739 mov x10, x1 // pass rounds 1740 add x0, x19, #16 1741 1742 bl _bsaes_encrypt8 1743 1744 ldr q16, [x0], #16 1745 eor v0.16b, v0.16b, v11.16b 1746 eor v1.16b, v1.16b, v12.16b 1747 eor v4.16b, v4.16b, v13.16b 1748 eor v6.16b, v6.16b, v14.16b 1749 ldr q11, [x0] // next round tweak 1750 eor v3.16b, v3.16b, v15.16b 1751 str q0, [x21], #16 1752 str q1, [x21], #16 1753 eor v0.16b, v7.16b, v16.16b 1754 str q4, [x21], #16 1755 str q6, [x21], #16 1756 str q3, [x21], #16 1757 str q0, [x21], #16 1758 b .Lxts_enc_done 1759 1760.align 4 1761.Lxts_enc_5: 1762 eor v3.16b, v3.16b, v14.16b 1763 eor v4.16b, v4.16b, v15.16b 1764 mov x9, sp // pass key schedule 1765 mov x10, x1 // pass rounds 1766 add x0, x19, #16 1767 1768 bl _bsaes_encrypt8 1769 1770 eor v0.16b, v0.16b, v11.16b 1771 eor v1.16b, v1.16b, v12.16b 1772 ldr q11, [x0] // next round tweak 1773 eor v4.16b, v4.16b, v13.16b 1774 eor v6.16b, v6.16b, v14.16b 1775 eor v3.16b, v3.16b, v15.16b 1776 str q0, [x21], #16 1777 str q1, [x21], #16 1778 str q4, [x21], #16 1779 str q6, [x21], #16 1780 str q3, [x21], #16 1781 b .Lxts_enc_done 1782 1783.align 4 1784.Lxts_enc_4: 1785 eor v2.16b, v2.16b, v13.16b 1786 eor v3.16b, v3.16b, v14.16b 1787 mov x9, sp // pass key schedule 1788 mov x10, x1 // pass rounds 1789 add x0, x19, #16 1790 1791 bl _bsaes_encrypt8 1792 1793 eor v0.16b, v0.16b, v11.16b 1794 eor v1.16b, v1.16b, v12.16b 1795 eor v4.16b, v4.16b, v13.16b 1796 eor v6.16b, v6.16b, v14.16b 1797 mov v11.16b, v15.16b // next round tweak 1798 str q0, [x21], #16 1799 str q1, [x21], #16 1800 str q4, [x21], #16 1801 str q6, [x21], #16 1802 b .Lxts_enc_done 1803 1804.align 4 1805.Lxts_enc_3: 1806 eor v1.16b, v1.16b, v12.16b 1807 eor v2.16b, v2.16b, v13.16b 1808 mov x9, sp // pass key schedule 1809 mov x10, x1 // pass rounds 1810 add x0, x19, #16 1811 1812 bl _bsaes_encrypt8 1813 1814 eor v0.16b, v0.16b, v11.16b 1815 eor v1.16b, v1.16b, v12.16b 1816 eor v4.16b, v4.16b, v13.16b 1817 mov v11.16b, v14.16b // next round tweak 1818 str q0, [x21], #16 1819 str q1, [x21], #16 1820 str q4, [x21], #16 1821 b .Lxts_enc_done 1822 1823.align 4 1824.Lxts_enc_2: 1825 eor v0.16b, v0.16b, v11.16b 1826 eor v1.16b, v1.16b, v12.16b 1827 mov x9, sp // pass key schedule 1828 mov x10, x1 // pass rounds 1829 add x0, x19, #16 1830 1831 bl _bsaes_encrypt8 1832 1833 eor v0.16b, v0.16b, v11.16b 1834 eor v1.16b, v1.16b, v12.16b 1835 mov v11.16b, v13.16b // next round tweak 1836 str q0, [x21], #16 1837 str q1, [x21], #16 1838 b .Lxts_enc_done 1839 1840.align 4 1841.Lxts_enc_1: 1842 eor v0.16b, v0.16b, v11.16b 1843 sub x0, sp, #16 1844 sub x1, sp, #16 1845 mov x2, x23 1846 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1847 mov v14.d[0], v12.d[1] 1848 str q0, [sp, #-16]! 1849 1850 bl AES_encrypt 1851 1852 ldr q0, [sp], #16 1853 trn1 v13.2d, v11.2d, v13.2d 1854 trn1 v11.2d, v12.2d, v14.2d // next round tweak 1855 eor v0.16b, v0.16b, v13.16b 1856 str q0, [x21], #16 1857 1858.Lxts_enc_done: 1859 adds x22, x22, #0x10 1860 beq .Lxts_enc_ret 1861 1862 sub x6, x21, #0x10 1863 // Penultimate plaintext block produces final ciphertext part-block 1864 // plus remaining part of final plaintext block. Move ciphertext part 1865 // to final position and reuse penultimate ciphertext block buffer to 1866 // construct final plaintext block 1867.Lxts_enc_steal: 1868 ldrb w0, [x20], #1 1869 ldrb w1, [x21, #-0x10] 1870 strb w0, [x21, #-0x10] 1871 strb w1, [x21], #1 1872 1873 subs x22, x22, #1 1874 bhi .Lxts_enc_steal 1875 1876 // Finally encrypt the penultimate ciphertext block using the 1877 // last tweak 1878 ldr q0, [x6] 1879 eor v0.16b, v0.16b, v11.16b 1880 str q0, [sp, #-16]! 1881 mov x0, sp 1882 mov x1, sp 1883 mov x2, x23 1884 mov x21, x6 1885 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1886 1887 bl AES_encrypt 1888 1889 trn1 v11.2d, v11.2d, v13.2d 1890 ldr q0, [sp], #16 1891 eor v0.16b, v0.16b, v11.16b 1892 str q0, [x21] 1893 1894.Lxts_enc_ret: 1895 1896 movi v0.16b, #0 1897 movi v1.16b, #0 1898.Lxts_enc_bzero: // wipe key schedule 1899 stp q0, q1, [sp], #32 1900 cmp sp, x19 1901 bne .Lxts_enc_bzero 1902 1903 ldp x19, x20, [sp, #80] 1904 ldp x21, x22, [sp, #96] 1905 ldr x23, [sp, #112] 1906 ldp d8, d9, [sp, #128] 1907 ldp d10, d11, [sp, #144] 1908 ldp d12, d13, [sp, #160] 1909 ldp d14, d15, [sp, #176] 1910 ldp x29, x30, [sp], #192 1911 ret 1912.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 1913 1914// The assembler doesn't seem capable of de-duplicating these when expressed 1915// using `ldr qd,=` syntax, so assign a symbolic address 1916.align 5 1917.Lxts_magic: 1918.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 1919 1920.globl ossl_bsaes_xts_decrypt 1921.type ossl_bsaes_xts_decrypt,%function 1922.align 4 1923// On entry: 1924// x0 -> input ciphertext 1925// x1 -> output plaintext 1926// x2 -> length of text in bytes (must be at least 16) 1927// x3 -> key1 (used to decrypt the XORed ciphertext blocks) 1928// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1929// x5 -> 16-byte initial vector (typically, sector number) 1930// On exit: 1931// Output plaintext filled in 1932// No output registers, usual AAPCS64 register preservation 1933ossl_bsaes_xts_decrypt: 1934 AARCH64_VALID_CALL_TARGET 1935 // Stack layout: 1936 // sp -> 1937 // nrounds*128-96 bytes: key schedule 1938 // x19 -> 1939 // 16 bytes: frame record 1940 // 4*16 bytes: tweak storage across _bsaes_decrypt8 1941 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1942 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1943 stp x29, x30, [sp, #-192]! 1944 stp x19, x20, [sp, #80] 1945 stp x21, x22, [sp, #96] 1946 str x23, [sp, #112] 1947 stp d8, d9, [sp, #128] 1948 stp d10, d11, [sp, #144] 1949 stp d12, d13, [sp, #160] 1950 stp d14, d15, [sp, #176] 1951 1952 mov x19, sp 1953 mov x20, x0 1954 mov x21, x1 1955 mov x22, x2 1956 mov x23, x3 1957 1958 // generate initial tweak 1959 sub sp, sp, #16 1960 mov x0, x5 // iv[] 1961 mov x1, sp 1962 mov x2, x4 // key2 1963 bl AES_encrypt 1964 ldr q11, [sp], #16 1965 1966 ldr w1, [x23, #240] // get # of rounds 1967 // allocate the key schedule on the stack 1968 add x17, sp, #96 1969 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1970 1971 // populate the key schedule 1972 mov x9, x23 // pass key 1973 mov x10, x1 // pass # of rounds 1974 mov sp, x17 1975 bl _bsaes_key_convert 1976 ldr q6, [sp] 1977 str q15, [x17] // save last round key 1978 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1979 str q6, [sp] 1980 1981 sub x30, x22, #0x10 1982 tst x22, #0xf // if not multiple of 16 1983 csel x22, x30, x22, ne // subtract another 16 bytes 1984 subs x22, x22, #0x80 1985 1986 blo .Lxts_dec_short 1987 b .Lxts_dec_loop 1988 1989.align 4 1990.Lxts_dec_loop: 1991 ldr q8, .Lxts_magic 1992 mov x10, x1 // pass rounds 1993 add x2, x19, #16 1994 ldr q0, [x20], #16 1995 sshr v1.2d, v11.2d, #63 1996 mov x9, sp // pass key schedule 1997 ldr q6, .Lxts_magic+16 1998 add v2.2d, v11.2d, v11.2d 1999 cmtst v3.2d, v11.2d, v6.2d 2000 and v1.16b, v1.16b, v8.16b 2001 ext v1.16b, v1.16b, v1.16b, #8 2002 and v3.16b, v3.16b, v8.16b 2003 ldr q4, [x20], #16 2004 eor v12.16b, v2.16b, v1.16b 2005 eor v1.16b, v4.16b, v12.16b 2006 eor v0.16b, v0.16b, v11.16b 2007 cmtst v2.2d, v12.2d, v6.2d 2008 add v4.2d, v12.2d, v12.2d 2009 add x0, x19, #16 2010 ext v3.16b, v3.16b, v3.16b, #8 2011 and v2.16b, v2.16b, v8.16b 2012 eor v13.16b, v4.16b, v3.16b 2013 ldr q3, [x20], #16 2014 ext v4.16b, v2.16b, v2.16b, #8 2015 eor v2.16b, v3.16b, v13.16b 2016 ldr q3, [x20], #16 2017 add v5.2d, v13.2d, v13.2d 2018 cmtst v7.2d, v13.2d, v6.2d 2019 and v7.16b, v7.16b, v8.16b 2020 ldr q9, [x20], #16 2021 ext v7.16b, v7.16b, v7.16b, #8 2022 ldr q10, [x20], #16 2023 eor v14.16b, v5.16b, v4.16b 2024 ldr q16, [x20], #16 2025 add v4.2d, v14.2d, v14.2d 2026 eor v3.16b, v3.16b, v14.16b 2027 eor v15.16b, v4.16b, v7.16b 2028 add v5.2d, v15.2d, v15.2d 2029 ldr q7, [x20], #16 2030 cmtst v4.2d, v14.2d, v6.2d 2031 and v17.16b, v4.16b, v8.16b 2032 cmtst v18.2d, v15.2d, v6.2d 2033 eor v4.16b, v9.16b, v15.16b 2034 ext v9.16b, v17.16b, v17.16b, #8 2035 eor v9.16b, v5.16b, v9.16b 2036 add v17.2d, v9.2d, v9.2d 2037 and v18.16b, v18.16b, v8.16b 2038 eor v5.16b, v10.16b, v9.16b 2039 str q9, [x2], #16 2040 ext v10.16b, v18.16b, v18.16b, #8 2041 cmtst v9.2d, v9.2d, v6.2d 2042 and v9.16b, v9.16b, v8.16b 2043 eor v10.16b, v17.16b, v10.16b 2044 cmtst v17.2d, v10.2d, v6.2d 2045 eor v6.16b, v16.16b, v10.16b 2046 str q10, [x2], #16 2047 ext v9.16b, v9.16b, v9.16b, #8 2048 add v10.2d, v10.2d, v10.2d 2049 eor v9.16b, v10.16b, v9.16b 2050 str q9, [x2], #16 2051 eor v7.16b, v7.16b, v9.16b 2052 add v9.2d, v9.2d, v9.2d 2053 and v8.16b, v17.16b, v8.16b 2054 ext v8.16b, v8.16b, v8.16b, #8 2055 eor v8.16b, v9.16b, v8.16b 2056 str q8, [x2] // next round tweak 2057 2058 bl _bsaes_decrypt8 2059 2060 eor v6.16b, v6.16b, v13.16b 2061 eor v0.16b, v0.16b, v11.16b 2062 ldr q8, [x0], #16 2063 eor v7.16b, v7.16b, v8.16b 2064 str q0, [x21], #16 2065 eor v0.16b, v1.16b, v12.16b 2066 ldr q1, [x0], #16 2067 eor v1.16b, v3.16b, v1.16b 2068 subs x22, x22, #0x80 2069 eor v2.16b, v2.16b, v15.16b 2070 eor v3.16b, v4.16b, v14.16b 2071 ldr q4, [x0], #16 2072 str q0, [x21], #16 2073 ldr q11, [x0] // next round tweak 2074 eor v0.16b, v5.16b, v4.16b 2075 str q6, [x21], #16 2076 str q3, [x21], #16 2077 str q2, [x21], #16 2078 str q7, [x21], #16 2079 str q1, [x21], #16 2080 str q0, [x21], #16 2081 bpl .Lxts_dec_loop 2082 2083.Lxts_dec_short: 2084 adds x22, x22, #0x70 2085 bmi .Lxts_dec_done 2086 2087 ldr q8, .Lxts_magic 2088 sshr v1.2d, v11.2d, #63 2089 add v2.2d, v11.2d, v11.2d 2090 ldr q9, .Lxts_magic+16 2091 subs x22, x22, #0x10 2092 ldr q0, [x20], #16 2093 and v1.16b, v1.16b, v8.16b 2094 cmtst v3.2d, v11.2d, v9.2d 2095 ext v1.16b, v1.16b, v1.16b, #8 2096 and v3.16b, v3.16b, v8.16b 2097 eor v12.16b, v2.16b, v1.16b 2098 ext v1.16b, v3.16b, v3.16b, #8 2099 add v2.2d, v12.2d, v12.2d 2100 cmtst v3.2d, v12.2d, v9.2d 2101 eor v13.16b, v2.16b, v1.16b 2102 and v22.16b, v3.16b, v8.16b 2103 bmi .Lxts_dec_1 2104 2105 ext v2.16b, v22.16b, v22.16b, #8 2106 add v3.2d, v13.2d, v13.2d 2107 ldr q1, [x20], #16 2108 cmtst v4.2d, v13.2d, v9.2d 2109 subs x22, x22, #0x10 2110 eor v14.16b, v3.16b, v2.16b 2111 and v23.16b, v4.16b, v8.16b 2112 bmi .Lxts_dec_2 2113 2114 ext v3.16b, v23.16b, v23.16b, #8 2115 add v4.2d, v14.2d, v14.2d 2116 ldr q2, [x20], #16 2117 cmtst v5.2d, v14.2d, v9.2d 2118 eor v0.16b, v0.16b, v11.16b 2119 subs x22, x22, #0x10 2120 eor v15.16b, v4.16b, v3.16b 2121 and v24.16b, v5.16b, v8.16b 2122 bmi .Lxts_dec_3 2123 2124 ext v4.16b, v24.16b, v24.16b, #8 2125 add v5.2d, v15.2d, v15.2d 2126 ldr q3, [x20], #16 2127 cmtst v6.2d, v15.2d, v9.2d 2128 eor v1.16b, v1.16b, v12.16b 2129 subs x22, x22, #0x10 2130 eor v16.16b, v5.16b, v4.16b 2131 and v25.16b, v6.16b, v8.16b 2132 bmi .Lxts_dec_4 2133 2134 ext v5.16b, v25.16b, v25.16b, #8 2135 add v6.2d, v16.2d, v16.2d 2136 add x0, x19, #16 2137 cmtst v7.2d, v16.2d, v9.2d 2138 ldr q4, [x20], #16 2139 eor v2.16b, v2.16b, v13.16b 2140 str q16, [x0], #16 2141 subs x22, x22, #0x10 2142 eor v17.16b, v6.16b, v5.16b 2143 and v26.16b, v7.16b, v8.16b 2144 bmi .Lxts_dec_5 2145 2146 ext v7.16b, v26.16b, v26.16b, #8 2147 add v18.2d, v17.2d, v17.2d 2148 ldr q5, [x20], #16 2149 eor v3.16b, v3.16b, v14.16b 2150 str q17, [x0], #16 2151 subs x22, x22, #0x10 2152 eor v18.16b, v18.16b, v7.16b 2153 bmi .Lxts_dec_6 2154 2155 ldr q6, [x20], #16 2156 eor v4.16b, v4.16b, v15.16b 2157 eor v5.16b, v5.16b, v16.16b 2158 str q18, [x0] // next round tweak 2159 mov x9, sp // pass key schedule 2160 mov x10, x1 2161 add x0, x19, #16 2162 sub x22, x22, #0x10 2163 eor v6.16b, v6.16b, v17.16b 2164 2165 bl _bsaes_decrypt8 2166 2167 ldr q16, [x0], #16 2168 eor v0.16b, v0.16b, v11.16b 2169 eor v1.16b, v1.16b, v12.16b 2170 ldr q17, [x0], #16 2171 eor v6.16b, v6.16b, v13.16b 2172 eor v4.16b, v4.16b, v14.16b 2173 eor v2.16b, v2.16b, v15.16b 2174 ldr q11, [x0] // next round tweak 2175 str q0, [x21], #16 2176 str q1, [x21], #16 2177 eor v0.16b, v7.16b, v16.16b 2178 eor v1.16b, v3.16b, v17.16b 2179 str q6, [x21], #16 2180 str q4, [x21], #16 2181 str q2, [x21], #16 2182 str q0, [x21], #16 2183 str q1, [x21], #16 2184 b .Lxts_dec_done 2185 2186.align 4 2187.Lxts_dec_6: 2188 eor v4.16b, v4.16b, v15.16b 2189 eor v5.16b, v5.16b, v16.16b 2190 mov x9, sp // pass key schedule 2191 mov x10, x1 // pass rounds 2192 add x0, x19, #16 2193 2194 bl _bsaes_decrypt8 2195 2196 ldr q16, [x0], #16 2197 eor v0.16b, v0.16b, v11.16b 2198 eor v1.16b, v1.16b, v12.16b 2199 eor v6.16b, v6.16b, v13.16b 2200 eor v4.16b, v4.16b, v14.16b 2201 ldr q11, [x0] // next round tweak 2202 eor v2.16b, v2.16b, v15.16b 2203 str q0, [x21], #16 2204 str q1, [x21], #16 2205 eor v0.16b, v7.16b, v16.16b 2206 str q6, [x21], #16 2207 str q4, [x21], #16 2208 str q2, [x21], #16 2209 str q0, [x21], #16 2210 b .Lxts_dec_done 2211 2212.align 4 2213.Lxts_dec_5: 2214 eor v3.16b, v3.16b, v14.16b 2215 eor v4.16b, v4.16b, v15.16b 2216 mov x9, sp // pass key schedule 2217 mov x10, x1 // pass rounds 2218 add x0, x19, #16 2219 2220 bl _bsaes_decrypt8 2221 2222 eor v0.16b, v0.16b, v11.16b 2223 eor v1.16b, v1.16b, v12.16b 2224 ldr q11, [x0] // next round tweak 2225 eor v6.16b, v6.16b, v13.16b 2226 eor v4.16b, v4.16b, v14.16b 2227 eor v2.16b, v2.16b, v15.16b 2228 str q0, [x21], #16 2229 str q1, [x21], #16 2230 str q6, [x21], #16 2231 str q4, [x21], #16 2232 str q2, [x21], #16 2233 b .Lxts_dec_done 2234 2235.align 4 2236.Lxts_dec_4: 2237 eor v2.16b, v2.16b, v13.16b 2238 eor v3.16b, v3.16b, v14.16b 2239 mov x9, sp // pass key schedule 2240 mov x10, x1 // pass rounds 2241 add x0, x19, #16 2242 2243 bl _bsaes_decrypt8 2244 2245 eor v0.16b, v0.16b, v11.16b 2246 eor v1.16b, v1.16b, v12.16b 2247 eor v6.16b, v6.16b, v13.16b 2248 eor v4.16b, v4.16b, v14.16b 2249 mov v11.16b, v15.16b // next round tweak 2250 str q0, [x21], #16 2251 str q1, [x21], #16 2252 str q6, [x21], #16 2253 str q4, [x21], #16 2254 b .Lxts_dec_done 2255 2256.align 4 2257.Lxts_dec_3: 2258 eor v1.16b, v1.16b, v12.16b 2259 eor v2.16b, v2.16b, v13.16b 2260 mov x9, sp // pass key schedule 2261 mov x10, x1 // pass rounds 2262 add x0, x19, #16 2263 2264 bl _bsaes_decrypt8 2265 2266 eor v0.16b, v0.16b, v11.16b 2267 eor v1.16b, v1.16b, v12.16b 2268 eor v6.16b, v6.16b, v13.16b 2269 mov v11.16b, v14.16b // next round tweak 2270 str q0, [x21], #16 2271 str q1, [x21], #16 2272 str q6, [x21], #16 2273 b .Lxts_dec_done 2274 2275.align 4 2276.Lxts_dec_2: 2277 eor v0.16b, v0.16b, v11.16b 2278 eor v1.16b, v1.16b, v12.16b 2279 mov x9, sp // pass key schedule 2280 mov x10, x1 // pass rounds 2281 add x0, x19, #16 2282 2283 bl _bsaes_decrypt8 2284 2285 eor v0.16b, v0.16b, v11.16b 2286 eor v1.16b, v1.16b, v12.16b 2287 mov v11.16b, v13.16b // next round tweak 2288 str q0, [x21], #16 2289 str q1, [x21], #16 2290 b .Lxts_dec_done 2291 2292.align 4 2293.Lxts_dec_1: 2294 eor v0.16b, v0.16b, v11.16b 2295 sub x0, sp, #16 2296 sub x1, sp, #16 2297 mov x2, x23 2298 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2299 mov v14.d[0], v12.d[1] 2300 str q0, [sp, #-16]! 2301 2302 bl AES_decrypt 2303 2304 ldr q0, [sp], #16 2305 trn1 v13.2d, v11.2d, v13.2d 2306 trn1 v11.2d, v12.2d, v14.2d // next round tweak 2307 eor v0.16b, v0.16b, v13.16b 2308 str q0, [x21], #16 2309 2310.Lxts_dec_done: 2311 adds x22, x22, #0x10 2312 beq .Lxts_dec_ret 2313 2314 // calculate one round of extra tweak for the stolen ciphertext 2315 ldr q8, .Lxts_magic 2316 sshr v6.2d, v11.2d, #63 2317 and v6.16b, v6.16b, v8.16b 2318 add v12.2d, v11.2d, v11.2d 2319 ext v6.16b, v6.16b, v6.16b, #8 2320 eor v12.16b, v12.16b, v6.16b 2321 2322 // perform the final decryption with the last tweak value 2323 ldr q0, [x20], #16 2324 eor v0.16b, v0.16b, v12.16b 2325 str q0, [sp, #-16]! 2326 mov x0, sp 2327 mov x1, sp 2328 mov x2, x23 2329 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2330 mov v14.d[0], v12.d[1] 2331 2332 bl AES_decrypt 2333 2334 trn1 v12.2d, v12.2d, v14.2d 2335 trn1 v11.2d, v11.2d, v13.2d 2336 ldr q0, [sp], #16 2337 eor v0.16b, v0.16b, v12.16b 2338 str q0, [x21] 2339 2340 mov x6, x21 2341 // Penultimate ciphertext block produces final plaintext part-block 2342 // plus remaining part of final ciphertext block. Move plaintext part 2343 // to final position and reuse penultimate plaintext block buffer to 2344 // construct final ciphertext block 2345.Lxts_dec_steal: 2346 ldrb w1, [x21] 2347 ldrb w0, [x20], #1 2348 strb w1, [x21, #0x10] 2349 strb w0, [x21], #1 2350 2351 subs x22, x22, #1 2352 bhi .Lxts_dec_steal 2353 2354 // Finally decrypt the penultimate plaintext block using the 2355 // penultimate tweak 2356 ldr q0, [x6] 2357 eor v0.16b, v0.16b, v11.16b 2358 str q0, [sp, #-16]! 2359 mov x0, sp 2360 mov x1, sp 2361 mov x2, x23 2362 mov x21, x6 2363 2364 bl AES_decrypt 2365 2366 trn1 v11.2d, v11.2d, v13.2d 2367 ldr q0, [sp], #16 2368 eor v0.16b, v0.16b, v11.16b 2369 str q0, [x21] 2370 2371.Lxts_dec_ret: 2372 2373 movi v0.16b, #0 2374 movi v1.16b, #0 2375.Lxts_dec_bzero: // wipe key schedule 2376 stp q0, q1, [sp], #32 2377 cmp sp, x19 2378 bne .Lxts_dec_bzero 2379 2380 ldp x19, x20, [sp, #80] 2381 ldp x21, x22, [sp, #96] 2382 ldr x23, [sp, #112] 2383 ldp d8, d9, [sp, #128] 2384 ldp d10, d11, [sp, #144] 2385 ldp d12, d13, [sp, #160] 2386 ldp d14, d15, [sp, #176] 2387 ldp x29, x30, [sp], #192 2388 ret 2389.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 2390