1/* Do not modify. This file is auto-generated from bsaes-armv8.pl. */ 2// Copyright 2021-2025 The OpenSSL Project Authors. All Rights Reserved. 3// 4// Licensed under the OpenSSL license (the "License"). You may not use 5// this file except in compliance with the License. You can obtain a copy 6// in the file LICENSE in the source distribution or at 7// https://www.openssl.org/source/license.html 8// 9// ==================================================================== 10// Written by Ben Avison <bavison@riscosopen.org> for the OpenSSL 11// project. Rights for redistribution and usage in source and binary 12// forms are granted according to the OpenSSL license. 13// ==================================================================== 14// 15// This implementation is a translation of bsaes-armv7 for AArch64. 16// No attempt has been made to carry across the build switches for 17// kernel targets, since the Linux kernel crypto support has moved on 18// from when it was based on OpenSSL. 19 20// A lot of hand-scheduling has been performed. Consequently, this code 21// doesn't factor out neatly into macros in the same way that the 22// AArch32 version did, and there is little to be gained by wrapping it 23// up in Perl, and it is presented as pure assembly. 24 25 26#include "crypto/arm_arch.h" 27 28.text 29 30 31 32 33 34.type _bsaes_decrypt8,%function 35.align 4 36// On entry: 37// x9 -> key (previously expanded using _bsaes_key_convert) 38// x10 = number of rounds 39// v0-v7 input data 40// On exit: 41// x9-x11 corrupted 42// other general-purpose registers preserved 43// v0-v7 output data 44// v11-v15 preserved 45// other SIMD registers corrupted 46_bsaes_decrypt8: 47 ldr q8, [x9], #16 48 adrp x11, .LM0ISR 49 add x11, x11, #:lo12:.LM0ISR 50 movi v9.16b, #0x55 51 ldr q10, [x11], #16 52 movi v16.16b, #0x33 53 movi v17.16b, #0x0f 54 sub x10, x10, #1 55 eor v0.16b, v0.16b, v8.16b 56 eor v1.16b, v1.16b, v8.16b 57 eor v2.16b, v2.16b, v8.16b 58 eor v4.16b, v4.16b, v8.16b 59 eor v3.16b, v3.16b, v8.16b 60 eor v5.16b, v5.16b, v8.16b 61 tbl v0.16b, {v0.16b}, v10.16b 62 tbl v1.16b, {v1.16b}, v10.16b 63 tbl v2.16b, {v2.16b}, v10.16b 64 tbl v4.16b, {v4.16b}, v10.16b 65 eor v6.16b, v6.16b, v8.16b 66 eor v7.16b, v7.16b, v8.16b 67 tbl v3.16b, {v3.16b}, v10.16b 68 tbl v5.16b, {v5.16b}, v10.16b 69 tbl v6.16b, {v6.16b}, v10.16b 70 ushr v8.2d, v0.2d, #1 71 tbl v7.16b, {v7.16b}, v10.16b 72 ushr v10.2d, v4.2d, #1 73 ushr v18.2d, v2.2d, #1 74 eor v8.16b, v8.16b, v1.16b 75 ushr v19.2d, v6.2d, #1 76 eor v10.16b, v10.16b, v5.16b 77 eor v18.16b, v18.16b, v3.16b 78 and v8.16b, v8.16b, v9.16b 79 eor v19.16b, v19.16b, v7.16b 80 and v10.16b, v10.16b, v9.16b 81 and v18.16b, v18.16b, v9.16b 82 eor v1.16b, v1.16b, v8.16b 83 shl v8.2d, v8.2d, #1 84 and v9.16b, v19.16b, v9.16b 85 eor v5.16b, v5.16b, v10.16b 86 shl v10.2d, v10.2d, #1 87 eor v3.16b, v3.16b, v18.16b 88 shl v18.2d, v18.2d, #1 89 eor v0.16b, v0.16b, v8.16b 90 shl v8.2d, v9.2d, #1 91 eor v7.16b, v7.16b, v9.16b 92 eor v4.16b, v4.16b, v10.16b 93 eor v2.16b, v2.16b, v18.16b 94 ushr v9.2d, v1.2d, #2 95 eor v6.16b, v6.16b, v8.16b 96 ushr v8.2d, v0.2d, #2 97 ushr v10.2d, v5.2d, #2 98 ushr v18.2d, v4.2d, #2 99 eor v9.16b, v9.16b, v3.16b 100 eor v8.16b, v8.16b, v2.16b 101 eor v10.16b, v10.16b, v7.16b 102 eor v18.16b, v18.16b, v6.16b 103 and v9.16b, v9.16b, v16.16b 104 and v8.16b, v8.16b, v16.16b 105 and v10.16b, v10.16b, v16.16b 106 and v16.16b, v18.16b, v16.16b 107 eor v3.16b, v3.16b, v9.16b 108 shl v9.2d, v9.2d, #2 109 eor v2.16b, v2.16b, v8.16b 110 shl v8.2d, v8.2d, #2 111 eor v7.16b, v7.16b, v10.16b 112 shl v10.2d, v10.2d, #2 113 eor v6.16b, v6.16b, v16.16b 114 shl v16.2d, v16.2d, #2 115 eor v1.16b, v1.16b, v9.16b 116 eor v0.16b, v0.16b, v8.16b 117 eor v5.16b, v5.16b, v10.16b 118 eor v4.16b, v4.16b, v16.16b 119 ushr v8.2d, v3.2d, #4 120 ushr v9.2d, v2.2d, #4 121 ushr v10.2d, v1.2d, #4 122 ushr v16.2d, v0.2d, #4 123 eor v8.16b, v8.16b, v7.16b 124 eor v9.16b, v9.16b, v6.16b 125 eor v10.16b, v10.16b, v5.16b 126 eor v16.16b, v16.16b, v4.16b 127 and v8.16b, v8.16b, v17.16b 128 and v9.16b, v9.16b, v17.16b 129 and v10.16b, v10.16b, v17.16b 130 and v16.16b, v16.16b, v17.16b 131 eor v7.16b, v7.16b, v8.16b 132 shl v8.2d, v8.2d, #4 133 eor v6.16b, v6.16b, v9.16b 134 shl v9.2d, v9.2d, #4 135 eor v5.16b, v5.16b, v10.16b 136 shl v10.2d, v10.2d, #4 137 eor v4.16b, v4.16b, v16.16b 138 shl v16.2d, v16.2d, #4 139 eor v3.16b, v3.16b, v8.16b 140 eor v2.16b, v2.16b, v9.16b 141 eor v1.16b, v1.16b, v10.16b 142 eor v0.16b, v0.16b, v16.16b 143 b .Ldec_sbox 144.align 4 145.Ldec_loop: 146 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 147 ldp q8, q9, [x9], #32 148 eor v0.16b, v16.16b, v0.16b 149 ldr q10, [x9], #16 150 eor v1.16b, v17.16b, v1.16b 151 ldr q16, [x9], #16 152 eor v2.16b, v18.16b, v2.16b 153 eor v3.16b, v19.16b, v3.16b 154 eor v4.16b, v8.16b, v4.16b 155 eor v5.16b, v9.16b, v5.16b 156 eor v6.16b, v10.16b, v6.16b 157 eor v7.16b, v16.16b, v7.16b 158 tbl v0.16b, {v0.16b}, v28.16b 159 tbl v1.16b, {v1.16b}, v28.16b 160 tbl v2.16b, {v2.16b}, v28.16b 161 tbl v3.16b, {v3.16b}, v28.16b 162 tbl v4.16b, {v4.16b}, v28.16b 163 tbl v5.16b, {v5.16b}, v28.16b 164 tbl v6.16b, {v6.16b}, v28.16b 165 tbl v7.16b, {v7.16b}, v28.16b 166.Ldec_sbox: 167 eor v1.16b, v1.16b, v4.16b 168 eor v3.16b, v3.16b, v4.16b 169 subs x10, x10, #1 170 eor v4.16b, v4.16b, v7.16b 171 eor v2.16b, v2.16b, v7.16b 172 eor v1.16b, v1.16b, v6.16b 173 eor v6.16b, v6.16b, v4.16b 174 eor v2.16b, v2.16b, v5.16b 175 eor v0.16b, v0.16b, v1.16b 176 eor v7.16b, v7.16b, v6.16b 177 eor v8.16b, v6.16b, v2.16b 178 and v9.16b, v4.16b, v6.16b 179 eor v10.16b, v2.16b, v6.16b 180 eor v3.16b, v3.16b, v0.16b 181 eor v5.16b, v5.16b, v0.16b 182 eor v16.16b, v7.16b, v4.16b 183 eor v17.16b, v4.16b, v0.16b 184 and v18.16b, v0.16b, v2.16b 185 eor v19.16b, v7.16b, v4.16b 186 eor v1.16b, v1.16b, v3.16b 187 eor v20.16b, v3.16b, v0.16b 188 eor v21.16b, v5.16b, v2.16b 189 eor v22.16b, v3.16b, v7.16b 190 and v8.16b, v17.16b, v8.16b 191 orr v17.16b, v3.16b, v5.16b 192 eor v23.16b, v1.16b, v6.16b 193 eor v24.16b, v20.16b, v16.16b 194 eor v25.16b, v1.16b, v5.16b 195 orr v26.16b, v20.16b, v21.16b 196 and v20.16b, v20.16b, v21.16b 197 and v27.16b, v7.16b, v1.16b 198 eor v21.16b, v21.16b, v23.16b 199 orr v28.16b, v16.16b, v23.16b 200 orr v29.16b, v22.16b, v25.16b 201 eor v26.16b, v26.16b, v8.16b 202 and v16.16b, v16.16b, v23.16b 203 and v22.16b, v22.16b, v25.16b 204 and v21.16b, v24.16b, v21.16b 205 eor v8.16b, v28.16b, v8.16b 206 eor v23.16b, v5.16b, v2.16b 207 eor v24.16b, v1.16b, v6.16b 208 eor v16.16b, v16.16b, v22.16b 209 eor v22.16b, v3.16b, v0.16b 210 eor v25.16b, v29.16b, v21.16b 211 eor v21.16b, v26.16b, v21.16b 212 eor v8.16b, v8.16b, v20.16b 213 eor v26.16b, v23.16b, v24.16b 214 eor v16.16b, v16.16b, v20.16b 215 eor v28.16b, v22.16b, v19.16b 216 eor v20.16b, v25.16b, v20.16b 217 eor v9.16b, v21.16b, v9.16b 218 eor v8.16b, v8.16b, v18.16b 219 eor v18.16b, v5.16b, v1.16b 220 eor v21.16b, v16.16b, v17.16b 221 eor v16.16b, v16.16b, v17.16b 222 eor v17.16b, v20.16b, v27.16b 223 eor v20.16b, v3.16b, v7.16b 224 eor v25.16b, v9.16b, v8.16b 225 eor v27.16b, v0.16b, v4.16b 226 and v29.16b, v9.16b, v17.16b 227 eor v30.16b, v8.16b, v29.16b 228 eor v31.16b, v21.16b, v29.16b 229 eor v29.16b, v21.16b, v29.16b 230 bsl v30.16b, v17.16b, v21.16b 231 bsl v31.16b, v9.16b, v8.16b 232 bsl v16.16b, v30.16b, v29.16b 233 bsl v21.16b, v29.16b, v30.16b 234 eor v8.16b, v31.16b, v30.16b 235 and v1.16b, v1.16b, v31.16b 236 and v9.16b, v16.16b, v31.16b 237 and v6.16b, v6.16b, v30.16b 238 eor v16.16b, v17.16b, v21.16b 239 and v4.16b, v4.16b, v30.16b 240 eor v17.16b, v8.16b, v30.16b 241 and v21.16b, v24.16b, v8.16b 242 eor v9.16b, v9.16b, v25.16b 243 and v19.16b, v19.16b, v8.16b 244 eor v24.16b, v30.16b, v16.16b 245 eor v25.16b, v30.16b, v16.16b 246 and v7.16b, v7.16b, v17.16b 247 and v10.16b, v10.16b, v16.16b 248 eor v29.16b, v9.16b, v16.16b 249 eor v30.16b, v31.16b, v9.16b 250 and v0.16b, v24.16b, v0.16b 251 and v9.16b, v18.16b, v9.16b 252 and v2.16b, v25.16b, v2.16b 253 eor v10.16b, v10.16b, v6.16b 254 eor v18.16b, v29.16b, v16.16b 255 and v5.16b, v30.16b, v5.16b 256 eor v24.16b, v8.16b, v29.16b 257 and v25.16b, v26.16b, v29.16b 258 and v26.16b, v28.16b, v29.16b 259 eor v8.16b, v8.16b, v29.16b 260 eor v17.16b, v17.16b, v18.16b 261 eor v5.16b, v1.16b, v5.16b 262 and v23.16b, v24.16b, v23.16b 263 eor v21.16b, v21.16b, v25.16b 264 eor v19.16b, v19.16b, v26.16b 265 eor v0.16b, v4.16b, v0.16b 266 and v3.16b, v17.16b, v3.16b 267 eor v1.16b, v9.16b, v1.16b 268 eor v9.16b, v25.16b, v23.16b 269 eor v5.16b, v5.16b, v21.16b 270 eor v2.16b, v6.16b, v2.16b 271 and v6.16b, v8.16b, v22.16b 272 eor v3.16b, v7.16b, v3.16b 273 and v8.16b, v20.16b, v18.16b 274 eor v10.16b, v10.16b, v9.16b 275 eor v0.16b, v0.16b, v19.16b 276 eor v9.16b, v1.16b, v9.16b 277 eor v1.16b, v2.16b, v21.16b 278 eor v3.16b, v3.16b, v19.16b 279 and v16.16b, v27.16b, v16.16b 280 eor v17.16b, v26.16b, v6.16b 281 eor v6.16b, v8.16b, v7.16b 282 eor v7.16b, v1.16b, v9.16b 283 eor v1.16b, v5.16b, v3.16b 284 eor v2.16b, v10.16b, v3.16b 285 eor v4.16b, v16.16b, v4.16b 286 eor v8.16b, v6.16b, v17.16b 287 eor v5.16b, v9.16b, v3.16b 288 eor v9.16b, v0.16b, v1.16b 289 eor v6.16b, v7.16b, v1.16b 290 eor v0.16b, v4.16b, v17.16b 291 eor v4.16b, v8.16b, v7.16b 292 eor v7.16b, v9.16b, v2.16b 293 eor v8.16b, v3.16b, v0.16b 294 eor v7.16b, v7.16b, v5.16b 295 eor v3.16b, v4.16b, v7.16b 296 eor v4.16b, v7.16b, v0.16b 297 eor v7.16b, v8.16b, v3.16b 298 bcc .Ldec_done 299 ext v8.16b, v0.16b, v0.16b, #8 300 ext v9.16b, v1.16b, v1.16b, #8 301 ldr q28, [x11] // load from .LISR in common case (x10 > 0) 302 ext v10.16b, v6.16b, v6.16b, #8 303 ext v16.16b, v3.16b, v3.16b, #8 304 ext v17.16b, v5.16b, v5.16b, #8 305 ext v18.16b, v4.16b, v4.16b, #8 306 eor v8.16b, v8.16b, v0.16b 307 eor v9.16b, v9.16b, v1.16b 308 eor v10.16b, v10.16b, v6.16b 309 eor v16.16b, v16.16b, v3.16b 310 eor v17.16b, v17.16b, v5.16b 311 ext v19.16b, v2.16b, v2.16b, #8 312 ext v20.16b, v7.16b, v7.16b, #8 313 eor v18.16b, v18.16b, v4.16b 314 eor v6.16b, v6.16b, v8.16b 315 eor v8.16b, v2.16b, v10.16b 316 eor v4.16b, v4.16b, v9.16b 317 eor v2.16b, v19.16b, v2.16b 318 eor v9.16b, v20.16b, v7.16b 319 eor v0.16b, v0.16b, v16.16b 320 eor v1.16b, v1.16b, v16.16b 321 eor v6.16b, v6.16b, v17.16b 322 eor v8.16b, v8.16b, v16.16b 323 eor v7.16b, v7.16b, v18.16b 324 eor v4.16b, v4.16b, v16.16b 325 eor v2.16b, v3.16b, v2.16b 326 eor v1.16b, v1.16b, v17.16b 327 eor v3.16b, v5.16b, v9.16b 328 eor v5.16b, v8.16b, v17.16b 329 eor v7.16b, v7.16b, v17.16b 330 ext v8.16b, v0.16b, v0.16b, #12 331 ext v9.16b, v6.16b, v6.16b, #12 332 ext v10.16b, v4.16b, v4.16b, #12 333 ext v16.16b, v1.16b, v1.16b, #12 334 ext v17.16b, v5.16b, v5.16b, #12 335 ext v18.16b, v7.16b, v7.16b, #12 336 eor v0.16b, v0.16b, v8.16b 337 eor v6.16b, v6.16b, v9.16b 338 eor v4.16b, v4.16b, v10.16b 339 ext v19.16b, v2.16b, v2.16b, #12 340 ext v20.16b, v3.16b, v3.16b, #12 341 eor v1.16b, v1.16b, v16.16b 342 eor v5.16b, v5.16b, v17.16b 343 eor v7.16b, v7.16b, v18.16b 344 eor v2.16b, v2.16b, v19.16b 345 eor v16.16b, v16.16b, v0.16b 346 eor v3.16b, v3.16b, v20.16b 347 eor v17.16b, v17.16b, v4.16b 348 eor v10.16b, v10.16b, v6.16b 349 ext v0.16b, v0.16b, v0.16b, #8 350 eor v9.16b, v9.16b, v1.16b 351 ext v1.16b, v1.16b, v1.16b, #8 352 eor v8.16b, v8.16b, v3.16b 353 eor v16.16b, v16.16b, v3.16b 354 eor v18.16b, v18.16b, v5.16b 355 eor v19.16b, v19.16b, v7.16b 356 ext v21.16b, v5.16b, v5.16b, #8 357 ext v5.16b, v7.16b, v7.16b, #8 358 eor v7.16b, v20.16b, v2.16b 359 ext v4.16b, v4.16b, v4.16b, #8 360 ext v20.16b, v3.16b, v3.16b, #8 361 eor v17.16b, v17.16b, v3.16b 362 ext v2.16b, v2.16b, v2.16b, #8 363 eor v3.16b, v10.16b, v3.16b 364 ext v10.16b, v6.16b, v6.16b, #8 365 eor v0.16b, v0.16b, v8.16b 366 eor v1.16b, v1.16b, v16.16b 367 eor v5.16b, v5.16b, v18.16b 368 eor v3.16b, v3.16b, v4.16b 369 eor v7.16b, v20.16b, v7.16b 370 eor v6.16b, v2.16b, v19.16b 371 eor v4.16b, v21.16b, v17.16b 372 eor v2.16b, v10.16b, v9.16b 373 bne .Ldec_loop 374 ldr q28, [x11, #16]! // load from .LISRM0 on last round (x10 == 0) 375 b .Ldec_loop 376.align 4 377.Ldec_done: 378 ushr v8.2d, v0.2d, #1 379 movi v9.16b, #0x55 380 ldr q10, [x9] 381 ushr v16.2d, v2.2d, #1 382 movi v17.16b, #0x33 383 ushr v18.2d, v6.2d, #1 384 movi v19.16b, #0x0f 385 eor v8.16b, v8.16b, v1.16b 386 ushr v20.2d, v3.2d, #1 387 eor v16.16b, v16.16b, v7.16b 388 eor v18.16b, v18.16b, v4.16b 389 and v8.16b, v8.16b, v9.16b 390 eor v20.16b, v20.16b, v5.16b 391 and v16.16b, v16.16b, v9.16b 392 and v18.16b, v18.16b, v9.16b 393 shl v21.2d, v8.2d, #1 394 eor v1.16b, v1.16b, v8.16b 395 and v8.16b, v20.16b, v9.16b 396 eor v7.16b, v7.16b, v16.16b 397 shl v9.2d, v16.2d, #1 398 eor v4.16b, v4.16b, v18.16b 399 shl v16.2d, v18.2d, #1 400 eor v0.16b, v0.16b, v21.16b 401 shl v18.2d, v8.2d, #1 402 eor v5.16b, v5.16b, v8.16b 403 eor v2.16b, v2.16b, v9.16b 404 eor v6.16b, v6.16b, v16.16b 405 ushr v8.2d, v1.2d, #2 406 eor v3.16b, v3.16b, v18.16b 407 ushr v9.2d, v0.2d, #2 408 ushr v16.2d, v7.2d, #2 409 ushr v18.2d, v2.2d, #2 410 eor v8.16b, v8.16b, v4.16b 411 eor v9.16b, v9.16b, v6.16b 412 eor v16.16b, v16.16b, v5.16b 413 eor v18.16b, v18.16b, v3.16b 414 and v8.16b, v8.16b, v17.16b 415 and v9.16b, v9.16b, v17.16b 416 and v16.16b, v16.16b, v17.16b 417 and v17.16b, v18.16b, v17.16b 418 eor v4.16b, v4.16b, v8.16b 419 shl v8.2d, v8.2d, #2 420 eor v6.16b, v6.16b, v9.16b 421 shl v9.2d, v9.2d, #2 422 eor v5.16b, v5.16b, v16.16b 423 shl v16.2d, v16.2d, #2 424 eor v3.16b, v3.16b, v17.16b 425 shl v17.2d, v17.2d, #2 426 eor v1.16b, v1.16b, v8.16b 427 eor v0.16b, v0.16b, v9.16b 428 eor v7.16b, v7.16b, v16.16b 429 eor v2.16b, v2.16b, v17.16b 430 ushr v8.2d, v4.2d, #4 431 ushr v9.2d, v6.2d, #4 432 ushr v16.2d, v1.2d, #4 433 ushr v17.2d, v0.2d, #4 434 eor v8.16b, v8.16b, v5.16b 435 eor v9.16b, v9.16b, v3.16b 436 eor v16.16b, v16.16b, v7.16b 437 eor v17.16b, v17.16b, v2.16b 438 and v8.16b, v8.16b, v19.16b 439 and v9.16b, v9.16b, v19.16b 440 and v16.16b, v16.16b, v19.16b 441 and v17.16b, v17.16b, v19.16b 442 eor v5.16b, v5.16b, v8.16b 443 shl v8.2d, v8.2d, #4 444 eor v3.16b, v3.16b, v9.16b 445 shl v9.2d, v9.2d, #4 446 eor v7.16b, v7.16b, v16.16b 447 shl v16.2d, v16.2d, #4 448 eor v2.16b, v2.16b, v17.16b 449 shl v17.2d, v17.2d, #4 450 eor v4.16b, v4.16b, v8.16b 451 eor v6.16b, v6.16b, v9.16b 452 eor v7.16b, v7.16b, v10.16b 453 eor v1.16b, v1.16b, v16.16b 454 eor v2.16b, v2.16b, v10.16b 455 eor v0.16b, v0.16b, v17.16b 456 eor v4.16b, v4.16b, v10.16b 457 eor v6.16b, v6.16b, v10.16b 458 eor v3.16b, v3.16b, v10.16b 459 eor v5.16b, v5.16b, v10.16b 460 eor v1.16b, v1.16b, v10.16b 461 eor v0.16b, v0.16b, v10.16b 462 ret 463.size _bsaes_decrypt8,.-_bsaes_decrypt8 464 465.section .rodata 466.type _bsaes_consts,%object 467.align 6 468_bsaes_consts: 469// InvShiftRows constants 470// Used in _bsaes_decrypt8, which assumes contiguity 471// .LM0ISR used with round 0 key 472// .LISR used with middle round keys 473// .LISRM0 used with final round key 474.LM0ISR: 475.quad 0x0a0e0206070b0f03, 0x0004080c0d010509 476.LISR: 477.quad 0x0504070602010003, 0x0f0e0d0c080b0a09 478.LISRM0: 479.quad 0x01040b0e0205080f, 0x0306090c00070a0d 480 481// ShiftRows constants 482// Used in _bsaes_encrypt8, which assumes contiguity 483// .LM0SR used with round 0 key 484// .LSR used with middle round keys 485// .LSRM0 used with final round key 486.LM0SR: 487.quad 0x0a0e02060f03070b, 0x0004080c05090d01 488.LSR: 489.quad 0x0504070600030201, 0x0f0e0d0c0a09080b 490.LSRM0: 491.quad 0x0304090e00050a0f, 0x01060b0c0207080d 492 493.LM0_bigendian: 494.quad 0x02060a0e03070b0f, 0x0004080c0105090d 495.LM0_littleendian: 496.quad 0x0105090d0004080c, 0x03070b0f02060a0e 497 498// Used in ossl_bsaes_ctr32_encrypt_blocks, prior to dropping into 499// _bsaes_encrypt8_alt, for round 0 key in place of .LM0SR 500.LREVM0SR: 501.quad 0x090d01050c000408, 0x03070b0f060a0e02 502 503.align 6 504.size _bsaes_consts,.-_bsaes_consts 505 506.previous 507 508.type _bsaes_encrypt8,%function 509.align 4 510// On entry: 511// x9 -> key (previously expanded using _bsaes_key_convert) 512// x10 = number of rounds 513// v0-v7 input data 514// On exit: 515// x9-x11 corrupted 516// other general-purpose registers preserved 517// v0-v7 output data 518// v11-v15 preserved 519// other SIMD registers corrupted 520_bsaes_encrypt8: 521 ldr q8, [x9], #16 522 adrp x11, .LM0SR 523 add x11, x11, #:lo12:.LM0SR 524 ldr q9, [x11], #16 525_bsaes_encrypt8_alt: 526 eor v0.16b, v0.16b, v8.16b 527 eor v1.16b, v1.16b, v8.16b 528 sub x10, x10, #1 529 eor v2.16b, v2.16b, v8.16b 530 eor v4.16b, v4.16b, v8.16b 531 eor v3.16b, v3.16b, v8.16b 532 eor v5.16b, v5.16b, v8.16b 533 tbl v0.16b, {v0.16b}, v9.16b 534 tbl v1.16b, {v1.16b}, v9.16b 535 tbl v2.16b, {v2.16b}, v9.16b 536 tbl v4.16b, {v4.16b}, v9.16b 537 eor v6.16b, v6.16b, v8.16b 538 eor v7.16b, v7.16b, v8.16b 539 tbl v3.16b, {v3.16b}, v9.16b 540 tbl v5.16b, {v5.16b}, v9.16b 541 tbl v6.16b, {v6.16b}, v9.16b 542 ushr v8.2d, v0.2d, #1 543 movi v10.16b, #0x55 544 tbl v7.16b, {v7.16b}, v9.16b 545 ushr v9.2d, v4.2d, #1 546 movi v16.16b, #0x33 547 ushr v17.2d, v2.2d, #1 548 eor v8.16b, v8.16b, v1.16b 549 movi v18.16b, #0x0f 550 ushr v19.2d, v6.2d, #1 551 eor v9.16b, v9.16b, v5.16b 552 eor v17.16b, v17.16b, v3.16b 553 and v8.16b, v8.16b, v10.16b 554 eor v19.16b, v19.16b, v7.16b 555 and v9.16b, v9.16b, v10.16b 556 and v17.16b, v17.16b, v10.16b 557 eor v1.16b, v1.16b, v8.16b 558 shl v8.2d, v8.2d, #1 559 and v10.16b, v19.16b, v10.16b 560 eor v5.16b, v5.16b, v9.16b 561 shl v9.2d, v9.2d, #1 562 eor v3.16b, v3.16b, v17.16b 563 shl v17.2d, v17.2d, #1 564 eor v0.16b, v0.16b, v8.16b 565 shl v8.2d, v10.2d, #1 566 eor v7.16b, v7.16b, v10.16b 567 eor v4.16b, v4.16b, v9.16b 568 eor v2.16b, v2.16b, v17.16b 569 ushr v9.2d, v1.2d, #2 570 eor v6.16b, v6.16b, v8.16b 571 ushr v8.2d, v0.2d, #2 572 ushr v10.2d, v5.2d, #2 573 ushr v17.2d, v4.2d, #2 574 eor v9.16b, v9.16b, v3.16b 575 eor v8.16b, v8.16b, v2.16b 576 eor v10.16b, v10.16b, v7.16b 577 eor v17.16b, v17.16b, v6.16b 578 and v9.16b, v9.16b, v16.16b 579 and v8.16b, v8.16b, v16.16b 580 and v10.16b, v10.16b, v16.16b 581 and v16.16b, v17.16b, v16.16b 582 eor v3.16b, v3.16b, v9.16b 583 shl v9.2d, v9.2d, #2 584 eor v2.16b, v2.16b, v8.16b 585 shl v8.2d, v8.2d, #2 586 eor v7.16b, v7.16b, v10.16b 587 shl v10.2d, v10.2d, #2 588 eor v6.16b, v6.16b, v16.16b 589 shl v16.2d, v16.2d, #2 590 eor v1.16b, v1.16b, v9.16b 591 eor v0.16b, v0.16b, v8.16b 592 eor v5.16b, v5.16b, v10.16b 593 eor v4.16b, v4.16b, v16.16b 594 ushr v8.2d, v3.2d, #4 595 ushr v9.2d, v2.2d, #4 596 ushr v10.2d, v1.2d, #4 597 ushr v16.2d, v0.2d, #4 598 eor v8.16b, v8.16b, v7.16b 599 eor v9.16b, v9.16b, v6.16b 600 eor v10.16b, v10.16b, v5.16b 601 eor v16.16b, v16.16b, v4.16b 602 and v8.16b, v8.16b, v18.16b 603 and v9.16b, v9.16b, v18.16b 604 and v10.16b, v10.16b, v18.16b 605 and v16.16b, v16.16b, v18.16b 606 eor v7.16b, v7.16b, v8.16b 607 shl v8.2d, v8.2d, #4 608 eor v6.16b, v6.16b, v9.16b 609 shl v9.2d, v9.2d, #4 610 eor v5.16b, v5.16b, v10.16b 611 shl v10.2d, v10.2d, #4 612 eor v4.16b, v4.16b, v16.16b 613 shl v16.2d, v16.2d, #4 614 eor v3.16b, v3.16b, v8.16b 615 eor v2.16b, v2.16b, v9.16b 616 eor v1.16b, v1.16b, v10.16b 617 eor v0.16b, v0.16b, v16.16b 618 b .Lenc_sbox 619.align 4 620.Lenc_loop: 621 ld1 {v16.16b, v17.16b, v18.16b, v19.16b}, [x9], #64 622 ldp q8, q9, [x9], #32 623 eor v0.16b, v16.16b, v0.16b 624 ldr q10, [x9], #16 625 eor v1.16b, v17.16b, v1.16b 626 ldr q16, [x9], #16 627 eor v2.16b, v18.16b, v2.16b 628 eor v3.16b, v19.16b, v3.16b 629 eor v4.16b, v8.16b, v4.16b 630 eor v5.16b, v9.16b, v5.16b 631 eor v6.16b, v10.16b, v6.16b 632 eor v7.16b, v16.16b, v7.16b 633 tbl v0.16b, {v0.16b}, v28.16b 634 tbl v1.16b, {v1.16b}, v28.16b 635 tbl v2.16b, {v2.16b}, v28.16b 636 tbl v3.16b, {v3.16b}, v28.16b 637 tbl v4.16b, {v4.16b}, v28.16b 638 tbl v5.16b, {v5.16b}, v28.16b 639 tbl v6.16b, {v6.16b}, v28.16b 640 tbl v7.16b, {v7.16b}, v28.16b 641.Lenc_sbox: 642 eor v5.16b, v5.16b, v6.16b 643 eor v3.16b, v3.16b, v0.16b 644 subs x10, x10, #1 645 eor v2.16b, v2.16b, v1.16b 646 eor v5.16b, v5.16b, v0.16b 647 eor v8.16b, v3.16b, v7.16b 648 eor v6.16b, v6.16b, v2.16b 649 eor v7.16b, v7.16b, v5.16b 650 eor v8.16b, v8.16b, v4.16b 651 eor v3.16b, v6.16b, v3.16b 652 eor v4.16b, v4.16b, v5.16b 653 eor v6.16b, v1.16b, v5.16b 654 eor v2.16b, v2.16b, v7.16b 655 eor v1.16b, v8.16b, v1.16b 656 eor v8.16b, v7.16b, v4.16b 657 eor v9.16b, v3.16b, v0.16b 658 eor v10.16b, v7.16b, v6.16b 659 eor v16.16b, v5.16b, v3.16b 660 eor v17.16b, v6.16b, v2.16b 661 eor v18.16b, v5.16b, v1.16b 662 eor v19.16b, v2.16b, v4.16b 663 eor v20.16b, v1.16b, v0.16b 664 orr v21.16b, v8.16b, v9.16b 665 orr v22.16b, v10.16b, v16.16b 666 eor v23.16b, v8.16b, v17.16b 667 eor v24.16b, v9.16b, v18.16b 668 and v19.16b, v19.16b, v20.16b 669 orr v20.16b, v17.16b, v18.16b 670 and v8.16b, v8.16b, v9.16b 671 and v9.16b, v17.16b, v18.16b 672 and v17.16b, v23.16b, v24.16b 673 and v10.16b, v10.16b, v16.16b 674 eor v16.16b, v21.16b, v19.16b 675 eor v18.16b, v20.16b, v19.16b 676 and v19.16b, v2.16b, v1.16b 677 and v20.16b, v6.16b, v5.16b 678 eor v21.16b, v22.16b, v17.16b 679 eor v9.16b, v9.16b, v10.16b 680 eor v10.16b, v16.16b, v17.16b 681 eor v16.16b, v18.16b, v8.16b 682 and v17.16b, v4.16b, v0.16b 683 orr v18.16b, v7.16b, v3.16b 684 eor v21.16b, v21.16b, v8.16b 685 eor v8.16b, v9.16b, v8.16b 686 eor v9.16b, v10.16b, v19.16b 687 eor v10.16b, v3.16b, v0.16b 688 eor v16.16b, v16.16b, v17.16b 689 eor v17.16b, v5.16b, v1.16b 690 eor v19.16b, v21.16b, v20.16b 691 eor v20.16b, v8.16b, v18.16b 692 eor v8.16b, v8.16b, v18.16b 693 eor v18.16b, v7.16b, v4.16b 694 eor v21.16b, v9.16b, v16.16b 695 eor v22.16b, v6.16b, v2.16b 696 and v23.16b, v9.16b, v19.16b 697 eor v24.16b, v10.16b, v17.16b 698 eor v25.16b, v0.16b, v1.16b 699 eor v26.16b, v7.16b, v6.16b 700 eor v27.16b, v18.16b, v22.16b 701 eor v28.16b, v3.16b, v5.16b 702 eor v29.16b, v16.16b, v23.16b 703 eor v30.16b, v20.16b, v23.16b 704 eor v23.16b, v20.16b, v23.16b 705 eor v31.16b, v4.16b, v2.16b 706 bsl v29.16b, v19.16b, v20.16b 707 bsl v30.16b, v9.16b, v16.16b 708 bsl v8.16b, v29.16b, v23.16b 709 bsl v20.16b, v23.16b, v29.16b 710 eor v9.16b, v30.16b, v29.16b 711 and v5.16b, v5.16b, v30.16b 712 and v8.16b, v8.16b, v30.16b 713 and v1.16b, v1.16b, v29.16b 714 eor v16.16b, v19.16b, v20.16b 715 and v2.16b, v2.16b, v29.16b 716 eor v19.16b, v9.16b, v29.16b 717 and v17.16b, v17.16b, v9.16b 718 eor v8.16b, v8.16b, v21.16b 719 and v20.16b, v22.16b, v9.16b 720 eor v21.16b, v29.16b, v16.16b 721 eor v22.16b, v29.16b, v16.16b 722 and v23.16b, v25.16b, v16.16b 723 and v6.16b, v6.16b, v19.16b 724 eor v25.16b, v8.16b, v16.16b 725 eor v29.16b, v30.16b, v8.16b 726 and v4.16b, v21.16b, v4.16b 727 and v8.16b, v28.16b, v8.16b 728 and v0.16b, v22.16b, v0.16b 729 eor v21.16b, v23.16b, v1.16b 730 eor v22.16b, v9.16b, v25.16b 731 eor v9.16b, v9.16b, v25.16b 732 eor v23.16b, v25.16b, v16.16b 733 and v3.16b, v29.16b, v3.16b 734 and v24.16b, v24.16b, v25.16b 735 and v25.16b, v27.16b, v25.16b 736 and v10.16b, v22.16b, v10.16b 737 and v9.16b, v9.16b, v18.16b 738 eor v18.16b, v19.16b, v23.16b 739 and v19.16b, v26.16b, v23.16b 740 eor v3.16b, v5.16b, v3.16b 741 eor v17.16b, v17.16b, v24.16b 742 eor v10.16b, v24.16b, v10.16b 743 and v16.16b, v31.16b, v16.16b 744 eor v20.16b, v20.16b, v25.16b 745 eor v9.16b, v25.16b, v9.16b 746 eor v4.16b, v2.16b, v4.16b 747 and v7.16b, v18.16b, v7.16b 748 eor v18.16b, v19.16b, v6.16b 749 eor v5.16b, v8.16b, v5.16b 750 eor v0.16b, v1.16b, v0.16b 751 eor v1.16b, v21.16b, v10.16b 752 eor v8.16b, v3.16b, v17.16b 753 eor v2.16b, v16.16b, v2.16b 754 eor v3.16b, v6.16b, v7.16b 755 eor v6.16b, v18.16b, v9.16b 756 eor v4.16b, v4.16b, v20.16b 757 eor v10.16b, v5.16b, v10.16b 758 eor v0.16b, v0.16b, v17.16b 759 eor v9.16b, v2.16b, v9.16b 760 eor v3.16b, v3.16b, v20.16b 761 eor v7.16b, v6.16b, v1.16b 762 eor v5.16b, v8.16b, v4.16b 763 eor v6.16b, v10.16b, v1.16b 764 eor v2.16b, v4.16b, v0.16b 765 eor v4.16b, v3.16b, v10.16b 766 eor v9.16b, v9.16b, v7.16b 767 eor v3.16b, v0.16b, v5.16b 768 eor v0.16b, v1.16b, v4.16b 769 eor v1.16b, v4.16b, v8.16b 770 eor v4.16b, v9.16b, v5.16b 771 eor v6.16b, v6.16b, v3.16b 772 bcc .Lenc_done 773 ext v8.16b, v0.16b, v0.16b, #12 774 ext v9.16b, v4.16b, v4.16b, #12 775 ldr q28, [x11] 776 ext v10.16b, v6.16b, v6.16b, #12 777 ext v16.16b, v1.16b, v1.16b, #12 778 ext v17.16b, v3.16b, v3.16b, #12 779 ext v18.16b, v7.16b, v7.16b, #12 780 eor v0.16b, v0.16b, v8.16b 781 eor v4.16b, v4.16b, v9.16b 782 eor v6.16b, v6.16b, v10.16b 783 ext v19.16b, v2.16b, v2.16b, #12 784 ext v20.16b, v5.16b, v5.16b, #12 785 eor v1.16b, v1.16b, v16.16b 786 eor v3.16b, v3.16b, v17.16b 787 eor v7.16b, v7.16b, v18.16b 788 eor v2.16b, v2.16b, v19.16b 789 eor v16.16b, v16.16b, v0.16b 790 eor v5.16b, v5.16b, v20.16b 791 eor v17.16b, v17.16b, v6.16b 792 eor v10.16b, v10.16b, v4.16b 793 ext v0.16b, v0.16b, v0.16b, #8 794 eor v9.16b, v9.16b, v1.16b 795 ext v1.16b, v1.16b, v1.16b, #8 796 eor v8.16b, v8.16b, v5.16b 797 eor v16.16b, v16.16b, v5.16b 798 eor v18.16b, v18.16b, v3.16b 799 eor v19.16b, v19.16b, v7.16b 800 ext v3.16b, v3.16b, v3.16b, #8 801 ext v7.16b, v7.16b, v7.16b, #8 802 eor v20.16b, v20.16b, v2.16b 803 ext v6.16b, v6.16b, v6.16b, #8 804 ext v21.16b, v5.16b, v5.16b, #8 805 eor v17.16b, v17.16b, v5.16b 806 ext v2.16b, v2.16b, v2.16b, #8 807 eor v10.16b, v10.16b, v5.16b 808 ext v22.16b, v4.16b, v4.16b, #8 809 eor v0.16b, v0.16b, v8.16b 810 eor v1.16b, v1.16b, v16.16b 811 eor v5.16b, v7.16b, v18.16b 812 eor v4.16b, v3.16b, v17.16b 813 eor v3.16b, v6.16b, v10.16b 814 eor v7.16b, v21.16b, v20.16b 815 eor v6.16b, v2.16b, v19.16b 816 eor v2.16b, v22.16b, v9.16b 817 bne .Lenc_loop 818 ldr q28, [x11, #16]! // load from .LSRM0 on last round (x10 == 0) 819 b .Lenc_loop 820.align 4 821.Lenc_done: 822 ushr v8.2d, v0.2d, #1 823 movi v9.16b, #0x55 824 ldr q10, [x9] 825 ushr v16.2d, v3.2d, #1 826 movi v17.16b, #0x33 827 ushr v18.2d, v4.2d, #1 828 movi v19.16b, #0x0f 829 eor v8.16b, v8.16b, v1.16b 830 ushr v20.2d, v2.2d, #1 831 eor v16.16b, v16.16b, v7.16b 832 eor v18.16b, v18.16b, v6.16b 833 and v8.16b, v8.16b, v9.16b 834 eor v20.16b, v20.16b, v5.16b 835 and v16.16b, v16.16b, v9.16b 836 and v18.16b, v18.16b, v9.16b 837 shl v21.2d, v8.2d, #1 838 eor v1.16b, v1.16b, v8.16b 839 and v8.16b, v20.16b, v9.16b 840 eor v7.16b, v7.16b, v16.16b 841 shl v9.2d, v16.2d, #1 842 eor v6.16b, v6.16b, v18.16b 843 shl v16.2d, v18.2d, #1 844 eor v0.16b, v0.16b, v21.16b 845 shl v18.2d, v8.2d, #1 846 eor v5.16b, v5.16b, v8.16b 847 eor v3.16b, v3.16b, v9.16b 848 eor v4.16b, v4.16b, v16.16b 849 ushr v8.2d, v1.2d, #2 850 eor v2.16b, v2.16b, v18.16b 851 ushr v9.2d, v0.2d, #2 852 ushr v16.2d, v7.2d, #2 853 ushr v18.2d, v3.2d, #2 854 eor v8.16b, v8.16b, v6.16b 855 eor v9.16b, v9.16b, v4.16b 856 eor v16.16b, v16.16b, v5.16b 857 eor v18.16b, v18.16b, v2.16b 858 and v8.16b, v8.16b, v17.16b 859 and v9.16b, v9.16b, v17.16b 860 and v16.16b, v16.16b, v17.16b 861 and v17.16b, v18.16b, v17.16b 862 eor v6.16b, v6.16b, v8.16b 863 shl v8.2d, v8.2d, #2 864 eor v4.16b, v4.16b, v9.16b 865 shl v9.2d, v9.2d, #2 866 eor v5.16b, v5.16b, v16.16b 867 shl v16.2d, v16.2d, #2 868 eor v2.16b, v2.16b, v17.16b 869 shl v17.2d, v17.2d, #2 870 eor v1.16b, v1.16b, v8.16b 871 eor v0.16b, v0.16b, v9.16b 872 eor v7.16b, v7.16b, v16.16b 873 eor v3.16b, v3.16b, v17.16b 874 ushr v8.2d, v6.2d, #4 875 ushr v9.2d, v4.2d, #4 876 ushr v16.2d, v1.2d, #4 877 ushr v17.2d, v0.2d, #4 878 eor v8.16b, v8.16b, v5.16b 879 eor v9.16b, v9.16b, v2.16b 880 eor v16.16b, v16.16b, v7.16b 881 eor v17.16b, v17.16b, v3.16b 882 and v8.16b, v8.16b, v19.16b 883 and v9.16b, v9.16b, v19.16b 884 and v16.16b, v16.16b, v19.16b 885 and v17.16b, v17.16b, v19.16b 886 eor v5.16b, v5.16b, v8.16b 887 shl v8.2d, v8.2d, #4 888 eor v2.16b, v2.16b, v9.16b 889 shl v9.2d, v9.2d, #4 890 eor v7.16b, v7.16b, v16.16b 891 shl v16.2d, v16.2d, #4 892 eor v3.16b, v3.16b, v17.16b 893 shl v17.2d, v17.2d, #4 894 eor v6.16b, v6.16b, v8.16b 895 eor v4.16b, v4.16b, v9.16b 896 eor v7.16b, v7.16b, v10.16b 897 eor v1.16b, v1.16b, v16.16b 898 eor v3.16b, v3.16b, v10.16b 899 eor v0.16b, v0.16b, v17.16b 900 eor v6.16b, v6.16b, v10.16b 901 eor v4.16b, v4.16b, v10.16b 902 eor v2.16b, v2.16b, v10.16b 903 eor v5.16b, v5.16b, v10.16b 904 eor v1.16b, v1.16b, v10.16b 905 eor v0.16b, v0.16b, v10.16b 906 ret 907.size _bsaes_encrypt8,.-_bsaes_encrypt8 908 909.type _bsaes_key_convert,%function 910.align 4 911// On entry: 912// x9 -> input key (big-endian) 913// x10 = number of rounds 914// x17 -> output key (native endianness) 915// On exit: 916// x9, x10 corrupted 917// x11 -> .LM0_bigendian 918// x17 -> last quadword of output key 919// other general-purpose registers preserved 920// v2-v6 preserved 921// v7.16b[] = 0x63 922// v8-v14 preserved 923// v15 = last round key (converted to native endianness) 924// other SIMD registers corrupted 925_bsaes_key_convert: 926#ifdef __AARCH64EL__ 927 adrp x11, .LM0_littleendian 928 add x11, x11, #:lo12:.LM0_littleendian 929#else 930 adrp x11, .LM0_bigendian 931 add x11, x11, #:lo12:.LM0_bigendian 932#endif 933 ldr q0, [x9], #16 // load round 0 key 934 ldr q1, [x11] // .LM0 935 ldr q15, [x9], #16 // load round 1 key 936 937 movi v7.16b, #0x63 // compose .L63 938 movi v16.16b, #0x01 // bit masks 939 movi v17.16b, #0x02 940 movi v18.16b, #0x04 941 movi v19.16b, #0x08 942 movi v20.16b, #0x10 943 movi v21.16b, #0x20 944 movi v22.16b, #0x40 945 movi v23.16b, #0x80 946 947#ifdef __AARCH64EL__ 948 rev32 v0.16b, v0.16b 949#endif 950 sub x10, x10, #1 951 str q0, [x17], #16 // save round 0 key 952 953.align 4 954.Lkey_loop: 955 tbl v0.16b, {v15.16b}, v1.16b 956 ldr q15, [x9], #16 // load next round key 957 958 eor v0.16b, v0.16b, v7.16b 959 cmtst v24.16b, v0.16b, v16.16b 960 cmtst v25.16b, v0.16b, v17.16b 961 cmtst v26.16b, v0.16b, v18.16b 962 cmtst v27.16b, v0.16b, v19.16b 963 cmtst v28.16b, v0.16b, v20.16b 964 cmtst v29.16b, v0.16b, v21.16b 965 cmtst v30.16b, v0.16b, v22.16b 966 cmtst v31.16b, v0.16b, v23.16b 967 sub x10, x10, #1 968 st1 {v24.16b,v25.16b,v26.16b,v27.16b}, [x17], #64 // write bit-sliced round key 969 st1 {v28.16b,v29.16b,v30.16b,v31.16b}, [x17], #64 970 cbnz x10, .Lkey_loop 971 972 // don't save last round key 973#ifdef __AARCH64EL__ 974 rev32 v15.16b, v15.16b 975 adrp x11, .LM0_bigendian 976 add x11, x11, #:lo12:.LM0_bigendian 977#endif 978 ret 979.size _bsaes_key_convert,.-_bsaes_key_convert 980 981.globl ossl_bsaes_cbc_encrypt 982.type ossl_bsaes_cbc_encrypt,%function 983.align 4 984// On entry: 985// x0 -> input ciphertext 986// x1 -> output plaintext 987// x2 = size of ciphertext and plaintext in bytes (assumed a multiple of 16) 988// x3 -> key 989// x4 -> 128-bit initialisation vector (or preceding 128-bit block of ciphertext if continuing after an earlier call) 990// w5 must be == 0 991// On exit: 992// Output plaintext filled in 993// Initialisation vector overwritten with last quadword of ciphertext 994// No output registers, usual AAPCS64 register preservation 995ossl_bsaes_cbc_encrypt: 996 AARCH64_VALID_CALL_TARGET 997 cmp x2, #128 998 bhs .Lcbc_do_bsaes 999 b AES_cbc_encrypt 1000.Lcbc_do_bsaes: 1001 1002 // it is up to the caller to make sure we are called with enc == 0 1003 1004 stp x29, x30, [sp, #-48]! 1005 stp d8, d9, [sp, #16] 1006 stp d10, d15, [sp, #32] 1007 lsr x2, x2, #4 // len in 16 byte blocks 1008 1009 ldr w15, [x3, #240] // get # of rounds 1010 mov x14, sp 1011 1012 // allocate the key schedule on the stack 1013 add x17, sp, #96 1014 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1015 1016 // populate the key schedule 1017 mov x9, x3 // pass key 1018 mov x10, x15 // pass # of rounds 1019 mov sp, x17 // sp is sp 1020 bl _bsaes_key_convert 1021 ldr q6, [sp] 1022 str q15, [x17] // save last round key 1023 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1024 str q6, [sp] 1025 1026 ldr q15, [x4] // load IV 1027 b .Lcbc_dec_loop 1028 1029.align 4 1030.Lcbc_dec_loop: 1031 subs x2, x2, #0x8 1032 bmi .Lcbc_dec_loop_finish 1033 1034 ldr q0, [x0], #16 // load input 1035 mov x9, sp // pass the key 1036 ldr q1, [x0], #16 1037 mov x10, x15 1038 ldr q2, [x0], #16 1039 ldr q3, [x0], #16 1040 ldr q4, [x0], #16 1041 ldr q5, [x0], #16 1042 ldr q6, [x0], #16 1043 ldr q7, [x0], #-7*16 1044 1045 bl _bsaes_decrypt8 1046 1047 ldr q16, [x0], #16 // reload input 1048 eor v0.16b, v0.16b, v15.16b // ^= IV 1049 eor v1.16b, v1.16b, v16.16b 1050 str q0, [x1], #16 // write output 1051 ldr q0, [x0], #16 1052 str q1, [x1], #16 1053 ldr q1, [x0], #16 1054 eor v1.16b, v4.16b, v1.16b 1055 ldr q4, [x0], #16 1056 eor v2.16b, v2.16b, v4.16b 1057 eor v0.16b, v6.16b, v0.16b 1058 ldr q4, [x0], #16 1059 str q0, [x1], #16 1060 str q1, [x1], #16 1061 eor v0.16b, v7.16b, v4.16b 1062 ldr q1, [x0], #16 1063 str q2, [x1], #16 1064 ldr q2, [x0], #16 1065 ldr q15, [x0], #16 1066 str q0, [x1], #16 1067 eor v0.16b, v5.16b, v2.16b 1068 eor v1.16b, v3.16b, v1.16b 1069 str q1, [x1], #16 1070 str q0, [x1], #16 1071 1072 b .Lcbc_dec_loop 1073 1074.Lcbc_dec_loop_finish: 1075 adds x2, x2, #8 1076 beq .Lcbc_dec_done 1077 1078 ldr q0, [x0], #16 // load input 1079 cmp x2, #2 1080 blo .Lcbc_dec_one 1081 ldr q1, [x0], #16 1082 mov x9, sp // pass the key 1083 mov x10, x15 1084 beq .Lcbc_dec_two 1085 ldr q2, [x0], #16 1086 cmp x2, #4 1087 blo .Lcbc_dec_three 1088 ldr q3, [x0], #16 1089 beq .Lcbc_dec_four 1090 ldr q4, [x0], #16 1091 cmp x2, #6 1092 blo .Lcbc_dec_five 1093 ldr q5, [x0], #16 1094 beq .Lcbc_dec_six 1095 ldr q6, [x0], #-6*16 1096 1097 bl _bsaes_decrypt8 1098 1099 ldr q5, [x0], #16 // reload input 1100 eor v0.16b, v0.16b, v15.16b // ^= IV 1101 ldr q8, [x0], #16 1102 ldr q9, [x0], #16 1103 ldr q10, [x0], #16 1104 str q0, [x1], #16 // write output 1105 ldr q0, [x0], #16 1106 eor v1.16b, v1.16b, v5.16b 1107 ldr q5, [x0], #16 1108 eor v6.16b, v6.16b, v8.16b 1109 ldr q15, [x0] 1110 eor v4.16b, v4.16b, v9.16b 1111 eor v2.16b, v2.16b, v10.16b 1112 str q1, [x1], #16 1113 eor v0.16b, v7.16b, v0.16b 1114 str q6, [x1], #16 1115 eor v1.16b, v3.16b, v5.16b 1116 str q4, [x1], #16 1117 str q2, [x1], #16 1118 str q0, [x1], #16 1119 str q1, [x1] 1120 b .Lcbc_dec_done 1121.align 4 1122.Lcbc_dec_six: 1123 sub x0, x0, #0x60 1124 bl _bsaes_decrypt8 1125 ldr q3, [x0], #16 // reload input 1126 eor v0.16b, v0.16b, v15.16b // ^= IV 1127 ldr q5, [x0], #16 1128 ldr q8, [x0], #16 1129 ldr q9, [x0], #16 1130 str q0, [x1], #16 // write output 1131 ldr q0, [x0], #16 1132 eor v1.16b, v1.16b, v3.16b 1133 ldr q15, [x0] 1134 eor v3.16b, v6.16b, v5.16b 1135 eor v4.16b, v4.16b, v8.16b 1136 eor v2.16b, v2.16b, v9.16b 1137 str q1, [x1], #16 1138 eor v0.16b, v7.16b, v0.16b 1139 str q3, [x1], #16 1140 str q4, [x1], #16 1141 str q2, [x1], #16 1142 str q0, [x1] 1143 b .Lcbc_dec_done 1144.align 4 1145.Lcbc_dec_five: 1146 sub x0, x0, #0x50 1147 bl _bsaes_decrypt8 1148 ldr q3, [x0], #16 // reload input 1149 eor v0.16b, v0.16b, v15.16b // ^= IV 1150 ldr q5, [x0], #16 1151 ldr q7, [x0], #16 1152 ldr q8, [x0], #16 1153 str q0, [x1], #16 // write output 1154 ldr q15, [x0] 1155 eor v0.16b, v1.16b, v3.16b 1156 eor v1.16b, v6.16b, v5.16b 1157 eor v3.16b, v4.16b, v7.16b 1158 str q0, [x1], #16 1159 eor v0.16b, v2.16b, v8.16b 1160 str q1, [x1], #16 1161 str q3, [x1], #16 1162 str q0, [x1] 1163 b .Lcbc_dec_done 1164.align 4 1165.Lcbc_dec_four: 1166 sub x0, x0, #0x40 1167 bl _bsaes_decrypt8 1168 ldr q2, [x0], #16 // reload input 1169 eor v0.16b, v0.16b, v15.16b // ^= IV 1170 ldr q3, [x0], #16 1171 ldr q5, [x0], #16 1172 str q0, [x1], #16 // write output 1173 ldr q15, [x0] 1174 eor v0.16b, v1.16b, v2.16b 1175 eor v1.16b, v6.16b, v3.16b 1176 eor v2.16b, v4.16b, v5.16b 1177 str q0, [x1], #16 1178 str q1, [x1], #16 1179 str q2, [x1] 1180 b .Lcbc_dec_done 1181.align 4 1182.Lcbc_dec_three: 1183 sub x0, x0, #0x30 1184 bl _bsaes_decrypt8 1185 ldr q2, [x0], #16 // reload input 1186 eor v0.16b, v0.16b, v15.16b // ^= IV 1187 ldr q3, [x0], #16 1188 ldr q15, [x0] 1189 str q0, [x1], #16 // write output 1190 eor v0.16b, v1.16b, v2.16b 1191 eor v1.16b, v6.16b, v3.16b 1192 str q0, [x1], #16 1193 str q1, [x1] 1194 b .Lcbc_dec_done 1195.align 4 1196.Lcbc_dec_two: 1197 sub x0, x0, #0x20 1198 bl _bsaes_decrypt8 1199 ldr q2, [x0], #16 // reload input 1200 eor v0.16b, v0.16b, v15.16b // ^= IV 1201 ldr q15, [x0] 1202 str q0, [x1], #16 // write output 1203 eor v0.16b, v1.16b, v2.16b 1204 str q0, [x1] 1205 b .Lcbc_dec_done 1206.align 4 1207.Lcbc_dec_one: 1208 sub x0, x0, #0x10 1209 stp x1, x4, [sp, #-32]! 1210 str x14, [sp, #16] 1211 mov v8.16b, v15.16b 1212 mov v15.16b, v0.16b 1213 mov x2, x3 1214 bl AES_decrypt 1215 ldr x14, [sp, #16] 1216 ldp x1, x4, [sp], #32 1217 ldr q0, [x1] // load result 1218 eor v0.16b, v0.16b, v8.16b // ^= IV 1219 str q0, [x1] // write output 1220 1221.align 4 1222.Lcbc_dec_done: 1223 movi v0.16b, #0 1224 movi v1.16b, #0 1225.Lcbc_dec_bzero: // wipe key schedule [if any] 1226 stp q0, q1, [sp], #32 1227 cmp sp, x14 1228 bne .Lcbc_dec_bzero 1229 str q15, [x4] // return IV 1230 ldp d8, d9, [sp, #16] 1231 ldp d10, d15, [sp, #32] 1232 ldp x29, x30, [sp], #48 1233 ret 1234.size ossl_bsaes_cbc_encrypt,.-ossl_bsaes_cbc_encrypt 1235 1236.globl ossl_bsaes_ctr32_encrypt_blocks 1237.type ossl_bsaes_ctr32_encrypt_blocks,%function 1238.align 4 1239// On entry: 1240// x0 -> input text (whole 16-byte blocks) 1241// x1 -> output text (whole 16-byte blocks) 1242// x2 = number of 16-byte blocks to encrypt/decrypt (> 0) 1243// x3 -> key 1244// x4 -> initial value of 128-bit counter (stored big-endian) which increments, modulo 2^32, for each block 1245// On exit: 1246// Output text filled in 1247// No output registers, usual AAPCS64 register preservation 1248ossl_bsaes_ctr32_encrypt_blocks: 1249 AARCH64_VALID_CALL_TARGET 1250 cmp x2, #8 // use plain AES for 1251 blo .Lctr_enc_short // small sizes 1252 1253 stp x29, x30, [sp, #-80]! 1254 stp d8, d9, [sp, #16] 1255 stp d10, d11, [sp, #32] 1256 stp d12, d13, [sp, #48] 1257 stp d14, d15, [sp, #64] 1258 1259 ldr w15, [x3, #240] // get # of rounds 1260 mov x14, sp 1261 1262 // allocate the key schedule on the stack 1263 add x17, sp, #96 1264 sub x17, x17, x15, lsl #7 // 128 bytes per inner round key, less 96 bytes 1265 1266 // populate the key schedule 1267 mov x9, x3 // pass key 1268 mov x10, x15 // pass # of rounds 1269 mov sp, x17 // sp is sp 1270 bl _bsaes_key_convert 1271 eor v7.16b, v7.16b, v15.16b // fix up last round key 1272 str q7, [x17] // save last round key 1273 1274 ldr q0, [x4] // load counter 1275 add x13, x11, #.LREVM0SR-.LM0_bigendian 1276 ldr q4, [sp] // load round0 key 1277 1278 movi v8.4s, #1 // compose 1<<96 1279 movi v9.16b, #0 1280 rev32 v15.16b, v0.16b 1281 rev32 v0.16b, v0.16b 1282 ext v11.16b, v9.16b, v8.16b, #4 1283 rev32 v4.16b, v4.16b 1284 add v12.4s, v11.4s, v11.4s // compose 2<<96 1285 str q4, [sp] // save adjusted round0 key 1286 add v13.4s, v11.4s, v12.4s // compose 3<<96 1287 add v14.4s, v12.4s, v12.4s // compose 4<<96 1288 b .Lctr_enc_loop 1289 1290.align 4 1291.Lctr_enc_loop: 1292 // Intermix prologue from _bsaes_encrypt8 to use the opportunity 1293 // to flip byte order in 32-bit counter 1294 1295 add v1.4s, v15.4s, v11.4s // +1 1296 add x9, sp, #0x10 // pass next round key 1297 add v2.4s, v15.4s, v12.4s // +2 1298 ldr q9, [x13] // .LREVM0SR 1299 ldr q8, [sp] // load round0 key 1300 add v3.4s, v15.4s, v13.4s // +3 1301 mov x10, x15 // pass rounds 1302 sub x11, x13, #.LREVM0SR-.LSR // pass constants 1303 add v6.4s, v2.4s, v14.4s 1304 add v4.4s, v15.4s, v14.4s // +4 1305 add v7.4s, v3.4s, v14.4s 1306 add v15.4s, v4.4s, v14.4s // next counter 1307 add v5.4s, v1.4s, v14.4s 1308 1309 bl _bsaes_encrypt8_alt 1310 1311 subs x2, x2, #8 1312 blo .Lctr_enc_loop_done 1313 1314 ldr q16, [x0], #16 1315 ldr q17, [x0], #16 1316 eor v1.16b, v1.16b, v17.16b 1317 ldr q17, [x0], #16 1318 eor v0.16b, v0.16b, v16.16b 1319 eor v4.16b, v4.16b, v17.16b 1320 str q0, [x1], #16 1321 ldr q16, [x0], #16 1322 str q1, [x1], #16 1323 mov v0.16b, v15.16b 1324 str q4, [x1], #16 1325 ldr q1, [x0], #16 1326 eor v4.16b, v6.16b, v16.16b 1327 eor v1.16b, v3.16b, v1.16b 1328 ldr q3, [x0], #16 1329 eor v3.16b, v7.16b, v3.16b 1330 ldr q6, [x0], #16 1331 eor v2.16b, v2.16b, v6.16b 1332 ldr q6, [x0], #16 1333 eor v5.16b, v5.16b, v6.16b 1334 str q4, [x1], #16 1335 str q1, [x1], #16 1336 str q3, [x1], #16 1337 str q2, [x1], #16 1338 str q5, [x1], #16 1339 1340 bne .Lctr_enc_loop 1341 b .Lctr_enc_done 1342 1343.align 4 1344.Lctr_enc_loop_done: 1345 add x2, x2, #8 1346 ldr q16, [x0], #16 // load input 1347 eor v0.16b, v0.16b, v16.16b 1348 str q0, [x1], #16 // write output 1349 cmp x2, #2 1350 blo .Lctr_enc_done 1351 ldr q17, [x0], #16 1352 eor v1.16b, v1.16b, v17.16b 1353 str q1, [x1], #16 1354 beq .Lctr_enc_done 1355 ldr q18, [x0], #16 1356 eor v4.16b, v4.16b, v18.16b 1357 str q4, [x1], #16 1358 cmp x2, #4 1359 blo .Lctr_enc_done 1360 ldr q19, [x0], #16 1361 eor v6.16b, v6.16b, v19.16b 1362 str q6, [x1], #16 1363 beq .Lctr_enc_done 1364 ldr q20, [x0], #16 1365 eor v3.16b, v3.16b, v20.16b 1366 str q3, [x1], #16 1367 cmp x2, #6 1368 blo .Lctr_enc_done 1369 ldr q21, [x0], #16 1370 eor v7.16b, v7.16b, v21.16b 1371 str q7, [x1], #16 1372 beq .Lctr_enc_done 1373 ldr q22, [x0] 1374 eor v2.16b, v2.16b, v22.16b 1375 str q2, [x1], #16 1376 1377.Lctr_enc_done: 1378 movi v0.16b, #0 1379 movi v1.16b, #0 1380.Lctr_enc_bzero: // wipe key schedule [if any] 1381 stp q0, q1, [sp], #32 1382 cmp sp, x14 1383 bne .Lctr_enc_bzero 1384 1385 ldp d8, d9, [sp, #16] 1386 ldp d10, d11, [sp, #32] 1387 ldp d12, d13, [sp, #48] 1388 ldp d14, d15, [sp, #64] 1389 ldp x29, x30, [sp], #80 1390 ret 1391 1392.Lctr_enc_short: 1393 stp x29, x30, [sp, #-96]! 1394 stp x19, x20, [sp, #16] 1395 stp x21, x22, [sp, #32] 1396 str x23, [sp, #48] 1397 1398 mov x19, x0 // copy arguments 1399 mov x20, x1 1400 mov x21, x2 1401 mov x22, x3 1402 ldr w23, [x4, #12] // load counter .LSW 1403 ldr q1, [x4] // load whole counter value 1404#ifdef __AARCH64EL__ 1405 rev w23, w23 1406#endif 1407 str q1, [sp, #80] // copy counter value 1408 1409.Lctr_enc_short_loop: 1410 add x0, sp, #80 // input counter value 1411 add x1, sp, #64 // output on the stack 1412 mov x2, x22 // key 1413 1414 bl AES_encrypt 1415 1416 ldr q0, [x19], #16 // load input 1417 ldr q1, [sp, #64] // load encrypted counter 1418 add x23, x23, #1 1419#ifdef __AARCH64EL__ 1420 rev w0, w23 1421 str w0, [sp, #80+12] // next counter value 1422#else 1423 str w23, [sp, #80+12] // next counter value 1424#endif 1425 eor v0.16b, v0.16b, v1.16b 1426 str q0, [x20], #16 // store output 1427 subs x21, x21, #1 1428 bne .Lctr_enc_short_loop 1429 1430 movi v0.16b, #0 1431 movi v1.16b, #0 1432 stp q0, q1, [sp, #64] 1433 1434 ldr x23, [sp, #48] 1435 ldp x21, x22, [sp, #32] 1436 ldp x19, x20, [sp, #16] 1437 ldp x29, x30, [sp], #96 1438 ret 1439.size ossl_bsaes_ctr32_encrypt_blocks,.-ossl_bsaes_ctr32_encrypt_blocks 1440 1441.globl ossl_bsaes_xts_encrypt 1442.type ossl_bsaes_xts_encrypt,%function 1443.align 4 1444// On entry: 1445// x0 -> input plaintext 1446// x1 -> output ciphertext 1447// x2 -> length of text in bytes (must be at least 16) 1448// x3 -> key1 (used to encrypt the XORed plaintext blocks) 1449// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1450// x5 -> 16-byte initial vector (typically, sector number) 1451// On exit: 1452// Output ciphertext filled in 1453// No output registers, usual AAPCS64 register preservation 1454ossl_bsaes_xts_encrypt: 1455 AARCH64_VALID_CALL_TARGET 1456 // Stack layout: 1457 // sp -> 1458 // nrounds*128-96 bytes: key schedule 1459 // x19 -> 1460 // 16 bytes: frame record 1461 // 4*16 bytes: tweak storage across _bsaes_encrypt8 1462 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1463 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1464 stp x29, x30, [sp, #-192]! 1465 stp x19, x20, [sp, #80] 1466 stp x21, x22, [sp, #96] 1467 str x23, [sp, #112] 1468 stp d8, d9, [sp, #128] 1469 stp d10, d11, [sp, #144] 1470 stp d12, d13, [sp, #160] 1471 stp d14, d15, [sp, #176] 1472 1473 mov x19, sp 1474 mov x20, x0 1475 mov x21, x1 1476 mov x22, x2 1477 mov x23, x3 1478 1479 // generate initial tweak 1480 sub sp, sp, #16 1481 mov x0, x5 // iv[] 1482 mov x1, sp 1483 mov x2, x4 // key2 1484 bl AES_encrypt 1485 ldr q11, [sp], #16 1486 1487 ldr w1, [x23, #240] // get # of rounds 1488 // allocate the key schedule on the stack 1489 add x17, sp, #96 1490 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1491 1492 // populate the key schedule 1493 mov x9, x23 // pass key 1494 mov x10, x1 // pass # of rounds 1495 mov sp, x17 1496 bl _bsaes_key_convert 1497 eor v15.16b, v15.16b, v7.16b // fix up last round key 1498 str q15, [x17] // save last round key 1499 1500 subs x22, x22, #0x80 1501 blo .Lxts_enc_short 1502 b .Lxts_enc_loop 1503 1504.align 4 1505.Lxts_enc_loop: 1506 ldr q8, .Lxts_magic 1507 mov x10, x1 // pass rounds 1508 add x2, x19, #16 1509 ldr q0, [x20], #16 1510 sshr v1.2d, v11.2d, #63 1511 mov x9, sp // pass key schedule 1512 ldr q6, .Lxts_magic+16 1513 add v2.2d, v11.2d, v11.2d 1514 cmtst v3.2d, v11.2d, v6.2d 1515 and v1.16b, v1.16b, v8.16b 1516 ext v1.16b, v1.16b, v1.16b, #8 1517 and v3.16b, v3.16b, v8.16b 1518 ldr q4, [x20], #16 1519 eor v12.16b, v2.16b, v1.16b 1520 eor v1.16b, v4.16b, v12.16b 1521 eor v0.16b, v0.16b, v11.16b 1522 cmtst v2.2d, v12.2d, v6.2d 1523 add v4.2d, v12.2d, v12.2d 1524 add x0, x19, #16 1525 ext v3.16b, v3.16b, v3.16b, #8 1526 and v2.16b, v2.16b, v8.16b 1527 eor v13.16b, v4.16b, v3.16b 1528 ldr q3, [x20], #16 1529 ext v4.16b, v2.16b, v2.16b, #8 1530 eor v2.16b, v3.16b, v13.16b 1531 ldr q3, [x20], #16 1532 add v5.2d, v13.2d, v13.2d 1533 cmtst v7.2d, v13.2d, v6.2d 1534 and v7.16b, v7.16b, v8.16b 1535 ldr q9, [x20], #16 1536 ext v7.16b, v7.16b, v7.16b, #8 1537 ldr q10, [x20], #16 1538 eor v14.16b, v5.16b, v4.16b 1539 ldr q16, [x20], #16 1540 add v4.2d, v14.2d, v14.2d 1541 eor v3.16b, v3.16b, v14.16b 1542 eor v15.16b, v4.16b, v7.16b 1543 add v5.2d, v15.2d, v15.2d 1544 ldr q7, [x20], #16 1545 cmtst v4.2d, v14.2d, v6.2d 1546 and v17.16b, v4.16b, v8.16b 1547 cmtst v18.2d, v15.2d, v6.2d 1548 eor v4.16b, v9.16b, v15.16b 1549 ext v9.16b, v17.16b, v17.16b, #8 1550 eor v9.16b, v5.16b, v9.16b 1551 add v17.2d, v9.2d, v9.2d 1552 and v18.16b, v18.16b, v8.16b 1553 eor v5.16b, v10.16b, v9.16b 1554 str q9, [x2], #16 1555 ext v10.16b, v18.16b, v18.16b, #8 1556 cmtst v9.2d, v9.2d, v6.2d 1557 and v9.16b, v9.16b, v8.16b 1558 eor v10.16b, v17.16b, v10.16b 1559 cmtst v17.2d, v10.2d, v6.2d 1560 eor v6.16b, v16.16b, v10.16b 1561 str q10, [x2], #16 1562 ext v9.16b, v9.16b, v9.16b, #8 1563 add v10.2d, v10.2d, v10.2d 1564 eor v9.16b, v10.16b, v9.16b 1565 str q9, [x2], #16 1566 eor v7.16b, v7.16b, v9.16b 1567 add v9.2d, v9.2d, v9.2d 1568 and v8.16b, v17.16b, v8.16b 1569 ext v8.16b, v8.16b, v8.16b, #8 1570 eor v8.16b, v9.16b, v8.16b 1571 str q8, [x2] // next round tweak 1572 1573 bl _bsaes_encrypt8 1574 1575 ldr q8, [x0], #16 1576 eor v0.16b, v0.16b, v11.16b 1577 eor v1.16b, v1.16b, v12.16b 1578 ldr q9, [x0], #16 1579 eor v4.16b, v4.16b, v13.16b 1580 eor v6.16b, v6.16b, v14.16b 1581 ldr q10, [x0], #16 1582 eor v3.16b, v3.16b, v15.16b 1583 subs x22, x22, #0x80 1584 str q0, [x21], #16 1585 ldr q11, [x0] // next round tweak 1586 str q1, [x21], #16 1587 eor v0.16b, v7.16b, v8.16b 1588 eor v1.16b, v2.16b, v9.16b 1589 str q4, [x21], #16 1590 eor v2.16b, v5.16b, v10.16b 1591 str q6, [x21], #16 1592 str q3, [x21], #16 1593 str q0, [x21], #16 1594 str q1, [x21], #16 1595 str q2, [x21], #16 1596 bpl .Lxts_enc_loop 1597 1598.Lxts_enc_short: 1599 adds x22, x22, #0x70 1600 bmi .Lxts_enc_done 1601 1602 ldr q8, .Lxts_magic 1603 sshr v1.2d, v11.2d, #63 1604 add v2.2d, v11.2d, v11.2d 1605 ldr q9, .Lxts_magic+16 1606 subs x22, x22, #0x10 1607 ldr q0, [x20], #16 1608 and v1.16b, v1.16b, v8.16b 1609 cmtst v3.2d, v11.2d, v9.2d 1610 ext v1.16b, v1.16b, v1.16b, #8 1611 and v3.16b, v3.16b, v8.16b 1612 eor v12.16b, v2.16b, v1.16b 1613 ext v1.16b, v3.16b, v3.16b, #8 1614 add v2.2d, v12.2d, v12.2d 1615 cmtst v3.2d, v12.2d, v9.2d 1616 eor v13.16b, v2.16b, v1.16b 1617 and v22.16b, v3.16b, v8.16b 1618 bmi .Lxts_enc_1 1619 1620 ext v2.16b, v22.16b, v22.16b, #8 1621 add v3.2d, v13.2d, v13.2d 1622 ldr q1, [x20], #16 1623 cmtst v4.2d, v13.2d, v9.2d 1624 subs x22, x22, #0x10 1625 eor v14.16b, v3.16b, v2.16b 1626 and v23.16b, v4.16b, v8.16b 1627 bmi .Lxts_enc_2 1628 1629 ext v3.16b, v23.16b, v23.16b, #8 1630 add v4.2d, v14.2d, v14.2d 1631 ldr q2, [x20], #16 1632 cmtst v5.2d, v14.2d, v9.2d 1633 eor v0.16b, v0.16b, v11.16b 1634 subs x22, x22, #0x10 1635 eor v15.16b, v4.16b, v3.16b 1636 and v24.16b, v5.16b, v8.16b 1637 bmi .Lxts_enc_3 1638 1639 ext v4.16b, v24.16b, v24.16b, #8 1640 add v5.2d, v15.2d, v15.2d 1641 ldr q3, [x20], #16 1642 cmtst v6.2d, v15.2d, v9.2d 1643 eor v1.16b, v1.16b, v12.16b 1644 subs x22, x22, #0x10 1645 eor v16.16b, v5.16b, v4.16b 1646 and v25.16b, v6.16b, v8.16b 1647 bmi .Lxts_enc_4 1648 1649 ext v5.16b, v25.16b, v25.16b, #8 1650 add v6.2d, v16.2d, v16.2d 1651 add x0, x19, #16 1652 cmtst v7.2d, v16.2d, v9.2d 1653 ldr q4, [x20], #16 1654 eor v2.16b, v2.16b, v13.16b 1655 str q16, [x0], #16 1656 subs x22, x22, #0x10 1657 eor v17.16b, v6.16b, v5.16b 1658 and v26.16b, v7.16b, v8.16b 1659 bmi .Lxts_enc_5 1660 1661 ext v7.16b, v26.16b, v26.16b, #8 1662 add v18.2d, v17.2d, v17.2d 1663 ldr q5, [x20], #16 1664 eor v3.16b, v3.16b, v14.16b 1665 str q17, [x0], #16 1666 subs x22, x22, #0x10 1667 eor v18.16b, v18.16b, v7.16b 1668 bmi .Lxts_enc_6 1669 1670 ldr q6, [x20], #16 1671 eor v4.16b, v4.16b, v15.16b 1672 eor v5.16b, v5.16b, v16.16b 1673 str q18, [x0] // next round tweak 1674 mov x9, sp // pass key schedule 1675 mov x10, x1 1676 add x0, x19, #16 1677 sub x22, x22, #0x10 1678 eor v6.16b, v6.16b, v17.16b 1679 1680 bl _bsaes_encrypt8 1681 1682 ldr q16, [x0], #16 1683 eor v0.16b, v0.16b, v11.16b 1684 eor v1.16b, v1.16b, v12.16b 1685 ldr q17, [x0], #16 1686 eor v4.16b, v4.16b, v13.16b 1687 eor v6.16b, v6.16b, v14.16b 1688 eor v3.16b, v3.16b, v15.16b 1689 ldr q11, [x0] // next round tweak 1690 str q0, [x21], #16 1691 str q1, [x21], #16 1692 eor v0.16b, v7.16b, v16.16b 1693 eor v1.16b, v2.16b, v17.16b 1694 str q4, [x21], #16 1695 str q6, [x21], #16 1696 str q3, [x21], #16 1697 str q0, [x21], #16 1698 str q1, [x21], #16 1699 b .Lxts_enc_done 1700 1701.align 4 1702.Lxts_enc_6: 1703 eor v4.16b, v4.16b, v15.16b 1704 eor v5.16b, v5.16b, v16.16b 1705 mov x9, sp // pass key schedule 1706 mov x10, x1 // pass rounds 1707 add x0, x19, #16 1708 1709 bl _bsaes_encrypt8 1710 1711 ldr q16, [x0], #16 1712 eor v0.16b, v0.16b, v11.16b 1713 eor v1.16b, v1.16b, v12.16b 1714 eor v4.16b, v4.16b, v13.16b 1715 eor v6.16b, v6.16b, v14.16b 1716 ldr q11, [x0] // next round tweak 1717 eor v3.16b, v3.16b, v15.16b 1718 str q0, [x21], #16 1719 str q1, [x21], #16 1720 eor v0.16b, v7.16b, v16.16b 1721 str q4, [x21], #16 1722 str q6, [x21], #16 1723 str q3, [x21], #16 1724 str q0, [x21], #16 1725 b .Lxts_enc_done 1726 1727.align 4 1728.Lxts_enc_5: 1729 eor v3.16b, v3.16b, v14.16b 1730 eor v4.16b, v4.16b, v15.16b 1731 mov x9, sp // pass key schedule 1732 mov x10, x1 // pass rounds 1733 add x0, x19, #16 1734 1735 bl _bsaes_encrypt8 1736 1737 eor v0.16b, v0.16b, v11.16b 1738 eor v1.16b, v1.16b, v12.16b 1739 ldr q11, [x0] // next round tweak 1740 eor v4.16b, v4.16b, v13.16b 1741 eor v6.16b, v6.16b, v14.16b 1742 eor v3.16b, v3.16b, v15.16b 1743 str q0, [x21], #16 1744 str q1, [x21], #16 1745 str q4, [x21], #16 1746 str q6, [x21], #16 1747 str q3, [x21], #16 1748 b .Lxts_enc_done 1749 1750.align 4 1751.Lxts_enc_4: 1752 eor v2.16b, v2.16b, v13.16b 1753 eor v3.16b, v3.16b, v14.16b 1754 mov x9, sp // pass key schedule 1755 mov x10, x1 // pass rounds 1756 add x0, x19, #16 1757 1758 bl _bsaes_encrypt8 1759 1760 eor v0.16b, v0.16b, v11.16b 1761 eor v1.16b, v1.16b, v12.16b 1762 eor v4.16b, v4.16b, v13.16b 1763 eor v6.16b, v6.16b, v14.16b 1764 mov v11.16b, v15.16b // next round tweak 1765 str q0, [x21], #16 1766 str q1, [x21], #16 1767 str q4, [x21], #16 1768 str q6, [x21], #16 1769 b .Lxts_enc_done 1770 1771.align 4 1772.Lxts_enc_3: 1773 eor v1.16b, v1.16b, v12.16b 1774 eor v2.16b, v2.16b, v13.16b 1775 mov x9, sp // pass key schedule 1776 mov x10, x1 // pass rounds 1777 add x0, x19, #16 1778 1779 bl _bsaes_encrypt8 1780 1781 eor v0.16b, v0.16b, v11.16b 1782 eor v1.16b, v1.16b, v12.16b 1783 eor v4.16b, v4.16b, v13.16b 1784 mov v11.16b, v14.16b // next round tweak 1785 str q0, [x21], #16 1786 str q1, [x21], #16 1787 str q4, [x21], #16 1788 b .Lxts_enc_done 1789 1790.align 4 1791.Lxts_enc_2: 1792 eor v0.16b, v0.16b, v11.16b 1793 eor v1.16b, v1.16b, v12.16b 1794 mov x9, sp // pass key schedule 1795 mov x10, x1 // pass rounds 1796 add x0, x19, #16 1797 1798 bl _bsaes_encrypt8 1799 1800 eor v0.16b, v0.16b, v11.16b 1801 eor v1.16b, v1.16b, v12.16b 1802 mov v11.16b, v13.16b // next round tweak 1803 str q0, [x21], #16 1804 str q1, [x21], #16 1805 b .Lxts_enc_done 1806 1807.align 4 1808.Lxts_enc_1: 1809 eor v0.16b, v0.16b, v11.16b 1810 sub x0, sp, #16 1811 sub x1, sp, #16 1812 mov x2, x23 1813 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1814 mov v14.d[0], v12.d[1] 1815 str q0, [sp, #-16]! 1816 1817 bl AES_encrypt 1818 1819 ldr q0, [sp], #16 1820 trn1 v13.2d, v11.2d, v13.2d 1821 trn1 v11.2d, v12.2d, v14.2d // next round tweak 1822 eor v0.16b, v0.16b, v13.16b 1823 str q0, [x21], #16 1824 1825.Lxts_enc_done: 1826 adds x22, x22, #0x10 1827 beq .Lxts_enc_ret 1828 1829 sub x6, x21, #0x10 1830 // Penultimate plaintext block produces final ciphertext part-block 1831 // plus remaining part of final plaintext block. Move ciphertext part 1832 // to final position and reuse penultimate ciphertext block buffer to 1833 // construct final plaintext block 1834.Lxts_enc_steal: 1835 ldrb w0, [x20], #1 1836 ldrb w1, [x21, #-0x10] 1837 strb w0, [x21, #-0x10] 1838 strb w1, [x21], #1 1839 1840 subs x22, x22, #1 1841 bhi .Lxts_enc_steal 1842 1843 // Finally encrypt the penultimate ciphertext block using the 1844 // last tweak 1845 ldr q0, [x6] 1846 eor v0.16b, v0.16b, v11.16b 1847 str q0, [sp, #-16]! 1848 mov x0, sp 1849 mov x1, sp 1850 mov x2, x23 1851 mov x21, x6 1852 mov v13.d[0], v11.d[1] // just in case AES_encrypt corrupts top half of callee-saved SIMD registers 1853 1854 bl AES_encrypt 1855 1856 trn1 v11.2d, v11.2d, v13.2d 1857 ldr q0, [sp], #16 1858 eor v0.16b, v0.16b, v11.16b 1859 str q0, [x21] 1860 1861.Lxts_enc_ret: 1862 1863 movi v0.16b, #0 1864 movi v1.16b, #0 1865.Lxts_enc_bzero: // wipe key schedule 1866 stp q0, q1, [sp], #32 1867 cmp sp, x19 1868 bne .Lxts_enc_bzero 1869 1870 ldp x19, x20, [sp, #80] 1871 ldp x21, x22, [sp, #96] 1872 ldr x23, [sp, #112] 1873 ldp d8, d9, [sp, #128] 1874 ldp d10, d11, [sp, #144] 1875 ldp d12, d13, [sp, #160] 1876 ldp d14, d15, [sp, #176] 1877 ldp x29, x30, [sp], #192 1878 ret 1879.size ossl_bsaes_xts_encrypt,.-ossl_bsaes_xts_encrypt 1880 1881// The assembler doesn't seem capable of de-duplicating these when expressed 1882// using `ldr qd,=` syntax, so assign a symbolic address 1883.align 5 1884.Lxts_magic: 1885.quad 1, 0x87, 0x4000000000000000, 0x4000000000000000 1886 1887.globl ossl_bsaes_xts_decrypt 1888.type ossl_bsaes_xts_decrypt,%function 1889.align 4 1890// On entry: 1891// x0 -> input ciphertext 1892// x1 -> output plaintext 1893// x2 -> length of text in bytes (must be at least 16) 1894// x3 -> key1 (used to decrypt the XORed ciphertext blocks) 1895// x4 -> key2 (used to encrypt the initial vector to yield the initial tweak) 1896// x5 -> 16-byte initial vector (typically, sector number) 1897// On exit: 1898// Output plaintext filled in 1899// No output registers, usual AAPCS64 register preservation 1900ossl_bsaes_xts_decrypt: 1901 AARCH64_VALID_CALL_TARGET 1902 // Stack layout: 1903 // sp -> 1904 // nrounds*128-96 bytes: key schedule 1905 // x19 -> 1906 // 16 bytes: frame record 1907 // 4*16 bytes: tweak storage across _bsaes_decrypt8 1908 // 6*8 bytes: storage for 5 callee-saved general-purpose registers 1909 // 8*8 bytes: storage for 8 callee-saved SIMD registers 1910 stp x29, x30, [sp, #-192]! 1911 stp x19, x20, [sp, #80] 1912 stp x21, x22, [sp, #96] 1913 str x23, [sp, #112] 1914 stp d8, d9, [sp, #128] 1915 stp d10, d11, [sp, #144] 1916 stp d12, d13, [sp, #160] 1917 stp d14, d15, [sp, #176] 1918 1919 mov x19, sp 1920 mov x20, x0 1921 mov x21, x1 1922 mov x22, x2 1923 mov x23, x3 1924 1925 // generate initial tweak 1926 sub sp, sp, #16 1927 mov x0, x5 // iv[] 1928 mov x1, sp 1929 mov x2, x4 // key2 1930 bl AES_encrypt 1931 ldr q11, [sp], #16 1932 1933 ldr w1, [x23, #240] // get # of rounds 1934 // allocate the key schedule on the stack 1935 add x17, sp, #96 1936 sub x17, x17, x1, lsl #7 // 128 bytes per inner round key, less 96 bytes 1937 1938 // populate the key schedule 1939 mov x9, x23 // pass key 1940 mov x10, x1 // pass # of rounds 1941 mov sp, x17 1942 bl _bsaes_key_convert 1943 ldr q6, [sp] 1944 str q15, [x17] // save last round key 1945 eor v6.16b, v6.16b, v7.16b // fix up round 0 key (by XORing with 0x63) 1946 str q6, [sp] 1947 1948 sub x30, x22, #0x10 1949 tst x22, #0xf // if not multiple of 16 1950 csel x22, x30, x22, ne // subtract another 16 bytes 1951 subs x22, x22, #0x80 1952 1953 blo .Lxts_dec_short 1954 b .Lxts_dec_loop 1955 1956.align 4 1957.Lxts_dec_loop: 1958 ldr q8, .Lxts_magic 1959 mov x10, x1 // pass rounds 1960 add x2, x19, #16 1961 ldr q0, [x20], #16 1962 sshr v1.2d, v11.2d, #63 1963 mov x9, sp // pass key schedule 1964 ldr q6, .Lxts_magic+16 1965 add v2.2d, v11.2d, v11.2d 1966 cmtst v3.2d, v11.2d, v6.2d 1967 and v1.16b, v1.16b, v8.16b 1968 ext v1.16b, v1.16b, v1.16b, #8 1969 and v3.16b, v3.16b, v8.16b 1970 ldr q4, [x20], #16 1971 eor v12.16b, v2.16b, v1.16b 1972 eor v1.16b, v4.16b, v12.16b 1973 eor v0.16b, v0.16b, v11.16b 1974 cmtst v2.2d, v12.2d, v6.2d 1975 add v4.2d, v12.2d, v12.2d 1976 add x0, x19, #16 1977 ext v3.16b, v3.16b, v3.16b, #8 1978 and v2.16b, v2.16b, v8.16b 1979 eor v13.16b, v4.16b, v3.16b 1980 ldr q3, [x20], #16 1981 ext v4.16b, v2.16b, v2.16b, #8 1982 eor v2.16b, v3.16b, v13.16b 1983 ldr q3, [x20], #16 1984 add v5.2d, v13.2d, v13.2d 1985 cmtst v7.2d, v13.2d, v6.2d 1986 and v7.16b, v7.16b, v8.16b 1987 ldr q9, [x20], #16 1988 ext v7.16b, v7.16b, v7.16b, #8 1989 ldr q10, [x20], #16 1990 eor v14.16b, v5.16b, v4.16b 1991 ldr q16, [x20], #16 1992 add v4.2d, v14.2d, v14.2d 1993 eor v3.16b, v3.16b, v14.16b 1994 eor v15.16b, v4.16b, v7.16b 1995 add v5.2d, v15.2d, v15.2d 1996 ldr q7, [x20], #16 1997 cmtst v4.2d, v14.2d, v6.2d 1998 and v17.16b, v4.16b, v8.16b 1999 cmtst v18.2d, v15.2d, v6.2d 2000 eor v4.16b, v9.16b, v15.16b 2001 ext v9.16b, v17.16b, v17.16b, #8 2002 eor v9.16b, v5.16b, v9.16b 2003 add v17.2d, v9.2d, v9.2d 2004 and v18.16b, v18.16b, v8.16b 2005 eor v5.16b, v10.16b, v9.16b 2006 str q9, [x2], #16 2007 ext v10.16b, v18.16b, v18.16b, #8 2008 cmtst v9.2d, v9.2d, v6.2d 2009 and v9.16b, v9.16b, v8.16b 2010 eor v10.16b, v17.16b, v10.16b 2011 cmtst v17.2d, v10.2d, v6.2d 2012 eor v6.16b, v16.16b, v10.16b 2013 str q10, [x2], #16 2014 ext v9.16b, v9.16b, v9.16b, #8 2015 add v10.2d, v10.2d, v10.2d 2016 eor v9.16b, v10.16b, v9.16b 2017 str q9, [x2], #16 2018 eor v7.16b, v7.16b, v9.16b 2019 add v9.2d, v9.2d, v9.2d 2020 and v8.16b, v17.16b, v8.16b 2021 ext v8.16b, v8.16b, v8.16b, #8 2022 eor v8.16b, v9.16b, v8.16b 2023 str q8, [x2] // next round tweak 2024 2025 bl _bsaes_decrypt8 2026 2027 eor v6.16b, v6.16b, v13.16b 2028 eor v0.16b, v0.16b, v11.16b 2029 ldr q8, [x0], #16 2030 eor v7.16b, v7.16b, v8.16b 2031 str q0, [x21], #16 2032 eor v0.16b, v1.16b, v12.16b 2033 ldr q1, [x0], #16 2034 eor v1.16b, v3.16b, v1.16b 2035 subs x22, x22, #0x80 2036 eor v2.16b, v2.16b, v15.16b 2037 eor v3.16b, v4.16b, v14.16b 2038 ldr q4, [x0], #16 2039 str q0, [x21], #16 2040 ldr q11, [x0] // next round tweak 2041 eor v0.16b, v5.16b, v4.16b 2042 str q6, [x21], #16 2043 str q3, [x21], #16 2044 str q2, [x21], #16 2045 str q7, [x21], #16 2046 str q1, [x21], #16 2047 str q0, [x21], #16 2048 bpl .Lxts_dec_loop 2049 2050.Lxts_dec_short: 2051 adds x22, x22, #0x70 2052 bmi .Lxts_dec_done 2053 2054 ldr q8, .Lxts_magic 2055 sshr v1.2d, v11.2d, #63 2056 add v2.2d, v11.2d, v11.2d 2057 ldr q9, .Lxts_magic+16 2058 subs x22, x22, #0x10 2059 ldr q0, [x20], #16 2060 and v1.16b, v1.16b, v8.16b 2061 cmtst v3.2d, v11.2d, v9.2d 2062 ext v1.16b, v1.16b, v1.16b, #8 2063 and v3.16b, v3.16b, v8.16b 2064 eor v12.16b, v2.16b, v1.16b 2065 ext v1.16b, v3.16b, v3.16b, #8 2066 add v2.2d, v12.2d, v12.2d 2067 cmtst v3.2d, v12.2d, v9.2d 2068 eor v13.16b, v2.16b, v1.16b 2069 and v22.16b, v3.16b, v8.16b 2070 bmi .Lxts_dec_1 2071 2072 ext v2.16b, v22.16b, v22.16b, #8 2073 add v3.2d, v13.2d, v13.2d 2074 ldr q1, [x20], #16 2075 cmtst v4.2d, v13.2d, v9.2d 2076 subs x22, x22, #0x10 2077 eor v14.16b, v3.16b, v2.16b 2078 and v23.16b, v4.16b, v8.16b 2079 bmi .Lxts_dec_2 2080 2081 ext v3.16b, v23.16b, v23.16b, #8 2082 add v4.2d, v14.2d, v14.2d 2083 ldr q2, [x20], #16 2084 cmtst v5.2d, v14.2d, v9.2d 2085 eor v0.16b, v0.16b, v11.16b 2086 subs x22, x22, #0x10 2087 eor v15.16b, v4.16b, v3.16b 2088 and v24.16b, v5.16b, v8.16b 2089 bmi .Lxts_dec_3 2090 2091 ext v4.16b, v24.16b, v24.16b, #8 2092 add v5.2d, v15.2d, v15.2d 2093 ldr q3, [x20], #16 2094 cmtst v6.2d, v15.2d, v9.2d 2095 eor v1.16b, v1.16b, v12.16b 2096 subs x22, x22, #0x10 2097 eor v16.16b, v5.16b, v4.16b 2098 and v25.16b, v6.16b, v8.16b 2099 bmi .Lxts_dec_4 2100 2101 ext v5.16b, v25.16b, v25.16b, #8 2102 add v6.2d, v16.2d, v16.2d 2103 add x0, x19, #16 2104 cmtst v7.2d, v16.2d, v9.2d 2105 ldr q4, [x20], #16 2106 eor v2.16b, v2.16b, v13.16b 2107 str q16, [x0], #16 2108 subs x22, x22, #0x10 2109 eor v17.16b, v6.16b, v5.16b 2110 and v26.16b, v7.16b, v8.16b 2111 bmi .Lxts_dec_5 2112 2113 ext v7.16b, v26.16b, v26.16b, #8 2114 add v18.2d, v17.2d, v17.2d 2115 ldr q5, [x20], #16 2116 eor v3.16b, v3.16b, v14.16b 2117 str q17, [x0], #16 2118 subs x22, x22, #0x10 2119 eor v18.16b, v18.16b, v7.16b 2120 bmi .Lxts_dec_6 2121 2122 ldr q6, [x20], #16 2123 eor v4.16b, v4.16b, v15.16b 2124 eor v5.16b, v5.16b, v16.16b 2125 str q18, [x0] // next round tweak 2126 mov x9, sp // pass key schedule 2127 mov x10, x1 2128 add x0, x19, #16 2129 sub x22, x22, #0x10 2130 eor v6.16b, v6.16b, v17.16b 2131 2132 bl _bsaes_decrypt8 2133 2134 ldr q16, [x0], #16 2135 eor v0.16b, v0.16b, v11.16b 2136 eor v1.16b, v1.16b, v12.16b 2137 ldr q17, [x0], #16 2138 eor v6.16b, v6.16b, v13.16b 2139 eor v4.16b, v4.16b, v14.16b 2140 eor v2.16b, v2.16b, v15.16b 2141 ldr q11, [x0] // next round tweak 2142 str q0, [x21], #16 2143 str q1, [x21], #16 2144 eor v0.16b, v7.16b, v16.16b 2145 eor v1.16b, v3.16b, v17.16b 2146 str q6, [x21], #16 2147 str q4, [x21], #16 2148 str q2, [x21], #16 2149 str q0, [x21], #16 2150 str q1, [x21], #16 2151 b .Lxts_dec_done 2152 2153.align 4 2154.Lxts_dec_6: 2155 eor v4.16b, v4.16b, v15.16b 2156 eor v5.16b, v5.16b, v16.16b 2157 mov x9, sp // pass key schedule 2158 mov x10, x1 // pass rounds 2159 add x0, x19, #16 2160 2161 bl _bsaes_decrypt8 2162 2163 ldr q16, [x0], #16 2164 eor v0.16b, v0.16b, v11.16b 2165 eor v1.16b, v1.16b, v12.16b 2166 eor v6.16b, v6.16b, v13.16b 2167 eor v4.16b, v4.16b, v14.16b 2168 ldr q11, [x0] // next round tweak 2169 eor v2.16b, v2.16b, v15.16b 2170 str q0, [x21], #16 2171 str q1, [x21], #16 2172 eor v0.16b, v7.16b, v16.16b 2173 str q6, [x21], #16 2174 str q4, [x21], #16 2175 str q2, [x21], #16 2176 str q0, [x21], #16 2177 b .Lxts_dec_done 2178 2179.align 4 2180.Lxts_dec_5: 2181 eor v3.16b, v3.16b, v14.16b 2182 eor v4.16b, v4.16b, v15.16b 2183 mov x9, sp // pass key schedule 2184 mov x10, x1 // pass rounds 2185 add x0, x19, #16 2186 2187 bl _bsaes_decrypt8 2188 2189 eor v0.16b, v0.16b, v11.16b 2190 eor v1.16b, v1.16b, v12.16b 2191 ldr q11, [x0] // next round tweak 2192 eor v6.16b, v6.16b, v13.16b 2193 eor v4.16b, v4.16b, v14.16b 2194 eor v2.16b, v2.16b, v15.16b 2195 str q0, [x21], #16 2196 str q1, [x21], #16 2197 str q6, [x21], #16 2198 str q4, [x21], #16 2199 str q2, [x21], #16 2200 b .Lxts_dec_done 2201 2202.align 4 2203.Lxts_dec_4: 2204 eor v2.16b, v2.16b, v13.16b 2205 eor v3.16b, v3.16b, v14.16b 2206 mov x9, sp // pass key schedule 2207 mov x10, x1 // pass rounds 2208 add x0, x19, #16 2209 2210 bl _bsaes_decrypt8 2211 2212 eor v0.16b, v0.16b, v11.16b 2213 eor v1.16b, v1.16b, v12.16b 2214 eor v6.16b, v6.16b, v13.16b 2215 eor v4.16b, v4.16b, v14.16b 2216 mov v11.16b, v15.16b // next round tweak 2217 str q0, [x21], #16 2218 str q1, [x21], #16 2219 str q6, [x21], #16 2220 str q4, [x21], #16 2221 b .Lxts_dec_done 2222 2223.align 4 2224.Lxts_dec_3: 2225 eor v1.16b, v1.16b, v12.16b 2226 eor v2.16b, v2.16b, v13.16b 2227 mov x9, sp // pass key schedule 2228 mov x10, x1 // pass rounds 2229 add x0, x19, #16 2230 2231 bl _bsaes_decrypt8 2232 2233 eor v0.16b, v0.16b, v11.16b 2234 eor v1.16b, v1.16b, v12.16b 2235 eor v6.16b, v6.16b, v13.16b 2236 mov v11.16b, v14.16b // next round tweak 2237 str q0, [x21], #16 2238 str q1, [x21], #16 2239 str q6, [x21], #16 2240 b .Lxts_dec_done 2241 2242.align 4 2243.Lxts_dec_2: 2244 eor v0.16b, v0.16b, v11.16b 2245 eor v1.16b, v1.16b, v12.16b 2246 mov x9, sp // pass key schedule 2247 mov x10, x1 // pass rounds 2248 add x0, x19, #16 2249 2250 bl _bsaes_decrypt8 2251 2252 eor v0.16b, v0.16b, v11.16b 2253 eor v1.16b, v1.16b, v12.16b 2254 mov v11.16b, v13.16b // next round tweak 2255 str q0, [x21], #16 2256 str q1, [x21], #16 2257 b .Lxts_dec_done 2258 2259.align 4 2260.Lxts_dec_1: 2261 eor v0.16b, v0.16b, v11.16b 2262 sub x0, sp, #16 2263 sub x1, sp, #16 2264 mov x2, x23 2265 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2266 mov v14.d[0], v12.d[1] 2267 str q0, [sp, #-16]! 2268 2269 bl AES_decrypt 2270 2271 ldr q0, [sp], #16 2272 trn1 v13.2d, v11.2d, v13.2d 2273 trn1 v11.2d, v12.2d, v14.2d // next round tweak 2274 eor v0.16b, v0.16b, v13.16b 2275 str q0, [x21], #16 2276 2277.Lxts_dec_done: 2278 adds x22, x22, #0x10 2279 beq .Lxts_dec_ret 2280 2281 // calculate one round of extra tweak for the stolen ciphertext 2282 ldr q8, .Lxts_magic 2283 sshr v6.2d, v11.2d, #63 2284 and v6.16b, v6.16b, v8.16b 2285 add v12.2d, v11.2d, v11.2d 2286 ext v6.16b, v6.16b, v6.16b, #8 2287 eor v12.16b, v12.16b, v6.16b 2288 2289 // perform the final decryption with the last tweak value 2290 ldr q0, [x20], #16 2291 eor v0.16b, v0.16b, v12.16b 2292 str q0, [sp, #-16]! 2293 mov x0, sp 2294 mov x1, sp 2295 mov x2, x23 2296 mov v13.d[0], v11.d[1] // just in case AES_decrypt corrupts top half of callee-saved SIMD registers 2297 mov v14.d[0], v12.d[1] 2298 2299 bl AES_decrypt 2300 2301 trn1 v12.2d, v12.2d, v14.2d 2302 trn1 v11.2d, v11.2d, v13.2d 2303 ldr q0, [sp], #16 2304 eor v0.16b, v0.16b, v12.16b 2305 str q0, [x21] 2306 2307 mov x6, x21 2308 // Penultimate ciphertext block produces final plaintext part-block 2309 // plus remaining part of final ciphertext block. Move plaintext part 2310 // to final position and reuse penultimate plaintext block buffer to 2311 // construct final ciphertext block 2312.Lxts_dec_steal: 2313 ldrb w1, [x21] 2314 ldrb w0, [x20], #1 2315 strb w1, [x21, #0x10] 2316 strb w0, [x21], #1 2317 2318 subs x22, x22, #1 2319 bhi .Lxts_dec_steal 2320 2321 // Finally decrypt the penultimate plaintext block using the 2322 // penultimate tweak 2323 ldr q0, [x6] 2324 eor v0.16b, v0.16b, v11.16b 2325 str q0, [sp, #-16]! 2326 mov x0, sp 2327 mov x1, sp 2328 mov x2, x23 2329 mov x21, x6 2330 2331 bl AES_decrypt 2332 2333 trn1 v11.2d, v11.2d, v13.2d 2334 ldr q0, [sp], #16 2335 eor v0.16b, v0.16b, v11.16b 2336 str q0, [x21] 2337 2338.Lxts_dec_ret: 2339 2340 movi v0.16b, #0 2341 movi v1.16b, #0 2342.Lxts_dec_bzero: // wipe key schedule 2343 stp q0, q1, [sp], #32 2344 cmp sp, x19 2345 bne .Lxts_dec_bzero 2346 2347 ldp x19, x20, [sp, #80] 2348 ldp x21, x22, [sp, #96] 2349 ldr x23, [sp, #112] 2350 ldp d8, d9, [sp, #128] 2351 ldp d10, d11, [sp, #144] 2352 ldp d12, d13, [sp, #160] 2353 ldp d14, d15, [sp, #176] 2354 ldp x29, x30, [sp], #192 2355 ret 2356.size ossl_bsaes_xts_decrypt,.-ossl_bsaes_xts_decrypt 2357