1/* Do not modify. This file is auto-generated from aesv8-armx.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=7 5.arch armv8-a+crypto 6.text 7.align 5 8.Lrcon: 9.long 0x01,0x01,0x01,0x01 10.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 11.long 0x1b,0x1b,0x1b,0x1b 12 13.globl aes_v8_set_encrypt_key 14.type aes_v8_set_encrypt_key,%function 15.align 5 16aes_v8_set_encrypt_key: 17.Lenc_key: 18 AARCH64_VALID_CALL_TARGET 19 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 20 stp x29,x30,[sp,#-16]! 21 add x29,sp,#0 22 mov x3,#-1 23 cmp x0,#0 24 b.eq .Lenc_key_abort 25 cmp x2,#0 26 b.eq .Lenc_key_abort 27 mov x3,#-2 28 cmp w1,#128 29 b.lt .Lenc_key_abort 30 cmp w1,#256 31 b.gt .Lenc_key_abort 32 tst w1,#0x3f 33 b.ne .Lenc_key_abort 34 35 adr x3,.Lrcon 36 cmp w1,#192 37 38 eor v0.16b,v0.16b,v0.16b 39 ld1 {v3.16b},[x0],#16 40 mov w1,#8 // reuse w1 41 ld1 {v1.4s,v2.4s},[x3],#32 42 43 b.lt .Loop128 44 b.eq .L192 45 b .L256 46 47.align 4 48.Loop128: 49 tbl v6.16b,{v3.16b},v2.16b 50 ext v5.16b,v0.16b,v3.16b,#12 51 st1 {v3.4s},[x2],#16 52 aese v6.16b,v0.16b 53 subs w1,w1,#1 54 55 eor v3.16b,v3.16b,v5.16b 56 ext v5.16b,v0.16b,v5.16b,#12 57 eor v3.16b,v3.16b,v5.16b 58 ext v5.16b,v0.16b,v5.16b,#12 59 eor v6.16b,v6.16b,v1.16b 60 eor v3.16b,v3.16b,v5.16b 61 shl v1.16b,v1.16b,#1 62 eor v3.16b,v3.16b,v6.16b 63 b.ne .Loop128 64 65 ld1 {v1.4s},[x3] 66 67 tbl v6.16b,{v3.16b},v2.16b 68 ext v5.16b,v0.16b,v3.16b,#12 69 st1 {v3.4s},[x2],#16 70 aese v6.16b,v0.16b 71 72 eor v3.16b,v3.16b,v5.16b 73 ext v5.16b,v0.16b,v5.16b,#12 74 eor v3.16b,v3.16b,v5.16b 75 ext v5.16b,v0.16b,v5.16b,#12 76 eor v6.16b,v6.16b,v1.16b 77 eor v3.16b,v3.16b,v5.16b 78 shl v1.16b,v1.16b,#1 79 eor v3.16b,v3.16b,v6.16b 80 81 tbl v6.16b,{v3.16b},v2.16b 82 ext v5.16b,v0.16b,v3.16b,#12 83 st1 {v3.4s},[x2],#16 84 aese v6.16b,v0.16b 85 86 eor v3.16b,v3.16b,v5.16b 87 ext v5.16b,v0.16b,v5.16b,#12 88 eor v3.16b,v3.16b,v5.16b 89 ext v5.16b,v0.16b,v5.16b,#12 90 eor v6.16b,v6.16b,v1.16b 91 eor v3.16b,v3.16b,v5.16b 92 eor v3.16b,v3.16b,v6.16b 93 st1 {v3.4s},[x2] 94 add x2,x2,#0x50 95 96 mov w12,#10 97 b .Ldone 98 99.align 4 100.L192: 101 ld1 {v4.8b},[x0],#8 102 movi v6.16b,#8 // borrow v6.16b 103 st1 {v3.4s},[x2],#16 104 sub v2.16b,v2.16b,v6.16b // adjust the mask 105 106.Loop192: 107 tbl v6.16b,{v4.16b},v2.16b 108 ext v5.16b,v0.16b,v3.16b,#12 109#ifdef __AARCH64EB__ 110 st1 {v4.4s},[x2],#16 111 sub x2,x2,#8 112#else 113 st1 {v4.8b},[x2],#8 114#endif 115 aese v6.16b,v0.16b 116 subs w1,w1,#1 117 118 eor v3.16b,v3.16b,v5.16b 119 ext v5.16b,v0.16b,v5.16b,#12 120 eor v3.16b,v3.16b,v5.16b 121 ext v5.16b,v0.16b,v5.16b,#12 122 eor v3.16b,v3.16b,v5.16b 123 124 dup v5.4s,v3.s[3] 125 eor v5.16b,v5.16b,v4.16b 126 eor v6.16b,v6.16b,v1.16b 127 ext v4.16b,v0.16b,v4.16b,#12 128 shl v1.16b,v1.16b,#1 129 eor v4.16b,v4.16b,v5.16b 130 eor v3.16b,v3.16b,v6.16b 131 eor v4.16b,v4.16b,v6.16b 132 st1 {v3.4s},[x2],#16 133 b.ne .Loop192 134 135 mov w12,#12 136 add x2,x2,#0x20 137 b .Ldone 138 139.align 4 140.L256: 141 ld1 {v4.16b},[x0] 142 mov w1,#7 143 mov w12,#14 144 st1 {v3.4s},[x2],#16 145 146.Loop256: 147 tbl v6.16b,{v4.16b},v2.16b 148 ext v5.16b,v0.16b,v3.16b,#12 149 st1 {v4.4s},[x2],#16 150 aese v6.16b,v0.16b 151 subs w1,w1,#1 152 153 eor v3.16b,v3.16b,v5.16b 154 ext v5.16b,v0.16b,v5.16b,#12 155 eor v3.16b,v3.16b,v5.16b 156 ext v5.16b,v0.16b,v5.16b,#12 157 eor v6.16b,v6.16b,v1.16b 158 eor v3.16b,v3.16b,v5.16b 159 shl v1.16b,v1.16b,#1 160 eor v3.16b,v3.16b,v6.16b 161 st1 {v3.4s},[x2],#16 162 b.eq .Ldone 163 164 dup v6.4s,v3.s[3] // just splat 165 ext v5.16b,v0.16b,v4.16b,#12 166 aese v6.16b,v0.16b 167 168 eor v4.16b,v4.16b,v5.16b 169 ext v5.16b,v0.16b,v5.16b,#12 170 eor v4.16b,v4.16b,v5.16b 171 ext v5.16b,v0.16b,v5.16b,#12 172 eor v4.16b,v4.16b,v5.16b 173 174 eor v4.16b,v4.16b,v6.16b 175 b .Loop256 176 177.Ldone: 178 str w12,[x2] 179 mov x3,#0 180 181.Lenc_key_abort: 182 mov x0,x3 // return value 183 ldr x29,[sp],#16 184 ret 185.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key 186 187.globl aes_v8_set_decrypt_key 188.type aes_v8_set_decrypt_key,%function 189.align 5 190aes_v8_set_decrypt_key: 191 AARCH64_SIGN_LINK_REGISTER 192 stp x29,x30,[sp,#-16]! 193 add x29,sp,#0 194 bl .Lenc_key 195 196 cmp x0,#0 197 b.ne .Ldec_key_abort 198 199 sub x2,x2,#240 // restore original x2 200 mov x4,#-16 201 add x0,x2,x12,lsl#4 // end of key schedule 202 203 ld1 {v0.4s},[x2] 204 ld1 {v1.4s},[x0] 205 st1 {v0.4s},[x0],x4 206 st1 {v1.4s},[x2],#16 207 208.Loop_imc: 209 ld1 {v0.4s},[x2] 210 ld1 {v1.4s},[x0] 211 aesimc v0.16b,v0.16b 212 aesimc v1.16b,v1.16b 213 st1 {v0.4s},[x0],x4 214 st1 {v1.4s},[x2],#16 215 cmp x0,x2 216 b.hi .Loop_imc 217 218 ld1 {v0.4s},[x2] 219 aesimc v0.16b,v0.16b 220 st1 {v0.4s},[x0] 221 222 eor x0,x0,x0 // return value 223.Ldec_key_abort: 224 ldp x29,x30,[sp],#16 225 AARCH64_VALIDATE_LINK_REGISTER 226 ret 227.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key 228.globl aes_v8_encrypt 229.type aes_v8_encrypt,%function 230.align 5 231aes_v8_encrypt: 232 AARCH64_VALID_CALL_TARGET 233 ldr w3,[x2,#240] 234 ld1 {v0.4s},[x2],#16 235 ld1 {v2.16b},[x0] 236 sub w3,w3,#2 237 ld1 {v1.4s},[x2],#16 238 239.Loop_enc: 240 aese v2.16b,v0.16b 241 aesmc v2.16b,v2.16b 242 ld1 {v0.4s},[x2],#16 243 subs w3,w3,#2 244 aese v2.16b,v1.16b 245 aesmc v2.16b,v2.16b 246 ld1 {v1.4s},[x2],#16 247 b.gt .Loop_enc 248 249 aese v2.16b,v0.16b 250 aesmc v2.16b,v2.16b 251 ld1 {v0.4s},[x2] 252 aese v2.16b,v1.16b 253 eor v2.16b,v2.16b,v0.16b 254 255 st1 {v2.16b},[x1] 256 ret 257.size aes_v8_encrypt,.-aes_v8_encrypt 258.globl aes_v8_decrypt 259.type aes_v8_decrypt,%function 260.align 5 261aes_v8_decrypt: 262 AARCH64_VALID_CALL_TARGET 263 ldr w3,[x2,#240] 264 ld1 {v0.4s},[x2],#16 265 ld1 {v2.16b},[x0] 266 sub w3,w3,#2 267 ld1 {v1.4s},[x2],#16 268 269.Loop_dec: 270 aesd v2.16b,v0.16b 271 aesimc v2.16b,v2.16b 272 ld1 {v0.4s},[x2],#16 273 subs w3,w3,#2 274 aesd v2.16b,v1.16b 275 aesimc v2.16b,v2.16b 276 ld1 {v1.4s},[x2],#16 277 b.gt .Loop_dec 278 279 aesd v2.16b,v0.16b 280 aesimc v2.16b,v2.16b 281 ld1 {v0.4s},[x2] 282 aesd v2.16b,v1.16b 283 eor v2.16b,v2.16b,v0.16b 284 285 st1 {v2.16b},[x1] 286 ret 287.size aes_v8_decrypt,.-aes_v8_decrypt 288.globl aes_v8_ecb_encrypt 289.type aes_v8_ecb_encrypt,%function 290.align 5 291aes_v8_ecb_encrypt: 292 AARCH64_VALID_CALL_TARGET 293 subs x2,x2,#16 294 // Original input data size bigger than 16, jump to big size processing. 295 b.ne .Lecb_big_size 296 ld1 {v0.16b},[x0] 297 cmp w4,#0 // en- or decrypting? 298 ldr w5,[x3,#240] 299 ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... 300 301 b.eq .Lecb_small_dec 302 aese v0.16b,v5.16b 303 aesmc v0.16b,v0.16b 304 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 305 aese v0.16b,v6.16b 306 aesmc v0.16b,v0.16b 307 subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing 308 b.eq .Lecb_128_enc 309.Lecb_round_loop: 310 aese v0.16b,v16.16b 311 aesmc v0.16b,v0.16b 312 ld1 {v16.4s},[x3],#16 // load key schedule... 313 aese v0.16b,v17.16b 314 aesmc v0.16b,v0.16b 315 ld1 {v17.4s},[x3],#16 // load key schedule... 316 subs w5,w5,#2 // bias 317 b.gt .Lecb_round_loop 318.Lecb_128_enc: 319 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 320 aese v0.16b,v16.16b 321 aesmc v0.16b,v0.16b 322 aese v0.16b,v17.16b 323 aesmc v0.16b,v0.16b 324 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 325 aese v0.16b,v18.16b 326 aesmc v0.16b,v0.16b 327 aese v0.16b,v19.16b 328 aesmc v0.16b,v0.16b 329 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 330 aese v0.16b,v20.16b 331 aesmc v0.16b,v0.16b 332 aese v0.16b,v21.16b 333 aesmc v0.16b,v0.16b 334 ld1 {v7.4s},[x3] 335 aese v0.16b,v22.16b 336 aesmc v0.16b,v0.16b 337 aese v0.16b,v23.16b 338 eor v0.16b,v0.16b,v7.16b 339 st1 {v0.16b},[x1] 340 b .Lecb_Final_abort 341.Lecb_small_dec: 342 aesd v0.16b,v5.16b 343 aesimc v0.16b,v0.16b 344 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 345 aesd v0.16b,v6.16b 346 aesimc v0.16b,v0.16b 347 subs w5,w5,#10 // bias 348 b.eq .Lecb_128_dec 349.Lecb_dec_round_loop: 350 aesd v0.16b,v16.16b 351 aesimc v0.16b,v0.16b 352 ld1 {v16.4s},[x3],#16 // load key schedule... 353 aesd v0.16b,v17.16b 354 aesimc v0.16b,v0.16b 355 ld1 {v17.4s},[x3],#16 // load key schedule... 356 subs w5,w5,#2 // bias 357 b.gt .Lecb_dec_round_loop 358.Lecb_128_dec: 359 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 360 aesd v0.16b,v16.16b 361 aesimc v0.16b,v0.16b 362 aesd v0.16b,v17.16b 363 aesimc v0.16b,v0.16b 364 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 365 aesd v0.16b,v18.16b 366 aesimc v0.16b,v0.16b 367 aesd v0.16b,v19.16b 368 aesimc v0.16b,v0.16b 369 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 370 aesd v0.16b,v20.16b 371 aesimc v0.16b,v0.16b 372 aesd v0.16b,v21.16b 373 aesimc v0.16b,v0.16b 374 ld1 {v7.4s},[x3] 375 aesd v0.16b,v22.16b 376 aesimc v0.16b,v0.16b 377 aesd v0.16b,v23.16b 378 eor v0.16b,v0.16b,v7.16b 379 st1 {v0.16b},[x1] 380 b .Lecb_Final_abort 381.Lecb_big_size: 382 stp x29,x30,[sp,#-16]! 383 add x29,sp,#0 384 mov x8,#16 385 b.lo .Lecb_done 386 csel x8,xzr,x8,eq 387 388 cmp w4,#0 // en- or decrypting? 389 ldr w5,[x3,#240] 390 and x2,x2,#-16 391 ld1 {v0.16b},[x0],x8 392 393 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 394 sub w5,w5,#6 395 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 396 sub w5,w5,#2 397 ld1 {v18.4s,v19.4s},[x7],#32 398 ld1 {v20.4s,v21.4s},[x7],#32 399 ld1 {v22.4s,v23.4s},[x7],#32 400 ld1 {v7.4s},[x7] 401 402 add x7,x3,#32 403 mov w6,w5 404 b.eq .Lecb_dec 405 406 ld1 {v1.16b},[x0],#16 407 subs x2,x2,#32 // bias 408 add w6,w5,#2 409 orr v3.16b,v1.16b,v1.16b 410 orr v24.16b,v1.16b,v1.16b 411 orr v1.16b,v0.16b,v0.16b 412 b.lo .Lecb_enc_tail 413 414 orr v1.16b,v3.16b,v3.16b 415 ld1 {v24.16b},[x0],#16 416 cmp x2,#32 417 b.lo .Loop3x_ecb_enc 418 419 ld1 {v25.16b},[x0],#16 420 ld1 {v26.16b},[x0],#16 421 sub x2,x2,#32 // bias 422 mov w6,w5 423 424.Loop5x_ecb_enc: 425 aese v0.16b,v16.16b 426 aesmc v0.16b,v0.16b 427 aese v1.16b,v16.16b 428 aesmc v1.16b,v1.16b 429 aese v24.16b,v16.16b 430 aesmc v24.16b,v24.16b 431 aese v25.16b,v16.16b 432 aesmc v25.16b,v25.16b 433 aese v26.16b,v16.16b 434 aesmc v26.16b,v26.16b 435 ld1 {v16.4s},[x7],#16 436 subs w6,w6,#2 437 aese v0.16b,v17.16b 438 aesmc v0.16b,v0.16b 439 aese v1.16b,v17.16b 440 aesmc v1.16b,v1.16b 441 aese v24.16b,v17.16b 442 aesmc v24.16b,v24.16b 443 aese v25.16b,v17.16b 444 aesmc v25.16b,v25.16b 445 aese v26.16b,v17.16b 446 aesmc v26.16b,v26.16b 447 ld1 {v17.4s},[x7],#16 448 b.gt .Loop5x_ecb_enc 449 450 aese v0.16b,v16.16b 451 aesmc v0.16b,v0.16b 452 aese v1.16b,v16.16b 453 aesmc v1.16b,v1.16b 454 aese v24.16b,v16.16b 455 aesmc v24.16b,v24.16b 456 aese v25.16b,v16.16b 457 aesmc v25.16b,v25.16b 458 aese v26.16b,v16.16b 459 aesmc v26.16b,v26.16b 460 cmp x2,#0x40 // because .Lecb_enc_tail4x 461 sub x2,x2,#0x50 462 463 aese v0.16b,v17.16b 464 aesmc v0.16b,v0.16b 465 aese v1.16b,v17.16b 466 aesmc v1.16b,v1.16b 467 aese v24.16b,v17.16b 468 aesmc v24.16b,v24.16b 469 aese v25.16b,v17.16b 470 aesmc v25.16b,v25.16b 471 aese v26.16b,v17.16b 472 aesmc v26.16b,v26.16b 473 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 474 mov x7,x3 475 476 aese v0.16b,v18.16b 477 aesmc v0.16b,v0.16b 478 aese v1.16b,v18.16b 479 aesmc v1.16b,v1.16b 480 aese v24.16b,v18.16b 481 aesmc v24.16b,v24.16b 482 aese v25.16b,v18.16b 483 aesmc v25.16b,v25.16b 484 aese v26.16b,v18.16b 485 aesmc v26.16b,v26.16b 486 add x0,x0,x6 // x0 is adjusted in such way that 487 // at exit from the loop v1.16b-v26.16b 488 // are loaded with last "words" 489 add x6,x2,#0x60 // because .Lecb_enc_tail4x 490 491 aese v0.16b,v19.16b 492 aesmc v0.16b,v0.16b 493 aese v1.16b,v19.16b 494 aesmc v1.16b,v1.16b 495 aese v24.16b,v19.16b 496 aesmc v24.16b,v24.16b 497 aese v25.16b,v19.16b 498 aesmc v25.16b,v25.16b 499 aese v26.16b,v19.16b 500 aesmc v26.16b,v26.16b 501 502 aese v0.16b,v20.16b 503 aesmc v0.16b,v0.16b 504 aese v1.16b,v20.16b 505 aesmc v1.16b,v1.16b 506 aese v24.16b,v20.16b 507 aesmc v24.16b,v24.16b 508 aese v25.16b,v20.16b 509 aesmc v25.16b,v25.16b 510 aese v26.16b,v20.16b 511 aesmc v26.16b,v26.16b 512 513 aese v0.16b,v21.16b 514 aesmc v0.16b,v0.16b 515 aese v1.16b,v21.16b 516 aesmc v1.16b,v1.16b 517 aese v24.16b,v21.16b 518 aesmc v24.16b,v24.16b 519 aese v25.16b,v21.16b 520 aesmc v25.16b,v25.16b 521 aese v26.16b,v21.16b 522 aesmc v26.16b,v26.16b 523 524 aese v0.16b,v22.16b 525 aesmc v0.16b,v0.16b 526 aese v1.16b,v22.16b 527 aesmc v1.16b,v1.16b 528 aese v24.16b,v22.16b 529 aesmc v24.16b,v24.16b 530 aese v25.16b,v22.16b 531 aesmc v25.16b,v25.16b 532 aese v26.16b,v22.16b 533 aesmc v26.16b,v26.16b 534 535 aese v0.16b,v23.16b 536 ld1 {v2.16b},[x0],#16 537 aese v1.16b,v23.16b 538 ld1 {v3.16b},[x0],#16 539 aese v24.16b,v23.16b 540 ld1 {v27.16b},[x0],#16 541 aese v25.16b,v23.16b 542 ld1 {v28.16b},[x0],#16 543 aese v26.16b,v23.16b 544 ld1 {v29.16b},[x0],#16 545 cbz x6,.Lecb_enc_tail4x 546 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 547 eor v4.16b,v7.16b,v0.16b 548 orr v0.16b,v2.16b,v2.16b 549 eor v5.16b,v7.16b,v1.16b 550 orr v1.16b,v3.16b,v3.16b 551 eor v17.16b,v7.16b,v24.16b 552 orr v24.16b,v27.16b,v27.16b 553 eor v30.16b,v7.16b,v25.16b 554 orr v25.16b,v28.16b,v28.16b 555 eor v31.16b,v7.16b,v26.16b 556 st1 {v4.16b},[x1],#16 557 orr v26.16b,v29.16b,v29.16b 558 st1 {v5.16b},[x1],#16 559 mov w6,w5 560 st1 {v17.16b},[x1],#16 561 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 562 st1 {v30.16b},[x1],#16 563 st1 {v31.16b},[x1],#16 564 b.hs .Loop5x_ecb_enc 565 566 add x2,x2,#0x50 567 cbz x2,.Lecb_done 568 569 add w6,w5,#2 570 subs x2,x2,#0x30 571 orr v0.16b,v27.16b,v27.16b 572 orr v1.16b,v28.16b,v28.16b 573 orr v24.16b,v29.16b,v29.16b 574 b.lo .Lecb_enc_tail 575 576 b .Loop3x_ecb_enc 577 578.align 4 579.Lecb_enc_tail4x: 580 eor v5.16b,v7.16b,v1.16b 581 eor v17.16b,v7.16b,v24.16b 582 eor v30.16b,v7.16b,v25.16b 583 eor v31.16b,v7.16b,v26.16b 584 st1 {v5.16b},[x1],#16 585 st1 {v17.16b},[x1],#16 586 st1 {v30.16b},[x1],#16 587 st1 {v31.16b},[x1],#16 588 589 b .Lecb_done 590.align 4 591.Loop3x_ecb_enc: 592 aese v0.16b,v16.16b 593 aesmc v0.16b,v0.16b 594 aese v1.16b,v16.16b 595 aesmc v1.16b,v1.16b 596 aese v24.16b,v16.16b 597 aesmc v24.16b,v24.16b 598 ld1 {v16.4s},[x7],#16 599 subs w6,w6,#2 600 aese v0.16b,v17.16b 601 aesmc v0.16b,v0.16b 602 aese v1.16b,v17.16b 603 aesmc v1.16b,v1.16b 604 aese v24.16b,v17.16b 605 aesmc v24.16b,v24.16b 606 ld1 {v17.4s},[x7],#16 607 b.gt .Loop3x_ecb_enc 608 609 aese v0.16b,v16.16b 610 aesmc v0.16b,v0.16b 611 aese v1.16b,v16.16b 612 aesmc v1.16b,v1.16b 613 aese v24.16b,v16.16b 614 aesmc v24.16b,v24.16b 615 subs x2,x2,#0x30 616 csel x6,x2,x6,lo // x6, w6, is zero at this point 617 aese v0.16b,v17.16b 618 aesmc v0.16b,v0.16b 619 aese v1.16b,v17.16b 620 aesmc v1.16b,v1.16b 621 aese v24.16b,v17.16b 622 aesmc v24.16b,v24.16b 623 add x0,x0,x6 // x0 is adjusted in such way that 624 // at exit from the loop v1.16b-v24.16b 625 // are loaded with last "words" 626 mov x7,x3 627 aese v0.16b,v20.16b 628 aesmc v0.16b,v0.16b 629 aese v1.16b,v20.16b 630 aesmc v1.16b,v1.16b 631 aese v24.16b,v20.16b 632 aesmc v24.16b,v24.16b 633 ld1 {v2.16b},[x0],#16 634 aese v0.16b,v21.16b 635 aesmc v0.16b,v0.16b 636 aese v1.16b,v21.16b 637 aesmc v1.16b,v1.16b 638 aese v24.16b,v21.16b 639 aesmc v24.16b,v24.16b 640 ld1 {v3.16b},[x0],#16 641 aese v0.16b,v22.16b 642 aesmc v0.16b,v0.16b 643 aese v1.16b,v22.16b 644 aesmc v1.16b,v1.16b 645 aese v24.16b,v22.16b 646 aesmc v24.16b,v24.16b 647 ld1 {v27.16b},[x0],#16 648 aese v0.16b,v23.16b 649 aese v1.16b,v23.16b 650 aese v24.16b,v23.16b 651 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 652 add w6,w5,#2 653 eor v4.16b,v7.16b,v0.16b 654 eor v5.16b,v7.16b,v1.16b 655 eor v24.16b,v24.16b,v7.16b 656 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 657 st1 {v4.16b},[x1],#16 658 orr v0.16b,v2.16b,v2.16b 659 st1 {v5.16b},[x1],#16 660 orr v1.16b,v3.16b,v3.16b 661 st1 {v24.16b},[x1],#16 662 orr v24.16b,v27.16b,v27.16b 663 b.hs .Loop3x_ecb_enc 664 665 cmn x2,#0x30 666 b.eq .Lecb_done 667 nop 668 669.Lecb_enc_tail: 670 aese v1.16b,v16.16b 671 aesmc v1.16b,v1.16b 672 aese v24.16b,v16.16b 673 aesmc v24.16b,v24.16b 674 ld1 {v16.4s},[x7],#16 675 subs w6,w6,#2 676 aese v1.16b,v17.16b 677 aesmc v1.16b,v1.16b 678 aese v24.16b,v17.16b 679 aesmc v24.16b,v24.16b 680 ld1 {v17.4s},[x7],#16 681 b.gt .Lecb_enc_tail 682 683 aese v1.16b,v16.16b 684 aesmc v1.16b,v1.16b 685 aese v24.16b,v16.16b 686 aesmc v24.16b,v24.16b 687 aese v1.16b,v17.16b 688 aesmc v1.16b,v1.16b 689 aese v24.16b,v17.16b 690 aesmc v24.16b,v24.16b 691 aese v1.16b,v20.16b 692 aesmc v1.16b,v1.16b 693 aese v24.16b,v20.16b 694 aesmc v24.16b,v24.16b 695 cmn x2,#0x20 696 aese v1.16b,v21.16b 697 aesmc v1.16b,v1.16b 698 aese v24.16b,v21.16b 699 aesmc v24.16b,v24.16b 700 aese v1.16b,v22.16b 701 aesmc v1.16b,v1.16b 702 aese v24.16b,v22.16b 703 aesmc v24.16b,v24.16b 704 aese v1.16b,v23.16b 705 aese v24.16b,v23.16b 706 b.eq .Lecb_enc_one 707 eor v5.16b,v7.16b,v1.16b 708 eor v17.16b,v7.16b,v24.16b 709 st1 {v5.16b},[x1],#16 710 st1 {v17.16b},[x1],#16 711 b .Lecb_done 712 713.Lecb_enc_one: 714 eor v5.16b,v7.16b,v24.16b 715 st1 {v5.16b},[x1],#16 716 b .Lecb_done 717.align 5 718.Lecb_dec: 719 ld1 {v1.16b},[x0],#16 720 subs x2,x2,#32 // bias 721 add w6,w5,#2 722 orr v3.16b,v1.16b,v1.16b 723 orr v24.16b,v1.16b,v1.16b 724 orr v1.16b,v0.16b,v0.16b 725 b.lo .Lecb_dec_tail 726 727 orr v1.16b,v3.16b,v3.16b 728 ld1 {v24.16b},[x0],#16 729 cmp x2,#32 730 b.lo .Loop3x_ecb_dec 731 732 ld1 {v25.16b},[x0],#16 733 ld1 {v26.16b},[x0],#16 734 sub x2,x2,#32 // bias 735 mov w6,w5 736 737.Loop5x_ecb_dec: 738 aesd v0.16b,v16.16b 739 aesimc v0.16b,v0.16b 740 aesd v1.16b,v16.16b 741 aesimc v1.16b,v1.16b 742 aesd v24.16b,v16.16b 743 aesimc v24.16b,v24.16b 744 aesd v25.16b,v16.16b 745 aesimc v25.16b,v25.16b 746 aesd v26.16b,v16.16b 747 aesimc v26.16b,v26.16b 748 ld1 {v16.4s},[x7],#16 749 subs w6,w6,#2 750 aesd v0.16b,v17.16b 751 aesimc v0.16b,v0.16b 752 aesd v1.16b,v17.16b 753 aesimc v1.16b,v1.16b 754 aesd v24.16b,v17.16b 755 aesimc v24.16b,v24.16b 756 aesd v25.16b,v17.16b 757 aesimc v25.16b,v25.16b 758 aesd v26.16b,v17.16b 759 aesimc v26.16b,v26.16b 760 ld1 {v17.4s},[x7],#16 761 b.gt .Loop5x_ecb_dec 762 763 aesd v0.16b,v16.16b 764 aesimc v0.16b,v0.16b 765 aesd v1.16b,v16.16b 766 aesimc v1.16b,v1.16b 767 aesd v24.16b,v16.16b 768 aesimc v24.16b,v24.16b 769 aesd v25.16b,v16.16b 770 aesimc v25.16b,v25.16b 771 aesd v26.16b,v16.16b 772 aesimc v26.16b,v26.16b 773 cmp x2,#0x40 // because .Lecb_tail4x 774 sub x2,x2,#0x50 775 776 aesd v0.16b,v17.16b 777 aesimc v0.16b,v0.16b 778 aesd v1.16b,v17.16b 779 aesimc v1.16b,v1.16b 780 aesd v24.16b,v17.16b 781 aesimc v24.16b,v24.16b 782 aesd v25.16b,v17.16b 783 aesimc v25.16b,v25.16b 784 aesd v26.16b,v17.16b 785 aesimc v26.16b,v26.16b 786 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 787 mov x7,x3 788 789 aesd v0.16b,v18.16b 790 aesimc v0.16b,v0.16b 791 aesd v1.16b,v18.16b 792 aesimc v1.16b,v1.16b 793 aesd v24.16b,v18.16b 794 aesimc v24.16b,v24.16b 795 aesd v25.16b,v18.16b 796 aesimc v25.16b,v25.16b 797 aesd v26.16b,v18.16b 798 aesimc v26.16b,v26.16b 799 add x0,x0,x6 // x0 is adjusted in such way that 800 // at exit from the loop v1.16b-v26.16b 801 // are loaded with last "words" 802 add x6,x2,#0x60 // because .Lecb_tail4x 803 804 aesd v0.16b,v19.16b 805 aesimc v0.16b,v0.16b 806 aesd v1.16b,v19.16b 807 aesimc v1.16b,v1.16b 808 aesd v24.16b,v19.16b 809 aesimc v24.16b,v24.16b 810 aesd v25.16b,v19.16b 811 aesimc v25.16b,v25.16b 812 aesd v26.16b,v19.16b 813 aesimc v26.16b,v26.16b 814 815 aesd v0.16b,v20.16b 816 aesimc v0.16b,v0.16b 817 aesd v1.16b,v20.16b 818 aesimc v1.16b,v1.16b 819 aesd v24.16b,v20.16b 820 aesimc v24.16b,v24.16b 821 aesd v25.16b,v20.16b 822 aesimc v25.16b,v25.16b 823 aesd v26.16b,v20.16b 824 aesimc v26.16b,v26.16b 825 826 aesd v0.16b,v21.16b 827 aesimc v0.16b,v0.16b 828 aesd v1.16b,v21.16b 829 aesimc v1.16b,v1.16b 830 aesd v24.16b,v21.16b 831 aesimc v24.16b,v24.16b 832 aesd v25.16b,v21.16b 833 aesimc v25.16b,v25.16b 834 aesd v26.16b,v21.16b 835 aesimc v26.16b,v26.16b 836 837 aesd v0.16b,v22.16b 838 aesimc v0.16b,v0.16b 839 aesd v1.16b,v22.16b 840 aesimc v1.16b,v1.16b 841 aesd v24.16b,v22.16b 842 aesimc v24.16b,v24.16b 843 aesd v25.16b,v22.16b 844 aesimc v25.16b,v25.16b 845 aesd v26.16b,v22.16b 846 aesimc v26.16b,v26.16b 847 848 aesd v0.16b,v23.16b 849 ld1 {v2.16b},[x0],#16 850 aesd v1.16b,v23.16b 851 ld1 {v3.16b},[x0],#16 852 aesd v24.16b,v23.16b 853 ld1 {v27.16b},[x0],#16 854 aesd v25.16b,v23.16b 855 ld1 {v28.16b},[x0],#16 856 aesd v26.16b,v23.16b 857 ld1 {v29.16b},[x0],#16 858 cbz x6,.Lecb_tail4x 859 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 860 eor v4.16b,v7.16b,v0.16b 861 orr v0.16b,v2.16b,v2.16b 862 eor v5.16b,v7.16b,v1.16b 863 orr v1.16b,v3.16b,v3.16b 864 eor v17.16b,v7.16b,v24.16b 865 orr v24.16b,v27.16b,v27.16b 866 eor v30.16b,v7.16b,v25.16b 867 orr v25.16b,v28.16b,v28.16b 868 eor v31.16b,v7.16b,v26.16b 869 st1 {v4.16b},[x1],#16 870 orr v26.16b,v29.16b,v29.16b 871 st1 {v5.16b},[x1],#16 872 mov w6,w5 873 st1 {v17.16b},[x1],#16 874 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 875 st1 {v30.16b},[x1],#16 876 st1 {v31.16b},[x1],#16 877 b.hs .Loop5x_ecb_dec 878 879 add x2,x2,#0x50 880 cbz x2,.Lecb_done 881 882 add w6,w5,#2 883 subs x2,x2,#0x30 884 orr v0.16b,v27.16b,v27.16b 885 orr v1.16b,v28.16b,v28.16b 886 orr v24.16b,v29.16b,v29.16b 887 b.lo .Lecb_dec_tail 888 889 b .Loop3x_ecb_dec 890 891.align 4 892.Lecb_tail4x: 893 eor v5.16b,v7.16b,v1.16b 894 eor v17.16b,v7.16b,v24.16b 895 eor v30.16b,v7.16b,v25.16b 896 eor v31.16b,v7.16b,v26.16b 897 st1 {v5.16b},[x1],#16 898 st1 {v17.16b},[x1],#16 899 st1 {v30.16b},[x1],#16 900 st1 {v31.16b},[x1],#16 901 902 b .Lecb_done 903.align 4 904.Loop3x_ecb_dec: 905 aesd v0.16b,v16.16b 906 aesimc v0.16b,v0.16b 907 aesd v1.16b,v16.16b 908 aesimc v1.16b,v1.16b 909 aesd v24.16b,v16.16b 910 aesimc v24.16b,v24.16b 911 ld1 {v16.4s},[x7],#16 912 subs w6,w6,#2 913 aesd v0.16b,v17.16b 914 aesimc v0.16b,v0.16b 915 aesd v1.16b,v17.16b 916 aesimc v1.16b,v1.16b 917 aesd v24.16b,v17.16b 918 aesimc v24.16b,v24.16b 919 ld1 {v17.4s},[x7],#16 920 b.gt .Loop3x_ecb_dec 921 922 aesd v0.16b,v16.16b 923 aesimc v0.16b,v0.16b 924 aesd v1.16b,v16.16b 925 aesimc v1.16b,v1.16b 926 aesd v24.16b,v16.16b 927 aesimc v24.16b,v24.16b 928 subs x2,x2,#0x30 929 csel x6,x2,x6,lo // x6, w6, is zero at this point 930 aesd v0.16b,v17.16b 931 aesimc v0.16b,v0.16b 932 aesd v1.16b,v17.16b 933 aesimc v1.16b,v1.16b 934 aesd v24.16b,v17.16b 935 aesimc v24.16b,v24.16b 936 add x0,x0,x6 // x0 is adjusted in such way that 937 // at exit from the loop v1.16b-v24.16b 938 // are loaded with last "words" 939 mov x7,x3 940 aesd v0.16b,v20.16b 941 aesimc v0.16b,v0.16b 942 aesd v1.16b,v20.16b 943 aesimc v1.16b,v1.16b 944 aesd v24.16b,v20.16b 945 aesimc v24.16b,v24.16b 946 ld1 {v2.16b},[x0],#16 947 aesd v0.16b,v21.16b 948 aesimc v0.16b,v0.16b 949 aesd v1.16b,v21.16b 950 aesimc v1.16b,v1.16b 951 aesd v24.16b,v21.16b 952 aesimc v24.16b,v24.16b 953 ld1 {v3.16b},[x0],#16 954 aesd v0.16b,v22.16b 955 aesimc v0.16b,v0.16b 956 aesd v1.16b,v22.16b 957 aesimc v1.16b,v1.16b 958 aesd v24.16b,v22.16b 959 aesimc v24.16b,v24.16b 960 ld1 {v27.16b},[x0],#16 961 aesd v0.16b,v23.16b 962 aesd v1.16b,v23.16b 963 aesd v24.16b,v23.16b 964 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 965 add w6,w5,#2 966 eor v4.16b,v7.16b,v0.16b 967 eor v5.16b,v7.16b,v1.16b 968 eor v24.16b,v24.16b,v7.16b 969 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 970 st1 {v4.16b},[x1],#16 971 orr v0.16b,v2.16b,v2.16b 972 st1 {v5.16b},[x1],#16 973 orr v1.16b,v3.16b,v3.16b 974 st1 {v24.16b},[x1],#16 975 orr v24.16b,v27.16b,v27.16b 976 b.hs .Loop3x_ecb_dec 977 978 cmn x2,#0x30 979 b.eq .Lecb_done 980 nop 981 982.Lecb_dec_tail: 983 aesd v1.16b,v16.16b 984 aesimc v1.16b,v1.16b 985 aesd v24.16b,v16.16b 986 aesimc v24.16b,v24.16b 987 ld1 {v16.4s},[x7],#16 988 subs w6,w6,#2 989 aesd v1.16b,v17.16b 990 aesimc v1.16b,v1.16b 991 aesd v24.16b,v17.16b 992 aesimc v24.16b,v24.16b 993 ld1 {v17.4s},[x7],#16 994 b.gt .Lecb_dec_tail 995 996 aesd v1.16b,v16.16b 997 aesimc v1.16b,v1.16b 998 aesd v24.16b,v16.16b 999 aesimc v24.16b,v24.16b 1000 aesd v1.16b,v17.16b 1001 aesimc v1.16b,v1.16b 1002 aesd v24.16b,v17.16b 1003 aesimc v24.16b,v24.16b 1004 aesd v1.16b,v20.16b 1005 aesimc v1.16b,v1.16b 1006 aesd v24.16b,v20.16b 1007 aesimc v24.16b,v24.16b 1008 cmn x2,#0x20 1009 aesd v1.16b,v21.16b 1010 aesimc v1.16b,v1.16b 1011 aesd v24.16b,v21.16b 1012 aesimc v24.16b,v24.16b 1013 aesd v1.16b,v22.16b 1014 aesimc v1.16b,v1.16b 1015 aesd v24.16b,v22.16b 1016 aesimc v24.16b,v24.16b 1017 aesd v1.16b,v23.16b 1018 aesd v24.16b,v23.16b 1019 b.eq .Lecb_dec_one 1020 eor v5.16b,v7.16b,v1.16b 1021 eor v17.16b,v7.16b,v24.16b 1022 st1 {v5.16b},[x1],#16 1023 st1 {v17.16b},[x1],#16 1024 b .Lecb_done 1025 1026.Lecb_dec_one: 1027 eor v5.16b,v7.16b,v24.16b 1028 st1 {v5.16b},[x1],#16 1029 1030.Lecb_done: 1031 ldr x29,[sp],#16 1032.Lecb_Final_abort: 1033 ret 1034.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt 1035.globl aes_v8_cbc_encrypt 1036.type aes_v8_cbc_encrypt,%function 1037.align 5 1038aes_v8_cbc_encrypt: 1039 AARCH64_VALID_CALL_TARGET 1040 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1041 stp x29,x30,[sp,#-16]! 1042 add x29,sp,#0 1043 subs x2,x2,#16 1044 mov x8,#16 1045 b.lo .Lcbc_abort 1046 csel x8,xzr,x8,eq 1047 1048 cmp w5,#0 // en- or decrypting? 1049 ldr w5,[x3,#240] 1050 and x2,x2,#-16 1051 ld1 {v6.16b},[x4] 1052 ld1 {v0.16b},[x0],x8 1053 1054 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1055 sub w5,w5,#6 1056 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1057 sub w5,w5,#2 1058 ld1 {v18.4s,v19.4s},[x7],#32 1059 ld1 {v20.4s,v21.4s},[x7],#32 1060 ld1 {v22.4s,v23.4s},[x7],#32 1061 ld1 {v7.4s},[x7] 1062 1063 add x7,x3,#32 1064 mov w6,w5 1065 b.eq .Lcbc_dec 1066 1067 cmp w5,#2 1068 eor v0.16b,v0.16b,v6.16b 1069 eor v5.16b,v16.16b,v7.16b 1070 b.eq .Lcbc_enc128 1071 1072 ld1 {v2.4s,v3.4s},[x7] 1073 add x7,x3,#16 1074 add x6,x3,#16*4 1075 add x12,x3,#16*5 1076 aese v0.16b,v16.16b 1077 aesmc v0.16b,v0.16b 1078 add x14,x3,#16*6 1079 add x3,x3,#16*7 1080 b .Lenter_cbc_enc 1081 1082.align 4 1083.Loop_cbc_enc: 1084 aese v0.16b,v16.16b 1085 aesmc v0.16b,v0.16b 1086 st1 {v6.16b},[x1],#16 1087.Lenter_cbc_enc: 1088 aese v0.16b,v17.16b 1089 aesmc v0.16b,v0.16b 1090 aese v0.16b,v2.16b 1091 aesmc v0.16b,v0.16b 1092 ld1 {v16.4s},[x6] 1093 cmp w5,#4 1094 aese v0.16b,v3.16b 1095 aesmc v0.16b,v0.16b 1096 ld1 {v17.4s},[x12] 1097 b.eq .Lcbc_enc192 1098 1099 aese v0.16b,v16.16b 1100 aesmc v0.16b,v0.16b 1101 ld1 {v16.4s},[x14] 1102 aese v0.16b,v17.16b 1103 aesmc v0.16b,v0.16b 1104 ld1 {v17.4s},[x3] 1105 nop 1106 1107.Lcbc_enc192: 1108 aese v0.16b,v16.16b 1109 aesmc v0.16b,v0.16b 1110 subs x2,x2,#16 1111 aese v0.16b,v17.16b 1112 aesmc v0.16b,v0.16b 1113 csel x8,xzr,x8,eq 1114 aese v0.16b,v18.16b 1115 aesmc v0.16b,v0.16b 1116 aese v0.16b,v19.16b 1117 aesmc v0.16b,v0.16b 1118 ld1 {v16.16b},[x0],x8 1119 aese v0.16b,v20.16b 1120 aesmc v0.16b,v0.16b 1121 eor v16.16b,v16.16b,v5.16b 1122 aese v0.16b,v21.16b 1123 aesmc v0.16b,v0.16b 1124 ld1 {v17.4s},[x7] // re-pre-load rndkey[1] 1125 aese v0.16b,v22.16b 1126 aesmc v0.16b,v0.16b 1127 aese v0.16b,v23.16b 1128 eor v6.16b,v0.16b,v7.16b 1129 b.hs .Loop_cbc_enc 1130 1131 st1 {v6.16b},[x1],#16 1132 b .Lcbc_done 1133 1134.align 5 1135.Lcbc_enc128: 1136 ld1 {v2.4s,v3.4s},[x7] 1137 aese v0.16b,v16.16b 1138 aesmc v0.16b,v0.16b 1139 b .Lenter_cbc_enc128 1140.Loop_cbc_enc128: 1141 aese v0.16b,v16.16b 1142 aesmc v0.16b,v0.16b 1143 st1 {v6.16b},[x1],#16 1144.Lenter_cbc_enc128: 1145 aese v0.16b,v17.16b 1146 aesmc v0.16b,v0.16b 1147 subs x2,x2,#16 1148 aese v0.16b,v2.16b 1149 aesmc v0.16b,v0.16b 1150 csel x8,xzr,x8,eq 1151 aese v0.16b,v3.16b 1152 aesmc v0.16b,v0.16b 1153 aese v0.16b,v18.16b 1154 aesmc v0.16b,v0.16b 1155 aese v0.16b,v19.16b 1156 aesmc v0.16b,v0.16b 1157 ld1 {v16.16b},[x0],x8 1158 aese v0.16b,v20.16b 1159 aesmc v0.16b,v0.16b 1160 aese v0.16b,v21.16b 1161 aesmc v0.16b,v0.16b 1162 aese v0.16b,v22.16b 1163 aesmc v0.16b,v0.16b 1164 eor v16.16b,v16.16b,v5.16b 1165 aese v0.16b,v23.16b 1166 eor v6.16b,v0.16b,v7.16b 1167 b.hs .Loop_cbc_enc128 1168 1169 st1 {v6.16b},[x1],#16 1170 b .Lcbc_done 1171.align 5 1172.Lcbc_dec: 1173 ld1 {v24.16b},[x0],#16 1174 subs x2,x2,#32 // bias 1175 add w6,w5,#2 1176 orr v3.16b,v0.16b,v0.16b 1177 orr v1.16b,v0.16b,v0.16b 1178 orr v27.16b,v24.16b,v24.16b 1179 b.lo .Lcbc_dec_tail 1180 1181 orr v1.16b,v24.16b,v24.16b 1182 ld1 {v24.16b},[x0],#16 1183 orr v2.16b,v0.16b,v0.16b 1184 orr v3.16b,v1.16b,v1.16b 1185 orr v27.16b,v24.16b,v24.16b 1186 cmp x2,#32 1187 b.lo .Loop3x_cbc_dec 1188 1189 ld1 {v25.16b},[x0],#16 1190 ld1 {v26.16b},[x0],#16 1191 sub x2,x2,#32 // bias 1192 mov w6,w5 1193 orr v28.16b,v25.16b,v25.16b 1194 orr v29.16b,v26.16b,v26.16b 1195 1196.Loop5x_cbc_dec: 1197 aesd v0.16b,v16.16b 1198 aesimc v0.16b,v0.16b 1199 aesd v1.16b,v16.16b 1200 aesimc v1.16b,v1.16b 1201 aesd v24.16b,v16.16b 1202 aesimc v24.16b,v24.16b 1203 aesd v25.16b,v16.16b 1204 aesimc v25.16b,v25.16b 1205 aesd v26.16b,v16.16b 1206 aesimc v26.16b,v26.16b 1207 ld1 {v16.4s},[x7],#16 1208 subs w6,w6,#2 1209 aesd v0.16b,v17.16b 1210 aesimc v0.16b,v0.16b 1211 aesd v1.16b,v17.16b 1212 aesimc v1.16b,v1.16b 1213 aesd v24.16b,v17.16b 1214 aesimc v24.16b,v24.16b 1215 aesd v25.16b,v17.16b 1216 aesimc v25.16b,v25.16b 1217 aesd v26.16b,v17.16b 1218 aesimc v26.16b,v26.16b 1219 ld1 {v17.4s},[x7],#16 1220 b.gt .Loop5x_cbc_dec 1221 1222 aesd v0.16b,v16.16b 1223 aesimc v0.16b,v0.16b 1224 aesd v1.16b,v16.16b 1225 aesimc v1.16b,v1.16b 1226 aesd v24.16b,v16.16b 1227 aesimc v24.16b,v24.16b 1228 aesd v25.16b,v16.16b 1229 aesimc v25.16b,v25.16b 1230 aesd v26.16b,v16.16b 1231 aesimc v26.16b,v26.16b 1232 cmp x2,#0x40 // because .Lcbc_tail4x 1233 sub x2,x2,#0x50 1234 1235 aesd v0.16b,v17.16b 1236 aesimc v0.16b,v0.16b 1237 aesd v1.16b,v17.16b 1238 aesimc v1.16b,v1.16b 1239 aesd v24.16b,v17.16b 1240 aesimc v24.16b,v24.16b 1241 aesd v25.16b,v17.16b 1242 aesimc v25.16b,v25.16b 1243 aesd v26.16b,v17.16b 1244 aesimc v26.16b,v26.16b 1245 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 1246 mov x7,x3 1247 1248 aesd v0.16b,v18.16b 1249 aesimc v0.16b,v0.16b 1250 aesd v1.16b,v18.16b 1251 aesimc v1.16b,v1.16b 1252 aesd v24.16b,v18.16b 1253 aesimc v24.16b,v24.16b 1254 aesd v25.16b,v18.16b 1255 aesimc v25.16b,v25.16b 1256 aesd v26.16b,v18.16b 1257 aesimc v26.16b,v26.16b 1258 add x0,x0,x6 // x0 is adjusted in such way that 1259 // at exit from the loop v1.16b-v26.16b 1260 // are loaded with last "words" 1261 add x6,x2,#0x60 // because .Lcbc_tail4x 1262 1263 aesd v0.16b,v19.16b 1264 aesimc v0.16b,v0.16b 1265 aesd v1.16b,v19.16b 1266 aesimc v1.16b,v1.16b 1267 aesd v24.16b,v19.16b 1268 aesimc v24.16b,v24.16b 1269 aesd v25.16b,v19.16b 1270 aesimc v25.16b,v25.16b 1271 aesd v26.16b,v19.16b 1272 aesimc v26.16b,v26.16b 1273 1274 aesd v0.16b,v20.16b 1275 aesimc v0.16b,v0.16b 1276 aesd v1.16b,v20.16b 1277 aesimc v1.16b,v1.16b 1278 aesd v24.16b,v20.16b 1279 aesimc v24.16b,v24.16b 1280 aesd v25.16b,v20.16b 1281 aesimc v25.16b,v25.16b 1282 aesd v26.16b,v20.16b 1283 aesimc v26.16b,v26.16b 1284 1285 aesd v0.16b,v21.16b 1286 aesimc v0.16b,v0.16b 1287 aesd v1.16b,v21.16b 1288 aesimc v1.16b,v1.16b 1289 aesd v24.16b,v21.16b 1290 aesimc v24.16b,v24.16b 1291 aesd v25.16b,v21.16b 1292 aesimc v25.16b,v25.16b 1293 aesd v26.16b,v21.16b 1294 aesimc v26.16b,v26.16b 1295 1296 aesd v0.16b,v22.16b 1297 aesimc v0.16b,v0.16b 1298 aesd v1.16b,v22.16b 1299 aesimc v1.16b,v1.16b 1300 aesd v24.16b,v22.16b 1301 aesimc v24.16b,v24.16b 1302 aesd v25.16b,v22.16b 1303 aesimc v25.16b,v25.16b 1304 aesd v26.16b,v22.16b 1305 aesimc v26.16b,v26.16b 1306 1307 eor v4.16b,v6.16b,v7.16b 1308 aesd v0.16b,v23.16b 1309 eor v5.16b,v2.16b,v7.16b 1310 ld1 {v2.16b},[x0],#16 1311 aesd v1.16b,v23.16b 1312 eor v17.16b,v3.16b,v7.16b 1313 ld1 {v3.16b},[x0],#16 1314 aesd v24.16b,v23.16b 1315 eor v30.16b,v27.16b,v7.16b 1316 ld1 {v27.16b},[x0],#16 1317 aesd v25.16b,v23.16b 1318 eor v31.16b,v28.16b,v7.16b 1319 ld1 {v28.16b},[x0],#16 1320 aesd v26.16b,v23.16b 1321 orr v6.16b,v29.16b,v29.16b 1322 ld1 {v29.16b},[x0],#16 1323 cbz x6,.Lcbc_tail4x 1324 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1325 eor v4.16b,v4.16b,v0.16b 1326 orr v0.16b,v2.16b,v2.16b 1327 eor v5.16b,v5.16b,v1.16b 1328 orr v1.16b,v3.16b,v3.16b 1329 eor v17.16b,v17.16b,v24.16b 1330 orr v24.16b,v27.16b,v27.16b 1331 eor v30.16b,v30.16b,v25.16b 1332 orr v25.16b,v28.16b,v28.16b 1333 eor v31.16b,v31.16b,v26.16b 1334 st1 {v4.16b},[x1],#16 1335 orr v26.16b,v29.16b,v29.16b 1336 st1 {v5.16b},[x1],#16 1337 mov w6,w5 1338 st1 {v17.16b},[x1],#16 1339 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1340 st1 {v30.16b},[x1],#16 1341 st1 {v31.16b},[x1],#16 1342 b.hs .Loop5x_cbc_dec 1343 1344 add x2,x2,#0x50 1345 cbz x2,.Lcbc_done 1346 1347 add w6,w5,#2 1348 subs x2,x2,#0x30 1349 orr v0.16b,v27.16b,v27.16b 1350 orr v2.16b,v27.16b,v27.16b 1351 orr v1.16b,v28.16b,v28.16b 1352 orr v3.16b,v28.16b,v28.16b 1353 orr v24.16b,v29.16b,v29.16b 1354 orr v27.16b,v29.16b,v29.16b 1355 b.lo .Lcbc_dec_tail 1356 1357 b .Loop3x_cbc_dec 1358 1359.align 4 1360.Lcbc_tail4x: 1361 eor v5.16b,v4.16b,v1.16b 1362 eor v17.16b,v17.16b,v24.16b 1363 eor v30.16b,v30.16b,v25.16b 1364 eor v31.16b,v31.16b,v26.16b 1365 st1 {v5.16b},[x1],#16 1366 st1 {v17.16b},[x1],#16 1367 st1 {v30.16b},[x1],#16 1368 st1 {v31.16b},[x1],#16 1369 1370 b .Lcbc_done 1371.align 4 1372.Loop3x_cbc_dec: 1373 aesd v0.16b,v16.16b 1374 aesimc v0.16b,v0.16b 1375 aesd v1.16b,v16.16b 1376 aesimc v1.16b,v1.16b 1377 aesd v24.16b,v16.16b 1378 aesimc v24.16b,v24.16b 1379 ld1 {v16.4s},[x7],#16 1380 subs w6,w6,#2 1381 aesd v0.16b,v17.16b 1382 aesimc v0.16b,v0.16b 1383 aesd v1.16b,v17.16b 1384 aesimc v1.16b,v1.16b 1385 aesd v24.16b,v17.16b 1386 aesimc v24.16b,v24.16b 1387 ld1 {v17.4s},[x7],#16 1388 b.gt .Loop3x_cbc_dec 1389 1390 aesd v0.16b,v16.16b 1391 aesimc v0.16b,v0.16b 1392 aesd v1.16b,v16.16b 1393 aesimc v1.16b,v1.16b 1394 aesd v24.16b,v16.16b 1395 aesimc v24.16b,v24.16b 1396 eor v4.16b,v6.16b,v7.16b 1397 subs x2,x2,#0x30 1398 eor v5.16b,v2.16b,v7.16b 1399 csel x6,x2,x6,lo // x6, w6, is zero at this point 1400 aesd v0.16b,v17.16b 1401 aesimc v0.16b,v0.16b 1402 aesd v1.16b,v17.16b 1403 aesimc v1.16b,v1.16b 1404 aesd v24.16b,v17.16b 1405 aesimc v24.16b,v24.16b 1406 eor v17.16b,v3.16b,v7.16b 1407 add x0,x0,x6 // x0 is adjusted in such way that 1408 // at exit from the loop v1.16b-v24.16b 1409 // are loaded with last "words" 1410 orr v6.16b,v27.16b,v27.16b 1411 mov x7,x3 1412 aesd v0.16b,v20.16b 1413 aesimc v0.16b,v0.16b 1414 aesd v1.16b,v20.16b 1415 aesimc v1.16b,v1.16b 1416 aesd v24.16b,v20.16b 1417 aesimc v24.16b,v24.16b 1418 ld1 {v2.16b},[x0],#16 1419 aesd v0.16b,v21.16b 1420 aesimc v0.16b,v0.16b 1421 aesd v1.16b,v21.16b 1422 aesimc v1.16b,v1.16b 1423 aesd v24.16b,v21.16b 1424 aesimc v24.16b,v24.16b 1425 ld1 {v3.16b},[x0],#16 1426 aesd v0.16b,v22.16b 1427 aesimc v0.16b,v0.16b 1428 aesd v1.16b,v22.16b 1429 aesimc v1.16b,v1.16b 1430 aesd v24.16b,v22.16b 1431 aesimc v24.16b,v24.16b 1432 ld1 {v27.16b},[x0],#16 1433 aesd v0.16b,v23.16b 1434 aesd v1.16b,v23.16b 1435 aesd v24.16b,v23.16b 1436 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1437 add w6,w5,#2 1438 eor v4.16b,v4.16b,v0.16b 1439 eor v5.16b,v5.16b,v1.16b 1440 eor v24.16b,v24.16b,v17.16b 1441 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1442 st1 {v4.16b},[x1],#16 1443 orr v0.16b,v2.16b,v2.16b 1444 st1 {v5.16b},[x1],#16 1445 orr v1.16b,v3.16b,v3.16b 1446 st1 {v24.16b},[x1],#16 1447 orr v24.16b,v27.16b,v27.16b 1448 b.hs .Loop3x_cbc_dec 1449 1450 cmn x2,#0x30 1451 b.eq .Lcbc_done 1452 nop 1453 1454.Lcbc_dec_tail: 1455 aesd v1.16b,v16.16b 1456 aesimc v1.16b,v1.16b 1457 aesd v24.16b,v16.16b 1458 aesimc v24.16b,v24.16b 1459 ld1 {v16.4s},[x7],#16 1460 subs w6,w6,#2 1461 aesd v1.16b,v17.16b 1462 aesimc v1.16b,v1.16b 1463 aesd v24.16b,v17.16b 1464 aesimc v24.16b,v24.16b 1465 ld1 {v17.4s},[x7],#16 1466 b.gt .Lcbc_dec_tail 1467 1468 aesd v1.16b,v16.16b 1469 aesimc v1.16b,v1.16b 1470 aesd v24.16b,v16.16b 1471 aesimc v24.16b,v24.16b 1472 aesd v1.16b,v17.16b 1473 aesimc v1.16b,v1.16b 1474 aesd v24.16b,v17.16b 1475 aesimc v24.16b,v24.16b 1476 aesd v1.16b,v20.16b 1477 aesimc v1.16b,v1.16b 1478 aesd v24.16b,v20.16b 1479 aesimc v24.16b,v24.16b 1480 cmn x2,#0x20 1481 aesd v1.16b,v21.16b 1482 aesimc v1.16b,v1.16b 1483 aesd v24.16b,v21.16b 1484 aesimc v24.16b,v24.16b 1485 eor v5.16b,v6.16b,v7.16b 1486 aesd v1.16b,v22.16b 1487 aesimc v1.16b,v1.16b 1488 aesd v24.16b,v22.16b 1489 aesimc v24.16b,v24.16b 1490 eor v17.16b,v3.16b,v7.16b 1491 aesd v1.16b,v23.16b 1492 aesd v24.16b,v23.16b 1493 b.eq .Lcbc_dec_one 1494 eor v5.16b,v5.16b,v1.16b 1495 eor v17.16b,v17.16b,v24.16b 1496 orr v6.16b,v27.16b,v27.16b 1497 st1 {v5.16b},[x1],#16 1498 st1 {v17.16b},[x1],#16 1499 b .Lcbc_done 1500 1501.Lcbc_dec_one: 1502 eor v5.16b,v5.16b,v24.16b 1503 orr v6.16b,v27.16b,v27.16b 1504 st1 {v5.16b},[x1],#16 1505 1506.Lcbc_done: 1507 st1 {v6.16b},[x4] 1508.Lcbc_abort: 1509 ldr x29,[sp],#16 1510 ret 1511.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt 1512.globl aes_v8_ctr32_encrypt_blocks 1513.type aes_v8_ctr32_encrypt_blocks,%function 1514.align 5 1515aes_v8_ctr32_encrypt_blocks: 1516 AARCH64_VALID_CALL_TARGET 1517 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1518 stp x29,x30,[sp,#-16]! 1519 add x29,sp,#0 1520 ldr w5,[x3,#240] 1521 1522 ldr w8, [x4, #12] 1523#ifdef __AARCH64EB__ 1524 ld1 {v0.16b},[x4] 1525#else 1526 ld1 {v0.4s},[x4] 1527#endif 1528 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1529 sub w5,w5,#4 1530 mov x12,#16 1531 cmp x2,#2 1532 add x7,x3,x5,lsl#4 // pointer to last 5 round keys 1533 sub w5,w5,#2 1534 ld1 {v20.4s,v21.4s},[x7],#32 1535 ld1 {v22.4s,v23.4s},[x7],#32 1536 ld1 {v7.4s},[x7] 1537 add x7,x3,#32 1538 mov w6,w5 1539 csel x12,xzr,x12,lo 1540#ifndef __AARCH64EB__ 1541 rev w8, w8 1542#endif 1543 orr v1.16b,v0.16b,v0.16b 1544 add w10, w8, #1 1545 orr v18.16b,v0.16b,v0.16b 1546 add w8, w8, #2 1547 orr v6.16b,v0.16b,v0.16b 1548 rev w10, w10 1549 mov v1.s[3],w10 1550 b.ls .Lctr32_tail 1551 rev w12, w8 1552 sub x2,x2,#3 // bias 1553 mov v18.s[3],w12 1554 cmp x2,#32 1555 b.lo .Loop3x_ctr32 1556 1557 add w13,w8,#1 1558 add w14,w8,#2 1559 orr v24.16b,v0.16b,v0.16b 1560 rev w13,w13 1561 orr v25.16b,v0.16b,v0.16b 1562 rev w14,w14 1563 mov v24.s[3],w13 1564 sub x2,x2,#2 // bias 1565 mov v25.s[3],w14 1566 add w8,w8,#2 1567 b .Loop5x_ctr32 1568 1569.align 4 1570.Loop5x_ctr32: 1571 aese v0.16b,v16.16b 1572 aesmc v0.16b,v0.16b 1573 aese v1.16b,v16.16b 1574 aesmc v1.16b,v1.16b 1575 aese v18.16b,v16.16b 1576 aesmc v18.16b,v18.16b 1577 aese v24.16b,v16.16b 1578 aesmc v24.16b,v24.16b 1579 aese v25.16b,v16.16b 1580 aesmc v25.16b,v25.16b 1581 ld1 {v16.4s},[x7],#16 1582 subs w6,w6,#2 1583 aese v0.16b,v17.16b 1584 aesmc v0.16b,v0.16b 1585 aese v1.16b,v17.16b 1586 aesmc v1.16b,v1.16b 1587 aese v18.16b,v17.16b 1588 aesmc v18.16b,v18.16b 1589 aese v24.16b,v17.16b 1590 aesmc v24.16b,v24.16b 1591 aese v25.16b,v17.16b 1592 aesmc v25.16b,v25.16b 1593 ld1 {v17.4s},[x7],#16 1594 b.gt .Loop5x_ctr32 1595 1596 mov x7,x3 1597 aese v0.16b,v16.16b 1598 aesmc v0.16b,v0.16b 1599 aese v1.16b,v16.16b 1600 aesmc v1.16b,v1.16b 1601 aese v18.16b,v16.16b 1602 aesmc v18.16b,v18.16b 1603 aese v24.16b,v16.16b 1604 aesmc v24.16b,v24.16b 1605 aese v25.16b,v16.16b 1606 aesmc v25.16b,v25.16b 1607 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1608 1609 aese v0.16b,v17.16b 1610 aesmc v0.16b,v0.16b 1611 aese v1.16b,v17.16b 1612 aesmc v1.16b,v1.16b 1613 aese v18.16b,v17.16b 1614 aesmc v18.16b,v18.16b 1615 aese v24.16b,v17.16b 1616 aesmc v24.16b,v24.16b 1617 aese v25.16b,v17.16b 1618 aesmc v25.16b,v25.16b 1619 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1620 1621 aese v0.16b,v20.16b 1622 aesmc v0.16b,v0.16b 1623 add w9,w8,#1 1624 add w10,w8,#2 1625 aese v1.16b,v20.16b 1626 aesmc v1.16b,v1.16b 1627 add w12,w8,#3 1628 add w13,w8,#4 1629 aese v18.16b,v20.16b 1630 aesmc v18.16b,v18.16b 1631 add w14,w8,#5 1632 rev w9,w9 1633 aese v24.16b,v20.16b 1634 aesmc v24.16b,v24.16b 1635 rev w10,w10 1636 rev w12,w12 1637 aese v25.16b,v20.16b 1638 aesmc v25.16b,v25.16b 1639 rev w13,w13 1640 rev w14,w14 1641 1642 aese v0.16b,v21.16b 1643 aesmc v0.16b,v0.16b 1644 aese v1.16b,v21.16b 1645 aesmc v1.16b,v1.16b 1646 aese v18.16b,v21.16b 1647 aesmc v18.16b,v18.16b 1648 aese v24.16b,v21.16b 1649 aesmc v24.16b,v24.16b 1650 aese v25.16b,v21.16b 1651 aesmc v25.16b,v25.16b 1652 1653 aese v0.16b,v22.16b 1654 aesmc v0.16b,v0.16b 1655 ld1 {v2.16b},[x0],#16 1656 aese v1.16b,v22.16b 1657 aesmc v1.16b,v1.16b 1658 ld1 {v3.16b},[x0],#16 1659 aese v18.16b,v22.16b 1660 aesmc v18.16b,v18.16b 1661 ld1 {v19.16b},[x0],#16 1662 aese v24.16b,v22.16b 1663 aesmc v24.16b,v24.16b 1664 ld1 {v26.16b},[x0],#16 1665 aese v25.16b,v22.16b 1666 aesmc v25.16b,v25.16b 1667 ld1 {v27.16b},[x0],#16 1668 1669 aese v0.16b,v23.16b 1670 eor v2.16b,v2.16b,v7.16b 1671 aese v1.16b,v23.16b 1672 eor v3.16b,v3.16b,v7.16b 1673 aese v18.16b,v23.16b 1674 eor v19.16b,v19.16b,v7.16b 1675 aese v24.16b,v23.16b 1676 eor v26.16b,v26.16b,v7.16b 1677 aese v25.16b,v23.16b 1678 eor v27.16b,v27.16b,v7.16b 1679 1680 eor v2.16b,v2.16b,v0.16b 1681 orr v0.16b,v6.16b,v6.16b 1682 eor v3.16b,v3.16b,v1.16b 1683 orr v1.16b,v6.16b,v6.16b 1684 eor v19.16b,v19.16b,v18.16b 1685 orr v18.16b,v6.16b,v6.16b 1686 eor v26.16b,v26.16b,v24.16b 1687 orr v24.16b,v6.16b,v6.16b 1688 eor v27.16b,v27.16b,v25.16b 1689 orr v25.16b,v6.16b,v6.16b 1690 1691 st1 {v2.16b},[x1],#16 1692 mov v0.s[3],w9 1693 st1 {v3.16b},[x1],#16 1694 mov v1.s[3],w10 1695 st1 {v19.16b},[x1],#16 1696 mov v18.s[3],w12 1697 st1 {v26.16b},[x1],#16 1698 mov v24.s[3],w13 1699 st1 {v27.16b},[x1],#16 1700 mov v25.s[3],w14 1701 1702 mov w6,w5 1703 cbz x2,.Lctr32_done 1704 1705 add w8,w8,#5 1706 subs x2,x2,#5 1707 b.hs .Loop5x_ctr32 1708 1709 add x2,x2,#5 1710 sub w8,w8,#5 1711 1712 cmp x2,#2 1713 mov x12,#16 1714 csel x12,xzr,x12,lo 1715 b.ls .Lctr32_tail 1716 1717 sub x2,x2,#3 // bias 1718 add w8,w8,#3 1719 b .Loop3x_ctr32 1720 1721.align 4 1722.Loop3x_ctr32: 1723 aese v0.16b,v16.16b 1724 aesmc v0.16b,v0.16b 1725 aese v1.16b,v16.16b 1726 aesmc v1.16b,v1.16b 1727 aese v18.16b,v16.16b 1728 aesmc v18.16b,v18.16b 1729 ld1 {v16.4s},[x7],#16 1730 subs w6,w6,#2 1731 aese v0.16b,v17.16b 1732 aesmc v0.16b,v0.16b 1733 aese v1.16b,v17.16b 1734 aesmc v1.16b,v1.16b 1735 aese v18.16b,v17.16b 1736 aesmc v18.16b,v18.16b 1737 ld1 {v17.4s},[x7],#16 1738 b.gt .Loop3x_ctr32 1739 1740 aese v0.16b,v16.16b 1741 aesmc v4.16b,v0.16b 1742 aese v1.16b,v16.16b 1743 aesmc v5.16b,v1.16b 1744 ld1 {v2.16b},[x0],#16 1745 orr v0.16b,v6.16b,v6.16b 1746 aese v18.16b,v16.16b 1747 aesmc v18.16b,v18.16b 1748 ld1 {v3.16b},[x0],#16 1749 orr v1.16b,v6.16b,v6.16b 1750 aese v4.16b,v17.16b 1751 aesmc v4.16b,v4.16b 1752 aese v5.16b,v17.16b 1753 aesmc v5.16b,v5.16b 1754 ld1 {v19.16b},[x0],#16 1755 mov x7,x3 1756 aese v18.16b,v17.16b 1757 aesmc v17.16b,v18.16b 1758 orr v18.16b,v6.16b,v6.16b 1759 add w9,w8,#1 1760 aese v4.16b,v20.16b 1761 aesmc v4.16b,v4.16b 1762 aese v5.16b,v20.16b 1763 aesmc v5.16b,v5.16b 1764 eor v2.16b,v2.16b,v7.16b 1765 add w10,w8,#2 1766 aese v17.16b,v20.16b 1767 aesmc v17.16b,v17.16b 1768 eor v3.16b,v3.16b,v7.16b 1769 add w8,w8,#3 1770 aese v4.16b,v21.16b 1771 aesmc v4.16b,v4.16b 1772 aese v5.16b,v21.16b 1773 aesmc v5.16b,v5.16b 1774 eor v19.16b,v19.16b,v7.16b 1775 rev w9,w9 1776 aese v17.16b,v21.16b 1777 aesmc v17.16b,v17.16b 1778 mov v0.s[3], w9 1779 rev w10,w10 1780 aese v4.16b,v22.16b 1781 aesmc v4.16b,v4.16b 1782 aese v5.16b,v22.16b 1783 aesmc v5.16b,v5.16b 1784 mov v1.s[3], w10 1785 rev w12,w8 1786 aese v17.16b,v22.16b 1787 aesmc v17.16b,v17.16b 1788 mov v18.s[3], w12 1789 subs x2,x2,#3 1790 aese v4.16b,v23.16b 1791 aese v5.16b,v23.16b 1792 aese v17.16b,v23.16b 1793 1794 eor v2.16b,v2.16b,v4.16b 1795 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1796 st1 {v2.16b},[x1],#16 1797 eor v3.16b,v3.16b,v5.16b 1798 mov w6,w5 1799 st1 {v3.16b},[x1],#16 1800 eor v19.16b,v19.16b,v17.16b 1801 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1802 st1 {v19.16b},[x1],#16 1803 b.hs .Loop3x_ctr32 1804 1805 adds x2,x2,#3 1806 b.eq .Lctr32_done 1807 cmp x2,#1 1808 mov x12,#16 1809 csel x12,xzr,x12,eq 1810 1811.Lctr32_tail: 1812 aese v0.16b,v16.16b 1813 aesmc v0.16b,v0.16b 1814 aese v1.16b,v16.16b 1815 aesmc v1.16b,v1.16b 1816 ld1 {v16.4s},[x7],#16 1817 subs w6,w6,#2 1818 aese v0.16b,v17.16b 1819 aesmc v0.16b,v0.16b 1820 aese v1.16b,v17.16b 1821 aesmc v1.16b,v1.16b 1822 ld1 {v17.4s},[x7],#16 1823 b.gt .Lctr32_tail 1824 1825 aese v0.16b,v16.16b 1826 aesmc v0.16b,v0.16b 1827 aese v1.16b,v16.16b 1828 aesmc v1.16b,v1.16b 1829 aese v0.16b,v17.16b 1830 aesmc v0.16b,v0.16b 1831 aese v1.16b,v17.16b 1832 aesmc v1.16b,v1.16b 1833 ld1 {v2.16b},[x0],x12 1834 aese v0.16b,v20.16b 1835 aesmc v0.16b,v0.16b 1836 aese v1.16b,v20.16b 1837 aesmc v1.16b,v1.16b 1838 ld1 {v3.16b},[x0] 1839 aese v0.16b,v21.16b 1840 aesmc v0.16b,v0.16b 1841 aese v1.16b,v21.16b 1842 aesmc v1.16b,v1.16b 1843 eor v2.16b,v2.16b,v7.16b 1844 aese v0.16b,v22.16b 1845 aesmc v0.16b,v0.16b 1846 aese v1.16b,v22.16b 1847 aesmc v1.16b,v1.16b 1848 eor v3.16b,v3.16b,v7.16b 1849 aese v0.16b,v23.16b 1850 aese v1.16b,v23.16b 1851 1852 cmp x2,#1 1853 eor v2.16b,v2.16b,v0.16b 1854 eor v3.16b,v3.16b,v1.16b 1855 st1 {v2.16b},[x1],#16 1856 b.eq .Lctr32_done 1857 st1 {v3.16b},[x1] 1858 1859.Lctr32_done: 1860 ldr x29,[sp],#16 1861 ret 1862.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks 1863.globl aes_v8_xts_encrypt 1864.type aes_v8_xts_encrypt,%function 1865.align 5 1866aes_v8_xts_encrypt: 1867 AARCH64_VALID_CALL_TARGET 1868 cmp x2,#16 1869 // Original input data size bigger than 16, jump to big size processing. 1870 b.ne .Lxts_enc_big_size 1871 // Encrypt the iv with key2, as the first XEX iv. 1872 ldr w6,[x4,#240] 1873 ld1 {v0.4s},[x4],#16 1874 ld1 {v6.16b},[x5] 1875 sub w6,w6,#2 1876 ld1 {v1.4s},[x4],#16 1877 1878.Loop_enc_iv_enc: 1879 aese v6.16b,v0.16b 1880 aesmc v6.16b,v6.16b 1881 ld1 {v0.4s},[x4],#16 1882 subs w6,w6,#2 1883 aese v6.16b,v1.16b 1884 aesmc v6.16b,v6.16b 1885 ld1 {v1.4s},[x4],#16 1886 b.gt .Loop_enc_iv_enc 1887 1888 aese v6.16b,v0.16b 1889 aesmc v6.16b,v6.16b 1890 ld1 {v0.4s},[x4] 1891 aese v6.16b,v1.16b 1892 eor v6.16b,v6.16b,v0.16b 1893 1894 ld1 {v0.16b},[x0] 1895 eor v0.16b,v6.16b,v0.16b 1896 1897 ldr w6,[x3,#240] 1898 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 1899 1900 aese v0.16b,v28.16b 1901 aesmc v0.16b,v0.16b 1902 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 1903 aese v0.16b,v29.16b 1904 aesmc v0.16b,v0.16b 1905 subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing 1906 b.eq .Lxts_128_enc 1907.Lxts_enc_round_loop: 1908 aese v0.16b,v16.16b 1909 aesmc v0.16b,v0.16b 1910 ld1 {v16.4s},[x3],#16 // load key schedule... 1911 aese v0.16b,v17.16b 1912 aesmc v0.16b,v0.16b 1913 ld1 {v17.4s},[x3],#16 // load key schedule... 1914 subs w6,w6,#2 // bias 1915 b.gt .Lxts_enc_round_loop 1916.Lxts_128_enc: 1917 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 1918 aese v0.16b,v16.16b 1919 aesmc v0.16b,v0.16b 1920 aese v0.16b,v17.16b 1921 aesmc v0.16b,v0.16b 1922 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 1923 aese v0.16b,v18.16b 1924 aesmc v0.16b,v0.16b 1925 aese v0.16b,v19.16b 1926 aesmc v0.16b,v0.16b 1927 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 1928 aese v0.16b,v20.16b 1929 aesmc v0.16b,v0.16b 1930 aese v0.16b,v21.16b 1931 aesmc v0.16b,v0.16b 1932 ld1 {v7.4s},[x3] 1933 aese v0.16b,v22.16b 1934 aesmc v0.16b,v0.16b 1935 aese v0.16b,v23.16b 1936 eor v0.16b,v0.16b,v7.16b 1937 eor v0.16b,v0.16b,v6.16b 1938 st1 {v0.16b},[x1] 1939 b .Lxts_enc_final_abort 1940 1941.align 4 1942.Lxts_enc_big_size: 1943 stp x19,x20,[sp,#-64]! 1944 stp x21,x22,[sp,#48] 1945 stp d8,d9,[sp,#32] 1946 stp d10,d11,[sp,#16] 1947 1948 // tailcnt store the tail value of length%16. 1949 and x21,x2,#0xf 1950 and x2,x2,#-16 1951 subs x2,x2,#16 1952 mov x8,#16 1953 b.lo .Lxts_abort 1954 csel x8,xzr,x8,eq 1955 1956 // Firstly, encrypt the iv with key2, as the first iv of XEX. 1957 ldr w6,[x4,#240] 1958 ld1 {v0.4s},[x4],#16 1959 ld1 {v6.16b},[x5] 1960 sub w6,w6,#2 1961 ld1 {v1.4s},[x4],#16 1962 1963.Loop_iv_enc: 1964 aese v6.16b,v0.16b 1965 aesmc v6.16b,v6.16b 1966 ld1 {v0.4s},[x4],#16 1967 subs w6,w6,#2 1968 aese v6.16b,v1.16b 1969 aesmc v6.16b,v6.16b 1970 ld1 {v1.4s},[x4],#16 1971 b.gt .Loop_iv_enc 1972 1973 aese v6.16b,v0.16b 1974 aesmc v6.16b,v6.16b 1975 ld1 {v0.4s},[x4] 1976 aese v6.16b,v1.16b 1977 eor v6.16b,v6.16b,v0.16b 1978 1979 // The iv for second block 1980 // x9- iv(low), x10 - iv(high) 1981 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 1982 fmov x9,d6 1983 fmov x10,v6.d[1] 1984 mov w19,#0x87 1985 extr x22,x10,x10,#32 1986 extr x10,x10,x9,#63 1987 and w11,w19,w22,asr#31 1988 eor x9,x11,x9,lsl#1 1989 fmov d8,x9 1990 fmov v8.d[1],x10 1991 1992 ldr w5,[x3,#240] // next starting point 1993 ld1 {v0.16b},[x0],x8 1994 1995 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1996 sub w5,w5,#6 1997 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1998 sub w5,w5,#2 1999 ld1 {v18.4s,v19.4s},[x7],#32 2000 ld1 {v20.4s,v21.4s},[x7],#32 2001 ld1 {v22.4s,v23.4s},[x7],#32 2002 ld1 {v7.4s},[x7] 2003 2004 add x7,x3,#32 2005 mov w6,w5 2006 2007 // Encryption 2008.Lxts_enc: 2009 ld1 {v24.16b},[x0],#16 2010 subs x2,x2,#32 // bias 2011 add w6,w5,#2 2012 orr v3.16b,v0.16b,v0.16b 2013 orr v1.16b,v0.16b,v0.16b 2014 orr v28.16b,v0.16b,v0.16b 2015 orr v27.16b,v24.16b,v24.16b 2016 orr v29.16b,v24.16b,v24.16b 2017 b.lo .Lxts_inner_enc_tail 2018 eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv 2019 eor v24.16b,v24.16b,v8.16b 2020 2021 // The iv for third block 2022 extr x22,x10,x10,#32 2023 extr x10,x10,x9,#63 2024 and w11,w19,w22,asr#31 2025 eor x9,x11,x9,lsl#1 2026 fmov d9,x9 2027 fmov v9.d[1],x10 2028 2029 2030 orr v1.16b,v24.16b,v24.16b 2031 ld1 {v24.16b},[x0],#16 2032 orr v2.16b,v0.16b,v0.16b 2033 orr v3.16b,v1.16b,v1.16b 2034 eor v27.16b,v24.16b,v9.16b // the third block 2035 eor v24.16b,v24.16b,v9.16b 2036 cmp x2,#32 2037 b.lo .Lxts_outer_enc_tail 2038 2039 // The iv for fourth block 2040 extr x22,x10,x10,#32 2041 extr x10,x10,x9,#63 2042 and w11,w19,w22,asr#31 2043 eor x9,x11,x9,lsl#1 2044 fmov d10,x9 2045 fmov v10.d[1],x10 2046 2047 ld1 {v25.16b},[x0],#16 2048 // The iv for fifth block 2049 extr x22,x10,x10,#32 2050 extr x10,x10,x9,#63 2051 and w11,w19,w22,asr#31 2052 eor x9,x11,x9,lsl#1 2053 fmov d11,x9 2054 fmov v11.d[1],x10 2055 2056 ld1 {v26.16b},[x0],#16 2057 eor v25.16b,v25.16b,v10.16b // the fourth block 2058 eor v26.16b,v26.16b,v11.16b 2059 sub x2,x2,#32 // bias 2060 mov w6,w5 2061 b .Loop5x_xts_enc 2062 2063.align 4 2064.Loop5x_xts_enc: 2065 aese v0.16b,v16.16b 2066 aesmc v0.16b,v0.16b 2067 aese v1.16b,v16.16b 2068 aesmc v1.16b,v1.16b 2069 aese v24.16b,v16.16b 2070 aesmc v24.16b,v24.16b 2071 aese v25.16b,v16.16b 2072 aesmc v25.16b,v25.16b 2073 aese v26.16b,v16.16b 2074 aesmc v26.16b,v26.16b 2075 ld1 {v16.4s},[x7],#16 2076 subs w6,w6,#2 2077 aese v0.16b,v17.16b 2078 aesmc v0.16b,v0.16b 2079 aese v1.16b,v17.16b 2080 aesmc v1.16b,v1.16b 2081 aese v24.16b,v17.16b 2082 aesmc v24.16b,v24.16b 2083 aese v25.16b,v17.16b 2084 aesmc v25.16b,v25.16b 2085 aese v26.16b,v17.16b 2086 aesmc v26.16b,v26.16b 2087 ld1 {v17.4s},[x7],#16 2088 b.gt .Loop5x_xts_enc 2089 2090 aese v0.16b,v16.16b 2091 aesmc v0.16b,v0.16b 2092 aese v1.16b,v16.16b 2093 aesmc v1.16b,v1.16b 2094 aese v24.16b,v16.16b 2095 aesmc v24.16b,v24.16b 2096 aese v25.16b,v16.16b 2097 aesmc v25.16b,v25.16b 2098 aese v26.16b,v16.16b 2099 aesmc v26.16b,v26.16b 2100 subs x2,x2,#0x50 // because .Lxts_enc_tail4x 2101 2102 aese v0.16b,v17.16b 2103 aesmc v0.16b,v0.16b 2104 aese v1.16b,v17.16b 2105 aesmc v1.16b,v1.16b 2106 aese v24.16b,v17.16b 2107 aesmc v24.16b,v24.16b 2108 aese v25.16b,v17.16b 2109 aesmc v25.16b,v25.16b 2110 aese v26.16b,v17.16b 2111 aesmc v26.16b,v26.16b 2112 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2113 mov x7,x3 2114 2115 aese v0.16b,v18.16b 2116 aesmc v0.16b,v0.16b 2117 aese v1.16b,v18.16b 2118 aesmc v1.16b,v1.16b 2119 aese v24.16b,v18.16b 2120 aesmc v24.16b,v24.16b 2121 aese v25.16b,v18.16b 2122 aesmc v25.16b,v25.16b 2123 aese v26.16b,v18.16b 2124 aesmc v26.16b,v26.16b 2125 add x0,x0,x6 // x0 is adjusted in such way that 2126 // at exit from the loop v1.16b-v26.16b 2127 // are loaded with last "words" 2128 add x6,x2,#0x60 // because .Lxts_enc_tail4x 2129 2130 aese v0.16b,v19.16b 2131 aesmc v0.16b,v0.16b 2132 aese v1.16b,v19.16b 2133 aesmc v1.16b,v1.16b 2134 aese v24.16b,v19.16b 2135 aesmc v24.16b,v24.16b 2136 aese v25.16b,v19.16b 2137 aesmc v25.16b,v25.16b 2138 aese v26.16b,v19.16b 2139 aesmc v26.16b,v26.16b 2140 2141 aese v0.16b,v20.16b 2142 aesmc v0.16b,v0.16b 2143 aese v1.16b,v20.16b 2144 aesmc v1.16b,v1.16b 2145 aese v24.16b,v20.16b 2146 aesmc v24.16b,v24.16b 2147 aese v25.16b,v20.16b 2148 aesmc v25.16b,v25.16b 2149 aese v26.16b,v20.16b 2150 aesmc v26.16b,v26.16b 2151 2152 aese v0.16b,v21.16b 2153 aesmc v0.16b,v0.16b 2154 aese v1.16b,v21.16b 2155 aesmc v1.16b,v1.16b 2156 aese v24.16b,v21.16b 2157 aesmc v24.16b,v24.16b 2158 aese v25.16b,v21.16b 2159 aesmc v25.16b,v25.16b 2160 aese v26.16b,v21.16b 2161 aesmc v26.16b,v26.16b 2162 2163 aese v0.16b,v22.16b 2164 aesmc v0.16b,v0.16b 2165 aese v1.16b,v22.16b 2166 aesmc v1.16b,v1.16b 2167 aese v24.16b,v22.16b 2168 aesmc v24.16b,v24.16b 2169 aese v25.16b,v22.16b 2170 aesmc v25.16b,v25.16b 2171 aese v26.16b,v22.16b 2172 aesmc v26.16b,v26.16b 2173 2174 eor v4.16b,v7.16b,v6.16b 2175 aese v0.16b,v23.16b 2176 // The iv for first block of one iteration 2177 extr x22,x10,x10,#32 2178 extr x10,x10,x9,#63 2179 and w11,w19,w22,asr#31 2180 eor x9,x11,x9,lsl#1 2181 fmov d6,x9 2182 fmov v6.d[1],x10 2183 eor v5.16b,v7.16b,v8.16b 2184 ld1 {v2.16b},[x0],#16 2185 aese v1.16b,v23.16b 2186 // The iv for second block 2187 extr x22,x10,x10,#32 2188 extr x10,x10,x9,#63 2189 and w11,w19,w22,asr#31 2190 eor x9,x11,x9,lsl#1 2191 fmov d8,x9 2192 fmov v8.d[1],x10 2193 eor v17.16b,v7.16b,v9.16b 2194 ld1 {v3.16b},[x0],#16 2195 aese v24.16b,v23.16b 2196 // The iv for third block 2197 extr x22,x10,x10,#32 2198 extr x10,x10,x9,#63 2199 and w11,w19,w22,asr#31 2200 eor x9,x11,x9,lsl#1 2201 fmov d9,x9 2202 fmov v9.d[1],x10 2203 eor v30.16b,v7.16b,v10.16b 2204 ld1 {v27.16b},[x0],#16 2205 aese v25.16b,v23.16b 2206 // The iv for fourth block 2207 extr x22,x10,x10,#32 2208 extr x10,x10,x9,#63 2209 and w11,w19,w22,asr#31 2210 eor x9,x11,x9,lsl#1 2211 fmov d10,x9 2212 fmov v10.d[1],x10 2213 eor v31.16b,v7.16b,v11.16b 2214 ld1 {v28.16b},[x0],#16 2215 aese v26.16b,v23.16b 2216 2217 // The iv for fifth block 2218 extr x22,x10,x10,#32 2219 extr x10,x10,x9,#63 2220 and w11,w19,w22,asr #31 2221 eor x9,x11,x9,lsl #1 2222 fmov d11,x9 2223 fmov v11.d[1],x10 2224 2225 ld1 {v29.16b},[x0],#16 2226 cbz x6,.Lxts_enc_tail4x 2227 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2228 eor v4.16b,v4.16b,v0.16b 2229 eor v0.16b,v2.16b,v6.16b 2230 eor v5.16b,v5.16b,v1.16b 2231 eor v1.16b,v3.16b,v8.16b 2232 eor v17.16b,v17.16b,v24.16b 2233 eor v24.16b,v27.16b,v9.16b 2234 eor v30.16b,v30.16b,v25.16b 2235 eor v25.16b,v28.16b,v10.16b 2236 eor v31.16b,v31.16b,v26.16b 2237 st1 {v4.16b},[x1],#16 2238 eor v26.16b,v29.16b,v11.16b 2239 st1 {v5.16b},[x1],#16 2240 mov w6,w5 2241 st1 {v17.16b},[x1],#16 2242 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2243 st1 {v30.16b},[x1],#16 2244 st1 {v31.16b},[x1],#16 2245 b.hs .Loop5x_xts_enc 2246 2247 2248 // If left 4 blocks, borrow the five block's processing. 2249 cmn x2,#0x10 2250 b.ne .Loop5x_enc_after 2251 orr v11.16b,v10.16b,v10.16b 2252 orr v10.16b,v9.16b,v9.16b 2253 orr v9.16b,v8.16b,v8.16b 2254 orr v8.16b,v6.16b,v6.16b 2255 fmov x9,d11 2256 fmov x10,v11.d[1] 2257 eor v0.16b,v6.16b,v2.16b 2258 eor v1.16b,v8.16b,v3.16b 2259 eor v24.16b,v27.16b,v9.16b 2260 eor v25.16b,v28.16b,v10.16b 2261 eor v26.16b,v29.16b,v11.16b 2262 b.eq .Loop5x_xts_enc 2263 2264.Loop5x_enc_after: 2265 add x2,x2,#0x50 2266 cbz x2,.Lxts_enc_done 2267 2268 add w6,w5,#2 2269 subs x2,x2,#0x30 2270 b.lo .Lxts_inner_enc_tail 2271 2272 eor v0.16b,v6.16b,v27.16b 2273 eor v1.16b,v8.16b,v28.16b 2274 eor v24.16b,v29.16b,v9.16b 2275 b .Lxts_outer_enc_tail 2276 2277.align 4 2278.Lxts_enc_tail4x: 2279 add x0,x0,#16 2280 eor v5.16b,v1.16b,v5.16b 2281 st1 {v5.16b},[x1],#16 2282 eor v17.16b,v24.16b,v17.16b 2283 st1 {v17.16b},[x1],#16 2284 eor v30.16b,v25.16b,v30.16b 2285 eor v31.16b,v26.16b,v31.16b 2286 st1 {v30.16b,v31.16b},[x1],#32 2287 2288 b .Lxts_enc_done 2289.align 4 2290.Lxts_outer_enc_tail: 2291 aese v0.16b,v16.16b 2292 aesmc v0.16b,v0.16b 2293 aese v1.16b,v16.16b 2294 aesmc v1.16b,v1.16b 2295 aese v24.16b,v16.16b 2296 aesmc v24.16b,v24.16b 2297 ld1 {v16.4s},[x7],#16 2298 subs w6,w6,#2 2299 aese v0.16b,v17.16b 2300 aesmc v0.16b,v0.16b 2301 aese v1.16b,v17.16b 2302 aesmc v1.16b,v1.16b 2303 aese v24.16b,v17.16b 2304 aesmc v24.16b,v24.16b 2305 ld1 {v17.4s},[x7],#16 2306 b.gt .Lxts_outer_enc_tail 2307 2308 aese v0.16b,v16.16b 2309 aesmc v0.16b,v0.16b 2310 aese v1.16b,v16.16b 2311 aesmc v1.16b,v1.16b 2312 aese v24.16b,v16.16b 2313 aesmc v24.16b,v24.16b 2314 eor v4.16b,v6.16b,v7.16b 2315 subs x2,x2,#0x30 2316 // The iv for first block 2317 fmov x9,d9 2318 fmov x10,v9.d[1] 2319 //mov w19,#0x87 2320 extr x22,x10,x10,#32 2321 extr x10,x10,x9,#63 2322 and w11,w19,w22,asr#31 2323 eor x9,x11,x9,lsl#1 2324 fmov d6,x9 2325 fmov v6.d[1],x10 2326 eor v5.16b,v8.16b,v7.16b 2327 csel x6,x2,x6,lo // x6, w6, is zero at this point 2328 aese v0.16b,v17.16b 2329 aesmc v0.16b,v0.16b 2330 aese v1.16b,v17.16b 2331 aesmc v1.16b,v1.16b 2332 aese v24.16b,v17.16b 2333 aesmc v24.16b,v24.16b 2334 eor v17.16b,v9.16b,v7.16b 2335 2336 add x6,x6,#0x20 2337 add x0,x0,x6 2338 mov x7,x3 2339 2340 aese v0.16b,v20.16b 2341 aesmc v0.16b,v0.16b 2342 aese v1.16b,v20.16b 2343 aesmc v1.16b,v1.16b 2344 aese v24.16b,v20.16b 2345 aesmc v24.16b,v24.16b 2346 aese v0.16b,v21.16b 2347 aesmc v0.16b,v0.16b 2348 aese v1.16b,v21.16b 2349 aesmc v1.16b,v1.16b 2350 aese v24.16b,v21.16b 2351 aesmc v24.16b,v24.16b 2352 aese v0.16b,v22.16b 2353 aesmc v0.16b,v0.16b 2354 aese v1.16b,v22.16b 2355 aesmc v1.16b,v1.16b 2356 aese v24.16b,v22.16b 2357 aesmc v24.16b,v24.16b 2358 aese v0.16b,v23.16b 2359 aese v1.16b,v23.16b 2360 aese v24.16b,v23.16b 2361 ld1 {v27.16b},[x0],#16 2362 add w6,w5,#2 2363 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2364 eor v4.16b,v4.16b,v0.16b 2365 eor v5.16b,v5.16b,v1.16b 2366 eor v24.16b,v24.16b,v17.16b 2367 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2368 st1 {v4.16b},[x1],#16 2369 st1 {v5.16b},[x1],#16 2370 st1 {v24.16b},[x1],#16 2371 cmn x2,#0x30 2372 b.eq .Lxts_enc_done 2373.Lxts_encxor_one: 2374 orr v28.16b,v3.16b,v3.16b 2375 orr v29.16b,v27.16b,v27.16b 2376 nop 2377 2378.Lxts_inner_enc_tail: 2379 cmn x2,#0x10 2380 eor v1.16b,v28.16b,v6.16b 2381 eor v24.16b,v29.16b,v8.16b 2382 b.eq .Lxts_enc_tail_loop 2383 eor v24.16b,v29.16b,v6.16b 2384.Lxts_enc_tail_loop: 2385 aese v1.16b,v16.16b 2386 aesmc v1.16b,v1.16b 2387 aese v24.16b,v16.16b 2388 aesmc v24.16b,v24.16b 2389 ld1 {v16.4s},[x7],#16 2390 subs w6,w6,#2 2391 aese v1.16b,v17.16b 2392 aesmc v1.16b,v1.16b 2393 aese v24.16b,v17.16b 2394 aesmc v24.16b,v24.16b 2395 ld1 {v17.4s},[x7],#16 2396 b.gt .Lxts_enc_tail_loop 2397 2398 aese v1.16b,v16.16b 2399 aesmc v1.16b,v1.16b 2400 aese v24.16b,v16.16b 2401 aesmc v24.16b,v24.16b 2402 aese v1.16b,v17.16b 2403 aesmc v1.16b,v1.16b 2404 aese v24.16b,v17.16b 2405 aesmc v24.16b,v24.16b 2406 aese v1.16b,v20.16b 2407 aesmc v1.16b,v1.16b 2408 aese v24.16b,v20.16b 2409 aesmc v24.16b,v24.16b 2410 cmn x2,#0x20 2411 aese v1.16b,v21.16b 2412 aesmc v1.16b,v1.16b 2413 aese v24.16b,v21.16b 2414 aesmc v24.16b,v24.16b 2415 eor v5.16b,v6.16b,v7.16b 2416 aese v1.16b,v22.16b 2417 aesmc v1.16b,v1.16b 2418 aese v24.16b,v22.16b 2419 aesmc v24.16b,v24.16b 2420 eor v17.16b,v8.16b,v7.16b 2421 aese v1.16b,v23.16b 2422 aese v24.16b,v23.16b 2423 b.eq .Lxts_enc_one 2424 eor v5.16b,v5.16b,v1.16b 2425 st1 {v5.16b},[x1],#16 2426 eor v17.16b,v17.16b,v24.16b 2427 orr v6.16b,v8.16b,v8.16b 2428 st1 {v17.16b},[x1],#16 2429 fmov x9,d8 2430 fmov x10,v8.d[1] 2431 mov w19,#0x87 2432 extr x22,x10,x10,#32 2433 extr x10,x10,x9,#63 2434 and w11,w19,w22,asr #31 2435 eor x9,x11,x9,lsl #1 2436 fmov d6,x9 2437 fmov v6.d[1],x10 2438 b .Lxts_enc_done 2439 2440.Lxts_enc_one: 2441 eor v5.16b,v5.16b,v24.16b 2442 orr v6.16b,v6.16b,v6.16b 2443 st1 {v5.16b},[x1],#16 2444 fmov x9,d6 2445 fmov x10,v6.d[1] 2446 mov w19,#0x87 2447 extr x22,x10,x10,#32 2448 extr x10,x10,x9,#63 2449 and w11,w19,w22,asr #31 2450 eor x9,x11,x9,lsl #1 2451 fmov d6,x9 2452 fmov v6.d[1],x10 2453 b .Lxts_enc_done 2454.align 5 2455.Lxts_enc_done: 2456 // Process the tail block with cipher stealing. 2457 tst x21,#0xf 2458 b.eq .Lxts_abort 2459 2460 mov x20,x0 2461 mov x13,x1 2462 sub x1,x1,#16 2463.composite_enc_loop: 2464 subs x21,x21,#1 2465 ldrb w15,[x1,x21] 2466 ldrb w14,[x20,x21] 2467 strb w15,[x13,x21] 2468 strb w14,[x1,x21] 2469 b.gt .composite_enc_loop 2470.Lxts_enc_load_done: 2471 ld1 {v26.16b},[x1] 2472 eor v26.16b,v26.16b,v6.16b 2473 2474 // Encrypt the composite block to get the last second encrypted text block 2475 ldr w6,[x3,#240] // load key schedule... 2476 ld1 {v0.4s},[x3],#16 2477 sub w6,w6,#2 2478 ld1 {v1.4s},[x3],#16 // load key schedule... 2479.Loop_final_enc: 2480 aese v26.16b,v0.16b 2481 aesmc v26.16b,v26.16b 2482 ld1 {v0.4s},[x3],#16 2483 subs w6,w6,#2 2484 aese v26.16b,v1.16b 2485 aesmc v26.16b,v26.16b 2486 ld1 {v1.4s},[x3],#16 2487 b.gt .Loop_final_enc 2488 2489 aese v26.16b,v0.16b 2490 aesmc v26.16b,v26.16b 2491 ld1 {v0.4s},[x3] 2492 aese v26.16b,v1.16b 2493 eor v26.16b,v26.16b,v0.16b 2494 eor v26.16b,v26.16b,v6.16b 2495 st1 {v26.16b},[x1] 2496 2497.Lxts_abort: 2498 ldp x21,x22,[sp,#48] 2499 ldp d8,d9,[sp,#32] 2500 ldp d10,d11,[sp,#16] 2501 ldp x19,x20,[sp],#64 2502.Lxts_enc_final_abort: 2503 ret 2504.size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt 2505.globl aes_v8_xts_decrypt 2506.type aes_v8_xts_decrypt,%function 2507.align 5 2508aes_v8_xts_decrypt: 2509 AARCH64_VALID_CALL_TARGET 2510 cmp x2,#16 2511 // Original input data size bigger than 16, jump to big size processing. 2512 b.ne .Lxts_dec_big_size 2513 // Encrypt the iv with key2, as the first XEX iv. 2514 ldr w6,[x4,#240] 2515 ld1 {v0.4s},[x4],#16 2516 ld1 {v6.16b},[x5] 2517 sub w6,w6,#2 2518 ld1 {v1.4s},[x4],#16 2519 2520.Loop_dec_small_iv_enc: 2521 aese v6.16b,v0.16b 2522 aesmc v6.16b,v6.16b 2523 ld1 {v0.4s},[x4],#16 2524 subs w6,w6,#2 2525 aese v6.16b,v1.16b 2526 aesmc v6.16b,v6.16b 2527 ld1 {v1.4s},[x4],#16 2528 b.gt .Loop_dec_small_iv_enc 2529 2530 aese v6.16b,v0.16b 2531 aesmc v6.16b,v6.16b 2532 ld1 {v0.4s},[x4] 2533 aese v6.16b,v1.16b 2534 eor v6.16b,v6.16b,v0.16b 2535 2536 ld1 {v0.16b},[x0] 2537 eor v0.16b,v6.16b,v0.16b 2538 2539 ldr w6,[x3,#240] 2540 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 2541 2542 aesd v0.16b,v28.16b 2543 aesimc v0.16b,v0.16b 2544 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 2545 aesd v0.16b,v29.16b 2546 aesimc v0.16b,v0.16b 2547 subs w6,w6,#10 // bias 2548 b.eq .Lxts_128_dec 2549.Lxts_dec_round_loop: 2550 aesd v0.16b,v16.16b 2551 aesimc v0.16b,v0.16b 2552 ld1 {v16.4s},[x3],#16 // load key schedule... 2553 aesd v0.16b,v17.16b 2554 aesimc v0.16b,v0.16b 2555 ld1 {v17.4s},[x3],#16 // load key schedule... 2556 subs w6,w6,#2 // bias 2557 b.gt .Lxts_dec_round_loop 2558.Lxts_128_dec: 2559 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 2560 aesd v0.16b,v16.16b 2561 aesimc v0.16b,v0.16b 2562 aesd v0.16b,v17.16b 2563 aesimc v0.16b,v0.16b 2564 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 2565 aesd v0.16b,v18.16b 2566 aesimc v0.16b,v0.16b 2567 aesd v0.16b,v19.16b 2568 aesimc v0.16b,v0.16b 2569 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 2570 aesd v0.16b,v20.16b 2571 aesimc v0.16b,v0.16b 2572 aesd v0.16b,v21.16b 2573 aesimc v0.16b,v0.16b 2574 ld1 {v7.4s},[x3] 2575 aesd v0.16b,v22.16b 2576 aesimc v0.16b,v0.16b 2577 aesd v0.16b,v23.16b 2578 eor v0.16b,v0.16b,v7.16b 2579 eor v0.16b,v6.16b,v0.16b 2580 st1 {v0.16b},[x1] 2581 b .Lxts_dec_final_abort 2582.Lxts_dec_big_size: 2583 stp x19,x20,[sp,#-64]! 2584 stp x21,x22,[sp,#48] 2585 stp d8,d9,[sp,#32] 2586 stp d10,d11,[sp,#16] 2587 2588 and x21,x2,#0xf 2589 and x2,x2,#-16 2590 subs x2,x2,#16 2591 mov x8,#16 2592 b.lo .Lxts_dec_abort 2593 2594 // Encrypt the iv with key2, as the first XEX iv 2595 ldr w6,[x4,#240] 2596 ld1 {v0.4s},[x4],#16 2597 ld1 {v6.16b},[x5] 2598 sub w6,w6,#2 2599 ld1 {v1.4s},[x4],#16 2600 2601.Loop_dec_iv_enc: 2602 aese v6.16b,v0.16b 2603 aesmc v6.16b,v6.16b 2604 ld1 {v0.4s},[x4],#16 2605 subs w6,w6,#2 2606 aese v6.16b,v1.16b 2607 aesmc v6.16b,v6.16b 2608 ld1 {v1.4s},[x4],#16 2609 b.gt .Loop_dec_iv_enc 2610 2611 aese v6.16b,v0.16b 2612 aesmc v6.16b,v6.16b 2613 ld1 {v0.4s},[x4] 2614 aese v6.16b,v1.16b 2615 eor v6.16b,v6.16b,v0.16b 2616 2617 // The iv for second block 2618 // x9- iv(low), x10 - iv(high) 2619 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 2620 fmov x9,d6 2621 fmov x10,v6.d[1] 2622 mov w19,#0x87 2623 extr x22,x10,x10,#32 2624 extr x10,x10,x9,#63 2625 and w11,w19,w22,asr #31 2626 eor x9,x11,x9,lsl #1 2627 fmov d8,x9 2628 fmov v8.d[1],x10 2629 2630 ldr w5,[x3,#240] // load rounds number 2631 2632 // The iv for third block 2633 extr x22,x10,x10,#32 2634 extr x10,x10,x9,#63 2635 and w11,w19,w22,asr #31 2636 eor x9,x11,x9,lsl #1 2637 fmov d9,x9 2638 fmov v9.d[1],x10 2639 2640 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2641 sub w5,w5,#6 2642 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 2643 sub w5,w5,#2 2644 ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... 2645 ld1 {v20.4s,v21.4s},[x7],#32 2646 ld1 {v22.4s,v23.4s},[x7],#32 2647 ld1 {v7.4s},[x7] 2648 2649 // The iv for fourth block 2650 extr x22,x10,x10,#32 2651 extr x10,x10,x9,#63 2652 and w11,w19,w22,asr #31 2653 eor x9,x11,x9,lsl #1 2654 fmov d10,x9 2655 fmov v10.d[1],x10 2656 2657 add x7,x3,#32 2658 mov w6,w5 2659 b .Lxts_dec 2660 2661 // Decryption 2662.align 5 2663.Lxts_dec: 2664 tst x21,#0xf 2665 b.eq .Lxts_dec_begin 2666 subs x2,x2,#16 2667 csel x8,xzr,x8,eq 2668 ld1 {v0.16b},[x0],#16 2669 b.lo .Lxts_done 2670 sub x0,x0,#16 2671.Lxts_dec_begin: 2672 ld1 {v0.16b},[x0],x8 2673 subs x2,x2,#32 // bias 2674 add w6,w5,#2 2675 orr v3.16b,v0.16b,v0.16b 2676 orr v1.16b,v0.16b,v0.16b 2677 orr v28.16b,v0.16b,v0.16b 2678 ld1 {v24.16b},[x0],#16 2679 orr v27.16b,v24.16b,v24.16b 2680 orr v29.16b,v24.16b,v24.16b 2681 b.lo .Lxts_inner_dec_tail 2682 eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv 2683 eor v24.16b,v24.16b,v8.16b 2684 2685 orr v1.16b,v24.16b,v24.16b 2686 ld1 {v24.16b},[x0],#16 2687 orr v2.16b,v0.16b,v0.16b 2688 orr v3.16b,v1.16b,v1.16b 2689 eor v27.16b,v24.16b,v9.16b // third block xox with third iv 2690 eor v24.16b,v24.16b,v9.16b 2691 cmp x2,#32 2692 b.lo .Lxts_outer_dec_tail 2693 2694 ld1 {v25.16b},[x0],#16 2695 2696 // The iv for fifth block 2697 extr x22,x10,x10,#32 2698 extr x10,x10,x9,#63 2699 and w11,w19,w22,asr #31 2700 eor x9,x11,x9,lsl #1 2701 fmov d11,x9 2702 fmov v11.d[1],x10 2703 2704 ld1 {v26.16b},[x0],#16 2705 eor v25.16b,v25.16b,v10.16b // the fourth block 2706 eor v26.16b,v26.16b,v11.16b 2707 sub x2,x2,#32 // bias 2708 mov w6,w5 2709 b .Loop5x_xts_dec 2710 2711.align 4 2712.Loop5x_xts_dec: 2713 aesd v0.16b,v16.16b 2714 aesimc v0.16b,v0.16b 2715 aesd v1.16b,v16.16b 2716 aesimc v1.16b,v1.16b 2717 aesd v24.16b,v16.16b 2718 aesimc v24.16b,v24.16b 2719 aesd v25.16b,v16.16b 2720 aesimc v25.16b,v25.16b 2721 aesd v26.16b,v16.16b 2722 aesimc v26.16b,v26.16b 2723 ld1 {v16.4s},[x7],#16 // load key schedule... 2724 subs w6,w6,#2 2725 aesd v0.16b,v17.16b 2726 aesimc v0.16b,v0.16b 2727 aesd v1.16b,v17.16b 2728 aesimc v1.16b,v1.16b 2729 aesd v24.16b,v17.16b 2730 aesimc v24.16b,v24.16b 2731 aesd v25.16b,v17.16b 2732 aesimc v25.16b,v25.16b 2733 aesd v26.16b,v17.16b 2734 aesimc v26.16b,v26.16b 2735 ld1 {v17.4s},[x7],#16 // load key schedule... 2736 b.gt .Loop5x_xts_dec 2737 2738 aesd v0.16b,v16.16b 2739 aesimc v0.16b,v0.16b 2740 aesd v1.16b,v16.16b 2741 aesimc v1.16b,v1.16b 2742 aesd v24.16b,v16.16b 2743 aesimc v24.16b,v24.16b 2744 aesd v25.16b,v16.16b 2745 aesimc v25.16b,v25.16b 2746 aesd v26.16b,v16.16b 2747 aesimc v26.16b,v26.16b 2748 subs x2,x2,#0x50 // because .Lxts_dec_tail4x 2749 2750 aesd v0.16b,v17.16b 2751 aesimc v0.16b,v0.16b 2752 aesd v1.16b,v17.16b 2753 aesimc v1.16b,v1.16b 2754 aesd v24.16b,v17.16b 2755 aesimc v24.16b,v24.16b 2756 aesd v25.16b,v17.16b 2757 aesimc v25.16b,v25.16b 2758 aesd v26.16b,v17.16b 2759 aesimc v26.16b,v26.16b 2760 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2761 mov x7,x3 2762 2763 aesd v0.16b,v18.16b 2764 aesimc v0.16b,v0.16b 2765 aesd v1.16b,v18.16b 2766 aesimc v1.16b,v1.16b 2767 aesd v24.16b,v18.16b 2768 aesimc v24.16b,v24.16b 2769 aesd v25.16b,v18.16b 2770 aesimc v25.16b,v25.16b 2771 aesd v26.16b,v18.16b 2772 aesimc v26.16b,v26.16b 2773 add x0,x0,x6 // x0 is adjusted in such way that 2774 // at exit from the loop v1.16b-v26.16b 2775 // are loaded with last "words" 2776 add x6,x2,#0x60 // because .Lxts_dec_tail4x 2777 2778 aesd v0.16b,v19.16b 2779 aesimc v0.16b,v0.16b 2780 aesd v1.16b,v19.16b 2781 aesimc v1.16b,v1.16b 2782 aesd v24.16b,v19.16b 2783 aesimc v24.16b,v24.16b 2784 aesd v25.16b,v19.16b 2785 aesimc v25.16b,v25.16b 2786 aesd v26.16b,v19.16b 2787 aesimc v26.16b,v26.16b 2788 2789 aesd v0.16b,v20.16b 2790 aesimc v0.16b,v0.16b 2791 aesd v1.16b,v20.16b 2792 aesimc v1.16b,v1.16b 2793 aesd v24.16b,v20.16b 2794 aesimc v24.16b,v24.16b 2795 aesd v25.16b,v20.16b 2796 aesimc v25.16b,v25.16b 2797 aesd v26.16b,v20.16b 2798 aesimc v26.16b,v26.16b 2799 2800 aesd v0.16b,v21.16b 2801 aesimc v0.16b,v0.16b 2802 aesd v1.16b,v21.16b 2803 aesimc v1.16b,v1.16b 2804 aesd v24.16b,v21.16b 2805 aesimc v24.16b,v24.16b 2806 aesd v25.16b,v21.16b 2807 aesimc v25.16b,v25.16b 2808 aesd v26.16b,v21.16b 2809 aesimc v26.16b,v26.16b 2810 2811 aesd v0.16b,v22.16b 2812 aesimc v0.16b,v0.16b 2813 aesd v1.16b,v22.16b 2814 aesimc v1.16b,v1.16b 2815 aesd v24.16b,v22.16b 2816 aesimc v24.16b,v24.16b 2817 aesd v25.16b,v22.16b 2818 aesimc v25.16b,v25.16b 2819 aesd v26.16b,v22.16b 2820 aesimc v26.16b,v26.16b 2821 2822 eor v4.16b,v7.16b,v6.16b 2823 aesd v0.16b,v23.16b 2824 // The iv for first block of next iteration. 2825 extr x22,x10,x10,#32 2826 extr x10,x10,x9,#63 2827 and w11,w19,w22,asr #31 2828 eor x9,x11,x9,lsl #1 2829 fmov d6,x9 2830 fmov v6.d[1],x10 2831 eor v5.16b,v7.16b,v8.16b 2832 ld1 {v2.16b},[x0],#16 2833 aesd v1.16b,v23.16b 2834 // The iv for second block 2835 extr x22,x10,x10,#32 2836 extr x10,x10,x9,#63 2837 and w11,w19,w22,asr #31 2838 eor x9,x11,x9,lsl #1 2839 fmov d8,x9 2840 fmov v8.d[1],x10 2841 eor v17.16b,v7.16b,v9.16b 2842 ld1 {v3.16b},[x0],#16 2843 aesd v24.16b,v23.16b 2844 // The iv for third block 2845 extr x22,x10,x10,#32 2846 extr x10,x10,x9,#63 2847 and w11,w19,w22,asr #31 2848 eor x9,x11,x9,lsl #1 2849 fmov d9,x9 2850 fmov v9.d[1],x10 2851 eor v30.16b,v7.16b,v10.16b 2852 ld1 {v27.16b},[x0],#16 2853 aesd v25.16b,v23.16b 2854 // The iv for fourth block 2855 extr x22,x10,x10,#32 2856 extr x10,x10,x9,#63 2857 and w11,w19,w22,asr #31 2858 eor x9,x11,x9,lsl #1 2859 fmov d10,x9 2860 fmov v10.d[1],x10 2861 eor v31.16b,v7.16b,v11.16b 2862 ld1 {v28.16b},[x0],#16 2863 aesd v26.16b,v23.16b 2864 2865 // The iv for fifth block 2866 extr x22,x10,x10,#32 2867 extr x10,x10,x9,#63 2868 and w11,w19,w22,asr #31 2869 eor x9,x11,x9,lsl #1 2870 fmov d11,x9 2871 fmov v11.d[1],x10 2872 2873 ld1 {v29.16b},[x0],#16 2874 cbz x6,.Lxts_dec_tail4x 2875 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2876 eor v4.16b,v4.16b,v0.16b 2877 eor v0.16b,v2.16b,v6.16b 2878 eor v5.16b,v5.16b,v1.16b 2879 eor v1.16b,v3.16b,v8.16b 2880 eor v17.16b,v17.16b,v24.16b 2881 eor v24.16b,v27.16b,v9.16b 2882 eor v30.16b,v30.16b,v25.16b 2883 eor v25.16b,v28.16b,v10.16b 2884 eor v31.16b,v31.16b,v26.16b 2885 st1 {v4.16b},[x1],#16 2886 eor v26.16b,v29.16b,v11.16b 2887 st1 {v5.16b},[x1],#16 2888 mov w6,w5 2889 st1 {v17.16b},[x1],#16 2890 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2891 st1 {v30.16b},[x1],#16 2892 st1 {v31.16b},[x1],#16 2893 b.hs .Loop5x_xts_dec 2894 2895 cmn x2,#0x10 2896 b.ne .Loop5x_dec_after 2897 // If x2(x2) equal to -0x10, the left blocks is 4. 2898 // After specially processing, utilize the five blocks processing again. 2899 // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b. 2900 orr v11.16b,v10.16b,v10.16b 2901 orr v10.16b,v9.16b,v9.16b 2902 orr v9.16b,v8.16b,v8.16b 2903 orr v8.16b,v6.16b,v6.16b 2904 fmov x9,d11 2905 fmov x10,v11.d[1] 2906 eor v0.16b,v6.16b,v2.16b 2907 eor v1.16b,v8.16b,v3.16b 2908 eor v24.16b,v27.16b,v9.16b 2909 eor v25.16b,v28.16b,v10.16b 2910 eor v26.16b,v29.16b,v11.16b 2911 b.eq .Loop5x_xts_dec 2912 2913.Loop5x_dec_after: 2914 add x2,x2,#0x50 2915 cbz x2,.Lxts_done 2916 2917 add w6,w5,#2 2918 subs x2,x2,#0x30 2919 b.lo .Lxts_inner_dec_tail 2920 2921 eor v0.16b,v6.16b,v27.16b 2922 eor v1.16b,v8.16b,v28.16b 2923 eor v24.16b,v29.16b,v9.16b 2924 b .Lxts_outer_dec_tail 2925 2926.align 4 2927.Lxts_dec_tail4x: 2928 add x0,x0,#16 2929 tst x21,#0xf 2930 eor v5.16b,v1.16b,v4.16b 2931 st1 {v5.16b},[x1],#16 2932 eor v17.16b,v24.16b,v17.16b 2933 st1 {v17.16b},[x1],#16 2934 eor v30.16b,v25.16b,v30.16b 2935 eor v31.16b,v26.16b,v31.16b 2936 st1 {v30.16b,v31.16b},[x1],#32 2937 2938 b.eq .Lxts_dec_abort 2939 ld1 {v0.16b},[x0],#16 2940 b .Lxts_done 2941.align 4 2942.Lxts_outer_dec_tail: 2943 aesd v0.16b,v16.16b 2944 aesimc v0.16b,v0.16b 2945 aesd v1.16b,v16.16b 2946 aesimc v1.16b,v1.16b 2947 aesd v24.16b,v16.16b 2948 aesimc v24.16b,v24.16b 2949 ld1 {v16.4s},[x7],#16 2950 subs w6,w6,#2 2951 aesd v0.16b,v17.16b 2952 aesimc v0.16b,v0.16b 2953 aesd v1.16b,v17.16b 2954 aesimc v1.16b,v1.16b 2955 aesd v24.16b,v17.16b 2956 aesimc v24.16b,v24.16b 2957 ld1 {v17.4s},[x7],#16 2958 b.gt .Lxts_outer_dec_tail 2959 2960 aesd v0.16b,v16.16b 2961 aesimc v0.16b,v0.16b 2962 aesd v1.16b,v16.16b 2963 aesimc v1.16b,v1.16b 2964 aesd v24.16b,v16.16b 2965 aesimc v24.16b,v24.16b 2966 eor v4.16b,v6.16b,v7.16b 2967 subs x2,x2,#0x30 2968 // The iv for first block 2969 fmov x9,d9 2970 fmov x10,v9.d[1] 2971 mov w19,#0x87 2972 extr x22,x10,x10,#32 2973 extr x10,x10,x9,#63 2974 and w11,w19,w22,asr #31 2975 eor x9,x11,x9,lsl #1 2976 fmov d6,x9 2977 fmov v6.d[1],x10 2978 eor v5.16b,v8.16b,v7.16b 2979 csel x6,x2,x6,lo // x6, w6, is zero at this point 2980 aesd v0.16b,v17.16b 2981 aesimc v0.16b,v0.16b 2982 aesd v1.16b,v17.16b 2983 aesimc v1.16b,v1.16b 2984 aesd v24.16b,v17.16b 2985 aesimc v24.16b,v24.16b 2986 eor v17.16b,v9.16b,v7.16b 2987 // The iv for second block 2988 extr x22,x10,x10,#32 2989 extr x10,x10,x9,#63 2990 and w11,w19,w22,asr #31 2991 eor x9,x11,x9,lsl #1 2992 fmov d8,x9 2993 fmov v8.d[1],x10 2994 2995 add x6,x6,#0x20 2996 add x0,x0,x6 // x0 is adjusted to the last data 2997 2998 mov x7,x3 2999 3000 // The iv for third block 3001 extr x22,x10,x10,#32 3002 extr x10,x10,x9,#63 3003 and w11,w19,w22,asr #31 3004 eor x9,x11,x9,lsl #1 3005 fmov d9,x9 3006 fmov v9.d[1],x10 3007 3008 aesd v0.16b,v20.16b 3009 aesimc v0.16b,v0.16b 3010 aesd v1.16b,v20.16b 3011 aesimc v1.16b,v1.16b 3012 aesd v24.16b,v20.16b 3013 aesimc v24.16b,v24.16b 3014 aesd v0.16b,v21.16b 3015 aesimc v0.16b,v0.16b 3016 aesd v1.16b,v21.16b 3017 aesimc v1.16b,v1.16b 3018 aesd v24.16b,v21.16b 3019 aesimc v24.16b,v24.16b 3020 aesd v0.16b,v22.16b 3021 aesimc v0.16b,v0.16b 3022 aesd v1.16b,v22.16b 3023 aesimc v1.16b,v1.16b 3024 aesd v24.16b,v22.16b 3025 aesimc v24.16b,v24.16b 3026 ld1 {v27.16b},[x0],#16 3027 aesd v0.16b,v23.16b 3028 aesd v1.16b,v23.16b 3029 aesd v24.16b,v23.16b 3030 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3031 add w6,w5,#2 3032 eor v4.16b,v4.16b,v0.16b 3033 eor v5.16b,v5.16b,v1.16b 3034 eor v24.16b,v24.16b,v17.16b 3035 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3036 st1 {v4.16b},[x1],#16 3037 st1 {v5.16b},[x1],#16 3038 st1 {v24.16b},[x1],#16 3039 3040 cmn x2,#0x30 3041 add x2,x2,#0x30 3042 b.eq .Lxts_done 3043 sub x2,x2,#0x30 3044 orr v28.16b,v3.16b,v3.16b 3045 orr v29.16b,v27.16b,v27.16b 3046 nop 3047 3048.Lxts_inner_dec_tail: 3049 // x2 == -0x10 means two blocks left. 3050 cmn x2,#0x10 3051 eor v1.16b,v28.16b,v6.16b 3052 eor v24.16b,v29.16b,v8.16b 3053 b.eq .Lxts_dec_tail_loop 3054 eor v24.16b,v29.16b,v6.16b 3055.Lxts_dec_tail_loop: 3056 aesd v1.16b,v16.16b 3057 aesimc v1.16b,v1.16b 3058 aesd v24.16b,v16.16b 3059 aesimc v24.16b,v24.16b 3060 ld1 {v16.4s},[x7],#16 3061 subs w6,w6,#2 3062 aesd v1.16b,v17.16b 3063 aesimc v1.16b,v1.16b 3064 aesd v24.16b,v17.16b 3065 aesimc v24.16b,v24.16b 3066 ld1 {v17.4s},[x7],#16 3067 b.gt .Lxts_dec_tail_loop 3068 3069 aesd v1.16b,v16.16b 3070 aesimc v1.16b,v1.16b 3071 aesd v24.16b,v16.16b 3072 aesimc v24.16b,v24.16b 3073 aesd v1.16b,v17.16b 3074 aesimc v1.16b,v1.16b 3075 aesd v24.16b,v17.16b 3076 aesimc v24.16b,v24.16b 3077 aesd v1.16b,v20.16b 3078 aesimc v1.16b,v1.16b 3079 aesd v24.16b,v20.16b 3080 aesimc v24.16b,v24.16b 3081 cmn x2,#0x20 3082 aesd v1.16b,v21.16b 3083 aesimc v1.16b,v1.16b 3084 aesd v24.16b,v21.16b 3085 aesimc v24.16b,v24.16b 3086 eor v5.16b,v6.16b,v7.16b 3087 aesd v1.16b,v22.16b 3088 aesimc v1.16b,v1.16b 3089 aesd v24.16b,v22.16b 3090 aesimc v24.16b,v24.16b 3091 eor v17.16b,v8.16b,v7.16b 3092 aesd v1.16b,v23.16b 3093 aesd v24.16b,v23.16b 3094 b.eq .Lxts_dec_one 3095 eor v5.16b,v5.16b,v1.16b 3096 eor v17.16b,v17.16b,v24.16b 3097 orr v6.16b,v9.16b,v9.16b 3098 orr v8.16b,v10.16b,v10.16b 3099 st1 {v5.16b},[x1],#16 3100 st1 {v17.16b},[x1],#16 3101 add x2,x2,#16 3102 b .Lxts_done 3103 3104.Lxts_dec_one: 3105 eor v5.16b,v5.16b,v24.16b 3106 orr v6.16b,v8.16b,v8.16b 3107 orr v8.16b,v9.16b,v9.16b 3108 st1 {v5.16b},[x1],#16 3109 add x2,x2,#32 3110 3111.Lxts_done: 3112 tst x21,#0xf 3113 b.eq .Lxts_dec_abort 3114 // Processing the last two blocks with cipher stealing. 3115 mov x7,x3 3116 cbnz x2,.Lxts_dec_1st_done 3117 ld1 {v0.16b},[x0],#16 3118 3119 // Decrypt the last secod block to get the last plain text block 3120.Lxts_dec_1st_done: 3121 eor v26.16b,v0.16b,v8.16b 3122 ldr w6,[x3,#240] 3123 ld1 {v0.4s},[x3],#16 3124 sub w6,w6,#2 3125 ld1 {v1.4s},[x3],#16 3126.Loop_final_2nd_dec: 3127 aesd v26.16b,v0.16b 3128 aesimc v26.16b,v26.16b 3129 ld1 {v0.4s},[x3],#16 // load key schedule... 3130 subs w6,w6,#2 3131 aesd v26.16b,v1.16b 3132 aesimc v26.16b,v26.16b 3133 ld1 {v1.4s},[x3],#16 // load key schedule... 3134 b.gt .Loop_final_2nd_dec 3135 3136 aesd v26.16b,v0.16b 3137 aesimc v26.16b,v26.16b 3138 ld1 {v0.4s},[x3] 3139 aesd v26.16b,v1.16b 3140 eor v26.16b,v26.16b,v0.16b 3141 eor v26.16b,v26.16b,v8.16b 3142 st1 {v26.16b},[x1] 3143 3144 mov x20,x0 3145 add x13,x1,#16 3146 3147 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3148 // to get the last encrypted block. 3149.composite_dec_loop: 3150 subs x21,x21,#1 3151 ldrb w15,[x1,x21] 3152 ldrb w14,[x20,x21] 3153 strb w15,[x13,x21] 3154 strb w14,[x1,x21] 3155 b.gt .composite_dec_loop 3156.Lxts_dec_load_done: 3157 ld1 {v26.16b},[x1] 3158 eor v26.16b,v26.16b,v6.16b 3159 3160 // Decrypt the composite block to get the last second plain text block 3161 ldr w6,[x7,#240] 3162 ld1 {v0.4s},[x7],#16 3163 sub w6,w6,#2 3164 ld1 {v1.4s},[x7],#16 3165.Loop_final_dec: 3166 aesd v26.16b,v0.16b 3167 aesimc v26.16b,v26.16b 3168 ld1 {v0.4s},[x7],#16 // load key schedule... 3169 subs w6,w6,#2 3170 aesd v26.16b,v1.16b 3171 aesimc v26.16b,v26.16b 3172 ld1 {v1.4s},[x7],#16 // load key schedule... 3173 b.gt .Loop_final_dec 3174 3175 aesd v26.16b,v0.16b 3176 aesimc v26.16b,v26.16b 3177 ld1 {v0.4s},[x7] 3178 aesd v26.16b,v1.16b 3179 eor v26.16b,v26.16b,v0.16b 3180 eor v26.16b,v26.16b,v6.16b 3181 st1 {v26.16b},[x1] 3182 3183.Lxts_dec_abort: 3184 ldp x21,x22,[sp,#48] 3185 ldp d8,d9,[sp,#32] 3186 ldp d10,d11,[sp,#16] 3187 ldp x19,x20,[sp],#64 3188 3189.Lxts_dec_final_abort: 3190 ret 3191.size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt 3192#endif 3193