1/* Do not modify. This file is auto-generated from aesv8-armx.pl. */ 2#include "arm_arch.h" 3 4#if __ARM_MAX_ARCH__>=7 5.arch armv8-a+crypto 6.text 7.section .rodata 8.align 5 9.Lrcon: 10.long 0x01,0x01,0x01,0x01 11.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d // rotate-n-splat 12.long 0x1b,0x1b,0x1b,0x1b 13.previous 14.globl aes_v8_set_encrypt_key 15.type aes_v8_set_encrypt_key,%function 16.align 5 17aes_v8_set_encrypt_key: 18.Lenc_key: 19 AARCH64_VALID_CALL_TARGET 20 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 21 stp x29,x30,[sp,#-16]! 22 add x29,sp,#0 23 mov x3,#-1 24 cmp x0,#0 25 b.eq .Lenc_key_abort 26 cmp x2,#0 27 b.eq .Lenc_key_abort 28 mov x3,#-2 29 cmp w1,#128 30 b.lt .Lenc_key_abort 31 cmp w1,#256 32 b.gt .Lenc_key_abort 33 tst w1,#0x3f 34 b.ne .Lenc_key_abort 35 36 adrp x3,.Lrcon 37 add x3,x3,#:lo12:.Lrcon 38 cmp w1,#192 39 40 eor v0.16b,v0.16b,v0.16b 41 ld1 {v3.16b},[x0],#16 42 mov w1,#8 // reuse w1 43 ld1 {v1.4s,v2.4s},[x3],#32 44 45 b.lt .Loop128 46 b.eq .L192 47 b .L256 48 49.align 4 50.Loop128: 51 tbl v6.16b,{v3.16b},v2.16b 52 ext v5.16b,v0.16b,v3.16b,#12 53 st1 {v3.4s},[x2],#16 54 aese v6.16b,v0.16b 55 subs w1,w1,#1 56 57 eor v3.16b,v3.16b,v5.16b 58 ext v5.16b,v0.16b,v5.16b,#12 59 eor v3.16b,v3.16b,v5.16b 60 ext v5.16b,v0.16b,v5.16b,#12 61 eor v6.16b,v6.16b,v1.16b 62 eor v3.16b,v3.16b,v5.16b 63 shl v1.16b,v1.16b,#1 64 eor v3.16b,v3.16b,v6.16b 65 b.ne .Loop128 66 67 ld1 {v1.4s},[x3] 68 69 tbl v6.16b,{v3.16b},v2.16b 70 ext v5.16b,v0.16b,v3.16b,#12 71 st1 {v3.4s},[x2],#16 72 aese v6.16b,v0.16b 73 74 eor v3.16b,v3.16b,v5.16b 75 ext v5.16b,v0.16b,v5.16b,#12 76 eor v3.16b,v3.16b,v5.16b 77 ext v5.16b,v0.16b,v5.16b,#12 78 eor v6.16b,v6.16b,v1.16b 79 eor v3.16b,v3.16b,v5.16b 80 shl v1.16b,v1.16b,#1 81 eor v3.16b,v3.16b,v6.16b 82 83 tbl v6.16b,{v3.16b},v2.16b 84 ext v5.16b,v0.16b,v3.16b,#12 85 st1 {v3.4s},[x2],#16 86 aese v6.16b,v0.16b 87 88 eor v3.16b,v3.16b,v5.16b 89 ext v5.16b,v0.16b,v5.16b,#12 90 eor v3.16b,v3.16b,v5.16b 91 ext v5.16b,v0.16b,v5.16b,#12 92 eor v6.16b,v6.16b,v1.16b 93 eor v3.16b,v3.16b,v5.16b 94 eor v3.16b,v3.16b,v6.16b 95 st1 {v3.4s},[x2] 96 add x2,x2,#0x50 97 98 mov w12,#10 99 b .Ldone 100 101.align 4 102.L192: 103 ld1 {v4.8b},[x0],#8 104 movi v6.16b,#8 // borrow v6.16b 105 st1 {v3.4s},[x2],#16 106 sub v2.16b,v2.16b,v6.16b // adjust the mask 107 108.Loop192: 109 tbl v6.16b,{v4.16b},v2.16b 110 ext v5.16b,v0.16b,v3.16b,#12 111#ifdef __AARCH64EB__ 112 st1 {v4.4s},[x2],#16 113 sub x2,x2,#8 114#else 115 st1 {v4.8b},[x2],#8 116#endif 117 aese v6.16b,v0.16b 118 subs w1,w1,#1 119 120 eor v3.16b,v3.16b,v5.16b 121 ext v5.16b,v0.16b,v5.16b,#12 122 eor v3.16b,v3.16b,v5.16b 123 ext v5.16b,v0.16b,v5.16b,#12 124 eor v3.16b,v3.16b,v5.16b 125 126 dup v5.4s,v3.s[3] 127 eor v5.16b,v5.16b,v4.16b 128 eor v6.16b,v6.16b,v1.16b 129 ext v4.16b,v0.16b,v4.16b,#12 130 shl v1.16b,v1.16b,#1 131 eor v4.16b,v4.16b,v5.16b 132 eor v3.16b,v3.16b,v6.16b 133 eor v4.16b,v4.16b,v6.16b 134 st1 {v3.4s},[x2],#16 135 b.ne .Loop192 136 137 mov w12,#12 138 add x2,x2,#0x20 139 b .Ldone 140 141.align 4 142.L256: 143 ld1 {v4.16b},[x0] 144 mov w1,#7 145 mov w12,#14 146 st1 {v3.4s},[x2],#16 147 148.Loop256: 149 tbl v6.16b,{v4.16b},v2.16b 150 ext v5.16b,v0.16b,v3.16b,#12 151 st1 {v4.4s},[x2],#16 152 aese v6.16b,v0.16b 153 subs w1,w1,#1 154 155 eor v3.16b,v3.16b,v5.16b 156 ext v5.16b,v0.16b,v5.16b,#12 157 eor v3.16b,v3.16b,v5.16b 158 ext v5.16b,v0.16b,v5.16b,#12 159 eor v6.16b,v6.16b,v1.16b 160 eor v3.16b,v3.16b,v5.16b 161 shl v1.16b,v1.16b,#1 162 eor v3.16b,v3.16b,v6.16b 163 st1 {v3.4s},[x2],#16 164 b.eq .Ldone 165 166 dup v6.4s,v3.s[3] // just splat 167 ext v5.16b,v0.16b,v4.16b,#12 168 aese v6.16b,v0.16b 169 170 eor v4.16b,v4.16b,v5.16b 171 ext v5.16b,v0.16b,v5.16b,#12 172 eor v4.16b,v4.16b,v5.16b 173 ext v5.16b,v0.16b,v5.16b,#12 174 eor v4.16b,v4.16b,v5.16b 175 176 eor v4.16b,v4.16b,v6.16b 177 b .Loop256 178 179.Ldone: 180 str w12,[x2] 181 mov x3,#0 182 183.Lenc_key_abort: 184 mov x0,x3 // return value 185 ldr x29,[sp],#16 186 ret 187.size aes_v8_set_encrypt_key,.-aes_v8_set_encrypt_key 188 189.globl aes_v8_set_decrypt_key 190.type aes_v8_set_decrypt_key,%function 191.align 5 192aes_v8_set_decrypt_key: 193 AARCH64_SIGN_LINK_REGISTER 194 stp x29,x30,[sp,#-16]! 195 add x29,sp,#0 196 bl .Lenc_key 197 198 cmp x0,#0 199 b.ne .Ldec_key_abort 200 201 sub x2,x2,#240 // restore original x2 202 mov x4,#-16 203 add x0,x2,x12,lsl#4 // end of key schedule 204 205 ld1 {v0.4s},[x2] 206 ld1 {v1.4s},[x0] 207 st1 {v0.4s},[x0],x4 208 st1 {v1.4s},[x2],#16 209 210.Loop_imc: 211 ld1 {v0.4s},[x2] 212 ld1 {v1.4s},[x0] 213 aesimc v0.16b,v0.16b 214 aesimc v1.16b,v1.16b 215 st1 {v0.4s},[x0],x4 216 st1 {v1.4s},[x2],#16 217 cmp x0,x2 218 b.hi .Loop_imc 219 220 ld1 {v0.4s},[x2] 221 aesimc v0.16b,v0.16b 222 st1 {v0.4s},[x0] 223 224 eor x0,x0,x0 // return value 225.Ldec_key_abort: 226 ldp x29,x30,[sp],#16 227 AARCH64_VALIDATE_LINK_REGISTER 228 ret 229.size aes_v8_set_decrypt_key,.-aes_v8_set_decrypt_key 230.globl aes_v8_encrypt 231.type aes_v8_encrypt,%function 232.align 5 233aes_v8_encrypt: 234 AARCH64_VALID_CALL_TARGET 235 ldr w3,[x2,#240] 236 ld1 {v0.4s},[x2],#16 237 ld1 {v2.16b},[x0] 238 sub w3,w3,#2 239 ld1 {v1.4s},[x2],#16 240 241.Loop_enc: 242 aese v2.16b,v0.16b 243 aesmc v2.16b,v2.16b 244 ld1 {v0.4s},[x2],#16 245 subs w3,w3,#2 246 aese v2.16b,v1.16b 247 aesmc v2.16b,v2.16b 248 ld1 {v1.4s},[x2],#16 249 b.gt .Loop_enc 250 251 aese v2.16b,v0.16b 252 aesmc v2.16b,v2.16b 253 ld1 {v0.4s},[x2] 254 aese v2.16b,v1.16b 255 eor v2.16b,v2.16b,v0.16b 256 257 st1 {v2.16b},[x1] 258 ret 259.size aes_v8_encrypt,.-aes_v8_encrypt 260.globl aes_v8_decrypt 261.type aes_v8_decrypt,%function 262.align 5 263aes_v8_decrypt: 264 AARCH64_VALID_CALL_TARGET 265 ldr w3,[x2,#240] 266 ld1 {v0.4s},[x2],#16 267 ld1 {v2.16b},[x0] 268 sub w3,w3,#2 269 ld1 {v1.4s},[x2],#16 270 271.Loop_dec: 272 aesd v2.16b,v0.16b 273 aesimc v2.16b,v2.16b 274 ld1 {v0.4s},[x2],#16 275 subs w3,w3,#2 276 aesd v2.16b,v1.16b 277 aesimc v2.16b,v2.16b 278 ld1 {v1.4s},[x2],#16 279 b.gt .Loop_dec 280 281 aesd v2.16b,v0.16b 282 aesimc v2.16b,v2.16b 283 ld1 {v0.4s},[x2] 284 aesd v2.16b,v1.16b 285 eor v2.16b,v2.16b,v0.16b 286 287 st1 {v2.16b},[x1] 288 ret 289.size aes_v8_decrypt,.-aes_v8_decrypt 290.globl aes_v8_ecb_encrypt 291.type aes_v8_ecb_encrypt,%function 292.align 5 293aes_v8_ecb_encrypt: 294 AARCH64_VALID_CALL_TARGET 295 subs x2,x2,#16 296 // Original input data size bigger than 16, jump to big size processing. 297 b.ne .Lecb_big_size 298 ld1 {v0.16b},[x0] 299 cmp w4,#0 // en- or decrypting? 300 ldr w5,[x3,#240] 301 ld1 {v5.4s,v6.4s},[x3],#32 // load key schedule... 302 303 b.eq .Lecb_small_dec 304 aese v0.16b,v5.16b 305 aesmc v0.16b,v0.16b 306 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 307 aese v0.16b,v6.16b 308 aesmc v0.16b,v0.16b 309 subs w5,w5,#10 // if rounds==10, jump to aes-128-ecb processing 310 b.eq .Lecb_128_enc 311.Lecb_round_loop: 312 aese v0.16b,v16.16b 313 aesmc v0.16b,v0.16b 314 ld1 {v16.4s},[x3],#16 // load key schedule... 315 aese v0.16b,v17.16b 316 aesmc v0.16b,v0.16b 317 ld1 {v17.4s},[x3],#16 // load key schedule... 318 subs w5,w5,#2 // bias 319 b.gt .Lecb_round_loop 320.Lecb_128_enc: 321 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 322 aese v0.16b,v16.16b 323 aesmc v0.16b,v0.16b 324 aese v0.16b,v17.16b 325 aesmc v0.16b,v0.16b 326 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 327 aese v0.16b,v18.16b 328 aesmc v0.16b,v0.16b 329 aese v0.16b,v19.16b 330 aesmc v0.16b,v0.16b 331 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 332 aese v0.16b,v20.16b 333 aesmc v0.16b,v0.16b 334 aese v0.16b,v21.16b 335 aesmc v0.16b,v0.16b 336 ld1 {v7.4s},[x3] 337 aese v0.16b,v22.16b 338 aesmc v0.16b,v0.16b 339 aese v0.16b,v23.16b 340 eor v0.16b,v0.16b,v7.16b 341 st1 {v0.16b},[x1] 342 b .Lecb_Final_abort 343.Lecb_small_dec: 344 aesd v0.16b,v5.16b 345 aesimc v0.16b,v0.16b 346 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 347 aesd v0.16b,v6.16b 348 aesimc v0.16b,v0.16b 349 subs w5,w5,#10 // bias 350 b.eq .Lecb_128_dec 351.Lecb_dec_round_loop: 352 aesd v0.16b,v16.16b 353 aesimc v0.16b,v0.16b 354 ld1 {v16.4s},[x3],#16 // load key schedule... 355 aesd v0.16b,v17.16b 356 aesimc v0.16b,v0.16b 357 ld1 {v17.4s},[x3],#16 // load key schedule... 358 subs w5,w5,#2 // bias 359 b.gt .Lecb_dec_round_loop 360.Lecb_128_dec: 361 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 362 aesd v0.16b,v16.16b 363 aesimc v0.16b,v0.16b 364 aesd v0.16b,v17.16b 365 aesimc v0.16b,v0.16b 366 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 367 aesd v0.16b,v18.16b 368 aesimc v0.16b,v0.16b 369 aesd v0.16b,v19.16b 370 aesimc v0.16b,v0.16b 371 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 372 aesd v0.16b,v20.16b 373 aesimc v0.16b,v0.16b 374 aesd v0.16b,v21.16b 375 aesimc v0.16b,v0.16b 376 ld1 {v7.4s},[x3] 377 aesd v0.16b,v22.16b 378 aesimc v0.16b,v0.16b 379 aesd v0.16b,v23.16b 380 eor v0.16b,v0.16b,v7.16b 381 st1 {v0.16b},[x1] 382 b .Lecb_Final_abort 383.Lecb_big_size: 384 stp x29,x30,[sp,#-16]! 385 add x29,sp,#0 386 mov x8,#16 387 b.lo .Lecb_done 388 csel x8,xzr,x8,eq 389 390 cmp w4,#0 // en- or decrypting? 391 ldr w5,[x3,#240] 392 and x2,x2,#-16 393 ld1 {v0.16b},[x0],x8 394 395 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 396 sub w5,w5,#6 397 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 398 sub w5,w5,#2 399 ld1 {v18.4s,v19.4s},[x7],#32 400 ld1 {v20.4s,v21.4s},[x7],#32 401 ld1 {v22.4s,v23.4s},[x7],#32 402 ld1 {v7.4s},[x7] 403 404 add x7,x3,#32 405 mov w6,w5 406 b.eq .Lecb_dec 407 408 ld1 {v1.16b},[x0],#16 409 subs x2,x2,#32 // bias 410 add w6,w5,#2 411 orr v3.16b,v1.16b,v1.16b 412 orr v24.16b,v1.16b,v1.16b 413 orr v1.16b,v0.16b,v0.16b 414 b.lo .Lecb_enc_tail 415 416 orr v1.16b,v3.16b,v3.16b 417 ld1 {v24.16b},[x0],#16 418 cmp x2,#32 419 b.lo .Loop3x_ecb_enc 420 421 ld1 {v25.16b},[x0],#16 422 ld1 {v26.16b},[x0],#16 423 sub x2,x2,#32 // bias 424 mov w6,w5 425 426.Loop5x_ecb_enc: 427 aese v0.16b,v16.16b 428 aesmc v0.16b,v0.16b 429 aese v1.16b,v16.16b 430 aesmc v1.16b,v1.16b 431 aese v24.16b,v16.16b 432 aesmc v24.16b,v24.16b 433 aese v25.16b,v16.16b 434 aesmc v25.16b,v25.16b 435 aese v26.16b,v16.16b 436 aesmc v26.16b,v26.16b 437 ld1 {v16.4s},[x7],#16 438 subs w6,w6,#2 439 aese v0.16b,v17.16b 440 aesmc v0.16b,v0.16b 441 aese v1.16b,v17.16b 442 aesmc v1.16b,v1.16b 443 aese v24.16b,v17.16b 444 aesmc v24.16b,v24.16b 445 aese v25.16b,v17.16b 446 aesmc v25.16b,v25.16b 447 aese v26.16b,v17.16b 448 aesmc v26.16b,v26.16b 449 ld1 {v17.4s},[x7],#16 450 b.gt .Loop5x_ecb_enc 451 452 aese v0.16b,v16.16b 453 aesmc v0.16b,v0.16b 454 aese v1.16b,v16.16b 455 aesmc v1.16b,v1.16b 456 aese v24.16b,v16.16b 457 aesmc v24.16b,v24.16b 458 aese v25.16b,v16.16b 459 aesmc v25.16b,v25.16b 460 aese v26.16b,v16.16b 461 aesmc v26.16b,v26.16b 462 cmp x2,#0x40 // because .Lecb_enc_tail4x 463 sub x2,x2,#0x50 464 465 aese v0.16b,v17.16b 466 aesmc v0.16b,v0.16b 467 aese v1.16b,v17.16b 468 aesmc v1.16b,v1.16b 469 aese v24.16b,v17.16b 470 aesmc v24.16b,v24.16b 471 aese v25.16b,v17.16b 472 aesmc v25.16b,v25.16b 473 aese v26.16b,v17.16b 474 aesmc v26.16b,v26.16b 475 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 476 mov x7,x3 477 478 aese v0.16b,v18.16b 479 aesmc v0.16b,v0.16b 480 aese v1.16b,v18.16b 481 aesmc v1.16b,v1.16b 482 aese v24.16b,v18.16b 483 aesmc v24.16b,v24.16b 484 aese v25.16b,v18.16b 485 aesmc v25.16b,v25.16b 486 aese v26.16b,v18.16b 487 aesmc v26.16b,v26.16b 488 add x0,x0,x6 // x0 is adjusted in such way that 489 // at exit from the loop v1.16b-v26.16b 490 // are loaded with last "words" 491 add x6,x2,#0x60 // because .Lecb_enc_tail4x 492 493 aese v0.16b,v19.16b 494 aesmc v0.16b,v0.16b 495 aese v1.16b,v19.16b 496 aesmc v1.16b,v1.16b 497 aese v24.16b,v19.16b 498 aesmc v24.16b,v24.16b 499 aese v25.16b,v19.16b 500 aesmc v25.16b,v25.16b 501 aese v26.16b,v19.16b 502 aesmc v26.16b,v26.16b 503 504 aese v0.16b,v20.16b 505 aesmc v0.16b,v0.16b 506 aese v1.16b,v20.16b 507 aesmc v1.16b,v1.16b 508 aese v24.16b,v20.16b 509 aesmc v24.16b,v24.16b 510 aese v25.16b,v20.16b 511 aesmc v25.16b,v25.16b 512 aese v26.16b,v20.16b 513 aesmc v26.16b,v26.16b 514 515 aese v0.16b,v21.16b 516 aesmc v0.16b,v0.16b 517 aese v1.16b,v21.16b 518 aesmc v1.16b,v1.16b 519 aese v24.16b,v21.16b 520 aesmc v24.16b,v24.16b 521 aese v25.16b,v21.16b 522 aesmc v25.16b,v25.16b 523 aese v26.16b,v21.16b 524 aesmc v26.16b,v26.16b 525 526 aese v0.16b,v22.16b 527 aesmc v0.16b,v0.16b 528 aese v1.16b,v22.16b 529 aesmc v1.16b,v1.16b 530 aese v24.16b,v22.16b 531 aesmc v24.16b,v24.16b 532 aese v25.16b,v22.16b 533 aesmc v25.16b,v25.16b 534 aese v26.16b,v22.16b 535 aesmc v26.16b,v26.16b 536 537 aese v0.16b,v23.16b 538 ld1 {v2.16b},[x0],#16 539 aese v1.16b,v23.16b 540 ld1 {v3.16b},[x0],#16 541 aese v24.16b,v23.16b 542 ld1 {v27.16b},[x0],#16 543 aese v25.16b,v23.16b 544 ld1 {v28.16b},[x0],#16 545 aese v26.16b,v23.16b 546 ld1 {v29.16b},[x0],#16 547 cbz x6,.Lecb_enc_tail4x 548 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 549 eor v4.16b,v7.16b,v0.16b 550 orr v0.16b,v2.16b,v2.16b 551 eor v5.16b,v7.16b,v1.16b 552 orr v1.16b,v3.16b,v3.16b 553 eor v17.16b,v7.16b,v24.16b 554 orr v24.16b,v27.16b,v27.16b 555 eor v30.16b,v7.16b,v25.16b 556 orr v25.16b,v28.16b,v28.16b 557 eor v31.16b,v7.16b,v26.16b 558 st1 {v4.16b},[x1],#16 559 orr v26.16b,v29.16b,v29.16b 560 st1 {v5.16b},[x1],#16 561 mov w6,w5 562 st1 {v17.16b},[x1],#16 563 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 564 st1 {v30.16b},[x1],#16 565 st1 {v31.16b},[x1],#16 566 b.hs .Loop5x_ecb_enc 567 568 add x2,x2,#0x50 569 cbz x2,.Lecb_done 570 571 add w6,w5,#2 572 subs x2,x2,#0x30 573 orr v0.16b,v27.16b,v27.16b 574 orr v1.16b,v28.16b,v28.16b 575 orr v24.16b,v29.16b,v29.16b 576 b.lo .Lecb_enc_tail 577 578 b .Loop3x_ecb_enc 579 580.align 4 581.Lecb_enc_tail4x: 582 eor v5.16b,v7.16b,v1.16b 583 eor v17.16b,v7.16b,v24.16b 584 eor v30.16b,v7.16b,v25.16b 585 eor v31.16b,v7.16b,v26.16b 586 st1 {v5.16b},[x1],#16 587 st1 {v17.16b},[x1],#16 588 st1 {v30.16b},[x1],#16 589 st1 {v31.16b},[x1],#16 590 591 b .Lecb_done 592.align 4 593.Loop3x_ecb_enc: 594 aese v0.16b,v16.16b 595 aesmc v0.16b,v0.16b 596 aese v1.16b,v16.16b 597 aesmc v1.16b,v1.16b 598 aese v24.16b,v16.16b 599 aesmc v24.16b,v24.16b 600 ld1 {v16.4s},[x7],#16 601 subs w6,w6,#2 602 aese v0.16b,v17.16b 603 aesmc v0.16b,v0.16b 604 aese v1.16b,v17.16b 605 aesmc v1.16b,v1.16b 606 aese v24.16b,v17.16b 607 aesmc v24.16b,v24.16b 608 ld1 {v17.4s},[x7],#16 609 b.gt .Loop3x_ecb_enc 610 611 aese v0.16b,v16.16b 612 aesmc v0.16b,v0.16b 613 aese v1.16b,v16.16b 614 aesmc v1.16b,v1.16b 615 aese v24.16b,v16.16b 616 aesmc v24.16b,v24.16b 617 subs x2,x2,#0x30 618 csel x6,x2,x6,lo // x6, w6, is zero at this point 619 aese v0.16b,v17.16b 620 aesmc v0.16b,v0.16b 621 aese v1.16b,v17.16b 622 aesmc v1.16b,v1.16b 623 aese v24.16b,v17.16b 624 aesmc v24.16b,v24.16b 625 add x0,x0,x6 // x0 is adjusted in such way that 626 // at exit from the loop v1.16b-v24.16b 627 // are loaded with last "words" 628 mov x7,x3 629 aese v0.16b,v20.16b 630 aesmc v0.16b,v0.16b 631 aese v1.16b,v20.16b 632 aesmc v1.16b,v1.16b 633 aese v24.16b,v20.16b 634 aesmc v24.16b,v24.16b 635 ld1 {v2.16b},[x0],#16 636 aese v0.16b,v21.16b 637 aesmc v0.16b,v0.16b 638 aese v1.16b,v21.16b 639 aesmc v1.16b,v1.16b 640 aese v24.16b,v21.16b 641 aesmc v24.16b,v24.16b 642 ld1 {v3.16b},[x0],#16 643 aese v0.16b,v22.16b 644 aesmc v0.16b,v0.16b 645 aese v1.16b,v22.16b 646 aesmc v1.16b,v1.16b 647 aese v24.16b,v22.16b 648 aesmc v24.16b,v24.16b 649 ld1 {v27.16b},[x0],#16 650 aese v0.16b,v23.16b 651 aese v1.16b,v23.16b 652 aese v24.16b,v23.16b 653 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 654 add w6,w5,#2 655 eor v4.16b,v7.16b,v0.16b 656 eor v5.16b,v7.16b,v1.16b 657 eor v24.16b,v24.16b,v7.16b 658 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 659 st1 {v4.16b},[x1],#16 660 orr v0.16b,v2.16b,v2.16b 661 st1 {v5.16b},[x1],#16 662 orr v1.16b,v3.16b,v3.16b 663 st1 {v24.16b},[x1],#16 664 orr v24.16b,v27.16b,v27.16b 665 b.hs .Loop3x_ecb_enc 666 667 cmn x2,#0x30 668 b.eq .Lecb_done 669 nop 670 671.Lecb_enc_tail: 672 aese v1.16b,v16.16b 673 aesmc v1.16b,v1.16b 674 aese v24.16b,v16.16b 675 aesmc v24.16b,v24.16b 676 ld1 {v16.4s},[x7],#16 677 subs w6,w6,#2 678 aese v1.16b,v17.16b 679 aesmc v1.16b,v1.16b 680 aese v24.16b,v17.16b 681 aesmc v24.16b,v24.16b 682 ld1 {v17.4s},[x7],#16 683 b.gt .Lecb_enc_tail 684 685 aese v1.16b,v16.16b 686 aesmc v1.16b,v1.16b 687 aese v24.16b,v16.16b 688 aesmc v24.16b,v24.16b 689 aese v1.16b,v17.16b 690 aesmc v1.16b,v1.16b 691 aese v24.16b,v17.16b 692 aesmc v24.16b,v24.16b 693 aese v1.16b,v20.16b 694 aesmc v1.16b,v1.16b 695 aese v24.16b,v20.16b 696 aesmc v24.16b,v24.16b 697 cmn x2,#0x20 698 aese v1.16b,v21.16b 699 aesmc v1.16b,v1.16b 700 aese v24.16b,v21.16b 701 aesmc v24.16b,v24.16b 702 aese v1.16b,v22.16b 703 aesmc v1.16b,v1.16b 704 aese v24.16b,v22.16b 705 aesmc v24.16b,v24.16b 706 aese v1.16b,v23.16b 707 aese v24.16b,v23.16b 708 b.eq .Lecb_enc_one 709 eor v5.16b,v7.16b,v1.16b 710 eor v17.16b,v7.16b,v24.16b 711 st1 {v5.16b},[x1],#16 712 st1 {v17.16b},[x1],#16 713 b .Lecb_done 714 715.Lecb_enc_one: 716 eor v5.16b,v7.16b,v24.16b 717 st1 {v5.16b},[x1],#16 718 b .Lecb_done 719.align 5 720.Lecb_dec: 721 ld1 {v1.16b},[x0],#16 722 subs x2,x2,#32 // bias 723 add w6,w5,#2 724 orr v3.16b,v1.16b,v1.16b 725 orr v24.16b,v1.16b,v1.16b 726 orr v1.16b,v0.16b,v0.16b 727 b.lo .Lecb_dec_tail 728 729 orr v1.16b,v3.16b,v3.16b 730 ld1 {v24.16b},[x0],#16 731 cmp x2,#32 732 b.lo .Loop3x_ecb_dec 733 734 ld1 {v25.16b},[x0],#16 735 ld1 {v26.16b},[x0],#16 736 sub x2,x2,#32 // bias 737 mov w6,w5 738 739.Loop5x_ecb_dec: 740 aesd v0.16b,v16.16b 741 aesimc v0.16b,v0.16b 742 aesd v1.16b,v16.16b 743 aesimc v1.16b,v1.16b 744 aesd v24.16b,v16.16b 745 aesimc v24.16b,v24.16b 746 aesd v25.16b,v16.16b 747 aesimc v25.16b,v25.16b 748 aesd v26.16b,v16.16b 749 aesimc v26.16b,v26.16b 750 ld1 {v16.4s},[x7],#16 751 subs w6,w6,#2 752 aesd v0.16b,v17.16b 753 aesimc v0.16b,v0.16b 754 aesd v1.16b,v17.16b 755 aesimc v1.16b,v1.16b 756 aesd v24.16b,v17.16b 757 aesimc v24.16b,v24.16b 758 aesd v25.16b,v17.16b 759 aesimc v25.16b,v25.16b 760 aesd v26.16b,v17.16b 761 aesimc v26.16b,v26.16b 762 ld1 {v17.4s},[x7],#16 763 b.gt .Loop5x_ecb_dec 764 765 aesd v0.16b,v16.16b 766 aesimc v0.16b,v0.16b 767 aesd v1.16b,v16.16b 768 aesimc v1.16b,v1.16b 769 aesd v24.16b,v16.16b 770 aesimc v24.16b,v24.16b 771 aesd v25.16b,v16.16b 772 aesimc v25.16b,v25.16b 773 aesd v26.16b,v16.16b 774 aesimc v26.16b,v26.16b 775 cmp x2,#0x40 // because .Lecb_tail4x 776 sub x2,x2,#0x50 777 778 aesd v0.16b,v17.16b 779 aesimc v0.16b,v0.16b 780 aesd v1.16b,v17.16b 781 aesimc v1.16b,v1.16b 782 aesd v24.16b,v17.16b 783 aesimc v24.16b,v24.16b 784 aesd v25.16b,v17.16b 785 aesimc v25.16b,v25.16b 786 aesd v26.16b,v17.16b 787 aesimc v26.16b,v26.16b 788 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 789 mov x7,x3 790 791 aesd v0.16b,v18.16b 792 aesimc v0.16b,v0.16b 793 aesd v1.16b,v18.16b 794 aesimc v1.16b,v1.16b 795 aesd v24.16b,v18.16b 796 aesimc v24.16b,v24.16b 797 aesd v25.16b,v18.16b 798 aesimc v25.16b,v25.16b 799 aesd v26.16b,v18.16b 800 aesimc v26.16b,v26.16b 801 add x0,x0,x6 // x0 is adjusted in such way that 802 // at exit from the loop v1.16b-v26.16b 803 // are loaded with last "words" 804 add x6,x2,#0x60 // because .Lecb_tail4x 805 806 aesd v0.16b,v19.16b 807 aesimc v0.16b,v0.16b 808 aesd v1.16b,v19.16b 809 aesimc v1.16b,v1.16b 810 aesd v24.16b,v19.16b 811 aesimc v24.16b,v24.16b 812 aesd v25.16b,v19.16b 813 aesimc v25.16b,v25.16b 814 aesd v26.16b,v19.16b 815 aesimc v26.16b,v26.16b 816 817 aesd v0.16b,v20.16b 818 aesimc v0.16b,v0.16b 819 aesd v1.16b,v20.16b 820 aesimc v1.16b,v1.16b 821 aesd v24.16b,v20.16b 822 aesimc v24.16b,v24.16b 823 aesd v25.16b,v20.16b 824 aesimc v25.16b,v25.16b 825 aesd v26.16b,v20.16b 826 aesimc v26.16b,v26.16b 827 828 aesd v0.16b,v21.16b 829 aesimc v0.16b,v0.16b 830 aesd v1.16b,v21.16b 831 aesimc v1.16b,v1.16b 832 aesd v24.16b,v21.16b 833 aesimc v24.16b,v24.16b 834 aesd v25.16b,v21.16b 835 aesimc v25.16b,v25.16b 836 aesd v26.16b,v21.16b 837 aesimc v26.16b,v26.16b 838 839 aesd v0.16b,v22.16b 840 aesimc v0.16b,v0.16b 841 aesd v1.16b,v22.16b 842 aesimc v1.16b,v1.16b 843 aesd v24.16b,v22.16b 844 aesimc v24.16b,v24.16b 845 aesd v25.16b,v22.16b 846 aesimc v25.16b,v25.16b 847 aesd v26.16b,v22.16b 848 aesimc v26.16b,v26.16b 849 850 aesd v0.16b,v23.16b 851 ld1 {v2.16b},[x0],#16 852 aesd v1.16b,v23.16b 853 ld1 {v3.16b},[x0],#16 854 aesd v24.16b,v23.16b 855 ld1 {v27.16b},[x0],#16 856 aesd v25.16b,v23.16b 857 ld1 {v28.16b},[x0],#16 858 aesd v26.16b,v23.16b 859 ld1 {v29.16b},[x0],#16 860 cbz x6,.Lecb_tail4x 861 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 862 eor v4.16b,v7.16b,v0.16b 863 orr v0.16b,v2.16b,v2.16b 864 eor v5.16b,v7.16b,v1.16b 865 orr v1.16b,v3.16b,v3.16b 866 eor v17.16b,v7.16b,v24.16b 867 orr v24.16b,v27.16b,v27.16b 868 eor v30.16b,v7.16b,v25.16b 869 orr v25.16b,v28.16b,v28.16b 870 eor v31.16b,v7.16b,v26.16b 871 st1 {v4.16b},[x1],#16 872 orr v26.16b,v29.16b,v29.16b 873 st1 {v5.16b},[x1],#16 874 mov w6,w5 875 st1 {v17.16b},[x1],#16 876 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 877 st1 {v30.16b},[x1],#16 878 st1 {v31.16b},[x1],#16 879 b.hs .Loop5x_ecb_dec 880 881 add x2,x2,#0x50 882 cbz x2,.Lecb_done 883 884 add w6,w5,#2 885 subs x2,x2,#0x30 886 orr v0.16b,v27.16b,v27.16b 887 orr v1.16b,v28.16b,v28.16b 888 orr v24.16b,v29.16b,v29.16b 889 b.lo .Lecb_dec_tail 890 891 b .Loop3x_ecb_dec 892 893.align 4 894.Lecb_tail4x: 895 eor v5.16b,v7.16b,v1.16b 896 eor v17.16b,v7.16b,v24.16b 897 eor v30.16b,v7.16b,v25.16b 898 eor v31.16b,v7.16b,v26.16b 899 st1 {v5.16b},[x1],#16 900 st1 {v17.16b},[x1],#16 901 st1 {v30.16b},[x1],#16 902 st1 {v31.16b},[x1],#16 903 904 b .Lecb_done 905.align 4 906.Loop3x_ecb_dec: 907 aesd v0.16b,v16.16b 908 aesimc v0.16b,v0.16b 909 aesd v1.16b,v16.16b 910 aesimc v1.16b,v1.16b 911 aesd v24.16b,v16.16b 912 aesimc v24.16b,v24.16b 913 ld1 {v16.4s},[x7],#16 914 subs w6,w6,#2 915 aesd v0.16b,v17.16b 916 aesimc v0.16b,v0.16b 917 aesd v1.16b,v17.16b 918 aesimc v1.16b,v1.16b 919 aesd v24.16b,v17.16b 920 aesimc v24.16b,v24.16b 921 ld1 {v17.4s},[x7],#16 922 b.gt .Loop3x_ecb_dec 923 924 aesd v0.16b,v16.16b 925 aesimc v0.16b,v0.16b 926 aesd v1.16b,v16.16b 927 aesimc v1.16b,v1.16b 928 aesd v24.16b,v16.16b 929 aesimc v24.16b,v24.16b 930 subs x2,x2,#0x30 931 csel x6,x2,x6,lo // x6, w6, is zero at this point 932 aesd v0.16b,v17.16b 933 aesimc v0.16b,v0.16b 934 aesd v1.16b,v17.16b 935 aesimc v1.16b,v1.16b 936 aesd v24.16b,v17.16b 937 aesimc v24.16b,v24.16b 938 add x0,x0,x6 // x0 is adjusted in such way that 939 // at exit from the loop v1.16b-v24.16b 940 // are loaded with last "words" 941 mov x7,x3 942 aesd v0.16b,v20.16b 943 aesimc v0.16b,v0.16b 944 aesd v1.16b,v20.16b 945 aesimc v1.16b,v1.16b 946 aesd v24.16b,v20.16b 947 aesimc v24.16b,v24.16b 948 ld1 {v2.16b},[x0],#16 949 aesd v0.16b,v21.16b 950 aesimc v0.16b,v0.16b 951 aesd v1.16b,v21.16b 952 aesimc v1.16b,v1.16b 953 aesd v24.16b,v21.16b 954 aesimc v24.16b,v24.16b 955 ld1 {v3.16b},[x0],#16 956 aesd v0.16b,v22.16b 957 aesimc v0.16b,v0.16b 958 aesd v1.16b,v22.16b 959 aesimc v1.16b,v1.16b 960 aesd v24.16b,v22.16b 961 aesimc v24.16b,v24.16b 962 ld1 {v27.16b},[x0],#16 963 aesd v0.16b,v23.16b 964 aesd v1.16b,v23.16b 965 aesd v24.16b,v23.16b 966 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 967 add w6,w5,#2 968 eor v4.16b,v7.16b,v0.16b 969 eor v5.16b,v7.16b,v1.16b 970 eor v24.16b,v24.16b,v7.16b 971 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 972 st1 {v4.16b},[x1],#16 973 orr v0.16b,v2.16b,v2.16b 974 st1 {v5.16b},[x1],#16 975 orr v1.16b,v3.16b,v3.16b 976 st1 {v24.16b},[x1],#16 977 orr v24.16b,v27.16b,v27.16b 978 b.hs .Loop3x_ecb_dec 979 980 cmn x2,#0x30 981 b.eq .Lecb_done 982 nop 983 984.Lecb_dec_tail: 985 aesd v1.16b,v16.16b 986 aesimc v1.16b,v1.16b 987 aesd v24.16b,v16.16b 988 aesimc v24.16b,v24.16b 989 ld1 {v16.4s},[x7],#16 990 subs w6,w6,#2 991 aesd v1.16b,v17.16b 992 aesimc v1.16b,v1.16b 993 aesd v24.16b,v17.16b 994 aesimc v24.16b,v24.16b 995 ld1 {v17.4s},[x7],#16 996 b.gt .Lecb_dec_tail 997 998 aesd v1.16b,v16.16b 999 aesimc v1.16b,v1.16b 1000 aesd v24.16b,v16.16b 1001 aesimc v24.16b,v24.16b 1002 aesd v1.16b,v17.16b 1003 aesimc v1.16b,v1.16b 1004 aesd v24.16b,v17.16b 1005 aesimc v24.16b,v24.16b 1006 aesd v1.16b,v20.16b 1007 aesimc v1.16b,v1.16b 1008 aesd v24.16b,v20.16b 1009 aesimc v24.16b,v24.16b 1010 cmn x2,#0x20 1011 aesd v1.16b,v21.16b 1012 aesimc v1.16b,v1.16b 1013 aesd v24.16b,v21.16b 1014 aesimc v24.16b,v24.16b 1015 aesd v1.16b,v22.16b 1016 aesimc v1.16b,v1.16b 1017 aesd v24.16b,v22.16b 1018 aesimc v24.16b,v24.16b 1019 aesd v1.16b,v23.16b 1020 aesd v24.16b,v23.16b 1021 b.eq .Lecb_dec_one 1022 eor v5.16b,v7.16b,v1.16b 1023 eor v17.16b,v7.16b,v24.16b 1024 st1 {v5.16b},[x1],#16 1025 st1 {v17.16b},[x1],#16 1026 b .Lecb_done 1027 1028.Lecb_dec_one: 1029 eor v5.16b,v7.16b,v24.16b 1030 st1 {v5.16b},[x1],#16 1031 1032.Lecb_done: 1033 ldr x29,[sp],#16 1034.Lecb_Final_abort: 1035 ret 1036.size aes_v8_ecb_encrypt,.-aes_v8_ecb_encrypt 1037.globl aes_v8_cbc_encrypt 1038.type aes_v8_cbc_encrypt,%function 1039.align 5 1040aes_v8_cbc_encrypt: 1041 AARCH64_VALID_CALL_TARGET 1042 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1043 stp x29,x30,[sp,#-16]! 1044 add x29,sp,#0 1045 subs x2,x2,#16 1046 mov x8,#16 1047 b.lo .Lcbc_abort 1048 csel x8,xzr,x8,eq 1049 1050 cmp w5,#0 // en- or decrypting? 1051 ldr w5,[x3,#240] 1052 and x2,x2,#-16 1053 ld1 {v6.16b},[x4] 1054 ld1 {v0.16b},[x0],x8 1055 1056 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 1057 sub w5,w5,#6 1058 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 1059 sub w5,w5,#2 1060 ld1 {v18.4s,v19.4s},[x7],#32 1061 ld1 {v20.4s,v21.4s},[x7],#32 1062 ld1 {v22.4s,v23.4s},[x7],#32 1063 ld1 {v7.4s},[x7] 1064 1065 add x7,x3,#32 1066 mov w6,w5 1067 b.eq .Lcbc_dec 1068 1069 cmp w5,#2 1070 eor v0.16b,v0.16b,v6.16b 1071 eor v5.16b,v16.16b,v7.16b 1072 b.eq .Lcbc_enc128 1073 1074 ld1 {v2.4s,v3.4s},[x7] 1075 add x7,x3,#16 1076 add x6,x3,#16*4 1077 add x12,x3,#16*5 1078 aese v0.16b,v16.16b 1079 aesmc v0.16b,v0.16b 1080 add x14,x3,#16*6 1081 add x3,x3,#16*7 1082 b .Lenter_cbc_enc 1083 1084.align 4 1085.Loop_cbc_enc: 1086 aese v0.16b,v16.16b 1087 aesmc v0.16b,v0.16b 1088 st1 {v6.16b},[x1],#16 1089.Lenter_cbc_enc: 1090 aese v0.16b,v17.16b 1091 aesmc v0.16b,v0.16b 1092 aese v0.16b,v2.16b 1093 aesmc v0.16b,v0.16b 1094 ld1 {v16.4s},[x6] 1095 cmp w5,#4 1096 aese v0.16b,v3.16b 1097 aesmc v0.16b,v0.16b 1098 ld1 {v17.4s},[x12] 1099 b.eq .Lcbc_enc192 1100 1101 aese v0.16b,v16.16b 1102 aesmc v0.16b,v0.16b 1103 ld1 {v16.4s},[x14] 1104 aese v0.16b,v17.16b 1105 aesmc v0.16b,v0.16b 1106 ld1 {v17.4s},[x3] 1107 nop 1108 1109.Lcbc_enc192: 1110 aese v0.16b,v16.16b 1111 aesmc v0.16b,v0.16b 1112 subs x2,x2,#16 1113 aese v0.16b,v17.16b 1114 aesmc v0.16b,v0.16b 1115 csel x8,xzr,x8,eq 1116 aese v0.16b,v18.16b 1117 aesmc v0.16b,v0.16b 1118 aese v0.16b,v19.16b 1119 aesmc v0.16b,v0.16b 1120 ld1 {v16.16b},[x0],x8 1121 aese v0.16b,v20.16b 1122 aesmc v0.16b,v0.16b 1123 eor v16.16b,v16.16b,v5.16b 1124 aese v0.16b,v21.16b 1125 aesmc v0.16b,v0.16b 1126 ld1 {v17.4s},[x7] // re-pre-load rndkey[1] 1127 aese v0.16b,v22.16b 1128 aesmc v0.16b,v0.16b 1129 aese v0.16b,v23.16b 1130 eor v6.16b,v0.16b,v7.16b 1131 b.hs .Loop_cbc_enc 1132 1133 st1 {v6.16b},[x1],#16 1134 b .Lcbc_done 1135 1136.align 5 1137.Lcbc_enc128: 1138 ld1 {v2.4s,v3.4s},[x7] 1139 aese v0.16b,v16.16b 1140 aesmc v0.16b,v0.16b 1141 b .Lenter_cbc_enc128 1142.Loop_cbc_enc128: 1143 aese v0.16b,v16.16b 1144 aesmc v0.16b,v0.16b 1145 st1 {v6.16b},[x1],#16 1146.Lenter_cbc_enc128: 1147 aese v0.16b,v17.16b 1148 aesmc v0.16b,v0.16b 1149 subs x2,x2,#16 1150 aese v0.16b,v2.16b 1151 aesmc v0.16b,v0.16b 1152 csel x8,xzr,x8,eq 1153 aese v0.16b,v3.16b 1154 aesmc v0.16b,v0.16b 1155 aese v0.16b,v18.16b 1156 aesmc v0.16b,v0.16b 1157 aese v0.16b,v19.16b 1158 aesmc v0.16b,v0.16b 1159 ld1 {v16.16b},[x0],x8 1160 aese v0.16b,v20.16b 1161 aesmc v0.16b,v0.16b 1162 aese v0.16b,v21.16b 1163 aesmc v0.16b,v0.16b 1164 aese v0.16b,v22.16b 1165 aesmc v0.16b,v0.16b 1166 eor v16.16b,v16.16b,v5.16b 1167 aese v0.16b,v23.16b 1168 eor v6.16b,v0.16b,v7.16b 1169 b.hs .Loop_cbc_enc128 1170 1171 st1 {v6.16b},[x1],#16 1172 b .Lcbc_done 1173.align 5 1174.Lcbc_dec: 1175 ld1 {v24.16b},[x0],#16 1176 subs x2,x2,#32 // bias 1177 add w6,w5,#2 1178 orr v3.16b,v0.16b,v0.16b 1179 orr v1.16b,v0.16b,v0.16b 1180 orr v27.16b,v24.16b,v24.16b 1181 b.lo .Lcbc_dec_tail 1182 1183 orr v1.16b,v24.16b,v24.16b 1184 ld1 {v24.16b},[x0],#16 1185 orr v2.16b,v0.16b,v0.16b 1186 orr v3.16b,v1.16b,v1.16b 1187 orr v27.16b,v24.16b,v24.16b 1188 cmp x2,#32 1189 b.lo .Loop3x_cbc_dec 1190 1191 ld1 {v25.16b},[x0],#16 1192 ld1 {v26.16b},[x0],#16 1193 sub x2,x2,#32 // bias 1194 mov w6,w5 1195 orr v28.16b,v25.16b,v25.16b 1196 orr v29.16b,v26.16b,v26.16b 1197 1198.Loop5x_cbc_dec: 1199 aesd v0.16b,v16.16b 1200 aesimc v0.16b,v0.16b 1201 aesd v1.16b,v16.16b 1202 aesimc v1.16b,v1.16b 1203 aesd v24.16b,v16.16b 1204 aesimc v24.16b,v24.16b 1205 aesd v25.16b,v16.16b 1206 aesimc v25.16b,v25.16b 1207 aesd v26.16b,v16.16b 1208 aesimc v26.16b,v26.16b 1209 ld1 {v16.4s},[x7],#16 1210 subs w6,w6,#2 1211 aesd v0.16b,v17.16b 1212 aesimc v0.16b,v0.16b 1213 aesd v1.16b,v17.16b 1214 aesimc v1.16b,v1.16b 1215 aesd v24.16b,v17.16b 1216 aesimc v24.16b,v24.16b 1217 aesd v25.16b,v17.16b 1218 aesimc v25.16b,v25.16b 1219 aesd v26.16b,v17.16b 1220 aesimc v26.16b,v26.16b 1221 ld1 {v17.4s},[x7],#16 1222 b.gt .Loop5x_cbc_dec 1223 1224 aesd v0.16b,v16.16b 1225 aesimc v0.16b,v0.16b 1226 aesd v1.16b,v16.16b 1227 aesimc v1.16b,v1.16b 1228 aesd v24.16b,v16.16b 1229 aesimc v24.16b,v24.16b 1230 aesd v25.16b,v16.16b 1231 aesimc v25.16b,v25.16b 1232 aesd v26.16b,v16.16b 1233 aesimc v26.16b,v26.16b 1234 cmp x2,#0x40 // because .Lcbc_tail4x 1235 sub x2,x2,#0x50 1236 1237 aesd v0.16b,v17.16b 1238 aesimc v0.16b,v0.16b 1239 aesd v1.16b,v17.16b 1240 aesimc v1.16b,v1.16b 1241 aesd v24.16b,v17.16b 1242 aesimc v24.16b,v24.16b 1243 aesd v25.16b,v17.16b 1244 aesimc v25.16b,v25.16b 1245 aesd v26.16b,v17.16b 1246 aesimc v26.16b,v26.16b 1247 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 1248 mov x7,x3 1249 1250 aesd v0.16b,v18.16b 1251 aesimc v0.16b,v0.16b 1252 aesd v1.16b,v18.16b 1253 aesimc v1.16b,v1.16b 1254 aesd v24.16b,v18.16b 1255 aesimc v24.16b,v24.16b 1256 aesd v25.16b,v18.16b 1257 aesimc v25.16b,v25.16b 1258 aesd v26.16b,v18.16b 1259 aesimc v26.16b,v26.16b 1260 add x0,x0,x6 // x0 is adjusted in such way that 1261 // at exit from the loop v1.16b-v26.16b 1262 // are loaded with last "words" 1263 add x6,x2,#0x60 // because .Lcbc_tail4x 1264 1265 aesd v0.16b,v19.16b 1266 aesimc v0.16b,v0.16b 1267 aesd v1.16b,v19.16b 1268 aesimc v1.16b,v1.16b 1269 aesd v24.16b,v19.16b 1270 aesimc v24.16b,v24.16b 1271 aesd v25.16b,v19.16b 1272 aesimc v25.16b,v25.16b 1273 aesd v26.16b,v19.16b 1274 aesimc v26.16b,v26.16b 1275 1276 aesd v0.16b,v20.16b 1277 aesimc v0.16b,v0.16b 1278 aesd v1.16b,v20.16b 1279 aesimc v1.16b,v1.16b 1280 aesd v24.16b,v20.16b 1281 aesimc v24.16b,v24.16b 1282 aesd v25.16b,v20.16b 1283 aesimc v25.16b,v25.16b 1284 aesd v26.16b,v20.16b 1285 aesimc v26.16b,v26.16b 1286 1287 aesd v0.16b,v21.16b 1288 aesimc v0.16b,v0.16b 1289 aesd v1.16b,v21.16b 1290 aesimc v1.16b,v1.16b 1291 aesd v24.16b,v21.16b 1292 aesimc v24.16b,v24.16b 1293 aesd v25.16b,v21.16b 1294 aesimc v25.16b,v25.16b 1295 aesd v26.16b,v21.16b 1296 aesimc v26.16b,v26.16b 1297 1298 aesd v0.16b,v22.16b 1299 aesimc v0.16b,v0.16b 1300 aesd v1.16b,v22.16b 1301 aesimc v1.16b,v1.16b 1302 aesd v24.16b,v22.16b 1303 aesimc v24.16b,v24.16b 1304 aesd v25.16b,v22.16b 1305 aesimc v25.16b,v25.16b 1306 aesd v26.16b,v22.16b 1307 aesimc v26.16b,v26.16b 1308 1309 eor v4.16b,v6.16b,v7.16b 1310 aesd v0.16b,v23.16b 1311 eor v5.16b,v2.16b,v7.16b 1312 ld1 {v2.16b},[x0],#16 1313 aesd v1.16b,v23.16b 1314 eor v17.16b,v3.16b,v7.16b 1315 ld1 {v3.16b},[x0],#16 1316 aesd v24.16b,v23.16b 1317 eor v30.16b,v27.16b,v7.16b 1318 ld1 {v27.16b},[x0],#16 1319 aesd v25.16b,v23.16b 1320 eor v31.16b,v28.16b,v7.16b 1321 ld1 {v28.16b},[x0],#16 1322 aesd v26.16b,v23.16b 1323 orr v6.16b,v29.16b,v29.16b 1324 ld1 {v29.16b},[x0],#16 1325 cbz x6,.Lcbc_tail4x 1326 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1327 eor v4.16b,v4.16b,v0.16b 1328 orr v0.16b,v2.16b,v2.16b 1329 eor v5.16b,v5.16b,v1.16b 1330 orr v1.16b,v3.16b,v3.16b 1331 eor v17.16b,v17.16b,v24.16b 1332 orr v24.16b,v27.16b,v27.16b 1333 eor v30.16b,v30.16b,v25.16b 1334 orr v25.16b,v28.16b,v28.16b 1335 eor v31.16b,v31.16b,v26.16b 1336 st1 {v4.16b},[x1],#16 1337 orr v26.16b,v29.16b,v29.16b 1338 st1 {v5.16b},[x1],#16 1339 mov w6,w5 1340 st1 {v17.16b},[x1],#16 1341 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1342 st1 {v30.16b},[x1],#16 1343 st1 {v31.16b},[x1],#16 1344 b.hs .Loop5x_cbc_dec 1345 1346 add x2,x2,#0x50 1347 cbz x2,.Lcbc_done 1348 1349 add w6,w5,#2 1350 subs x2,x2,#0x30 1351 orr v0.16b,v27.16b,v27.16b 1352 orr v2.16b,v27.16b,v27.16b 1353 orr v1.16b,v28.16b,v28.16b 1354 orr v3.16b,v28.16b,v28.16b 1355 orr v24.16b,v29.16b,v29.16b 1356 orr v27.16b,v29.16b,v29.16b 1357 b.lo .Lcbc_dec_tail 1358 1359 b .Loop3x_cbc_dec 1360 1361.align 4 1362.Lcbc_tail4x: 1363 eor v5.16b,v4.16b,v1.16b 1364 eor v17.16b,v17.16b,v24.16b 1365 eor v30.16b,v30.16b,v25.16b 1366 eor v31.16b,v31.16b,v26.16b 1367 st1 {v5.16b},[x1],#16 1368 st1 {v17.16b},[x1],#16 1369 st1 {v30.16b},[x1],#16 1370 st1 {v31.16b},[x1],#16 1371 1372 b .Lcbc_done 1373.align 4 1374.Loop3x_cbc_dec: 1375 aesd v0.16b,v16.16b 1376 aesimc v0.16b,v0.16b 1377 aesd v1.16b,v16.16b 1378 aesimc v1.16b,v1.16b 1379 aesd v24.16b,v16.16b 1380 aesimc v24.16b,v24.16b 1381 ld1 {v16.4s},[x7],#16 1382 subs w6,w6,#2 1383 aesd v0.16b,v17.16b 1384 aesimc v0.16b,v0.16b 1385 aesd v1.16b,v17.16b 1386 aesimc v1.16b,v1.16b 1387 aesd v24.16b,v17.16b 1388 aesimc v24.16b,v24.16b 1389 ld1 {v17.4s},[x7],#16 1390 b.gt .Loop3x_cbc_dec 1391 1392 aesd v0.16b,v16.16b 1393 aesimc v0.16b,v0.16b 1394 aesd v1.16b,v16.16b 1395 aesimc v1.16b,v1.16b 1396 aesd v24.16b,v16.16b 1397 aesimc v24.16b,v24.16b 1398 eor v4.16b,v6.16b,v7.16b 1399 subs x2,x2,#0x30 1400 eor v5.16b,v2.16b,v7.16b 1401 csel x6,x2,x6,lo // x6, w6, is zero at this point 1402 aesd v0.16b,v17.16b 1403 aesimc v0.16b,v0.16b 1404 aesd v1.16b,v17.16b 1405 aesimc v1.16b,v1.16b 1406 aesd v24.16b,v17.16b 1407 aesimc v24.16b,v24.16b 1408 eor v17.16b,v3.16b,v7.16b 1409 add x0,x0,x6 // x0 is adjusted in such way that 1410 // at exit from the loop v1.16b-v24.16b 1411 // are loaded with last "words" 1412 orr v6.16b,v27.16b,v27.16b 1413 mov x7,x3 1414 aesd v0.16b,v20.16b 1415 aesimc v0.16b,v0.16b 1416 aesd v1.16b,v20.16b 1417 aesimc v1.16b,v1.16b 1418 aesd v24.16b,v20.16b 1419 aesimc v24.16b,v24.16b 1420 ld1 {v2.16b},[x0],#16 1421 aesd v0.16b,v21.16b 1422 aesimc v0.16b,v0.16b 1423 aesd v1.16b,v21.16b 1424 aesimc v1.16b,v1.16b 1425 aesd v24.16b,v21.16b 1426 aesimc v24.16b,v24.16b 1427 ld1 {v3.16b},[x0],#16 1428 aesd v0.16b,v22.16b 1429 aesimc v0.16b,v0.16b 1430 aesd v1.16b,v22.16b 1431 aesimc v1.16b,v1.16b 1432 aesd v24.16b,v22.16b 1433 aesimc v24.16b,v24.16b 1434 ld1 {v27.16b},[x0],#16 1435 aesd v0.16b,v23.16b 1436 aesd v1.16b,v23.16b 1437 aesd v24.16b,v23.16b 1438 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 1439 add w6,w5,#2 1440 eor v4.16b,v4.16b,v0.16b 1441 eor v5.16b,v5.16b,v1.16b 1442 eor v24.16b,v24.16b,v17.16b 1443 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 1444 st1 {v4.16b},[x1],#16 1445 orr v0.16b,v2.16b,v2.16b 1446 st1 {v5.16b},[x1],#16 1447 orr v1.16b,v3.16b,v3.16b 1448 st1 {v24.16b},[x1],#16 1449 orr v24.16b,v27.16b,v27.16b 1450 b.hs .Loop3x_cbc_dec 1451 1452 cmn x2,#0x30 1453 b.eq .Lcbc_done 1454 nop 1455 1456.Lcbc_dec_tail: 1457 aesd v1.16b,v16.16b 1458 aesimc v1.16b,v1.16b 1459 aesd v24.16b,v16.16b 1460 aesimc v24.16b,v24.16b 1461 ld1 {v16.4s},[x7],#16 1462 subs w6,w6,#2 1463 aesd v1.16b,v17.16b 1464 aesimc v1.16b,v1.16b 1465 aesd v24.16b,v17.16b 1466 aesimc v24.16b,v24.16b 1467 ld1 {v17.4s},[x7],#16 1468 b.gt .Lcbc_dec_tail 1469 1470 aesd v1.16b,v16.16b 1471 aesimc v1.16b,v1.16b 1472 aesd v24.16b,v16.16b 1473 aesimc v24.16b,v24.16b 1474 aesd v1.16b,v17.16b 1475 aesimc v1.16b,v1.16b 1476 aesd v24.16b,v17.16b 1477 aesimc v24.16b,v24.16b 1478 aesd v1.16b,v20.16b 1479 aesimc v1.16b,v1.16b 1480 aesd v24.16b,v20.16b 1481 aesimc v24.16b,v24.16b 1482 cmn x2,#0x20 1483 aesd v1.16b,v21.16b 1484 aesimc v1.16b,v1.16b 1485 aesd v24.16b,v21.16b 1486 aesimc v24.16b,v24.16b 1487 eor v5.16b,v6.16b,v7.16b 1488 aesd v1.16b,v22.16b 1489 aesimc v1.16b,v1.16b 1490 aesd v24.16b,v22.16b 1491 aesimc v24.16b,v24.16b 1492 eor v17.16b,v3.16b,v7.16b 1493 aesd v1.16b,v23.16b 1494 aesd v24.16b,v23.16b 1495 b.eq .Lcbc_dec_one 1496 eor v5.16b,v5.16b,v1.16b 1497 eor v17.16b,v17.16b,v24.16b 1498 orr v6.16b,v27.16b,v27.16b 1499 st1 {v5.16b},[x1],#16 1500 st1 {v17.16b},[x1],#16 1501 b .Lcbc_done 1502 1503.Lcbc_dec_one: 1504 eor v5.16b,v5.16b,v24.16b 1505 orr v6.16b,v27.16b,v27.16b 1506 st1 {v5.16b},[x1],#16 1507 1508.Lcbc_done: 1509 st1 {v6.16b},[x4] 1510.Lcbc_abort: 1511 ldr x29,[sp],#16 1512 ret 1513.size aes_v8_cbc_encrypt,.-aes_v8_cbc_encrypt 1514.globl aes_v8_ctr32_encrypt_blocks_unroll12_eor3 1515.type aes_v8_ctr32_encrypt_blocks_unroll12_eor3,%function 1516.align 5 1517aes_v8_ctr32_encrypt_blocks_unroll12_eor3: 1518 AARCH64_VALID_CALL_TARGET 1519 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 1520 stp x29,x30,[sp,#-80]! 1521 stp d8,d9,[sp, #16] 1522 stp d10,d11,[sp, #32] 1523 stp d12,d13,[sp, #48] 1524 stp d14,d15,[sp, #64] 1525 add x29,sp,#0 1526 1527 ldr w5,[x3,#240] 1528 1529 ldr w8, [x4, #12] 1530#ifdef __AARCH64EB__ 1531 ld1 {v24.16b},[x4] 1532#else 1533 ld1 {v24.4s},[x4] 1534#endif 1535 ld1 {v2.4s,v3.4s},[x3] // load key schedule... 1536 sub w5,w5,#4 1537 cmp x2,#2 1538 add x7,x3,x5,lsl#4 // pointer to last round key 1539 sub w5,w5,#2 1540 add x7, x7, #64 1541 ld1 {v1.4s},[x7] 1542 add x7,x3,#32 1543 mov w6,w5 1544#ifndef __AARCH64EB__ 1545 rev w8, w8 1546#endif 1547 1548 orr v25.16b,v24.16b,v24.16b 1549 add w10, w8, #1 1550 orr v26.16b,v24.16b,v24.16b 1551 add w8, w8, #2 1552 orr v0.16b,v24.16b,v24.16b 1553 rev w10, w10 1554 mov v25.s[3],w10 1555 b.ls .Lctr32_tail_unroll 1556 cmp x2,#6 1557 rev w12, w8 1558 sub x2,x2,#3 // bias 1559 mov v26.s[3],w12 1560 b.lo .Loop3x_ctr32_unroll 1561 cmp x2,#9 1562 orr v27.16b,v24.16b,v24.16b 1563 add w11, w8, #1 1564 orr v28.16b,v24.16b,v24.16b 1565 add w13, w8, #2 1566 rev w11, w11 1567 orr v29.16b,v24.16b,v24.16b 1568 add w8, w8, #3 1569 rev w13, w13 1570 mov v27.s[3],w11 1571 rev w14, w8 1572 mov v28.s[3],w13 1573 mov v29.s[3],w14 1574 sub x2,x2,#3 1575 b.lo .Loop6x_ctr32_unroll 1576 1577 // push regs to stack when 12 data chunks are interleaved 1578 stp x19,x20,[sp,#-16]! 1579 stp x21,x22,[sp,#-16]! 1580 stp x23,x24,[sp,#-16]! 1581 stp d8,d9,[sp,#-32]! 1582 stp d10,d11,[sp,#-32]! 1583 1584 add w15,w8,#1 1585 add w19,w8,#2 1586 add w20,w8,#3 1587 add w21,w8,#4 1588 add w22,w8,#5 1589 add w8,w8,#6 1590 orr v30.16b,v24.16b,v24.16b 1591 rev w15,w15 1592 orr v31.16b,v24.16b,v24.16b 1593 rev w19,w19 1594 orr v8.16b,v24.16b,v24.16b 1595 rev w20,w20 1596 orr v9.16b,v24.16b,v24.16b 1597 rev w21,w21 1598 orr v10.16b,v24.16b,v24.16b 1599 rev w22,w22 1600 orr v11.16b,v24.16b,v24.16b 1601 rev w23,w8 1602 1603 sub x2,x2,#6 // bias 1604 mov v30.s[3],w15 1605 mov v31.s[3],w19 1606 mov v8.s[3],w20 1607 mov v9.s[3],w21 1608 mov v10.s[3],w22 1609 mov v11.s[3],w23 1610 b .Loop12x_ctr32_unroll 1611 1612.align 4 1613.Loop12x_ctr32_unroll: 1614 aese v24.16b,v2.16b 1615 aesmc v24.16b,v24.16b 1616 aese v25.16b,v2.16b 1617 aesmc v25.16b,v25.16b 1618 aese v26.16b,v2.16b 1619 aesmc v26.16b,v26.16b 1620 aese v27.16b,v2.16b 1621 aesmc v27.16b,v27.16b 1622 aese v28.16b,v2.16b 1623 aesmc v28.16b,v28.16b 1624 aese v29.16b,v2.16b 1625 aesmc v29.16b,v29.16b 1626 aese v30.16b,v2.16b 1627 aesmc v30.16b,v30.16b 1628 aese v31.16b,v2.16b 1629 aesmc v31.16b,v31.16b 1630 aese v8.16b,v2.16b 1631 aesmc v8.16b,v8.16b 1632 aese v9.16b,v2.16b 1633 aesmc v9.16b,v9.16b 1634 aese v10.16b,v2.16b 1635 aesmc v10.16b,v10.16b 1636 aese v11.16b,v2.16b 1637 aesmc v11.16b,v11.16b 1638 ld1 {v2.4s},[x7],#16 1639 subs w6,w6,#2 1640 aese v24.16b,v3.16b 1641 aesmc v24.16b,v24.16b 1642 aese v25.16b,v3.16b 1643 aesmc v25.16b,v25.16b 1644 aese v26.16b,v3.16b 1645 aesmc v26.16b,v26.16b 1646 aese v27.16b,v3.16b 1647 aesmc v27.16b,v27.16b 1648 aese v28.16b,v3.16b 1649 aesmc v28.16b,v28.16b 1650 aese v29.16b,v3.16b 1651 aesmc v29.16b,v29.16b 1652 aese v30.16b,v3.16b 1653 aesmc v30.16b,v30.16b 1654 aese v31.16b,v3.16b 1655 aesmc v31.16b,v31.16b 1656 aese v8.16b,v3.16b 1657 aesmc v8.16b,v8.16b 1658 aese v9.16b,v3.16b 1659 aesmc v9.16b,v9.16b 1660 aese v10.16b,v3.16b 1661 aesmc v10.16b,v10.16b 1662 aese v11.16b,v3.16b 1663 aesmc v11.16b,v11.16b 1664 ld1 {v3.4s},[x7],#16 1665 b.gt .Loop12x_ctr32_unroll 1666 1667 aese v24.16b,v2.16b 1668 aesmc v24.16b,v24.16b 1669 aese v25.16b,v2.16b 1670 aesmc v25.16b,v25.16b 1671 aese v26.16b,v2.16b 1672 aesmc v26.16b,v26.16b 1673 aese v27.16b,v2.16b 1674 aesmc v27.16b,v27.16b 1675 aese v28.16b,v2.16b 1676 aesmc v28.16b,v28.16b 1677 aese v29.16b,v2.16b 1678 aesmc v29.16b,v29.16b 1679 aese v30.16b,v2.16b 1680 aesmc v30.16b,v30.16b 1681 aese v31.16b,v2.16b 1682 aesmc v31.16b,v31.16b 1683 aese v8.16b,v2.16b 1684 aesmc v8.16b,v8.16b 1685 aese v9.16b,v2.16b 1686 aesmc v9.16b,v9.16b 1687 aese v10.16b,v2.16b 1688 aesmc v10.16b,v10.16b 1689 aese v11.16b,v2.16b 1690 aesmc v11.16b,v11.16b 1691 ld1 {v2.4s},[x7],#16 1692 1693 aese v24.16b,v3.16b 1694 aesmc v24.16b,v24.16b 1695 aese v25.16b,v3.16b 1696 aesmc v25.16b,v25.16b 1697 aese v26.16b,v3.16b 1698 aesmc v26.16b,v26.16b 1699 aese v27.16b,v3.16b 1700 aesmc v27.16b,v27.16b 1701 aese v28.16b,v3.16b 1702 aesmc v28.16b,v28.16b 1703 aese v29.16b,v3.16b 1704 aesmc v29.16b,v29.16b 1705 aese v30.16b,v3.16b 1706 aesmc v30.16b,v30.16b 1707 aese v31.16b,v3.16b 1708 aesmc v31.16b,v31.16b 1709 aese v8.16b,v3.16b 1710 aesmc v8.16b,v8.16b 1711 aese v9.16b,v3.16b 1712 aesmc v9.16b,v9.16b 1713 aese v10.16b,v3.16b 1714 aesmc v10.16b,v10.16b 1715 aese v11.16b,v3.16b 1716 aesmc v11.16b,v11.16b 1717 ld1 {v3.4s},[x7],#16 1718 1719 aese v24.16b,v2.16b 1720 aesmc v24.16b,v24.16b 1721 add w9,w8,#1 1722 add w10,w8,#2 1723 aese v25.16b,v2.16b 1724 aesmc v25.16b,v25.16b 1725 add w12,w8,#3 1726 add w11,w8,#4 1727 aese v26.16b,v2.16b 1728 aesmc v26.16b,v26.16b 1729 add w13,w8,#5 1730 add w14,w8,#6 1731 rev w9,w9 1732 aese v27.16b,v2.16b 1733 aesmc v27.16b,v27.16b 1734 add w15,w8,#7 1735 add w19,w8,#8 1736 rev w10,w10 1737 rev w12,w12 1738 aese v28.16b,v2.16b 1739 aesmc v28.16b,v28.16b 1740 add w20,w8,#9 1741 add w21,w8,#10 1742 rev w11,w11 1743 rev w13,w13 1744 aese v29.16b,v2.16b 1745 aesmc v29.16b,v29.16b 1746 add w22,w8,#11 1747 add w23,w8,#12 1748 rev w14,w14 1749 rev w15,w15 1750 aese v30.16b,v2.16b 1751 aesmc v30.16b,v30.16b 1752 rev w19,w19 1753 rev w20,w20 1754 aese v31.16b,v2.16b 1755 aesmc v31.16b,v31.16b 1756 rev w21,w21 1757 rev w22,w22 1758 aese v8.16b,v2.16b 1759 aesmc v8.16b,v8.16b 1760 rev w23,w23 1761 aese v9.16b,v2.16b 1762 aesmc v9.16b,v9.16b 1763 aese v10.16b,v2.16b 1764 aesmc v10.16b,v10.16b 1765 aese v11.16b,v2.16b 1766 aesmc v11.16b,v11.16b 1767 ld1 {v2.4s},[x7],#16 1768 1769 aese v24.16b,v3.16b 1770 aesmc v24.16b,v24.16b 1771 aese v25.16b,v3.16b 1772 aesmc v25.16b,v25.16b 1773 aese v26.16b,v3.16b 1774 aesmc v26.16b,v26.16b 1775 aese v27.16b,v3.16b 1776 aesmc v27.16b,v27.16b 1777 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1778 aese v28.16b,v3.16b 1779 aesmc v28.16b,v28.16b 1780 aese v29.16b,v3.16b 1781 aesmc v29.16b,v29.16b 1782 aese v30.16b,v3.16b 1783 aesmc v30.16b,v30.16b 1784 aese v31.16b,v3.16b 1785 aesmc v31.16b,v31.16b 1786 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x0],#64 1787 aese v8.16b,v3.16b 1788 aesmc v8.16b,v8.16b 1789 aese v9.16b,v3.16b 1790 aesmc v9.16b,v9.16b 1791 aese v10.16b,v3.16b 1792 aesmc v10.16b,v10.16b 1793 aese v11.16b,v3.16b 1794 aesmc v11.16b,v11.16b 1795 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x0],#64 1796 ld1 {v3.4s},[x7],#16 1797 1798 mov x7, x3 1799 aese v24.16b,v2.16b 1800 aesmc v24.16b,v24.16b 1801 aese v25.16b,v2.16b 1802 aesmc v25.16b,v25.16b 1803 aese v26.16b,v2.16b 1804 aesmc v26.16b,v26.16b 1805 aese v27.16b,v2.16b 1806 aesmc v27.16b,v27.16b 1807 aese v28.16b,v2.16b 1808 aesmc v28.16b,v28.16b 1809 aese v29.16b,v2.16b 1810 aesmc v29.16b,v29.16b 1811 aese v30.16b,v2.16b 1812 aesmc v30.16b,v30.16b 1813 aese v31.16b,v2.16b 1814 aesmc v31.16b,v31.16b 1815 aese v8.16b,v2.16b 1816 aesmc v8.16b,v8.16b 1817 aese v9.16b,v2.16b 1818 aesmc v9.16b,v9.16b 1819 aese v10.16b,v2.16b 1820 aesmc v10.16b,v10.16b 1821 aese v11.16b,v2.16b 1822 aesmc v11.16b,v11.16b 1823 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 1824 1825 aese v24.16b,v3.16b 1826.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 1827 orr v24.16b,v0.16b,v0.16b 1828 aese v25.16b,v3.16b 1829.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 1830 orr v25.16b,v0.16b,v0.16b 1831 aese v26.16b,v3.16b 1832.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b 1833 orr v26.16b,v0.16b,v0.16b 1834 aese v27.16b,v3.16b 1835.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b 1836 orr v27.16b,v0.16b,v0.16b 1837 aese v28.16b,v3.16b 1838.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b 1839 orr v28.16b,v0.16b,v0.16b 1840 aese v29.16b,v3.16b 1841.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b 1842 orr v29.16b,v0.16b,v0.16b 1843 aese v30.16b,v3.16b 1844.inst 0xce017a52 //eor3 v18.16b,v18.16b,v1.16b,v30.16b 1845 orr v30.16b,v0.16b,v0.16b 1846 aese v31.16b,v3.16b 1847.inst 0xce017e73 //eor3 v19.16b,v19.16b,v1.16b,v31.16b 1848 orr v31.16b,v0.16b,v0.16b 1849 aese v8.16b,v3.16b 1850.inst 0xce012294 //eor3 v20.16b,v20.16b,v1.16b,v8.16b 1851 orr v8.16b,v0.16b,v0.16b 1852 aese v9.16b,v3.16b 1853.inst 0xce0126b5 //eor3 v21.16b,v21.16b,v1.16b,v9.16b 1854 orr v9.16b,v0.16b,v0.16b 1855 aese v10.16b,v3.16b 1856.inst 0xce012ad6 //eor3 v22.16b,v22.16b,v1.16b,v10.16b 1857 orr v10.16b,v0.16b,v0.16b 1858 aese v11.16b,v3.16b 1859.inst 0xce012ef7 //eor3 v23.16b,v23.16b,v1.16b,v11.16b 1860 orr v11.16b,v0.16b,v0.16b 1861 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 1862 1863 mov v24.s[3],w9 1864 mov v25.s[3],w10 1865 mov v26.s[3],w12 1866 mov v27.s[3],w11 1867 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 1868 mov v28.s[3],w13 1869 mov v29.s[3],w14 1870 mov v30.s[3],w15 1871 mov v31.s[3],w19 1872 st1 {v16.16b,v17.16b,v18.16b,v19.16b},[x1],#64 1873 mov v8.s[3],w20 1874 mov v9.s[3],w21 1875 mov v10.s[3],w22 1876 mov v11.s[3],w23 1877 st1 {v20.16b,v21.16b,v22.16b,v23.16b},[x1],#64 1878 1879 mov w6,w5 1880 1881 add w8,w8,#12 1882 subs x2,x2,#12 1883 b.hs .Loop12x_ctr32_unroll 1884 1885 // pop regs from stack when 12 data chunks are interleaved 1886 ldp d10,d11,[sp],#32 1887 ldp d8,d9,[sp],#32 1888 ldp x23,x24,[sp],#16 1889 ldp x21,x22,[sp],#16 1890 ldp x19,x20,[sp],#16 1891 1892 add x2,x2,#12 1893 cbz x2,.Lctr32_done_unroll 1894 sub w8,w8,#12 1895 1896 cmp x2,#2 1897 b.ls .Lctr32_tail_unroll 1898 1899 cmp x2,#6 1900 sub x2,x2,#3 // bias 1901 add w8,w8,#3 1902 b.lo .Loop3x_ctr32_unroll 1903 1904 sub x2,x2,#3 1905 add w8,w8,#3 1906 b.lo .Loop6x_ctr32_unroll 1907 1908.align 4 1909.Loop6x_ctr32_unroll: 1910 aese v24.16b,v2.16b 1911 aesmc v24.16b,v24.16b 1912 aese v25.16b,v2.16b 1913 aesmc v25.16b,v25.16b 1914 aese v26.16b,v2.16b 1915 aesmc v26.16b,v26.16b 1916 aese v27.16b,v2.16b 1917 aesmc v27.16b,v27.16b 1918 aese v28.16b,v2.16b 1919 aesmc v28.16b,v28.16b 1920 aese v29.16b,v2.16b 1921 aesmc v29.16b,v29.16b 1922 ld1 {v2.4s},[x7],#16 1923 subs w6,w6,#2 1924 aese v24.16b,v3.16b 1925 aesmc v24.16b,v24.16b 1926 aese v25.16b,v3.16b 1927 aesmc v25.16b,v25.16b 1928 aese v26.16b,v3.16b 1929 aesmc v26.16b,v26.16b 1930 aese v27.16b,v3.16b 1931 aesmc v27.16b,v27.16b 1932 aese v28.16b,v3.16b 1933 aesmc v28.16b,v28.16b 1934 aese v29.16b,v3.16b 1935 aesmc v29.16b,v29.16b 1936 ld1 {v3.4s},[x7],#16 1937 b.gt .Loop6x_ctr32_unroll 1938 1939 aese v24.16b,v2.16b 1940 aesmc v24.16b,v24.16b 1941 aese v25.16b,v2.16b 1942 aesmc v25.16b,v25.16b 1943 aese v26.16b,v2.16b 1944 aesmc v26.16b,v26.16b 1945 aese v27.16b,v2.16b 1946 aesmc v27.16b,v27.16b 1947 aese v28.16b,v2.16b 1948 aesmc v28.16b,v28.16b 1949 aese v29.16b,v2.16b 1950 aesmc v29.16b,v29.16b 1951 ld1 {v2.4s},[x7],#16 1952 1953 aese v24.16b,v3.16b 1954 aesmc v24.16b,v24.16b 1955 aese v25.16b,v3.16b 1956 aesmc v25.16b,v25.16b 1957 aese v26.16b,v3.16b 1958 aesmc v26.16b,v26.16b 1959 aese v27.16b,v3.16b 1960 aesmc v27.16b,v27.16b 1961 aese v28.16b,v3.16b 1962 aesmc v28.16b,v28.16b 1963 aese v29.16b,v3.16b 1964 aesmc v29.16b,v29.16b 1965 ld1 {v3.4s},[x7],#16 1966 1967 aese v24.16b,v2.16b 1968 aesmc v24.16b,v24.16b 1969 add w9,w8,#1 1970 add w10,w8,#2 1971 aese v25.16b,v2.16b 1972 aesmc v25.16b,v25.16b 1973 add w12,w8,#3 1974 add w11,w8,#4 1975 aese v26.16b,v2.16b 1976 aesmc v26.16b,v26.16b 1977 add w13,w8,#5 1978 add w14,w8,#6 1979 rev w9,w9 1980 aese v27.16b,v2.16b 1981 aesmc v27.16b,v27.16b 1982 rev w10,w10 1983 rev w12,w12 1984 aese v28.16b,v2.16b 1985 aesmc v28.16b,v28.16b 1986 rev w11,w11 1987 rev w13,w13 1988 aese v29.16b,v2.16b 1989 aesmc v29.16b,v29.16b 1990 rev w14,w14 1991 ld1 {v2.4s},[x7],#16 1992 1993 aese v24.16b,v3.16b 1994 aesmc v24.16b,v24.16b 1995 aese v25.16b,v3.16b 1996 aesmc v25.16b,v25.16b 1997 ld1 {v4.16b,v5.16b,v6.16b,v7.16b},[x0],#64 1998 aese v26.16b,v3.16b 1999 aesmc v26.16b,v26.16b 2000 aese v27.16b,v3.16b 2001 aesmc v27.16b,v27.16b 2002 ld1 {v16.16b,v17.16b},[x0],#32 2003 aese v28.16b,v3.16b 2004 aesmc v28.16b,v28.16b 2005 aese v29.16b,v3.16b 2006 aesmc v29.16b,v29.16b 2007 ld1 {v3.4s},[x7],#16 2008 2009 mov x7, x3 2010 aese v24.16b,v2.16b 2011 aesmc v24.16b,v24.16b 2012 aese v25.16b,v2.16b 2013 aesmc v25.16b,v25.16b 2014 aese v26.16b,v2.16b 2015 aesmc v26.16b,v26.16b 2016 aese v27.16b,v2.16b 2017 aesmc v27.16b,v27.16b 2018 aese v28.16b,v2.16b 2019 aesmc v28.16b,v28.16b 2020 aese v29.16b,v2.16b 2021 aesmc v29.16b,v29.16b 2022 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 2023 2024 aese v24.16b,v3.16b 2025.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2026 aese v25.16b,v3.16b 2027.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 2028 aese v26.16b,v3.16b 2029.inst 0xce0168c6 //eor3 v6.16b,v6.16b,v1.16b,v26.16b 2030 aese v27.16b,v3.16b 2031.inst 0xce016ce7 //eor3 v7.16b,v7.16b,v1.16b,v27.16b 2032 aese v28.16b,v3.16b 2033.inst 0xce017210 //eor3 v16.16b,v16.16b,v1.16b,v28.16b 2034 aese v29.16b,v3.16b 2035.inst 0xce017631 //eor3 v17.16b,v17.16b,v1.16b,v29.16b 2036 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 2037 2038 orr v24.16b,v0.16b,v0.16b 2039 orr v25.16b,v0.16b,v0.16b 2040 orr v26.16b,v0.16b,v0.16b 2041 orr v27.16b,v0.16b,v0.16b 2042 orr v28.16b,v0.16b,v0.16b 2043 orr v29.16b,v0.16b,v0.16b 2044 2045 mov v24.s[3],w9 2046 mov v25.s[3],w10 2047 st1 {v4.16b,v5.16b,v6.16b,v7.16b},[x1],#64 2048 mov v26.s[3],w12 2049 mov v27.s[3],w11 2050 st1 {v16.16b,v17.16b},[x1],#32 2051 mov v28.s[3],w13 2052 mov v29.s[3],w14 2053 2054 cbz x2,.Lctr32_done_unroll 2055 mov w6,w5 2056 2057 cmp x2,#2 2058 b.ls .Lctr32_tail_unroll 2059 2060 sub x2,x2,#3 // bias 2061 add w8,w8,#3 2062 b .Loop3x_ctr32_unroll 2063 2064.align 4 2065.Loop3x_ctr32_unroll: 2066 aese v24.16b,v2.16b 2067 aesmc v24.16b,v24.16b 2068 aese v25.16b,v2.16b 2069 aesmc v25.16b,v25.16b 2070 aese v26.16b,v2.16b 2071 aesmc v26.16b,v26.16b 2072 ld1 {v2.4s},[x7],#16 2073 subs w6,w6,#2 2074 aese v24.16b,v3.16b 2075 aesmc v24.16b,v24.16b 2076 aese v25.16b,v3.16b 2077 aesmc v25.16b,v25.16b 2078 aese v26.16b,v3.16b 2079 aesmc v26.16b,v26.16b 2080 ld1 {v3.4s},[x7],#16 2081 b.gt .Loop3x_ctr32_unroll 2082 2083 aese v24.16b,v2.16b 2084 aesmc v9.16b,v24.16b 2085 aese v25.16b,v2.16b 2086 aesmc v10.16b,v25.16b 2087 ld1 {v4.16b,v5.16b,v6.16b},[x0],#48 2088 orr v24.16b,v0.16b,v0.16b 2089 aese v26.16b,v2.16b 2090 aesmc v26.16b,v26.16b 2091 ld1 {v2.4s},[x7],#16 2092 orr v25.16b,v0.16b,v0.16b 2093 aese v9.16b,v3.16b 2094 aesmc v9.16b,v9.16b 2095 aese v10.16b,v3.16b 2096 aesmc v10.16b,v10.16b 2097 aese v26.16b,v3.16b 2098 aesmc v11.16b,v26.16b 2099 ld1 {v3.4s},[x7],#16 2100 orr v26.16b,v0.16b,v0.16b 2101 add w9,w8,#1 2102 aese v9.16b,v2.16b 2103 aesmc v9.16b,v9.16b 2104 aese v10.16b,v2.16b 2105 aesmc v10.16b,v10.16b 2106 add w10,w8,#2 2107 aese v11.16b,v2.16b 2108 aesmc v11.16b,v11.16b 2109 ld1 {v2.4s},[x7],#16 2110 add w8,w8,#3 2111 aese v9.16b,v3.16b 2112 aesmc v9.16b,v9.16b 2113 aese v10.16b,v3.16b 2114 aesmc v10.16b,v10.16b 2115 2116 rev w9,w9 2117 aese v11.16b,v3.16b 2118 aesmc v11.16b,v11.16b 2119 ld1 {v3.4s},[x7],#16 2120 mov v24.s[3], w9 2121 mov x7,x3 2122 rev w10,w10 2123 aese v9.16b,v2.16b 2124 aesmc v9.16b,v9.16b 2125 2126 aese v10.16b,v2.16b 2127 aesmc v10.16b,v10.16b 2128 mov v25.s[3], w10 2129 rev w12,w8 2130 aese v11.16b,v2.16b 2131 aesmc v11.16b,v11.16b 2132 mov v26.s[3], w12 2133 2134 aese v9.16b,v3.16b 2135 aese v10.16b,v3.16b 2136 aese v11.16b,v3.16b 2137 2138.inst 0xce012484 //eor3 v4.16b,v4.16b,v1.16b,v9.16b 2139 ld1 {v2.4s},[x7],#16 // re-pre-load rndkey[0] 2140.inst 0xce0128a5 //eor3 v5.16b,v5.16b,v1.16b,v10.16b 2141 mov w6,w5 2142.inst 0xce012cc6 //eor3 v6.16b,v6.16b,v1.16b,v11.16b 2143 ld1 {v3.4s},[x7],#16 // re-pre-load rndkey[1] 2144 st1 {v4.16b,v5.16b,v6.16b},[x1],#48 2145 2146 cbz x2,.Lctr32_done_unroll 2147 2148.Lctr32_tail_unroll: 2149 cmp x2,#1 2150 b.eq .Lctr32_tail_1_unroll 2151 2152.Lctr32_tail_2_unroll: 2153 aese v24.16b,v2.16b 2154 aesmc v24.16b,v24.16b 2155 aese v25.16b,v2.16b 2156 aesmc v25.16b,v25.16b 2157 ld1 {v2.4s},[x7],#16 2158 subs w6,w6,#2 2159 aese v24.16b,v3.16b 2160 aesmc v24.16b,v24.16b 2161 aese v25.16b,v3.16b 2162 aesmc v25.16b,v25.16b 2163 ld1 {v3.4s},[x7],#16 2164 b.gt .Lctr32_tail_2_unroll 2165 2166 aese v24.16b,v2.16b 2167 aesmc v24.16b,v24.16b 2168 aese v25.16b,v2.16b 2169 aesmc v25.16b,v25.16b 2170 ld1 {v2.4s},[x7],#16 2171 aese v24.16b,v3.16b 2172 aesmc v24.16b,v24.16b 2173 aese v25.16b,v3.16b 2174 aesmc v25.16b,v25.16b 2175 ld1 {v3.4s},[x7],#16 2176 ld1 {v4.16b,v5.16b},[x0],#32 2177 aese v24.16b,v2.16b 2178 aesmc v24.16b,v24.16b 2179 aese v25.16b,v2.16b 2180 aesmc v25.16b,v25.16b 2181 ld1 {v2.4s},[x7],#16 2182 aese v24.16b,v3.16b 2183 aesmc v24.16b,v24.16b 2184 aese v25.16b,v3.16b 2185 aesmc v25.16b,v25.16b 2186 ld1 {v3.4s},[x7],#16 2187 aese v24.16b,v2.16b 2188 aesmc v24.16b,v24.16b 2189 aese v25.16b,v2.16b 2190 aesmc v25.16b,v25.16b 2191 aese v24.16b,v3.16b 2192 aese v25.16b,v3.16b 2193 2194.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2195.inst 0xce0164a5 //eor3 v5.16b,v5.16b,v1.16b,v25.16b 2196 st1 {v4.16b,v5.16b},[x1],#32 2197 b .Lctr32_done_unroll 2198 2199.Lctr32_tail_1_unroll: 2200 aese v24.16b,v2.16b 2201 aesmc v24.16b,v24.16b 2202 ld1 {v2.4s},[x7],#16 2203 subs w6,w6,#2 2204 aese v24.16b,v3.16b 2205 aesmc v24.16b,v24.16b 2206 ld1 {v3.4s},[x7],#16 2207 b.gt .Lctr32_tail_1_unroll 2208 2209 aese v24.16b,v2.16b 2210 aesmc v24.16b,v24.16b 2211 ld1 {v2.4s},[x7],#16 2212 aese v24.16b,v3.16b 2213 aesmc v24.16b,v24.16b 2214 ld1 {v3.4s},[x7],#16 2215 ld1 {v4.16b},[x0] 2216 aese v24.16b,v2.16b 2217 aesmc v24.16b,v24.16b 2218 ld1 {v2.4s},[x7],#16 2219 aese v24.16b,v3.16b 2220 aesmc v24.16b,v24.16b 2221 ld1 {v3.4s},[x7],#16 2222 aese v24.16b,v2.16b 2223 aesmc v24.16b,v24.16b 2224 aese v24.16b,v3.16b 2225 2226.inst 0xce016084 //eor3 v4.16b,v4.16b,v1.16b,v24.16b 2227 st1 {v4.16b},[x1],#16 2228 2229.Lctr32_done_unroll: 2230 ldp d8,d9,[sp, #16] 2231 ldp d10,d11,[sp, #32] 2232 ldp d12,d13,[sp, #48] 2233 ldp d14,d15,[sp, #64] 2234 ldr x29,[sp],#80 2235 ret 2236.size aes_v8_ctr32_encrypt_blocks_unroll12_eor3,.-aes_v8_ctr32_encrypt_blocks_unroll12_eor3 2237.globl aes_v8_ctr32_encrypt_blocks 2238.type aes_v8_ctr32_encrypt_blocks,%function 2239.align 5 2240aes_v8_ctr32_encrypt_blocks: 2241 AARCH64_VALID_CALL_TARGET 2242 // Armv8.3-A PAuth: even though x30 is pushed to stack it is not popped later. 2243 stp x29,x30,[sp,#-16]! 2244 add x29,sp,#0 2245 ldr w5,[x3,#240] 2246 2247 ldr w8, [x4, #12] 2248#ifdef __AARCH64EB__ 2249 ld1 {v0.16b},[x4] 2250#else 2251 ld1 {v0.4s},[x4] 2252#endif 2253 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2254 sub w5,w5,#4 2255 mov x12,#16 2256 cmp x2,#2 2257 add x7,x3,x5,lsl#4 // pointer to last 5 round keys 2258 sub w5,w5,#2 2259 ld1 {v20.4s,v21.4s},[x7],#32 2260 ld1 {v22.4s,v23.4s},[x7],#32 2261 ld1 {v7.4s},[x7] 2262 add x7,x3,#32 2263 mov w6,w5 2264 csel x12,xzr,x12,lo 2265#ifndef __AARCH64EB__ 2266 rev w8, w8 2267#endif 2268 orr v1.16b,v0.16b,v0.16b 2269 add w10, w8, #1 2270 orr v18.16b,v0.16b,v0.16b 2271 add w8, w8, #2 2272 orr v6.16b,v0.16b,v0.16b 2273 rev w10, w10 2274 mov v1.s[3],w10 2275 b.ls .Lctr32_tail 2276 rev w12, w8 2277 sub x2,x2,#3 // bias 2278 mov v18.s[3],w12 2279 cmp x2,#32 2280 b.lo .Loop3x_ctr32 2281 2282 add w13,w8,#1 2283 add w14,w8,#2 2284 orr v24.16b,v0.16b,v0.16b 2285 rev w13,w13 2286 orr v25.16b,v0.16b,v0.16b 2287 rev w14,w14 2288 mov v24.s[3],w13 2289 sub x2,x2,#2 // bias 2290 mov v25.s[3],w14 2291 add w8,w8,#2 2292 b .Loop5x_ctr32 2293 2294.align 4 2295.Loop5x_ctr32: 2296 aese v0.16b,v16.16b 2297 aesmc v0.16b,v0.16b 2298 aese v1.16b,v16.16b 2299 aesmc v1.16b,v1.16b 2300 aese v18.16b,v16.16b 2301 aesmc v18.16b,v18.16b 2302 aese v24.16b,v16.16b 2303 aesmc v24.16b,v24.16b 2304 aese v25.16b,v16.16b 2305 aesmc v25.16b,v25.16b 2306 ld1 {v16.4s},[x7],#16 2307 subs w6,w6,#2 2308 aese v0.16b,v17.16b 2309 aesmc v0.16b,v0.16b 2310 aese v1.16b,v17.16b 2311 aesmc v1.16b,v1.16b 2312 aese v18.16b,v17.16b 2313 aesmc v18.16b,v18.16b 2314 aese v24.16b,v17.16b 2315 aesmc v24.16b,v24.16b 2316 aese v25.16b,v17.16b 2317 aesmc v25.16b,v25.16b 2318 ld1 {v17.4s},[x7],#16 2319 b.gt .Loop5x_ctr32 2320 2321 mov x7,x3 2322 aese v0.16b,v16.16b 2323 aesmc v0.16b,v0.16b 2324 aese v1.16b,v16.16b 2325 aesmc v1.16b,v1.16b 2326 aese v18.16b,v16.16b 2327 aesmc v18.16b,v18.16b 2328 aese v24.16b,v16.16b 2329 aesmc v24.16b,v24.16b 2330 aese v25.16b,v16.16b 2331 aesmc v25.16b,v25.16b 2332 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2333 2334 aese v0.16b,v17.16b 2335 aesmc v0.16b,v0.16b 2336 aese v1.16b,v17.16b 2337 aesmc v1.16b,v1.16b 2338 aese v18.16b,v17.16b 2339 aesmc v18.16b,v18.16b 2340 aese v24.16b,v17.16b 2341 aesmc v24.16b,v24.16b 2342 aese v25.16b,v17.16b 2343 aesmc v25.16b,v25.16b 2344 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2345 2346 aese v0.16b,v20.16b 2347 aesmc v0.16b,v0.16b 2348 add w9,w8,#1 2349 add w10,w8,#2 2350 aese v1.16b,v20.16b 2351 aesmc v1.16b,v1.16b 2352 add w12,w8,#3 2353 add w13,w8,#4 2354 aese v18.16b,v20.16b 2355 aesmc v18.16b,v18.16b 2356 add w14,w8,#5 2357 rev w9,w9 2358 aese v24.16b,v20.16b 2359 aesmc v24.16b,v24.16b 2360 rev w10,w10 2361 rev w12,w12 2362 aese v25.16b,v20.16b 2363 aesmc v25.16b,v25.16b 2364 rev w13,w13 2365 rev w14,w14 2366 2367 aese v0.16b,v21.16b 2368 aesmc v0.16b,v0.16b 2369 aese v1.16b,v21.16b 2370 aesmc v1.16b,v1.16b 2371 aese v18.16b,v21.16b 2372 aesmc v18.16b,v18.16b 2373 aese v24.16b,v21.16b 2374 aesmc v24.16b,v24.16b 2375 aese v25.16b,v21.16b 2376 aesmc v25.16b,v25.16b 2377 2378 aese v0.16b,v22.16b 2379 aesmc v0.16b,v0.16b 2380 ld1 {v2.16b},[x0],#16 2381 aese v1.16b,v22.16b 2382 aesmc v1.16b,v1.16b 2383 ld1 {v3.16b},[x0],#16 2384 aese v18.16b,v22.16b 2385 aesmc v18.16b,v18.16b 2386 ld1 {v19.16b},[x0],#16 2387 aese v24.16b,v22.16b 2388 aesmc v24.16b,v24.16b 2389 ld1 {v26.16b},[x0],#16 2390 aese v25.16b,v22.16b 2391 aesmc v25.16b,v25.16b 2392 ld1 {v27.16b},[x0],#16 2393 2394 aese v0.16b,v23.16b 2395 eor v2.16b,v2.16b,v7.16b 2396 aese v1.16b,v23.16b 2397 eor v3.16b,v3.16b,v7.16b 2398 aese v18.16b,v23.16b 2399 eor v19.16b,v19.16b,v7.16b 2400 aese v24.16b,v23.16b 2401 eor v26.16b,v26.16b,v7.16b 2402 aese v25.16b,v23.16b 2403 eor v27.16b,v27.16b,v7.16b 2404 2405 eor v2.16b,v2.16b,v0.16b 2406 orr v0.16b,v6.16b,v6.16b 2407 eor v3.16b,v3.16b,v1.16b 2408 orr v1.16b,v6.16b,v6.16b 2409 eor v19.16b,v19.16b,v18.16b 2410 orr v18.16b,v6.16b,v6.16b 2411 eor v26.16b,v26.16b,v24.16b 2412 orr v24.16b,v6.16b,v6.16b 2413 eor v27.16b,v27.16b,v25.16b 2414 orr v25.16b,v6.16b,v6.16b 2415 2416 st1 {v2.16b},[x1],#16 2417 mov v0.s[3],w9 2418 st1 {v3.16b},[x1],#16 2419 mov v1.s[3],w10 2420 st1 {v19.16b},[x1],#16 2421 mov v18.s[3],w12 2422 st1 {v26.16b},[x1],#16 2423 mov v24.s[3],w13 2424 st1 {v27.16b},[x1],#16 2425 mov v25.s[3],w14 2426 2427 mov w6,w5 2428 cbz x2,.Lctr32_done 2429 2430 add w8,w8,#5 2431 subs x2,x2,#5 2432 b.hs .Loop5x_ctr32 2433 2434 add x2,x2,#5 2435 sub w8,w8,#5 2436 2437 cmp x2,#2 2438 mov x12,#16 2439 csel x12,xzr,x12,lo 2440 b.ls .Lctr32_tail 2441 2442 sub x2,x2,#3 // bias 2443 add w8,w8,#3 2444 b .Loop3x_ctr32 2445 2446.align 4 2447.Loop3x_ctr32: 2448 aese v0.16b,v16.16b 2449 aesmc v0.16b,v0.16b 2450 aese v1.16b,v16.16b 2451 aesmc v1.16b,v1.16b 2452 aese v18.16b,v16.16b 2453 aesmc v18.16b,v18.16b 2454 ld1 {v16.4s},[x7],#16 2455 subs w6,w6,#2 2456 aese v0.16b,v17.16b 2457 aesmc v0.16b,v0.16b 2458 aese v1.16b,v17.16b 2459 aesmc v1.16b,v1.16b 2460 aese v18.16b,v17.16b 2461 aesmc v18.16b,v18.16b 2462 ld1 {v17.4s},[x7],#16 2463 b.gt .Loop3x_ctr32 2464 2465 aese v0.16b,v16.16b 2466 aesmc v4.16b,v0.16b 2467 aese v1.16b,v16.16b 2468 aesmc v5.16b,v1.16b 2469 ld1 {v2.16b},[x0],#16 2470 orr v0.16b,v6.16b,v6.16b 2471 aese v18.16b,v16.16b 2472 aesmc v18.16b,v18.16b 2473 ld1 {v3.16b},[x0],#16 2474 orr v1.16b,v6.16b,v6.16b 2475 aese v4.16b,v17.16b 2476 aesmc v4.16b,v4.16b 2477 aese v5.16b,v17.16b 2478 aesmc v5.16b,v5.16b 2479 ld1 {v19.16b},[x0],#16 2480 mov x7,x3 2481 aese v18.16b,v17.16b 2482 aesmc v17.16b,v18.16b 2483 orr v18.16b,v6.16b,v6.16b 2484 add w9,w8,#1 2485 aese v4.16b,v20.16b 2486 aesmc v4.16b,v4.16b 2487 aese v5.16b,v20.16b 2488 aesmc v5.16b,v5.16b 2489 eor v2.16b,v2.16b,v7.16b 2490 add w10,w8,#2 2491 aese v17.16b,v20.16b 2492 aesmc v17.16b,v17.16b 2493 eor v3.16b,v3.16b,v7.16b 2494 add w8,w8,#3 2495 aese v4.16b,v21.16b 2496 aesmc v4.16b,v4.16b 2497 aese v5.16b,v21.16b 2498 aesmc v5.16b,v5.16b 2499 eor v19.16b,v19.16b,v7.16b 2500 rev w9,w9 2501 aese v17.16b,v21.16b 2502 aesmc v17.16b,v17.16b 2503 mov v0.s[3], w9 2504 rev w10,w10 2505 aese v4.16b,v22.16b 2506 aesmc v4.16b,v4.16b 2507 aese v5.16b,v22.16b 2508 aesmc v5.16b,v5.16b 2509 mov v1.s[3], w10 2510 rev w12,w8 2511 aese v17.16b,v22.16b 2512 aesmc v17.16b,v17.16b 2513 mov v18.s[3], w12 2514 subs x2,x2,#3 2515 aese v4.16b,v23.16b 2516 aese v5.16b,v23.16b 2517 aese v17.16b,v23.16b 2518 2519 eor v2.16b,v2.16b,v4.16b 2520 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2521 st1 {v2.16b},[x1],#16 2522 eor v3.16b,v3.16b,v5.16b 2523 mov w6,w5 2524 st1 {v3.16b},[x1],#16 2525 eor v19.16b,v19.16b,v17.16b 2526 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2527 st1 {v19.16b},[x1],#16 2528 b.hs .Loop3x_ctr32 2529 2530 adds x2,x2,#3 2531 b.eq .Lctr32_done 2532 cmp x2,#1 2533 mov x12,#16 2534 csel x12,xzr,x12,eq 2535 2536.Lctr32_tail: 2537 aese v0.16b,v16.16b 2538 aesmc v0.16b,v0.16b 2539 aese v1.16b,v16.16b 2540 aesmc v1.16b,v1.16b 2541 ld1 {v16.4s},[x7],#16 2542 subs w6,w6,#2 2543 aese v0.16b,v17.16b 2544 aesmc v0.16b,v0.16b 2545 aese v1.16b,v17.16b 2546 aesmc v1.16b,v1.16b 2547 ld1 {v17.4s},[x7],#16 2548 b.gt .Lctr32_tail 2549 2550 aese v0.16b,v16.16b 2551 aesmc v0.16b,v0.16b 2552 aese v1.16b,v16.16b 2553 aesmc v1.16b,v1.16b 2554 aese v0.16b,v17.16b 2555 aesmc v0.16b,v0.16b 2556 aese v1.16b,v17.16b 2557 aesmc v1.16b,v1.16b 2558 ld1 {v2.16b},[x0],x12 2559 aese v0.16b,v20.16b 2560 aesmc v0.16b,v0.16b 2561 aese v1.16b,v20.16b 2562 aesmc v1.16b,v1.16b 2563 ld1 {v3.16b},[x0] 2564 aese v0.16b,v21.16b 2565 aesmc v0.16b,v0.16b 2566 aese v1.16b,v21.16b 2567 aesmc v1.16b,v1.16b 2568 eor v2.16b,v2.16b,v7.16b 2569 aese v0.16b,v22.16b 2570 aesmc v0.16b,v0.16b 2571 aese v1.16b,v22.16b 2572 aesmc v1.16b,v1.16b 2573 eor v3.16b,v3.16b,v7.16b 2574 aese v0.16b,v23.16b 2575 aese v1.16b,v23.16b 2576 2577 cmp x2,#1 2578 eor v2.16b,v2.16b,v0.16b 2579 eor v3.16b,v3.16b,v1.16b 2580 st1 {v2.16b},[x1],#16 2581 b.eq .Lctr32_done 2582 st1 {v3.16b},[x1] 2583 2584.Lctr32_done: 2585 ldr x29,[sp],#16 2586 ret 2587.size aes_v8_ctr32_encrypt_blocks,.-aes_v8_ctr32_encrypt_blocks 2588.globl aes_v8_xts_encrypt 2589.type aes_v8_xts_encrypt,%function 2590.align 5 2591aes_v8_xts_encrypt: 2592 AARCH64_VALID_CALL_TARGET 2593 cmp x2,#16 2594 // Original input data size bigger than 16, jump to big size processing. 2595 b.ne .Lxts_enc_big_size 2596 // Encrypt the iv with key2, as the first XEX iv. 2597 ldr w6,[x4,#240] 2598 ld1 {v0.4s},[x4],#16 2599 ld1 {v6.16b},[x5] 2600 sub w6,w6,#2 2601 ld1 {v1.4s},[x4],#16 2602 2603.Loop_enc_iv_enc: 2604 aese v6.16b,v0.16b 2605 aesmc v6.16b,v6.16b 2606 ld1 {v0.4s},[x4],#16 2607 subs w6,w6,#2 2608 aese v6.16b,v1.16b 2609 aesmc v6.16b,v6.16b 2610 ld1 {v1.4s},[x4],#16 2611 b.gt .Loop_enc_iv_enc 2612 2613 aese v6.16b,v0.16b 2614 aesmc v6.16b,v6.16b 2615 ld1 {v0.4s},[x4] 2616 aese v6.16b,v1.16b 2617 eor v6.16b,v6.16b,v0.16b 2618 2619 ld1 {v0.16b},[x0] 2620 eor v0.16b,v6.16b,v0.16b 2621 2622 ldr w6,[x3,#240] 2623 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 2624 2625 aese v0.16b,v28.16b 2626 aesmc v0.16b,v0.16b 2627 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 2628 aese v0.16b,v29.16b 2629 aesmc v0.16b,v0.16b 2630 subs w6,w6,#10 // if rounds==10, jump to aes-128-xts processing 2631 b.eq .Lxts_128_enc 2632.Lxts_enc_round_loop: 2633 aese v0.16b,v16.16b 2634 aesmc v0.16b,v0.16b 2635 ld1 {v16.4s},[x3],#16 // load key schedule... 2636 aese v0.16b,v17.16b 2637 aesmc v0.16b,v0.16b 2638 ld1 {v17.4s},[x3],#16 // load key schedule... 2639 subs w6,w6,#2 // bias 2640 b.gt .Lxts_enc_round_loop 2641.Lxts_128_enc: 2642 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 2643 aese v0.16b,v16.16b 2644 aesmc v0.16b,v0.16b 2645 aese v0.16b,v17.16b 2646 aesmc v0.16b,v0.16b 2647 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 2648 aese v0.16b,v18.16b 2649 aesmc v0.16b,v0.16b 2650 aese v0.16b,v19.16b 2651 aesmc v0.16b,v0.16b 2652 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 2653 aese v0.16b,v20.16b 2654 aesmc v0.16b,v0.16b 2655 aese v0.16b,v21.16b 2656 aesmc v0.16b,v0.16b 2657 ld1 {v7.4s},[x3] 2658 aese v0.16b,v22.16b 2659 aesmc v0.16b,v0.16b 2660 aese v0.16b,v23.16b 2661 eor v0.16b,v0.16b,v7.16b 2662 eor v0.16b,v0.16b,v6.16b 2663 st1 {v0.16b},[x1] 2664 b .Lxts_enc_final_abort 2665 2666.align 4 2667.Lxts_enc_big_size: 2668 stp x19,x20,[sp,#-64]! 2669 stp x21,x22,[sp,#48] 2670 stp d8,d9,[sp,#32] 2671 stp d10,d11,[sp,#16] 2672 2673 // tailcnt store the tail value of length%16. 2674 and x21,x2,#0xf 2675 and x2,x2,#-16 2676 subs x2,x2,#16 2677 mov x8,#16 2678 b.lo .Lxts_abort 2679 csel x8,xzr,x8,eq 2680 2681 // Firstly, encrypt the iv with key2, as the first iv of XEX. 2682 ldr w6,[x4,#240] 2683 ld1 {v0.4s},[x4],#16 2684 ld1 {v6.16b},[x5] 2685 sub w6,w6,#2 2686 ld1 {v1.4s},[x4],#16 2687 2688.Loop_iv_enc: 2689 aese v6.16b,v0.16b 2690 aesmc v6.16b,v6.16b 2691 ld1 {v0.4s},[x4],#16 2692 subs w6,w6,#2 2693 aese v6.16b,v1.16b 2694 aesmc v6.16b,v6.16b 2695 ld1 {v1.4s},[x4],#16 2696 b.gt .Loop_iv_enc 2697 2698 aese v6.16b,v0.16b 2699 aesmc v6.16b,v6.16b 2700 ld1 {v0.4s},[x4] 2701 aese v6.16b,v1.16b 2702 eor v6.16b,v6.16b,v0.16b 2703 2704 // The iv for second block 2705 // x9- iv(low), x10 - iv(high) 2706 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 2707 fmov x9,d6 2708 fmov x10,v6.d[1] 2709 mov w19,#0x87 2710 extr x22,x10,x10,#32 2711 extr x10,x10,x9,#63 2712 and w11,w19,w22,asr#31 2713 eor x9,x11,x9,lsl#1 2714 fmov d8,x9 2715 fmov v8.d[1],x10 2716 2717 ldr w5,[x3,#240] // next starting point 2718 ld1 {v0.16b},[x0],x8 2719 2720 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 2721 sub w5,w5,#6 2722 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 2723 sub w5,w5,#2 2724 ld1 {v18.4s,v19.4s},[x7],#32 2725 ld1 {v20.4s,v21.4s},[x7],#32 2726 ld1 {v22.4s,v23.4s},[x7],#32 2727 ld1 {v7.4s},[x7] 2728 2729 add x7,x3,#32 2730 mov w6,w5 2731 2732 // Encryption 2733.Lxts_enc: 2734 ld1 {v24.16b},[x0],#16 2735 subs x2,x2,#32 // bias 2736 add w6,w5,#2 2737 orr v3.16b,v0.16b,v0.16b 2738 orr v1.16b,v0.16b,v0.16b 2739 orr v28.16b,v0.16b,v0.16b 2740 orr v27.16b,v24.16b,v24.16b 2741 orr v29.16b,v24.16b,v24.16b 2742 b.lo .Lxts_inner_enc_tail 2743 eor v0.16b,v0.16b,v6.16b // before encryption, xor with iv 2744 eor v24.16b,v24.16b,v8.16b 2745 2746 // The iv for third block 2747 extr x22,x10,x10,#32 2748 extr x10,x10,x9,#63 2749 and w11,w19,w22,asr#31 2750 eor x9,x11,x9,lsl#1 2751 fmov d9,x9 2752 fmov v9.d[1],x10 2753 2754 2755 orr v1.16b,v24.16b,v24.16b 2756 ld1 {v24.16b},[x0],#16 2757 orr v2.16b,v0.16b,v0.16b 2758 orr v3.16b,v1.16b,v1.16b 2759 eor v27.16b,v24.16b,v9.16b // the third block 2760 eor v24.16b,v24.16b,v9.16b 2761 cmp x2,#32 2762 b.lo .Lxts_outer_enc_tail 2763 2764 // The iv for fourth block 2765 extr x22,x10,x10,#32 2766 extr x10,x10,x9,#63 2767 and w11,w19,w22,asr#31 2768 eor x9,x11,x9,lsl#1 2769 fmov d10,x9 2770 fmov v10.d[1],x10 2771 2772 ld1 {v25.16b},[x0],#16 2773 // The iv for fifth block 2774 extr x22,x10,x10,#32 2775 extr x10,x10,x9,#63 2776 and w11,w19,w22,asr#31 2777 eor x9,x11,x9,lsl#1 2778 fmov d11,x9 2779 fmov v11.d[1],x10 2780 2781 ld1 {v26.16b},[x0],#16 2782 eor v25.16b,v25.16b,v10.16b // the fourth block 2783 eor v26.16b,v26.16b,v11.16b 2784 sub x2,x2,#32 // bias 2785 mov w6,w5 2786 b .Loop5x_xts_enc 2787 2788.align 4 2789.Loop5x_xts_enc: 2790 aese v0.16b,v16.16b 2791 aesmc v0.16b,v0.16b 2792 aese v1.16b,v16.16b 2793 aesmc v1.16b,v1.16b 2794 aese v24.16b,v16.16b 2795 aesmc v24.16b,v24.16b 2796 aese v25.16b,v16.16b 2797 aesmc v25.16b,v25.16b 2798 aese v26.16b,v16.16b 2799 aesmc v26.16b,v26.16b 2800 ld1 {v16.4s},[x7],#16 2801 subs w6,w6,#2 2802 aese v0.16b,v17.16b 2803 aesmc v0.16b,v0.16b 2804 aese v1.16b,v17.16b 2805 aesmc v1.16b,v1.16b 2806 aese v24.16b,v17.16b 2807 aesmc v24.16b,v24.16b 2808 aese v25.16b,v17.16b 2809 aesmc v25.16b,v25.16b 2810 aese v26.16b,v17.16b 2811 aesmc v26.16b,v26.16b 2812 ld1 {v17.4s},[x7],#16 2813 b.gt .Loop5x_xts_enc 2814 2815 aese v0.16b,v16.16b 2816 aesmc v0.16b,v0.16b 2817 aese v1.16b,v16.16b 2818 aesmc v1.16b,v1.16b 2819 aese v24.16b,v16.16b 2820 aesmc v24.16b,v24.16b 2821 aese v25.16b,v16.16b 2822 aesmc v25.16b,v25.16b 2823 aese v26.16b,v16.16b 2824 aesmc v26.16b,v26.16b 2825 subs x2,x2,#0x50 // because .Lxts_enc_tail4x 2826 2827 aese v0.16b,v17.16b 2828 aesmc v0.16b,v0.16b 2829 aese v1.16b,v17.16b 2830 aesmc v1.16b,v1.16b 2831 aese v24.16b,v17.16b 2832 aesmc v24.16b,v24.16b 2833 aese v25.16b,v17.16b 2834 aesmc v25.16b,v25.16b 2835 aese v26.16b,v17.16b 2836 aesmc v26.16b,v26.16b 2837 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 2838 mov x7,x3 2839 2840 aese v0.16b,v18.16b 2841 aesmc v0.16b,v0.16b 2842 aese v1.16b,v18.16b 2843 aesmc v1.16b,v1.16b 2844 aese v24.16b,v18.16b 2845 aesmc v24.16b,v24.16b 2846 aese v25.16b,v18.16b 2847 aesmc v25.16b,v25.16b 2848 aese v26.16b,v18.16b 2849 aesmc v26.16b,v26.16b 2850 add x0,x0,x6 // x0 is adjusted in such way that 2851 // at exit from the loop v1.16b-v26.16b 2852 // are loaded with last "words" 2853 add x6,x2,#0x60 // because .Lxts_enc_tail4x 2854 2855 aese v0.16b,v19.16b 2856 aesmc v0.16b,v0.16b 2857 aese v1.16b,v19.16b 2858 aesmc v1.16b,v1.16b 2859 aese v24.16b,v19.16b 2860 aesmc v24.16b,v24.16b 2861 aese v25.16b,v19.16b 2862 aesmc v25.16b,v25.16b 2863 aese v26.16b,v19.16b 2864 aesmc v26.16b,v26.16b 2865 2866 aese v0.16b,v20.16b 2867 aesmc v0.16b,v0.16b 2868 aese v1.16b,v20.16b 2869 aesmc v1.16b,v1.16b 2870 aese v24.16b,v20.16b 2871 aesmc v24.16b,v24.16b 2872 aese v25.16b,v20.16b 2873 aesmc v25.16b,v25.16b 2874 aese v26.16b,v20.16b 2875 aesmc v26.16b,v26.16b 2876 2877 aese v0.16b,v21.16b 2878 aesmc v0.16b,v0.16b 2879 aese v1.16b,v21.16b 2880 aesmc v1.16b,v1.16b 2881 aese v24.16b,v21.16b 2882 aesmc v24.16b,v24.16b 2883 aese v25.16b,v21.16b 2884 aesmc v25.16b,v25.16b 2885 aese v26.16b,v21.16b 2886 aesmc v26.16b,v26.16b 2887 2888 aese v0.16b,v22.16b 2889 aesmc v0.16b,v0.16b 2890 aese v1.16b,v22.16b 2891 aesmc v1.16b,v1.16b 2892 aese v24.16b,v22.16b 2893 aesmc v24.16b,v24.16b 2894 aese v25.16b,v22.16b 2895 aesmc v25.16b,v25.16b 2896 aese v26.16b,v22.16b 2897 aesmc v26.16b,v26.16b 2898 2899 eor v4.16b,v7.16b,v6.16b 2900 aese v0.16b,v23.16b 2901 // The iv for first block of one iteration 2902 extr x22,x10,x10,#32 2903 extr x10,x10,x9,#63 2904 and w11,w19,w22,asr#31 2905 eor x9,x11,x9,lsl#1 2906 fmov d6,x9 2907 fmov v6.d[1],x10 2908 eor v5.16b,v7.16b,v8.16b 2909 ld1 {v2.16b},[x0],#16 2910 aese v1.16b,v23.16b 2911 // The iv for second block 2912 extr x22,x10,x10,#32 2913 extr x10,x10,x9,#63 2914 and w11,w19,w22,asr#31 2915 eor x9,x11,x9,lsl#1 2916 fmov d8,x9 2917 fmov v8.d[1],x10 2918 eor v17.16b,v7.16b,v9.16b 2919 ld1 {v3.16b},[x0],#16 2920 aese v24.16b,v23.16b 2921 // The iv for third block 2922 extr x22,x10,x10,#32 2923 extr x10,x10,x9,#63 2924 and w11,w19,w22,asr#31 2925 eor x9,x11,x9,lsl#1 2926 fmov d9,x9 2927 fmov v9.d[1],x10 2928 eor v30.16b,v7.16b,v10.16b 2929 ld1 {v27.16b},[x0],#16 2930 aese v25.16b,v23.16b 2931 // The iv for fourth block 2932 extr x22,x10,x10,#32 2933 extr x10,x10,x9,#63 2934 and w11,w19,w22,asr#31 2935 eor x9,x11,x9,lsl#1 2936 fmov d10,x9 2937 fmov v10.d[1],x10 2938 eor v31.16b,v7.16b,v11.16b 2939 ld1 {v28.16b},[x0],#16 2940 aese v26.16b,v23.16b 2941 2942 // The iv for fifth block 2943 extr x22,x10,x10,#32 2944 extr x10,x10,x9,#63 2945 and w11,w19,w22,asr #31 2946 eor x9,x11,x9,lsl #1 2947 fmov d11,x9 2948 fmov v11.d[1],x10 2949 2950 ld1 {v29.16b},[x0],#16 2951 cbz x6,.Lxts_enc_tail4x 2952 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 2953 eor v4.16b,v4.16b,v0.16b 2954 eor v0.16b,v2.16b,v6.16b 2955 eor v5.16b,v5.16b,v1.16b 2956 eor v1.16b,v3.16b,v8.16b 2957 eor v17.16b,v17.16b,v24.16b 2958 eor v24.16b,v27.16b,v9.16b 2959 eor v30.16b,v30.16b,v25.16b 2960 eor v25.16b,v28.16b,v10.16b 2961 eor v31.16b,v31.16b,v26.16b 2962 st1 {v4.16b},[x1],#16 2963 eor v26.16b,v29.16b,v11.16b 2964 st1 {v5.16b},[x1],#16 2965 mov w6,w5 2966 st1 {v17.16b},[x1],#16 2967 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 2968 st1 {v30.16b},[x1],#16 2969 st1 {v31.16b},[x1],#16 2970 b.hs .Loop5x_xts_enc 2971 2972 2973 // If left 4 blocks, borrow the five block's processing. 2974 cmn x2,#0x10 2975 b.ne .Loop5x_enc_after 2976 orr v11.16b,v10.16b,v10.16b 2977 orr v10.16b,v9.16b,v9.16b 2978 orr v9.16b,v8.16b,v8.16b 2979 orr v8.16b,v6.16b,v6.16b 2980 fmov x9,d11 2981 fmov x10,v11.d[1] 2982 eor v0.16b,v6.16b,v2.16b 2983 eor v1.16b,v8.16b,v3.16b 2984 eor v24.16b,v27.16b,v9.16b 2985 eor v25.16b,v28.16b,v10.16b 2986 eor v26.16b,v29.16b,v11.16b 2987 b.eq .Loop5x_xts_enc 2988 2989.Loop5x_enc_after: 2990 add x2,x2,#0x50 2991 cbz x2,.Lxts_enc_done 2992 2993 add w6,w5,#2 2994 subs x2,x2,#0x30 2995 b.lo .Lxts_inner_enc_tail 2996 2997 eor v0.16b,v6.16b,v27.16b 2998 eor v1.16b,v8.16b,v28.16b 2999 eor v24.16b,v29.16b,v9.16b 3000 b .Lxts_outer_enc_tail 3001 3002.align 4 3003.Lxts_enc_tail4x: 3004 add x0,x0,#16 3005 eor v5.16b,v1.16b,v5.16b 3006 st1 {v5.16b},[x1],#16 3007 eor v17.16b,v24.16b,v17.16b 3008 st1 {v17.16b},[x1],#16 3009 eor v30.16b,v25.16b,v30.16b 3010 eor v31.16b,v26.16b,v31.16b 3011 st1 {v30.16b,v31.16b},[x1],#32 3012 3013 b .Lxts_enc_done 3014.align 4 3015.Lxts_outer_enc_tail: 3016 aese v0.16b,v16.16b 3017 aesmc v0.16b,v0.16b 3018 aese v1.16b,v16.16b 3019 aesmc v1.16b,v1.16b 3020 aese v24.16b,v16.16b 3021 aesmc v24.16b,v24.16b 3022 ld1 {v16.4s},[x7],#16 3023 subs w6,w6,#2 3024 aese v0.16b,v17.16b 3025 aesmc v0.16b,v0.16b 3026 aese v1.16b,v17.16b 3027 aesmc v1.16b,v1.16b 3028 aese v24.16b,v17.16b 3029 aesmc v24.16b,v24.16b 3030 ld1 {v17.4s},[x7],#16 3031 b.gt .Lxts_outer_enc_tail 3032 3033 aese v0.16b,v16.16b 3034 aesmc v0.16b,v0.16b 3035 aese v1.16b,v16.16b 3036 aesmc v1.16b,v1.16b 3037 aese v24.16b,v16.16b 3038 aesmc v24.16b,v24.16b 3039 eor v4.16b,v6.16b,v7.16b 3040 subs x2,x2,#0x30 3041 // The iv for first block 3042 fmov x9,d9 3043 fmov x10,v9.d[1] 3044 //mov w19,#0x87 3045 extr x22,x10,x10,#32 3046 extr x10,x10,x9,#63 3047 and w11,w19,w22,asr#31 3048 eor x9,x11,x9,lsl#1 3049 fmov d6,x9 3050 fmov v6.d[1],x10 3051 eor v5.16b,v8.16b,v7.16b 3052 csel x6,x2,x6,lo // x6, w6, is zero at this point 3053 aese v0.16b,v17.16b 3054 aesmc v0.16b,v0.16b 3055 aese v1.16b,v17.16b 3056 aesmc v1.16b,v1.16b 3057 aese v24.16b,v17.16b 3058 aesmc v24.16b,v24.16b 3059 eor v17.16b,v9.16b,v7.16b 3060 3061 add x6,x6,#0x20 3062 add x0,x0,x6 3063 mov x7,x3 3064 3065 aese v0.16b,v20.16b 3066 aesmc v0.16b,v0.16b 3067 aese v1.16b,v20.16b 3068 aesmc v1.16b,v1.16b 3069 aese v24.16b,v20.16b 3070 aesmc v24.16b,v24.16b 3071 aese v0.16b,v21.16b 3072 aesmc v0.16b,v0.16b 3073 aese v1.16b,v21.16b 3074 aesmc v1.16b,v1.16b 3075 aese v24.16b,v21.16b 3076 aesmc v24.16b,v24.16b 3077 aese v0.16b,v22.16b 3078 aesmc v0.16b,v0.16b 3079 aese v1.16b,v22.16b 3080 aesmc v1.16b,v1.16b 3081 aese v24.16b,v22.16b 3082 aesmc v24.16b,v24.16b 3083 aese v0.16b,v23.16b 3084 aese v1.16b,v23.16b 3085 aese v24.16b,v23.16b 3086 ld1 {v27.16b},[x0],#16 3087 add w6,w5,#2 3088 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3089 eor v4.16b,v4.16b,v0.16b 3090 eor v5.16b,v5.16b,v1.16b 3091 eor v24.16b,v24.16b,v17.16b 3092 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3093 st1 {v4.16b},[x1],#16 3094 st1 {v5.16b},[x1],#16 3095 st1 {v24.16b},[x1],#16 3096 cmn x2,#0x30 3097 b.eq .Lxts_enc_done 3098.Lxts_encxor_one: 3099 orr v28.16b,v3.16b,v3.16b 3100 orr v29.16b,v27.16b,v27.16b 3101 nop 3102 3103.Lxts_inner_enc_tail: 3104 cmn x2,#0x10 3105 eor v1.16b,v28.16b,v6.16b 3106 eor v24.16b,v29.16b,v8.16b 3107 b.eq .Lxts_enc_tail_loop 3108 eor v24.16b,v29.16b,v6.16b 3109.Lxts_enc_tail_loop: 3110 aese v1.16b,v16.16b 3111 aesmc v1.16b,v1.16b 3112 aese v24.16b,v16.16b 3113 aesmc v24.16b,v24.16b 3114 ld1 {v16.4s},[x7],#16 3115 subs w6,w6,#2 3116 aese v1.16b,v17.16b 3117 aesmc v1.16b,v1.16b 3118 aese v24.16b,v17.16b 3119 aesmc v24.16b,v24.16b 3120 ld1 {v17.4s},[x7],#16 3121 b.gt .Lxts_enc_tail_loop 3122 3123 aese v1.16b,v16.16b 3124 aesmc v1.16b,v1.16b 3125 aese v24.16b,v16.16b 3126 aesmc v24.16b,v24.16b 3127 aese v1.16b,v17.16b 3128 aesmc v1.16b,v1.16b 3129 aese v24.16b,v17.16b 3130 aesmc v24.16b,v24.16b 3131 aese v1.16b,v20.16b 3132 aesmc v1.16b,v1.16b 3133 aese v24.16b,v20.16b 3134 aesmc v24.16b,v24.16b 3135 cmn x2,#0x20 3136 aese v1.16b,v21.16b 3137 aesmc v1.16b,v1.16b 3138 aese v24.16b,v21.16b 3139 aesmc v24.16b,v24.16b 3140 eor v5.16b,v6.16b,v7.16b 3141 aese v1.16b,v22.16b 3142 aesmc v1.16b,v1.16b 3143 aese v24.16b,v22.16b 3144 aesmc v24.16b,v24.16b 3145 eor v17.16b,v8.16b,v7.16b 3146 aese v1.16b,v23.16b 3147 aese v24.16b,v23.16b 3148 b.eq .Lxts_enc_one 3149 eor v5.16b,v5.16b,v1.16b 3150 st1 {v5.16b},[x1],#16 3151 eor v17.16b,v17.16b,v24.16b 3152 orr v6.16b,v8.16b,v8.16b 3153 st1 {v17.16b},[x1],#16 3154 fmov x9,d8 3155 fmov x10,v8.d[1] 3156 mov w19,#0x87 3157 extr x22,x10,x10,#32 3158 extr x10,x10,x9,#63 3159 and w11,w19,w22,asr #31 3160 eor x9,x11,x9,lsl #1 3161 fmov d6,x9 3162 fmov v6.d[1],x10 3163 b .Lxts_enc_done 3164 3165.Lxts_enc_one: 3166 eor v5.16b,v5.16b,v24.16b 3167 orr v6.16b,v6.16b,v6.16b 3168 st1 {v5.16b},[x1],#16 3169 fmov x9,d6 3170 fmov x10,v6.d[1] 3171 mov w19,#0x87 3172 extr x22,x10,x10,#32 3173 extr x10,x10,x9,#63 3174 and w11,w19,w22,asr #31 3175 eor x9,x11,x9,lsl #1 3176 fmov d6,x9 3177 fmov v6.d[1],x10 3178 b .Lxts_enc_done 3179.align 5 3180.Lxts_enc_done: 3181 // Process the tail block with cipher stealing. 3182 tst x21,#0xf 3183 b.eq .Lxts_abort 3184 3185 mov x20,x0 3186 mov x13,x1 3187 sub x1,x1,#16 3188.composite_enc_loop: 3189 subs x21,x21,#1 3190 ldrb w15,[x1,x21] 3191 ldrb w14,[x20,x21] 3192 strb w15,[x13,x21] 3193 strb w14,[x1,x21] 3194 b.gt .composite_enc_loop 3195.Lxts_enc_load_done: 3196 ld1 {v26.16b},[x1] 3197 eor v26.16b,v26.16b,v6.16b 3198 3199 // Encrypt the composite block to get the last second encrypted text block 3200 ldr w6,[x3,#240] // load key schedule... 3201 ld1 {v0.4s},[x3],#16 3202 sub w6,w6,#2 3203 ld1 {v1.4s},[x3],#16 // load key schedule... 3204.Loop_final_enc: 3205 aese v26.16b,v0.16b 3206 aesmc v26.16b,v26.16b 3207 ld1 {v0.4s},[x3],#16 3208 subs w6,w6,#2 3209 aese v26.16b,v1.16b 3210 aesmc v26.16b,v26.16b 3211 ld1 {v1.4s},[x3],#16 3212 b.gt .Loop_final_enc 3213 3214 aese v26.16b,v0.16b 3215 aesmc v26.16b,v26.16b 3216 ld1 {v0.4s},[x3] 3217 aese v26.16b,v1.16b 3218 eor v26.16b,v26.16b,v0.16b 3219 eor v26.16b,v26.16b,v6.16b 3220 st1 {v26.16b},[x1] 3221 3222.Lxts_abort: 3223 ldp x21,x22,[sp,#48] 3224 ldp d8,d9,[sp,#32] 3225 ldp d10,d11,[sp,#16] 3226 ldp x19,x20,[sp],#64 3227.Lxts_enc_final_abort: 3228 ret 3229.size aes_v8_xts_encrypt,.-aes_v8_xts_encrypt 3230.globl aes_v8_xts_decrypt 3231.type aes_v8_xts_decrypt,%function 3232.align 5 3233aes_v8_xts_decrypt: 3234 AARCH64_VALID_CALL_TARGET 3235 cmp x2,#16 3236 // Original input data size bigger than 16, jump to big size processing. 3237 b.ne .Lxts_dec_big_size 3238 // Encrypt the iv with key2, as the first XEX iv. 3239 ldr w6,[x4,#240] 3240 ld1 {v0.4s},[x4],#16 3241 ld1 {v6.16b},[x5] 3242 sub w6,w6,#2 3243 ld1 {v1.4s},[x4],#16 3244 3245.Loop_dec_small_iv_enc: 3246 aese v6.16b,v0.16b 3247 aesmc v6.16b,v6.16b 3248 ld1 {v0.4s},[x4],#16 3249 subs w6,w6,#2 3250 aese v6.16b,v1.16b 3251 aesmc v6.16b,v6.16b 3252 ld1 {v1.4s},[x4],#16 3253 b.gt .Loop_dec_small_iv_enc 3254 3255 aese v6.16b,v0.16b 3256 aesmc v6.16b,v6.16b 3257 ld1 {v0.4s},[x4] 3258 aese v6.16b,v1.16b 3259 eor v6.16b,v6.16b,v0.16b 3260 3261 ld1 {v0.16b},[x0] 3262 eor v0.16b,v6.16b,v0.16b 3263 3264 ldr w6,[x3,#240] 3265 ld1 {v28.4s,v29.4s},[x3],#32 // load key schedule... 3266 3267 aesd v0.16b,v28.16b 3268 aesimc v0.16b,v0.16b 3269 ld1 {v16.4s,v17.4s},[x3],#32 // load key schedule... 3270 aesd v0.16b,v29.16b 3271 aesimc v0.16b,v0.16b 3272 subs w6,w6,#10 // bias 3273 b.eq .Lxts_128_dec 3274.Lxts_dec_round_loop: 3275 aesd v0.16b,v16.16b 3276 aesimc v0.16b,v0.16b 3277 ld1 {v16.4s},[x3],#16 // load key schedule... 3278 aesd v0.16b,v17.16b 3279 aesimc v0.16b,v0.16b 3280 ld1 {v17.4s},[x3],#16 // load key schedule... 3281 subs w6,w6,#2 // bias 3282 b.gt .Lxts_dec_round_loop 3283.Lxts_128_dec: 3284 ld1 {v18.4s,v19.4s},[x3],#32 // load key schedule... 3285 aesd v0.16b,v16.16b 3286 aesimc v0.16b,v0.16b 3287 aesd v0.16b,v17.16b 3288 aesimc v0.16b,v0.16b 3289 ld1 {v20.4s,v21.4s},[x3],#32 // load key schedule... 3290 aesd v0.16b,v18.16b 3291 aesimc v0.16b,v0.16b 3292 aesd v0.16b,v19.16b 3293 aesimc v0.16b,v0.16b 3294 ld1 {v22.4s,v23.4s},[x3],#32 // load key schedule... 3295 aesd v0.16b,v20.16b 3296 aesimc v0.16b,v0.16b 3297 aesd v0.16b,v21.16b 3298 aesimc v0.16b,v0.16b 3299 ld1 {v7.4s},[x3] 3300 aesd v0.16b,v22.16b 3301 aesimc v0.16b,v0.16b 3302 aesd v0.16b,v23.16b 3303 eor v0.16b,v0.16b,v7.16b 3304 eor v0.16b,v6.16b,v0.16b 3305 st1 {v0.16b},[x1] 3306 b .Lxts_dec_final_abort 3307.Lxts_dec_big_size: 3308 stp x19,x20,[sp,#-64]! 3309 stp x21,x22,[sp,#48] 3310 stp d8,d9,[sp,#32] 3311 stp d10,d11,[sp,#16] 3312 3313 and x21,x2,#0xf 3314 and x2,x2,#-16 3315 subs x2,x2,#16 3316 mov x8,#16 3317 b.lo .Lxts_dec_abort 3318 3319 // Encrypt the iv with key2, as the first XEX iv 3320 ldr w6,[x4,#240] 3321 ld1 {v0.4s},[x4],#16 3322 ld1 {v6.16b},[x5] 3323 sub w6,w6,#2 3324 ld1 {v1.4s},[x4],#16 3325 3326.Loop_dec_iv_enc: 3327 aese v6.16b,v0.16b 3328 aesmc v6.16b,v6.16b 3329 ld1 {v0.4s},[x4],#16 3330 subs w6,w6,#2 3331 aese v6.16b,v1.16b 3332 aesmc v6.16b,v6.16b 3333 ld1 {v1.4s},[x4],#16 3334 b.gt .Loop_dec_iv_enc 3335 3336 aese v6.16b,v0.16b 3337 aesmc v6.16b,v6.16b 3338 ld1 {v0.4s},[x4] 3339 aese v6.16b,v1.16b 3340 eor v6.16b,v6.16b,v0.16b 3341 3342 // The iv for second block 3343 // x9- iv(low), x10 - iv(high) 3344 // the five ivs stored into, v6.16b,v8.16b,v9.16b,v10.16b,v11.16b 3345 fmov x9,d6 3346 fmov x10,v6.d[1] 3347 mov w19,#0x87 3348 extr x22,x10,x10,#32 3349 extr x10,x10,x9,#63 3350 and w11,w19,w22,asr #31 3351 eor x9,x11,x9,lsl #1 3352 fmov d8,x9 3353 fmov v8.d[1],x10 3354 3355 ldr w5,[x3,#240] // load rounds number 3356 3357 // The iv for third block 3358 extr x22,x10,x10,#32 3359 extr x10,x10,x9,#63 3360 and w11,w19,w22,asr #31 3361 eor x9,x11,x9,lsl #1 3362 fmov d9,x9 3363 fmov v9.d[1],x10 3364 3365 ld1 {v16.4s,v17.4s},[x3] // load key schedule... 3366 sub w5,w5,#6 3367 add x7,x3,x5,lsl#4 // pointer to last 7 round keys 3368 sub w5,w5,#2 3369 ld1 {v18.4s,v19.4s},[x7],#32 // load key schedule... 3370 ld1 {v20.4s,v21.4s},[x7],#32 3371 ld1 {v22.4s,v23.4s},[x7],#32 3372 ld1 {v7.4s},[x7] 3373 3374 // The iv for fourth block 3375 extr x22,x10,x10,#32 3376 extr x10,x10,x9,#63 3377 and w11,w19,w22,asr #31 3378 eor x9,x11,x9,lsl #1 3379 fmov d10,x9 3380 fmov v10.d[1],x10 3381 3382 add x7,x3,#32 3383 mov w6,w5 3384 b .Lxts_dec 3385 3386 // Decryption 3387.align 5 3388.Lxts_dec: 3389 tst x21,#0xf 3390 b.eq .Lxts_dec_begin 3391 subs x2,x2,#16 3392 csel x8,xzr,x8,eq 3393 ld1 {v0.16b},[x0],#16 3394 b.lo .Lxts_done 3395 sub x0,x0,#16 3396.Lxts_dec_begin: 3397 ld1 {v0.16b},[x0],x8 3398 subs x2,x2,#32 // bias 3399 add w6,w5,#2 3400 orr v3.16b,v0.16b,v0.16b 3401 orr v1.16b,v0.16b,v0.16b 3402 orr v28.16b,v0.16b,v0.16b 3403 ld1 {v24.16b},[x0],#16 3404 orr v27.16b,v24.16b,v24.16b 3405 orr v29.16b,v24.16b,v24.16b 3406 b.lo .Lxts_inner_dec_tail 3407 eor v0.16b,v0.16b,v6.16b // before decryt, xor with iv 3408 eor v24.16b,v24.16b,v8.16b 3409 3410 orr v1.16b,v24.16b,v24.16b 3411 ld1 {v24.16b},[x0],#16 3412 orr v2.16b,v0.16b,v0.16b 3413 orr v3.16b,v1.16b,v1.16b 3414 eor v27.16b,v24.16b,v9.16b // third block xox with third iv 3415 eor v24.16b,v24.16b,v9.16b 3416 cmp x2,#32 3417 b.lo .Lxts_outer_dec_tail 3418 3419 ld1 {v25.16b},[x0],#16 3420 3421 // The iv for fifth block 3422 extr x22,x10,x10,#32 3423 extr x10,x10,x9,#63 3424 and w11,w19,w22,asr #31 3425 eor x9,x11,x9,lsl #1 3426 fmov d11,x9 3427 fmov v11.d[1],x10 3428 3429 ld1 {v26.16b},[x0],#16 3430 eor v25.16b,v25.16b,v10.16b // the fourth block 3431 eor v26.16b,v26.16b,v11.16b 3432 sub x2,x2,#32 // bias 3433 mov w6,w5 3434 b .Loop5x_xts_dec 3435 3436.align 4 3437.Loop5x_xts_dec: 3438 aesd v0.16b,v16.16b 3439 aesimc v0.16b,v0.16b 3440 aesd v1.16b,v16.16b 3441 aesimc v1.16b,v1.16b 3442 aesd v24.16b,v16.16b 3443 aesimc v24.16b,v24.16b 3444 aesd v25.16b,v16.16b 3445 aesimc v25.16b,v25.16b 3446 aesd v26.16b,v16.16b 3447 aesimc v26.16b,v26.16b 3448 ld1 {v16.4s},[x7],#16 // load key schedule... 3449 subs w6,w6,#2 3450 aesd v0.16b,v17.16b 3451 aesimc v0.16b,v0.16b 3452 aesd v1.16b,v17.16b 3453 aesimc v1.16b,v1.16b 3454 aesd v24.16b,v17.16b 3455 aesimc v24.16b,v24.16b 3456 aesd v25.16b,v17.16b 3457 aesimc v25.16b,v25.16b 3458 aesd v26.16b,v17.16b 3459 aesimc v26.16b,v26.16b 3460 ld1 {v17.4s},[x7],#16 // load key schedule... 3461 b.gt .Loop5x_xts_dec 3462 3463 aesd v0.16b,v16.16b 3464 aesimc v0.16b,v0.16b 3465 aesd v1.16b,v16.16b 3466 aesimc v1.16b,v1.16b 3467 aesd v24.16b,v16.16b 3468 aesimc v24.16b,v24.16b 3469 aesd v25.16b,v16.16b 3470 aesimc v25.16b,v25.16b 3471 aesd v26.16b,v16.16b 3472 aesimc v26.16b,v26.16b 3473 subs x2,x2,#0x50 // because .Lxts_dec_tail4x 3474 3475 aesd v0.16b,v17.16b 3476 aesimc v0.16b,v0.16b 3477 aesd v1.16b,v17.16b 3478 aesimc v1.16b,v1.16b 3479 aesd v24.16b,v17.16b 3480 aesimc v24.16b,v24.16b 3481 aesd v25.16b,v17.16b 3482 aesimc v25.16b,v25.16b 3483 aesd v26.16b,v17.16b 3484 aesimc v26.16b,v26.16b 3485 csel x6,xzr,x2,gt // borrow x6, w6, "gt" is not typo 3486 mov x7,x3 3487 3488 aesd v0.16b,v18.16b 3489 aesimc v0.16b,v0.16b 3490 aesd v1.16b,v18.16b 3491 aesimc v1.16b,v1.16b 3492 aesd v24.16b,v18.16b 3493 aesimc v24.16b,v24.16b 3494 aesd v25.16b,v18.16b 3495 aesimc v25.16b,v25.16b 3496 aesd v26.16b,v18.16b 3497 aesimc v26.16b,v26.16b 3498 add x0,x0,x6 // x0 is adjusted in such way that 3499 // at exit from the loop v1.16b-v26.16b 3500 // are loaded with last "words" 3501 add x6,x2,#0x60 // because .Lxts_dec_tail4x 3502 3503 aesd v0.16b,v19.16b 3504 aesimc v0.16b,v0.16b 3505 aesd v1.16b,v19.16b 3506 aesimc v1.16b,v1.16b 3507 aesd v24.16b,v19.16b 3508 aesimc v24.16b,v24.16b 3509 aesd v25.16b,v19.16b 3510 aesimc v25.16b,v25.16b 3511 aesd v26.16b,v19.16b 3512 aesimc v26.16b,v26.16b 3513 3514 aesd v0.16b,v20.16b 3515 aesimc v0.16b,v0.16b 3516 aesd v1.16b,v20.16b 3517 aesimc v1.16b,v1.16b 3518 aesd v24.16b,v20.16b 3519 aesimc v24.16b,v24.16b 3520 aesd v25.16b,v20.16b 3521 aesimc v25.16b,v25.16b 3522 aesd v26.16b,v20.16b 3523 aesimc v26.16b,v26.16b 3524 3525 aesd v0.16b,v21.16b 3526 aesimc v0.16b,v0.16b 3527 aesd v1.16b,v21.16b 3528 aesimc v1.16b,v1.16b 3529 aesd v24.16b,v21.16b 3530 aesimc v24.16b,v24.16b 3531 aesd v25.16b,v21.16b 3532 aesimc v25.16b,v25.16b 3533 aesd v26.16b,v21.16b 3534 aesimc v26.16b,v26.16b 3535 3536 aesd v0.16b,v22.16b 3537 aesimc v0.16b,v0.16b 3538 aesd v1.16b,v22.16b 3539 aesimc v1.16b,v1.16b 3540 aesd v24.16b,v22.16b 3541 aesimc v24.16b,v24.16b 3542 aesd v25.16b,v22.16b 3543 aesimc v25.16b,v25.16b 3544 aesd v26.16b,v22.16b 3545 aesimc v26.16b,v26.16b 3546 3547 eor v4.16b,v7.16b,v6.16b 3548 aesd v0.16b,v23.16b 3549 // The iv for first block of next iteration. 3550 extr x22,x10,x10,#32 3551 extr x10,x10,x9,#63 3552 and w11,w19,w22,asr #31 3553 eor x9,x11,x9,lsl #1 3554 fmov d6,x9 3555 fmov v6.d[1],x10 3556 eor v5.16b,v7.16b,v8.16b 3557 ld1 {v2.16b},[x0],#16 3558 aesd v1.16b,v23.16b 3559 // The iv for second block 3560 extr x22,x10,x10,#32 3561 extr x10,x10,x9,#63 3562 and w11,w19,w22,asr #31 3563 eor x9,x11,x9,lsl #1 3564 fmov d8,x9 3565 fmov v8.d[1],x10 3566 eor v17.16b,v7.16b,v9.16b 3567 ld1 {v3.16b},[x0],#16 3568 aesd v24.16b,v23.16b 3569 // The iv for third block 3570 extr x22,x10,x10,#32 3571 extr x10,x10,x9,#63 3572 and w11,w19,w22,asr #31 3573 eor x9,x11,x9,lsl #1 3574 fmov d9,x9 3575 fmov v9.d[1],x10 3576 eor v30.16b,v7.16b,v10.16b 3577 ld1 {v27.16b},[x0],#16 3578 aesd v25.16b,v23.16b 3579 // The iv for fourth block 3580 extr x22,x10,x10,#32 3581 extr x10,x10,x9,#63 3582 and w11,w19,w22,asr #31 3583 eor x9,x11,x9,lsl #1 3584 fmov d10,x9 3585 fmov v10.d[1],x10 3586 eor v31.16b,v7.16b,v11.16b 3587 ld1 {v28.16b},[x0],#16 3588 aesd v26.16b,v23.16b 3589 3590 // The iv for fifth block 3591 extr x22,x10,x10,#32 3592 extr x10,x10,x9,#63 3593 and w11,w19,w22,asr #31 3594 eor x9,x11,x9,lsl #1 3595 fmov d11,x9 3596 fmov v11.d[1],x10 3597 3598 ld1 {v29.16b},[x0],#16 3599 cbz x6,.Lxts_dec_tail4x 3600 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3601 eor v4.16b,v4.16b,v0.16b 3602 eor v0.16b,v2.16b,v6.16b 3603 eor v5.16b,v5.16b,v1.16b 3604 eor v1.16b,v3.16b,v8.16b 3605 eor v17.16b,v17.16b,v24.16b 3606 eor v24.16b,v27.16b,v9.16b 3607 eor v30.16b,v30.16b,v25.16b 3608 eor v25.16b,v28.16b,v10.16b 3609 eor v31.16b,v31.16b,v26.16b 3610 st1 {v4.16b},[x1],#16 3611 eor v26.16b,v29.16b,v11.16b 3612 st1 {v5.16b},[x1],#16 3613 mov w6,w5 3614 st1 {v17.16b},[x1],#16 3615 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3616 st1 {v30.16b},[x1],#16 3617 st1 {v31.16b},[x1],#16 3618 b.hs .Loop5x_xts_dec 3619 3620 cmn x2,#0x10 3621 b.ne .Loop5x_dec_after 3622 // If x2(x2) equal to -0x10, the left blocks is 4. 3623 // After specially processing, utilize the five blocks processing again. 3624 // It will use the following IVs: v6.16b,v6.16b,v8.16b,v9.16b,v10.16b. 3625 orr v11.16b,v10.16b,v10.16b 3626 orr v10.16b,v9.16b,v9.16b 3627 orr v9.16b,v8.16b,v8.16b 3628 orr v8.16b,v6.16b,v6.16b 3629 fmov x9,d11 3630 fmov x10,v11.d[1] 3631 eor v0.16b,v6.16b,v2.16b 3632 eor v1.16b,v8.16b,v3.16b 3633 eor v24.16b,v27.16b,v9.16b 3634 eor v25.16b,v28.16b,v10.16b 3635 eor v26.16b,v29.16b,v11.16b 3636 b.eq .Loop5x_xts_dec 3637 3638.Loop5x_dec_after: 3639 add x2,x2,#0x50 3640 cbz x2,.Lxts_done 3641 3642 add w6,w5,#2 3643 subs x2,x2,#0x30 3644 b.lo .Lxts_inner_dec_tail 3645 3646 eor v0.16b,v6.16b,v27.16b 3647 eor v1.16b,v8.16b,v28.16b 3648 eor v24.16b,v29.16b,v9.16b 3649 b .Lxts_outer_dec_tail 3650 3651.align 4 3652.Lxts_dec_tail4x: 3653 add x0,x0,#16 3654 tst x21,#0xf 3655 eor v5.16b,v1.16b,v4.16b 3656 st1 {v5.16b},[x1],#16 3657 eor v17.16b,v24.16b,v17.16b 3658 st1 {v17.16b},[x1],#16 3659 eor v30.16b,v25.16b,v30.16b 3660 eor v31.16b,v26.16b,v31.16b 3661 st1 {v30.16b,v31.16b},[x1],#32 3662 3663 b.eq .Lxts_dec_abort 3664 ld1 {v0.16b},[x0],#16 3665 b .Lxts_done 3666.align 4 3667.Lxts_outer_dec_tail: 3668 aesd v0.16b,v16.16b 3669 aesimc v0.16b,v0.16b 3670 aesd v1.16b,v16.16b 3671 aesimc v1.16b,v1.16b 3672 aesd v24.16b,v16.16b 3673 aesimc v24.16b,v24.16b 3674 ld1 {v16.4s},[x7],#16 3675 subs w6,w6,#2 3676 aesd v0.16b,v17.16b 3677 aesimc v0.16b,v0.16b 3678 aesd v1.16b,v17.16b 3679 aesimc v1.16b,v1.16b 3680 aesd v24.16b,v17.16b 3681 aesimc v24.16b,v24.16b 3682 ld1 {v17.4s},[x7],#16 3683 b.gt .Lxts_outer_dec_tail 3684 3685 aesd v0.16b,v16.16b 3686 aesimc v0.16b,v0.16b 3687 aesd v1.16b,v16.16b 3688 aesimc v1.16b,v1.16b 3689 aesd v24.16b,v16.16b 3690 aesimc v24.16b,v24.16b 3691 eor v4.16b,v6.16b,v7.16b 3692 subs x2,x2,#0x30 3693 // The iv for first block 3694 fmov x9,d9 3695 fmov x10,v9.d[1] 3696 mov w19,#0x87 3697 extr x22,x10,x10,#32 3698 extr x10,x10,x9,#63 3699 and w11,w19,w22,asr #31 3700 eor x9,x11,x9,lsl #1 3701 fmov d6,x9 3702 fmov v6.d[1],x10 3703 eor v5.16b,v8.16b,v7.16b 3704 csel x6,x2,x6,lo // x6, w6, is zero at this point 3705 aesd v0.16b,v17.16b 3706 aesimc v0.16b,v0.16b 3707 aesd v1.16b,v17.16b 3708 aesimc v1.16b,v1.16b 3709 aesd v24.16b,v17.16b 3710 aesimc v24.16b,v24.16b 3711 eor v17.16b,v9.16b,v7.16b 3712 // The iv for second block 3713 extr x22,x10,x10,#32 3714 extr x10,x10,x9,#63 3715 and w11,w19,w22,asr #31 3716 eor x9,x11,x9,lsl #1 3717 fmov d8,x9 3718 fmov v8.d[1],x10 3719 3720 add x6,x6,#0x20 3721 add x0,x0,x6 // x0 is adjusted to the last data 3722 3723 mov x7,x3 3724 3725 // The iv for third block 3726 extr x22,x10,x10,#32 3727 extr x10,x10,x9,#63 3728 and w11,w19,w22,asr #31 3729 eor x9,x11,x9,lsl #1 3730 fmov d9,x9 3731 fmov v9.d[1],x10 3732 3733 aesd v0.16b,v20.16b 3734 aesimc v0.16b,v0.16b 3735 aesd v1.16b,v20.16b 3736 aesimc v1.16b,v1.16b 3737 aesd v24.16b,v20.16b 3738 aesimc v24.16b,v24.16b 3739 aesd v0.16b,v21.16b 3740 aesimc v0.16b,v0.16b 3741 aesd v1.16b,v21.16b 3742 aesimc v1.16b,v1.16b 3743 aesd v24.16b,v21.16b 3744 aesimc v24.16b,v24.16b 3745 aesd v0.16b,v22.16b 3746 aesimc v0.16b,v0.16b 3747 aesd v1.16b,v22.16b 3748 aesimc v1.16b,v1.16b 3749 aesd v24.16b,v22.16b 3750 aesimc v24.16b,v24.16b 3751 ld1 {v27.16b},[x0],#16 3752 aesd v0.16b,v23.16b 3753 aesd v1.16b,v23.16b 3754 aesd v24.16b,v23.16b 3755 ld1 {v16.4s},[x7],#16 // re-pre-load rndkey[0] 3756 add w6,w5,#2 3757 eor v4.16b,v4.16b,v0.16b 3758 eor v5.16b,v5.16b,v1.16b 3759 eor v24.16b,v24.16b,v17.16b 3760 ld1 {v17.4s},[x7],#16 // re-pre-load rndkey[1] 3761 st1 {v4.16b},[x1],#16 3762 st1 {v5.16b},[x1],#16 3763 st1 {v24.16b},[x1],#16 3764 3765 cmn x2,#0x30 3766 add x2,x2,#0x30 3767 b.eq .Lxts_done 3768 sub x2,x2,#0x30 3769 orr v28.16b,v3.16b,v3.16b 3770 orr v29.16b,v27.16b,v27.16b 3771 nop 3772 3773.Lxts_inner_dec_tail: 3774 // x2 == -0x10 means two blocks left. 3775 cmn x2,#0x10 3776 eor v1.16b,v28.16b,v6.16b 3777 eor v24.16b,v29.16b,v8.16b 3778 b.eq .Lxts_dec_tail_loop 3779 eor v24.16b,v29.16b,v6.16b 3780.Lxts_dec_tail_loop: 3781 aesd v1.16b,v16.16b 3782 aesimc v1.16b,v1.16b 3783 aesd v24.16b,v16.16b 3784 aesimc v24.16b,v24.16b 3785 ld1 {v16.4s},[x7],#16 3786 subs w6,w6,#2 3787 aesd v1.16b,v17.16b 3788 aesimc v1.16b,v1.16b 3789 aesd v24.16b,v17.16b 3790 aesimc v24.16b,v24.16b 3791 ld1 {v17.4s},[x7],#16 3792 b.gt .Lxts_dec_tail_loop 3793 3794 aesd v1.16b,v16.16b 3795 aesimc v1.16b,v1.16b 3796 aesd v24.16b,v16.16b 3797 aesimc v24.16b,v24.16b 3798 aesd v1.16b,v17.16b 3799 aesimc v1.16b,v1.16b 3800 aesd v24.16b,v17.16b 3801 aesimc v24.16b,v24.16b 3802 aesd v1.16b,v20.16b 3803 aesimc v1.16b,v1.16b 3804 aesd v24.16b,v20.16b 3805 aesimc v24.16b,v24.16b 3806 cmn x2,#0x20 3807 aesd v1.16b,v21.16b 3808 aesimc v1.16b,v1.16b 3809 aesd v24.16b,v21.16b 3810 aesimc v24.16b,v24.16b 3811 eor v5.16b,v6.16b,v7.16b 3812 aesd v1.16b,v22.16b 3813 aesimc v1.16b,v1.16b 3814 aesd v24.16b,v22.16b 3815 aesimc v24.16b,v24.16b 3816 eor v17.16b,v8.16b,v7.16b 3817 aesd v1.16b,v23.16b 3818 aesd v24.16b,v23.16b 3819 b.eq .Lxts_dec_one 3820 eor v5.16b,v5.16b,v1.16b 3821 eor v17.16b,v17.16b,v24.16b 3822 orr v6.16b,v9.16b,v9.16b 3823 orr v8.16b,v10.16b,v10.16b 3824 st1 {v5.16b},[x1],#16 3825 st1 {v17.16b},[x1],#16 3826 add x2,x2,#16 3827 b .Lxts_done 3828 3829.Lxts_dec_one: 3830 eor v5.16b,v5.16b,v24.16b 3831 orr v6.16b,v8.16b,v8.16b 3832 orr v8.16b,v9.16b,v9.16b 3833 st1 {v5.16b},[x1],#16 3834 add x2,x2,#32 3835 3836.Lxts_done: 3837 tst x21,#0xf 3838 b.eq .Lxts_dec_abort 3839 // Processing the last two blocks with cipher stealing. 3840 mov x7,x3 3841 cbnz x2,.Lxts_dec_1st_done 3842 ld1 {v0.16b},[x0],#16 3843 3844 // Decrypt the last second block to get the last plain text block 3845.Lxts_dec_1st_done: 3846 eor v26.16b,v0.16b,v8.16b 3847 ldr w6,[x3,#240] 3848 ld1 {v0.4s},[x3],#16 3849 sub w6,w6,#2 3850 ld1 {v1.4s},[x3],#16 3851.Loop_final_2nd_dec: 3852 aesd v26.16b,v0.16b 3853 aesimc v26.16b,v26.16b 3854 ld1 {v0.4s},[x3],#16 // load key schedule... 3855 subs w6,w6,#2 3856 aesd v26.16b,v1.16b 3857 aesimc v26.16b,v26.16b 3858 ld1 {v1.4s},[x3],#16 // load key schedule... 3859 b.gt .Loop_final_2nd_dec 3860 3861 aesd v26.16b,v0.16b 3862 aesimc v26.16b,v26.16b 3863 ld1 {v0.4s},[x3] 3864 aesd v26.16b,v1.16b 3865 eor v26.16b,v26.16b,v0.16b 3866 eor v26.16b,v26.16b,v8.16b 3867 st1 {v26.16b},[x1] 3868 3869 mov x20,x0 3870 add x13,x1,#16 3871 3872 // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks 3873 // to get the last encrypted block. 3874.composite_dec_loop: 3875 subs x21,x21,#1 3876 ldrb w15,[x1,x21] 3877 ldrb w14,[x20,x21] 3878 strb w15,[x13,x21] 3879 strb w14,[x1,x21] 3880 b.gt .composite_dec_loop 3881.Lxts_dec_load_done: 3882 ld1 {v26.16b},[x1] 3883 eor v26.16b,v26.16b,v6.16b 3884 3885 // Decrypt the composite block to get the last second plain text block 3886 ldr w6,[x7,#240] 3887 ld1 {v0.4s},[x7],#16 3888 sub w6,w6,#2 3889 ld1 {v1.4s},[x7],#16 3890.Loop_final_dec: 3891 aesd v26.16b,v0.16b 3892 aesimc v26.16b,v26.16b 3893 ld1 {v0.4s},[x7],#16 // load key schedule... 3894 subs w6,w6,#2 3895 aesd v26.16b,v1.16b 3896 aesimc v26.16b,v26.16b 3897 ld1 {v1.4s},[x7],#16 // load key schedule... 3898 b.gt .Loop_final_dec 3899 3900 aesd v26.16b,v0.16b 3901 aesimc v26.16b,v26.16b 3902 ld1 {v0.4s},[x7] 3903 aesd v26.16b,v1.16b 3904 eor v26.16b,v26.16b,v0.16b 3905 eor v26.16b,v26.16b,v6.16b 3906 st1 {v26.16b},[x1] 3907 3908.Lxts_dec_abort: 3909 ldp x21,x22,[sp,#48] 3910 ldp d8,d9,[sp,#32] 3911 ldp d10,d11,[sp,#16] 3912 ldp x19,x20,[sp],#64 3913 3914.Lxts_dec_final_abort: 3915 ret 3916.size aes_v8_xts_decrypt,.-aes_v8_xts_decrypt 3917#endif 3918