1/* 2 * linux/arch/arm64/crypto/aes-modes.S - chaining mode wrappers for AES 3 * 4 * Copyright (C) 2013 Linaro Ltd <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify 7 * it under the terms of the GNU General Public License version 2 as 8 * published by the Free Software Foundation. 9 */ 10 11/* included by aes-ce.S and aes-neon.S */ 12 13 .text 14 .align 4 15 16/* 17 * There are several ways to instantiate this code: 18 * - no interleave, all inline 19 * - 2-way interleave, 2x calls out of line (-DINTERLEAVE=2) 20 * - 2-way interleave, all inline (-DINTERLEAVE=2 -DINTERLEAVE_INLINE) 21 * - 4-way interleave, 4x calls out of line (-DINTERLEAVE=4) 22 * - 4-way interleave, all inline (-DINTERLEAVE=4 -DINTERLEAVE_INLINE) 23 * 24 * Macros imported by this code: 25 * - enc_prepare - setup NEON registers for encryption 26 * - dec_prepare - setup NEON registers for decryption 27 * - enc_switch_key - change to new key after having prepared for encryption 28 * - encrypt_block - encrypt a single block 29 * - decrypt block - decrypt a single block 30 * - encrypt_block2x - encrypt 2 blocks in parallel (if INTERLEAVE == 2) 31 * - decrypt_block2x - decrypt 2 blocks in parallel (if INTERLEAVE == 2) 32 * - encrypt_block4x - encrypt 4 blocks in parallel (if INTERLEAVE == 4) 33 * - decrypt_block4x - decrypt 4 blocks in parallel (if INTERLEAVE == 4) 34 */ 35 36#if defined(INTERLEAVE) && !defined(INTERLEAVE_INLINE) 37#define FRAME_PUSH stp x29, x30, [sp,#-16]! ; mov x29, sp 38#define FRAME_POP ldp x29, x30, [sp],#16 39 40#if INTERLEAVE == 2 41 42aes_encrypt_block2x: 43 encrypt_block2x v0, v1, w3, x2, x6, w7 44 ret 45ENDPROC(aes_encrypt_block2x) 46 47aes_decrypt_block2x: 48 decrypt_block2x v0, v1, w3, x2, x6, w7 49 ret 50ENDPROC(aes_decrypt_block2x) 51 52#elif INTERLEAVE == 4 53 54aes_encrypt_block4x: 55 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 56 ret 57ENDPROC(aes_encrypt_block4x) 58 59aes_decrypt_block4x: 60 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 61 ret 62ENDPROC(aes_decrypt_block4x) 63 64#else 65#error INTERLEAVE should equal 2 or 4 66#endif 67 68 .macro do_encrypt_block2x 69 bl aes_encrypt_block2x 70 .endm 71 72 .macro do_decrypt_block2x 73 bl aes_decrypt_block2x 74 .endm 75 76 .macro do_encrypt_block4x 77 bl aes_encrypt_block4x 78 .endm 79 80 .macro do_decrypt_block4x 81 bl aes_decrypt_block4x 82 .endm 83 84#else 85#define FRAME_PUSH 86#define FRAME_POP 87 88 .macro do_encrypt_block2x 89 encrypt_block2x v0, v1, w3, x2, x6, w7 90 .endm 91 92 .macro do_decrypt_block2x 93 decrypt_block2x v0, v1, w3, x2, x6, w7 94 .endm 95 96 .macro do_encrypt_block4x 97 encrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 98 .endm 99 100 .macro do_decrypt_block4x 101 decrypt_block4x v0, v1, v2, v3, w3, x2, x6, w7 102 .endm 103 104#endif 105 106 /* 107 * aes_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 108 * int blocks, int first) 109 * aes_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 110 * int blocks, int first) 111 */ 112 113AES_ENTRY(aes_ecb_encrypt) 114 FRAME_PUSH 115 cbz w5, .LecbencloopNx 116 117 enc_prepare w3, x2, x5 118 119.LecbencloopNx: 120#if INTERLEAVE >= 2 121 subs w4, w4, #INTERLEAVE 122 bmi .Lecbenc1x 123#if INTERLEAVE == 2 124 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 125 do_encrypt_block2x 126 st1 {v0.16b-v1.16b}, [x0], #32 127#else 128 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 129 do_encrypt_block4x 130 st1 {v0.16b-v3.16b}, [x0], #64 131#endif 132 b .LecbencloopNx 133.Lecbenc1x: 134 adds w4, w4, #INTERLEAVE 135 beq .Lecbencout 136#endif 137.Lecbencloop: 138 ld1 {v0.16b}, [x1], #16 /* get next pt block */ 139 encrypt_block v0, w3, x2, x5, w6 140 st1 {v0.16b}, [x0], #16 141 subs w4, w4, #1 142 bne .Lecbencloop 143.Lecbencout: 144 FRAME_POP 145 ret 146AES_ENDPROC(aes_ecb_encrypt) 147 148 149AES_ENTRY(aes_ecb_decrypt) 150 FRAME_PUSH 151 cbz w5, .LecbdecloopNx 152 153 dec_prepare w3, x2, x5 154 155.LecbdecloopNx: 156#if INTERLEAVE >= 2 157 subs w4, w4, #INTERLEAVE 158 bmi .Lecbdec1x 159#if INTERLEAVE == 2 160 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 161 do_decrypt_block2x 162 st1 {v0.16b-v1.16b}, [x0], #32 163#else 164 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 165 do_decrypt_block4x 166 st1 {v0.16b-v3.16b}, [x0], #64 167#endif 168 b .LecbdecloopNx 169.Lecbdec1x: 170 adds w4, w4, #INTERLEAVE 171 beq .Lecbdecout 172#endif 173.Lecbdecloop: 174 ld1 {v0.16b}, [x1], #16 /* get next ct block */ 175 decrypt_block v0, w3, x2, x5, w6 176 st1 {v0.16b}, [x0], #16 177 subs w4, w4, #1 178 bne .Lecbdecloop 179.Lecbdecout: 180 FRAME_POP 181 ret 182AES_ENDPROC(aes_ecb_decrypt) 183 184 185 /* 186 * aes_cbc_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 187 * int blocks, u8 iv[], int first) 188 * aes_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 189 * int blocks, u8 iv[], int first) 190 */ 191 192AES_ENTRY(aes_cbc_encrypt) 193 cbz w6, .Lcbcencloop 194 195 ld1 {v0.16b}, [x5] /* get iv */ 196 enc_prepare w3, x2, x5 197 198.Lcbcencloop: 199 ld1 {v1.16b}, [x1], #16 /* get next pt block */ 200 eor v0.16b, v0.16b, v1.16b /* ..and xor with iv */ 201 encrypt_block v0, w3, x2, x5, w6 202 st1 {v0.16b}, [x0], #16 203 subs w4, w4, #1 204 bne .Lcbcencloop 205 ret 206AES_ENDPROC(aes_cbc_encrypt) 207 208 209AES_ENTRY(aes_cbc_decrypt) 210 FRAME_PUSH 211 cbz w6, .LcbcdecloopNx 212 213 ld1 {v7.16b}, [x5] /* get iv */ 214 dec_prepare w3, x2, x5 215 216.LcbcdecloopNx: 217#if INTERLEAVE >= 2 218 subs w4, w4, #INTERLEAVE 219 bmi .Lcbcdec1x 220#if INTERLEAVE == 2 221 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 222 mov v2.16b, v0.16b 223 mov v3.16b, v1.16b 224 do_decrypt_block2x 225 eor v0.16b, v0.16b, v7.16b 226 eor v1.16b, v1.16b, v2.16b 227 mov v7.16b, v3.16b 228 st1 {v0.16b-v1.16b}, [x0], #32 229#else 230 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 231 mov v4.16b, v0.16b 232 mov v5.16b, v1.16b 233 mov v6.16b, v2.16b 234 do_decrypt_block4x 235 sub x1, x1, #16 236 eor v0.16b, v0.16b, v7.16b 237 eor v1.16b, v1.16b, v4.16b 238 ld1 {v7.16b}, [x1], #16 /* reload 1 ct block */ 239 eor v2.16b, v2.16b, v5.16b 240 eor v3.16b, v3.16b, v6.16b 241 st1 {v0.16b-v3.16b}, [x0], #64 242#endif 243 b .LcbcdecloopNx 244.Lcbcdec1x: 245 adds w4, w4, #INTERLEAVE 246 beq .Lcbcdecout 247#endif 248.Lcbcdecloop: 249 ld1 {v1.16b}, [x1], #16 /* get next ct block */ 250 mov v0.16b, v1.16b /* ...and copy to v0 */ 251 decrypt_block v0, w3, x2, x5, w6 252 eor v0.16b, v0.16b, v7.16b /* xor with iv => pt */ 253 mov v7.16b, v1.16b /* ct is next iv */ 254 st1 {v0.16b}, [x0], #16 255 subs w4, w4, #1 256 bne .Lcbcdecloop 257.Lcbcdecout: 258 FRAME_POP 259 ret 260AES_ENDPROC(aes_cbc_decrypt) 261 262 263 /* 264 * aes_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 265 * int blocks, u8 ctr[], int first) 266 */ 267 268AES_ENTRY(aes_ctr_encrypt) 269 FRAME_PUSH 270 cbnz w6, .Lctrfirst /* 1st time around? */ 271 umov x5, v4.d[1] /* keep swabbed ctr in reg */ 272 rev x5, x5 273#if INTERLEAVE >= 2 274 cmn w5, w4 /* 32 bit overflow? */ 275 bcs .Lctrinc 276 add x5, x5, #1 /* increment BE ctr */ 277 b .LctrincNx 278#else 279 b .Lctrinc 280#endif 281.Lctrfirst: 282 enc_prepare w3, x2, x6 283 ld1 {v4.16b}, [x5] 284 umov x5, v4.d[1] /* keep swabbed ctr in reg */ 285 rev x5, x5 286#if INTERLEAVE >= 2 287 cmn w5, w4 /* 32 bit overflow? */ 288 bcs .Lctrloop 289.LctrloopNx: 290 subs w4, w4, #INTERLEAVE 291 bmi .Lctr1x 292#if INTERLEAVE == 2 293 mov v0.8b, v4.8b 294 mov v1.8b, v4.8b 295 rev x7, x5 296 add x5, x5, #1 297 ins v0.d[1], x7 298 rev x7, x5 299 add x5, x5, #1 300 ins v1.d[1], x7 301 ld1 {v2.16b-v3.16b}, [x1], #32 /* get 2 input blocks */ 302 do_encrypt_block2x 303 eor v0.16b, v0.16b, v2.16b 304 eor v1.16b, v1.16b, v3.16b 305 st1 {v0.16b-v1.16b}, [x0], #32 306#else 307 ldr q8, =0x30000000200000001 /* addends 1,2,3[,0] */ 308 dup v7.4s, w5 309 mov v0.16b, v4.16b 310 add v7.4s, v7.4s, v8.4s 311 mov v1.16b, v4.16b 312 rev32 v8.16b, v7.16b 313 mov v2.16b, v4.16b 314 mov v3.16b, v4.16b 315 mov v1.s[3], v8.s[0] 316 mov v2.s[3], v8.s[1] 317 mov v3.s[3], v8.s[2] 318 ld1 {v5.16b-v7.16b}, [x1], #48 /* get 3 input blocks */ 319 do_encrypt_block4x 320 eor v0.16b, v5.16b, v0.16b 321 ld1 {v5.16b}, [x1], #16 /* get 1 input block */ 322 eor v1.16b, v6.16b, v1.16b 323 eor v2.16b, v7.16b, v2.16b 324 eor v3.16b, v5.16b, v3.16b 325 st1 {v0.16b-v3.16b}, [x0], #64 326 add x5, x5, #INTERLEAVE 327#endif 328 cbz w4, .LctroutNx 329.LctrincNx: 330 rev x7, x5 331 ins v4.d[1], x7 332 b .LctrloopNx 333.LctroutNx: 334 sub x5, x5, #1 335 rev x7, x5 336 ins v4.d[1], x7 337 b .Lctrout 338.Lctr1x: 339 adds w4, w4, #INTERLEAVE 340 beq .Lctrout 341#endif 342.Lctrloop: 343 mov v0.16b, v4.16b 344 encrypt_block v0, w3, x2, x6, w7 345 subs w4, w4, #1 346 bmi .Lctrhalfblock /* blocks < 0 means 1/2 block */ 347 ld1 {v3.16b}, [x1], #16 348 eor v3.16b, v0.16b, v3.16b 349 st1 {v3.16b}, [x0], #16 350 beq .Lctrout 351.Lctrinc: 352 adds x5, x5, #1 /* increment BE ctr */ 353 rev x7, x5 354 ins v4.d[1], x7 355 bcc .Lctrloop /* no overflow? */ 356 umov x7, v4.d[0] /* load upper word of ctr */ 357 rev x7, x7 /* ... to handle the carry */ 358 add x7, x7, #1 359 rev x7, x7 360 ins v4.d[0], x7 361 b .Lctrloop 362.Lctrhalfblock: 363 ld1 {v3.8b}, [x1] 364 eor v3.8b, v0.8b, v3.8b 365 st1 {v3.8b}, [x0] 366.Lctrout: 367 FRAME_POP 368 ret 369AES_ENDPROC(aes_ctr_encrypt) 370 .ltorg 371 372 373 /* 374 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 375 * int blocks, u8 const rk2[], u8 iv[], int first) 376 * aes_xts_decrypt(u8 out[], u8 const in[], u8 const rk1[], int rounds, 377 * int blocks, u8 const rk2[], u8 iv[], int first) 378 */ 379 380 .macro next_tweak, out, in, const, tmp 381 sshr \tmp\().2d, \in\().2d, #63 382 and \tmp\().16b, \tmp\().16b, \const\().16b 383 add \out\().2d, \in\().2d, \in\().2d 384 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 385 eor \out\().16b, \out\().16b, \tmp\().16b 386 .endm 387 388.Lxts_mul_x: 389CPU_LE( .quad 1, 0x87 ) 390CPU_BE( .quad 0x87, 1 ) 391 392AES_ENTRY(aes_xts_encrypt) 393 FRAME_PUSH 394 cbz w7, .LxtsencloopNx 395 396 ld1 {v4.16b}, [x6] 397 enc_prepare w3, x5, x6 398 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 399 enc_switch_key w3, x2, x6 400 ldr q7, .Lxts_mul_x 401 b .LxtsencNx 402 403.LxtsencloopNx: 404 ldr q7, .Lxts_mul_x 405 next_tweak v4, v4, v7, v8 406.LxtsencNx: 407#if INTERLEAVE >= 2 408 subs w4, w4, #INTERLEAVE 409 bmi .Lxtsenc1x 410#if INTERLEAVE == 2 411 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 pt blocks */ 412 next_tweak v5, v4, v7, v8 413 eor v0.16b, v0.16b, v4.16b 414 eor v1.16b, v1.16b, v5.16b 415 do_encrypt_block2x 416 eor v0.16b, v0.16b, v4.16b 417 eor v1.16b, v1.16b, v5.16b 418 st1 {v0.16b-v1.16b}, [x0], #32 419 cbz w4, .LxtsencoutNx 420 next_tweak v4, v5, v7, v8 421 b .LxtsencNx 422.LxtsencoutNx: 423 mov v4.16b, v5.16b 424 b .Lxtsencout 425#else 426 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 pt blocks */ 427 next_tweak v5, v4, v7, v8 428 eor v0.16b, v0.16b, v4.16b 429 next_tweak v6, v5, v7, v8 430 eor v1.16b, v1.16b, v5.16b 431 eor v2.16b, v2.16b, v6.16b 432 next_tweak v7, v6, v7, v8 433 eor v3.16b, v3.16b, v7.16b 434 do_encrypt_block4x 435 eor v3.16b, v3.16b, v7.16b 436 eor v0.16b, v0.16b, v4.16b 437 eor v1.16b, v1.16b, v5.16b 438 eor v2.16b, v2.16b, v6.16b 439 st1 {v0.16b-v3.16b}, [x0], #64 440 mov v4.16b, v7.16b 441 cbz w4, .Lxtsencout 442 b .LxtsencloopNx 443#endif 444.Lxtsenc1x: 445 adds w4, w4, #INTERLEAVE 446 beq .Lxtsencout 447#endif 448.Lxtsencloop: 449 ld1 {v1.16b}, [x1], #16 450 eor v0.16b, v1.16b, v4.16b 451 encrypt_block v0, w3, x2, x6, w7 452 eor v0.16b, v0.16b, v4.16b 453 st1 {v0.16b}, [x0], #16 454 subs w4, w4, #1 455 beq .Lxtsencout 456 next_tweak v4, v4, v7, v8 457 b .Lxtsencloop 458.Lxtsencout: 459 FRAME_POP 460 ret 461AES_ENDPROC(aes_xts_encrypt) 462 463 464AES_ENTRY(aes_xts_decrypt) 465 FRAME_PUSH 466 cbz w7, .LxtsdecloopNx 467 468 ld1 {v4.16b}, [x6] 469 enc_prepare w3, x5, x6 470 encrypt_block v4, w3, x5, x6, w7 /* first tweak */ 471 dec_prepare w3, x2, x6 472 ldr q7, .Lxts_mul_x 473 b .LxtsdecNx 474 475.LxtsdecloopNx: 476 ldr q7, .Lxts_mul_x 477 next_tweak v4, v4, v7, v8 478.LxtsdecNx: 479#if INTERLEAVE >= 2 480 subs w4, w4, #INTERLEAVE 481 bmi .Lxtsdec1x 482#if INTERLEAVE == 2 483 ld1 {v0.16b-v1.16b}, [x1], #32 /* get 2 ct blocks */ 484 next_tweak v5, v4, v7, v8 485 eor v0.16b, v0.16b, v4.16b 486 eor v1.16b, v1.16b, v5.16b 487 do_decrypt_block2x 488 eor v0.16b, v0.16b, v4.16b 489 eor v1.16b, v1.16b, v5.16b 490 st1 {v0.16b-v1.16b}, [x0], #32 491 cbz w4, .LxtsdecoutNx 492 next_tweak v4, v5, v7, v8 493 b .LxtsdecNx 494.LxtsdecoutNx: 495 mov v4.16b, v5.16b 496 b .Lxtsdecout 497#else 498 ld1 {v0.16b-v3.16b}, [x1], #64 /* get 4 ct blocks */ 499 next_tweak v5, v4, v7, v8 500 eor v0.16b, v0.16b, v4.16b 501 next_tweak v6, v5, v7, v8 502 eor v1.16b, v1.16b, v5.16b 503 eor v2.16b, v2.16b, v6.16b 504 next_tweak v7, v6, v7, v8 505 eor v3.16b, v3.16b, v7.16b 506 do_decrypt_block4x 507 eor v3.16b, v3.16b, v7.16b 508 eor v0.16b, v0.16b, v4.16b 509 eor v1.16b, v1.16b, v5.16b 510 eor v2.16b, v2.16b, v6.16b 511 st1 {v0.16b-v3.16b}, [x0], #64 512 mov v4.16b, v7.16b 513 cbz w4, .Lxtsdecout 514 b .LxtsdecloopNx 515#endif 516.Lxtsdec1x: 517 adds w4, w4, #INTERLEAVE 518 beq .Lxtsdecout 519#endif 520.Lxtsdecloop: 521 ld1 {v1.16b}, [x1], #16 522 eor v0.16b, v1.16b, v4.16b 523 decrypt_block v0, w3, x2, x6, w7 524 eor v0.16b, v0.16b, v4.16b 525 st1 {v0.16b}, [x0], #16 526 subs w4, w4, #1 527 beq .Lxtsdecout 528 next_tweak v4, v4, v7, v8 529 b .Lxtsdecloop 530.Lxtsdecout: 531 FRAME_POP 532 ret 533AES_ENDPROC(aes_xts_decrypt) 534