1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated AES-GCM implementation with ARMv8 Crypto Extensions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. 6 * Copyright (C) 2023 Google LLC. <ardb@google.com> 7 */ 8 9#include <linux/linkage.h> 10#include <asm/assembler.h> 11 12 .arch armv8-a 13 .fpu crypto-neon-fp-armv8 14 15 SHASH .req q0 16 T1 .req q1 17 XL .req q2 18 XM .req q3 19 XH .req q4 20 IN1 .req q4 21 22 SHASH_L .req d0 23 SHASH_H .req d1 24 T1_L .req d2 25 T1_H .req d3 26 XL_L .req d4 27 XL_H .req d5 28 XM_L .req d6 29 XM_H .req d7 30 XH_L .req d8 31 32 XH2 .req q9 33 34 MASK .req d28 35 36 SHASH2_p64 .req d31 37 38 HH .req q10 39 HH3 .req q11 40 HH4 .req q12 41 HH34 .req q13 42 43 HH_L .req d20 44 HH_H .req d21 45 HH3_L .req d22 46 HH3_H .req d23 47 HH4_L .req d24 48 HH4_H .req d25 49 HH34_L .req d26 50 HH34_H .req d27 51 SHASH2_H .req d29 52 53 XL2 .req q5 54 XM2 .req q6 55 T2 .req q7 56 T3 .req q8 57 58 XL2_L .req d10 59 XL2_H .req d11 60 XM2_L .req d12 61 XM2_H .req d13 62 T3_L .req d16 63 T3_H .req d17 64 65 .text 66 67 .macro __pmull_reduce_p64 68 vmull.p64 T1, XL_L, MASK 69 70 veor XH_L, XH_L, XM_H 71 vext.8 T1, T1, T1, #8 72 veor XL_H, XL_H, XM_L 73 veor T1, T1, XL 74 75 vmull.p64 XL, T1_H, MASK 76 .endm 77 78 .macro ghash_update, enc, aggregate=1, head=1 79 vld1.64 {XL}, [r1] 80 81 .if \head 82 /* do the head block first, if supplied */ 83 ldr ip, [sp] 84 teq ip, #0 85 beq 0f 86 vld1.64 {T1}, [ip] 87 teq r0, #0 88 b 3f 89 .endif 90 910: .if \aggregate 92 tst r0, #3 // skip until #blocks is a 93 bne 2f // round multiple of 4 94 95 vld1.8 {XL2-XM2}, [r2]! 961: vld1.8 {T2-T3}, [r2]! 97 98 .ifnb \enc 99 \enc\()_4x XL2, XM2, T2, T3 100 101 add ip, r3, #16 102 vld1.64 {HH}, [ip, :128]! 103 vld1.64 {HH3-HH4}, [ip, :128] 104 105 veor SHASH2_p64, SHASH_L, SHASH_H 106 veor SHASH2_H, HH_L, HH_H 107 veor HH34_L, HH3_L, HH3_H 108 veor HH34_H, HH4_L, HH4_H 109 110 vmov.i8 MASK, #0xe1 111 vshl.u64 MASK, MASK, #57 112 .endif 113 114 vrev64.8 XL2, XL2 115 vrev64.8 XM2, XM2 116 117 subs r0, r0, #4 118 119 vext.8 T1, XL2, XL2, #8 120 veor XL2_H, XL2_H, XL_L 121 veor XL, XL, T1 122 123 vrev64.8 T1, T3 124 vrev64.8 T3, T2 125 126 vmull.p64 XH, HH4_H, XL_H // a1 * b1 127 veor XL2_H, XL2_H, XL_H 128 vmull.p64 XL, HH4_L, XL_L // a0 * b0 129 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 130 131 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 132 veor XM2_L, XM2_L, XM2_H 133 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 134 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 135 136 veor XH, XH, XH2 137 veor XL, XL, XL2 138 veor XM, XM, XM2 139 140 vmull.p64 XH2, HH_H, T3_L // a1 * b1 141 veor T3_L, T3_L, T3_H 142 vmull.p64 XL2, HH_L, T3_H // a0 * b0 143 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 144 145 veor XH, XH, XH2 146 veor XL, XL, XL2 147 veor XM, XM, XM2 148 149 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 150 veor T1_L, T1_L, T1_H 151 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 152 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 153 154 veor XH, XH, XH2 155 veor XL, XL, XL2 156 veor XM, XM, XM2 157 158 beq 4f 159 160 vld1.8 {XL2-XM2}, [r2]! 161 162 veor T1, XL, XH 163 veor XM, XM, T1 164 165 __pmull_reduce_p64 166 167 veor T1, T1, XH 168 veor XL, XL, T1 169 170 b 1b 171 .endif 172 1732: vld1.8 {T1}, [r2]! 174 175 .ifnb \enc 176 \enc\()_1x T1 177 veor SHASH2_p64, SHASH_L, SHASH_H 178 vmov.i8 MASK, #0xe1 179 vshl.u64 MASK, MASK, #57 180 .endif 181 182 subs r0, r0, #1 183 1843: /* multiply XL by SHASH in GF(2^128) */ 185 vrev64.8 T1, T1 186 187 vext.8 IN1, T1, T1, #8 188 veor T1_L, T1_L, XL_H 189 veor XL, XL, IN1 190 191 vmull.p64 XH, XL_H, SHASH_H @ a1 * b1 192 veor T1, T1, XL 193 vmull.p64 XL, XL_L, SHASH_L @ a0 * b0 194 vmull.p64 XM, T1_L, SHASH2_p64 @ (a1+a0)(b1+b0) 195 1964: veor T1, XL, XH 197 veor XM, XM, T1 198 199 __pmull_reduce_p64 200 201 veor T1, T1, XH 202 veor XL, XL, T1 203 204 bne 0b 205 .endm 206 207 /* 208 * void pmull_ghash_update_p64(int blocks, u64 dg[], const char *src, 209 * u64 const h[4][2], const char *head) 210 */ 211ENTRY(pmull_ghash_update_p64) 212 vld1.64 {SHASH}, [r3]! 213 vld1.64 {HH}, [r3]! 214 vld1.64 {HH3-HH4}, [r3] 215 216 veor SHASH2_p64, SHASH_L, SHASH_H 217 veor SHASH2_H, HH_L, HH_H 218 veor HH34_L, HH3_L, HH3_H 219 veor HH34_H, HH4_L, HH4_H 220 221 vmov.i8 MASK, #0xe1 222 vshl.u64 MASK, MASK, #57 223 224 ghash_update 225 vst1.64 {XL}, [r1] 226 227 bx lr 228ENDPROC(pmull_ghash_update_p64) 229 230 e0 .req q9 231 e1 .req q10 232 e2 .req q11 233 e3 .req q12 234 e0l .req d18 235 e0h .req d19 236 e2l .req d22 237 e2h .req d23 238 e3l .req d24 239 e3h .req d25 240 ctr .req q13 241 ctr0 .req d26 242 ctr1 .req d27 243 244 ek0 .req q14 245 ek1 .req q15 246 247 .macro round, rk:req, regs:vararg 248 .irp r, \regs 249 aese.8 \r, \rk 250 aesmc.8 \r, \r 251 .endr 252 .endm 253 254 .macro aes_encrypt, rkp, rounds, regs:vararg 255 vld1.8 {ek0-ek1}, [\rkp, :128]! 256 cmp \rounds, #12 257 blt .L\@ // AES-128 258 259 round ek0, \regs 260 vld1.8 {ek0}, [\rkp, :128]! 261 round ek1, \regs 262 vld1.8 {ek1}, [\rkp, :128]! 263 264 beq .L\@ // AES-192 265 266 round ek0, \regs 267 vld1.8 {ek0}, [\rkp, :128]! 268 round ek1, \regs 269 vld1.8 {ek1}, [\rkp, :128]! 270 271.L\@: .rept 4 272 round ek0, \regs 273 vld1.8 {ek0}, [\rkp, :128]! 274 round ek1, \regs 275 vld1.8 {ek1}, [\rkp, :128]! 276 .endr 277 278 round ek0, \regs 279 vld1.8 {ek0}, [\rkp, :128] 280 281 .irp r, \regs 282 aese.8 \r, ek1 283 .endr 284 .irp r, \regs 285 veor \r, \r, ek0 286 .endr 287 .endm 288 289pmull_aes_encrypt: 290 add ip, r5, #4 291 vld1.8 {ctr0}, [r5] // load 12 byte IV 292 vld1.8 {ctr1}, [ip] 293 rev r8, r7 294 vext.8 ctr1, ctr1, ctr1, #4 295 add r7, r7, #1 296 vmov.32 ctr1[1], r8 297 vmov e0, ctr 298 299 add ip, r3, #64 300 aes_encrypt ip, r6, e0 301 bx lr 302ENDPROC(pmull_aes_encrypt) 303 304pmull_aes_encrypt_4x: 305 add ip, r5, #4 306 vld1.8 {ctr0}, [r5] 307 vld1.8 {ctr1}, [ip] 308 rev r8, r7 309 vext.8 ctr1, ctr1, ctr1, #4 310 add r7, r7, #1 311 vmov.32 ctr1[1], r8 312 rev ip, r7 313 vmov e0, ctr 314 add r7, r7, #1 315 vmov.32 ctr1[1], ip 316 rev r8, r7 317 vmov e1, ctr 318 add r7, r7, #1 319 vmov.32 ctr1[1], r8 320 rev ip, r7 321 vmov e2, ctr 322 add r7, r7, #1 323 vmov.32 ctr1[1], ip 324 vmov e3, ctr 325 326 add ip, r3, #64 327 aes_encrypt ip, r6, e0, e1, e2, e3 328 bx lr 329ENDPROC(pmull_aes_encrypt_4x) 330 331pmull_aes_encrypt_final: 332 add ip, r5, #4 333 vld1.8 {ctr0}, [r5] 334 vld1.8 {ctr1}, [ip] 335 rev r8, r7 336 vext.8 ctr1, ctr1, ctr1, #4 337 mov r7, #1 << 24 // BE #1 for the tag 338 vmov.32 ctr1[1], r8 339 vmov e0, ctr 340 vmov.32 ctr1[1], r7 341 vmov e1, ctr 342 343 add ip, r3, #64 344 aes_encrypt ip, r6, e0, e1 345 bx lr 346ENDPROC(pmull_aes_encrypt_final) 347 348 .macro enc_1x, in0 349 bl pmull_aes_encrypt 350 veor \in0, \in0, e0 351 vst1.8 {\in0}, [r4]! 352 .endm 353 354 .macro dec_1x, in0 355 bl pmull_aes_encrypt 356 veor e0, e0, \in0 357 vst1.8 {e0}, [r4]! 358 .endm 359 360 .macro enc_4x, in0, in1, in2, in3 361 bl pmull_aes_encrypt_4x 362 363 veor \in0, \in0, e0 364 veor \in1, \in1, e1 365 veor \in2, \in2, e2 366 veor \in3, \in3, e3 367 368 vst1.8 {\in0-\in1}, [r4]! 369 vst1.8 {\in2-\in3}, [r4]! 370 .endm 371 372 .macro dec_4x, in0, in1, in2, in3 373 bl pmull_aes_encrypt_4x 374 375 veor e0, e0, \in0 376 veor e1, e1, \in1 377 veor e2, e2, \in2 378 veor e3, e3, \in3 379 380 vst1.8 {e0-e1}, [r4]! 381 vst1.8 {e2-e3}, [r4]! 382 .endm 383 384 /* 385 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, 386 * struct gcm_key const *k, char *dst, 387 * char *iv, int rounds, u32 counter) 388 */ 389ENTRY(pmull_gcm_encrypt) 390 push {r4-r8, lr} 391 ldrd r4, r5, [sp, #24] 392 ldrd r6, r7, [sp, #32] 393 394 vld1.64 {SHASH}, [r3] 395 396 ghash_update enc, head=0 397 vst1.64 {XL}, [r1] 398 399 pop {r4-r8, pc} 400ENDPROC(pmull_gcm_encrypt) 401 402 /* 403 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, 404 * struct gcm_key const *k, char *dst, 405 * char *iv, int rounds, u32 counter) 406 */ 407ENTRY(pmull_gcm_decrypt) 408 push {r4-r8, lr} 409 ldrd r4, r5, [sp, #24] 410 ldrd r6, r7, [sp, #32] 411 412 vld1.64 {SHASH}, [r3] 413 414 ghash_update dec, head=0 415 vst1.64 {XL}, [r1] 416 417 pop {r4-r8, pc} 418ENDPROC(pmull_gcm_decrypt) 419 420 /* 421 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, 422 * struct gcm_key const *k, char *head, 423 * char *iv, int rounds, u32 counter) 424 */ 425ENTRY(pmull_gcm_enc_final) 426 push {r4-r8, lr} 427 ldrd r4, r5, [sp, #24] 428 ldrd r6, r7, [sp, #32] 429 430 bl pmull_aes_encrypt_final 431 432 cmp r0, #0 433 beq .Lenc_final 434 435 mov_l ip, .Lpermute 436 sub r4, r4, #16 437 add r8, ip, r0 438 add ip, ip, #32 439 add r4, r4, r0 440 sub ip, ip, r0 441 442 vld1.8 {e3}, [r8] // permute vector for key stream 443 vld1.8 {e2}, [ip] // permute vector for ghash input 444 445 vtbl.8 e3l, {e0}, e3l 446 vtbl.8 e3h, {e0}, e3h 447 448 vld1.8 {e0}, [r4] // encrypt tail block 449 veor e0, e0, e3 450 vst1.8 {e0}, [r4] 451 452 vtbl.8 T1_L, {e0}, e2l 453 vtbl.8 T1_H, {e0}, e2h 454 455 vld1.64 {XL}, [r1] 456.Lenc_final: 457 vld1.64 {SHASH}, [r3, :128] 458 vmov.i8 MASK, #0xe1 459 veor SHASH2_p64, SHASH_L, SHASH_H 460 vshl.u64 MASK, MASK, #57 461 mov r0, #1 462 bne 3f // process head block first 463 ghash_update aggregate=0, head=0 464 465 vrev64.8 XL, XL 466 vext.8 XL, XL, XL, #8 467 veor XL, XL, e1 468 469 sub r2, r2, #16 // rewind src pointer 470 vst1.8 {XL}, [r2] // store tag 471 472 pop {r4-r8, pc} 473ENDPROC(pmull_gcm_enc_final) 474 475 /* 476 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, 477 * struct gcm_key const *k, char *head, 478 * char *iv, int rounds, u32 counter, 479 * const char *otag, int authsize) 480 */ 481ENTRY(pmull_gcm_dec_final) 482 push {r4-r8, lr} 483 ldrd r4, r5, [sp, #24] 484 ldrd r6, r7, [sp, #32] 485 486 bl pmull_aes_encrypt_final 487 488 cmp r0, #0 489 beq .Ldec_final 490 491 mov_l ip, .Lpermute 492 sub r4, r4, #16 493 add r8, ip, r0 494 add ip, ip, #32 495 add r4, r4, r0 496 sub ip, ip, r0 497 498 vld1.8 {e3}, [r8] // permute vector for key stream 499 vld1.8 {e2}, [ip] // permute vector for ghash input 500 501 vtbl.8 e3l, {e0}, e3l 502 vtbl.8 e3h, {e0}, e3h 503 504 vld1.8 {e0}, [r4] 505 506 vtbl.8 T1_L, {e0}, e2l 507 vtbl.8 T1_H, {e0}, e2h 508 509 veor e0, e0, e3 510 vst1.8 {e0}, [r4] 511 512 vld1.64 {XL}, [r1] 513.Ldec_final: 514 vld1.64 {SHASH}, [r3] 515 vmov.i8 MASK, #0xe1 516 veor SHASH2_p64, SHASH_L, SHASH_H 517 vshl.u64 MASK, MASK, #57 518 mov r0, #1 519 bne 3f // process head block first 520 ghash_update aggregate=0, head=0 521 522 vrev64.8 XL, XL 523 vext.8 XL, XL, XL, #8 524 veor XL, XL, e1 525 526 mov_l ip, .Lpermute 527 ldrd r2, r3, [sp, #40] // otag and authsize 528 vld1.8 {T1}, [r2] 529 add ip, ip, r3 530 vceq.i8 T1, T1, XL // compare tags 531 vmvn T1, T1 // 0 for eq, -1 for ne 532 533 vld1.8 {e0}, [ip] 534 vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only 535 vtbl.8 XL_H, {T1}, e0h 536 537 vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector 538 vpmin.s8 XL_L, XL_L, XL_L 539 vmov.32 r0, XL_L[0] // fail if != 0x0 540 541 pop {r4-r8, pc} 542ENDPROC(pmull_gcm_dec_final) 543 544 .section ".rodata", "a", %progbits 545 .align 5 546.Lpermute: 547 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 548 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 549 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 550 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 551 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 552 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 553