1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Accelerated GHASH implementation with NEON/ARMv8 vmull.p8/64 instructions. 4 * 5 * Copyright (C) 2015 - 2017 Linaro Ltd. 6 * Copyright (C) 2023 Google LLC. <ardb@google.com> 7 */ 8 9#include <linux/linkage.h> 10#include <asm/assembler.h> 11 12 .arch armv8-a 13 .fpu crypto-neon-fp-armv8 14 15 SHASH .req q0 16 T1 .req q1 17 XL .req q2 18 XM .req q3 19 XH .req q4 20 IN1 .req q4 21 22 SHASH_L .req d0 23 SHASH_H .req d1 24 T1_L .req d2 25 T1_H .req d3 26 XL_L .req d4 27 XL_H .req d5 28 XM_L .req d6 29 XM_H .req d7 30 XH_L .req d8 31 32 t0l .req d10 33 t0h .req d11 34 t1l .req d12 35 t1h .req d13 36 t2l .req d14 37 t2h .req d15 38 t3l .req d16 39 t3h .req d17 40 t4l .req d18 41 t4h .req d19 42 43 t0q .req q5 44 t1q .req q6 45 t2q .req q7 46 t3q .req q8 47 t4q .req q9 48 XH2 .req q9 49 50 s1l .req d20 51 s1h .req d21 52 s2l .req d22 53 s2h .req d23 54 s3l .req d24 55 s3h .req d25 56 s4l .req d26 57 s4h .req d27 58 59 MASK .req d28 60 SHASH2_p8 .req d28 61 62 k16 .req d29 63 k32 .req d30 64 k48 .req d31 65 SHASH2_p64 .req d31 66 67 HH .req q10 68 HH3 .req q11 69 HH4 .req q12 70 HH34 .req q13 71 72 HH_L .req d20 73 HH_H .req d21 74 HH3_L .req d22 75 HH3_H .req d23 76 HH4_L .req d24 77 HH4_H .req d25 78 HH34_L .req d26 79 HH34_H .req d27 80 SHASH2_H .req d29 81 82 XL2 .req q5 83 XM2 .req q6 84 T2 .req q7 85 T3 .req q8 86 87 XL2_L .req d10 88 XL2_H .req d11 89 XM2_L .req d12 90 XM2_H .req d13 91 T3_L .req d16 92 T3_H .req d17 93 94 .text 95 96 .macro __pmull_p64, rd, rn, rm, b1, b2, b3, b4 97 vmull.p64 \rd, \rn, \rm 98 .endm 99 100 /* 101 * This implementation of 64x64 -> 128 bit polynomial multiplication 102 * using vmull.p8 instructions (8x8 -> 16) is taken from the paper 103 * "Fast Software Polynomial Multiplication on ARM Processors Using 104 * the NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and 105 * Ricardo Dahab (https://hal.inria.fr/hal-01506572) 106 * 107 * It has been slightly tweaked for in-order performance, and to allow 108 * 'rq' to overlap with 'ad' or 'bd'. 109 */ 110 .macro __pmull_p8, rq, ad, bd, b1=t4l, b2=t3l, b3=t4l, b4=t3l 111 vext.8 t0l, \ad, \ad, #1 @ A1 112 .ifc \b1, t4l 113 vext.8 t4l, \bd, \bd, #1 @ B1 114 .endif 115 vmull.p8 t0q, t0l, \bd @ F = A1*B 116 vext.8 t1l, \ad, \ad, #2 @ A2 117 vmull.p8 t4q, \ad, \b1 @ E = A*B1 118 .ifc \b2, t3l 119 vext.8 t3l, \bd, \bd, #2 @ B2 120 .endif 121 vmull.p8 t1q, t1l, \bd @ H = A2*B 122 vext.8 t2l, \ad, \ad, #3 @ A3 123 vmull.p8 t3q, \ad, \b2 @ G = A*B2 124 veor t0q, t0q, t4q @ L = E + F 125 .ifc \b3, t4l 126 vext.8 t4l, \bd, \bd, #3 @ B3 127 .endif 128 vmull.p8 t2q, t2l, \bd @ J = A3*B 129 veor t0l, t0l, t0h @ t0 = (L) (P0 + P1) << 8 130 veor t1q, t1q, t3q @ M = G + H 131 .ifc \b4, t3l 132 vext.8 t3l, \bd, \bd, #4 @ B4 133 .endif 134 vmull.p8 t4q, \ad, \b3 @ I = A*B3 135 veor t1l, t1l, t1h @ t1 = (M) (P2 + P3) << 16 136 vmull.p8 t3q, \ad, \b4 @ K = A*B4 137 vand t0h, t0h, k48 138 vand t1h, t1h, k32 139 veor t2q, t2q, t4q @ N = I + J 140 veor t0l, t0l, t0h 141 veor t1l, t1l, t1h 142 veor t2l, t2l, t2h @ t2 = (N) (P4 + P5) << 24 143 vand t2h, t2h, k16 144 veor t3l, t3l, t3h @ t3 = (K) (P6 + P7) << 32 145 vmov.i64 t3h, #0 146 vext.8 t0q, t0q, t0q, #15 147 veor t2l, t2l, t2h 148 vext.8 t1q, t1q, t1q, #14 149 vmull.p8 \rq, \ad, \bd @ D = A*B 150 vext.8 t2q, t2q, t2q, #13 151 vext.8 t3q, t3q, t3q, #12 152 veor t0q, t0q, t1q 153 veor t2q, t2q, t3q 154 veor \rq, \rq, t0q 155 veor \rq, \rq, t2q 156 .endm 157 158 // 159 // PMULL (64x64->128) based reduction for CPUs that can do 160 // it in a single instruction. 161 // 162 .macro __pmull_reduce_p64 163 vmull.p64 T1, XL_L, MASK 164 165 veor XH_L, XH_L, XM_H 166 vext.8 T1, T1, T1, #8 167 veor XL_H, XL_H, XM_L 168 veor T1, T1, XL 169 170 vmull.p64 XL, T1_H, MASK 171 .endm 172 173 // 174 // Alternative reduction for CPUs that lack support for the 175 // 64x64->128 PMULL instruction 176 // 177 .macro __pmull_reduce_p8 178 veor XL_H, XL_H, XM_L 179 veor XH_L, XH_L, XM_H 180 181 vshl.i64 T1, XL, #57 182 vshl.i64 T2, XL, #62 183 veor T1, T1, T2 184 vshl.i64 T2, XL, #63 185 veor T1, T1, T2 186 veor XL_H, XL_H, T1_L 187 veor XH_L, XH_L, T1_H 188 189 vshr.u64 T1, XL, #1 190 veor XH, XH, XL 191 veor XL, XL, T1 192 vshr.u64 T1, T1, #6 193 vshr.u64 XL, XL, #1 194 .endm 195 196 .macro ghash_update, pn, enc, aggregate=1, head=1 197 vld1.64 {XL}, [r1] 198 199 .if \head 200 /* do the head block first, if supplied */ 201 ldr ip, [sp] 202 teq ip, #0 203 beq 0f 204 vld1.64 {T1}, [ip] 205 teq r0, #0 206 b 3f 207 .endif 208 2090: .ifc \pn, p64 210 .if \aggregate 211 tst r0, #3 // skip until #blocks is a 212 bne 2f // round multiple of 4 213 214 vld1.8 {XL2-XM2}, [r2]! 2151: vld1.8 {T2-T3}, [r2]! 216 217 .ifnb \enc 218 \enc\()_4x XL2, XM2, T2, T3 219 220 add ip, r3, #16 221 vld1.64 {HH}, [ip, :128]! 222 vld1.64 {HH3-HH4}, [ip, :128] 223 224 veor SHASH2_p64, SHASH_L, SHASH_H 225 veor SHASH2_H, HH_L, HH_H 226 veor HH34_L, HH3_L, HH3_H 227 veor HH34_H, HH4_L, HH4_H 228 229 vmov.i8 MASK, #0xe1 230 vshl.u64 MASK, MASK, #57 231 .endif 232 233 vrev64.8 XL2, XL2 234 vrev64.8 XM2, XM2 235 236 subs r0, r0, #4 237 238 vext.8 T1, XL2, XL2, #8 239 veor XL2_H, XL2_H, XL_L 240 veor XL, XL, T1 241 242 vrev64.8 T1, T3 243 vrev64.8 T3, T2 244 245 vmull.p64 XH, HH4_H, XL_H // a1 * b1 246 veor XL2_H, XL2_H, XL_H 247 vmull.p64 XL, HH4_L, XL_L // a0 * b0 248 vmull.p64 XM, HH34_H, XL2_H // (a1 + a0)(b1 + b0) 249 250 vmull.p64 XH2, HH3_H, XM2_L // a1 * b1 251 veor XM2_L, XM2_L, XM2_H 252 vmull.p64 XL2, HH3_L, XM2_H // a0 * b0 253 vmull.p64 XM2, HH34_L, XM2_L // (a1 + a0)(b1 + b0) 254 255 veor XH, XH, XH2 256 veor XL, XL, XL2 257 veor XM, XM, XM2 258 259 vmull.p64 XH2, HH_H, T3_L // a1 * b1 260 veor T3_L, T3_L, T3_H 261 vmull.p64 XL2, HH_L, T3_H // a0 * b0 262 vmull.p64 XM2, SHASH2_H, T3_L // (a1 + a0)(b1 + b0) 263 264 veor XH, XH, XH2 265 veor XL, XL, XL2 266 veor XM, XM, XM2 267 268 vmull.p64 XH2, SHASH_H, T1_L // a1 * b1 269 veor T1_L, T1_L, T1_H 270 vmull.p64 XL2, SHASH_L, T1_H // a0 * b0 271 vmull.p64 XM2, SHASH2_p64, T1_L // (a1 + a0)(b1 + b0) 272 273 veor XH, XH, XH2 274 veor XL, XL, XL2 275 veor XM, XM, XM2 276 277 beq 4f 278 279 vld1.8 {XL2-XM2}, [r2]! 280 281 veor T1, XL, XH 282 veor XM, XM, T1 283 284 __pmull_reduce_p64 285 286 veor T1, T1, XH 287 veor XL, XL, T1 288 289 b 1b 290 .endif 291 .endif 292 2932: vld1.8 {T1}, [r2]! 294 295 .ifnb \enc 296 \enc\()_1x T1 297 veor SHASH2_p64, SHASH_L, SHASH_H 298 vmov.i8 MASK, #0xe1 299 vshl.u64 MASK, MASK, #57 300 .endif 301 302 subs r0, r0, #1 303 3043: /* multiply XL by SHASH in GF(2^128) */ 305 vrev64.8 T1, T1 306 307 vext.8 IN1, T1, T1, #8 308 veor T1_L, T1_L, XL_H 309 veor XL, XL, IN1 310 311 __pmull_\pn XH, XL_H, SHASH_H, s1h, s2h, s3h, s4h @ a1 * b1 312 veor T1, T1, XL 313 __pmull_\pn XL, XL_L, SHASH_L, s1l, s2l, s3l, s4l @ a0 * b0 314 __pmull_\pn XM, T1_L, SHASH2_\pn @ (a1+a0)(b1+b0) 315 3164: veor T1, XL, XH 317 veor XM, XM, T1 318 319 __pmull_reduce_\pn 320 321 veor T1, T1, XH 322 veor XL, XL, T1 323 324 bne 0b 325 .endm 326 327 /* 328 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 329 * struct ghash_key const *k, const char *head) 330 */ 331ENTRY(pmull_ghash_update_p64) 332 vld1.64 {SHASH}, [r3]! 333 vld1.64 {HH}, [r3]! 334 vld1.64 {HH3-HH4}, [r3] 335 336 veor SHASH2_p64, SHASH_L, SHASH_H 337 veor SHASH2_H, HH_L, HH_H 338 veor HH34_L, HH3_L, HH3_H 339 veor HH34_H, HH4_L, HH4_H 340 341 vmov.i8 MASK, #0xe1 342 vshl.u64 MASK, MASK, #57 343 344 ghash_update p64 345 vst1.64 {XL}, [r1] 346 347 bx lr 348ENDPROC(pmull_ghash_update_p64) 349 350ENTRY(pmull_ghash_update_p8) 351 vld1.64 {SHASH}, [r3] 352 veor SHASH2_p8, SHASH_L, SHASH_H 353 354 vext.8 s1l, SHASH_L, SHASH_L, #1 355 vext.8 s2l, SHASH_L, SHASH_L, #2 356 vext.8 s3l, SHASH_L, SHASH_L, #3 357 vext.8 s4l, SHASH_L, SHASH_L, #4 358 vext.8 s1h, SHASH_H, SHASH_H, #1 359 vext.8 s2h, SHASH_H, SHASH_H, #2 360 vext.8 s3h, SHASH_H, SHASH_H, #3 361 vext.8 s4h, SHASH_H, SHASH_H, #4 362 363 vmov.i64 k16, #0xffff 364 vmov.i64 k32, #0xffffffff 365 vmov.i64 k48, #0xffffffffffff 366 367 ghash_update p8 368 vst1.64 {XL}, [r1] 369 370 bx lr 371ENDPROC(pmull_ghash_update_p8) 372 373 e0 .req q9 374 e1 .req q10 375 e2 .req q11 376 e3 .req q12 377 e0l .req d18 378 e0h .req d19 379 e2l .req d22 380 e2h .req d23 381 e3l .req d24 382 e3h .req d25 383 ctr .req q13 384 ctr0 .req d26 385 ctr1 .req d27 386 387 ek0 .req q14 388 ek1 .req q15 389 390 .macro round, rk:req, regs:vararg 391 .irp r, \regs 392 aese.8 \r, \rk 393 aesmc.8 \r, \r 394 .endr 395 .endm 396 397 .macro aes_encrypt, rkp, rounds, regs:vararg 398 vld1.8 {ek0-ek1}, [\rkp, :128]! 399 cmp \rounds, #12 400 blt .L\@ // AES-128 401 402 round ek0, \regs 403 vld1.8 {ek0}, [\rkp, :128]! 404 round ek1, \regs 405 vld1.8 {ek1}, [\rkp, :128]! 406 407 beq .L\@ // AES-192 408 409 round ek0, \regs 410 vld1.8 {ek0}, [\rkp, :128]! 411 round ek1, \regs 412 vld1.8 {ek1}, [\rkp, :128]! 413 414.L\@: .rept 4 415 round ek0, \regs 416 vld1.8 {ek0}, [\rkp, :128]! 417 round ek1, \regs 418 vld1.8 {ek1}, [\rkp, :128]! 419 .endr 420 421 round ek0, \regs 422 vld1.8 {ek0}, [\rkp, :128] 423 424 .irp r, \regs 425 aese.8 \r, ek1 426 .endr 427 .irp r, \regs 428 veor \r, \r, ek0 429 .endr 430 .endm 431 432pmull_aes_encrypt: 433 add ip, r5, #4 434 vld1.8 {ctr0}, [r5] // load 12 byte IV 435 vld1.8 {ctr1}, [ip] 436 rev r8, r7 437 vext.8 ctr1, ctr1, ctr1, #4 438 add r7, r7, #1 439 vmov.32 ctr1[1], r8 440 vmov e0, ctr 441 442 add ip, r3, #64 443 aes_encrypt ip, r6, e0 444 bx lr 445ENDPROC(pmull_aes_encrypt) 446 447pmull_aes_encrypt_4x: 448 add ip, r5, #4 449 vld1.8 {ctr0}, [r5] 450 vld1.8 {ctr1}, [ip] 451 rev r8, r7 452 vext.8 ctr1, ctr1, ctr1, #4 453 add r7, r7, #1 454 vmov.32 ctr1[1], r8 455 rev ip, r7 456 vmov e0, ctr 457 add r7, r7, #1 458 vmov.32 ctr1[1], ip 459 rev r8, r7 460 vmov e1, ctr 461 add r7, r7, #1 462 vmov.32 ctr1[1], r8 463 rev ip, r7 464 vmov e2, ctr 465 add r7, r7, #1 466 vmov.32 ctr1[1], ip 467 vmov e3, ctr 468 469 add ip, r3, #64 470 aes_encrypt ip, r6, e0, e1, e2, e3 471 bx lr 472ENDPROC(pmull_aes_encrypt_4x) 473 474pmull_aes_encrypt_final: 475 add ip, r5, #4 476 vld1.8 {ctr0}, [r5] 477 vld1.8 {ctr1}, [ip] 478 rev r8, r7 479 vext.8 ctr1, ctr1, ctr1, #4 480 mov r7, #1 << 24 // BE #1 for the tag 481 vmov.32 ctr1[1], r8 482 vmov e0, ctr 483 vmov.32 ctr1[1], r7 484 vmov e1, ctr 485 486 add ip, r3, #64 487 aes_encrypt ip, r6, e0, e1 488 bx lr 489ENDPROC(pmull_aes_encrypt_final) 490 491 .macro enc_1x, in0 492 bl pmull_aes_encrypt 493 veor \in0, \in0, e0 494 vst1.8 {\in0}, [r4]! 495 .endm 496 497 .macro dec_1x, in0 498 bl pmull_aes_encrypt 499 veor e0, e0, \in0 500 vst1.8 {e0}, [r4]! 501 .endm 502 503 .macro enc_4x, in0, in1, in2, in3 504 bl pmull_aes_encrypt_4x 505 506 veor \in0, \in0, e0 507 veor \in1, \in1, e1 508 veor \in2, \in2, e2 509 veor \in3, \in3, e3 510 511 vst1.8 {\in0-\in1}, [r4]! 512 vst1.8 {\in2-\in3}, [r4]! 513 .endm 514 515 .macro dec_4x, in0, in1, in2, in3 516 bl pmull_aes_encrypt_4x 517 518 veor e0, e0, \in0 519 veor e1, e1, \in1 520 veor e2, e2, \in2 521 veor e3, e3, \in3 522 523 vst1.8 {e0-e1}, [r4]! 524 vst1.8 {e2-e3}, [r4]! 525 .endm 526 527 /* 528 * void pmull_gcm_encrypt(int blocks, u64 dg[], const char *src, 529 * struct gcm_key const *k, char *dst, 530 * char *iv, int rounds, u32 counter) 531 */ 532ENTRY(pmull_gcm_encrypt) 533 push {r4-r8, lr} 534 ldrd r4, r5, [sp, #24] 535 ldrd r6, r7, [sp, #32] 536 537 vld1.64 {SHASH}, [r3] 538 539 ghash_update p64, enc, head=0 540 vst1.64 {XL}, [r1] 541 542 pop {r4-r8, pc} 543ENDPROC(pmull_gcm_encrypt) 544 545 /* 546 * void pmull_gcm_decrypt(int blocks, u64 dg[], const char *src, 547 * struct gcm_key const *k, char *dst, 548 * char *iv, int rounds, u32 counter) 549 */ 550ENTRY(pmull_gcm_decrypt) 551 push {r4-r8, lr} 552 ldrd r4, r5, [sp, #24] 553 ldrd r6, r7, [sp, #32] 554 555 vld1.64 {SHASH}, [r3] 556 557 ghash_update p64, dec, head=0 558 vst1.64 {XL}, [r1] 559 560 pop {r4-r8, pc} 561ENDPROC(pmull_gcm_decrypt) 562 563 /* 564 * void pmull_gcm_enc_final(int bytes, u64 dg[], char *tag, 565 * struct gcm_key const *k, char *head, 566 * char *iv, int rounds, u32 counter) 567 */ 568ENTRY(pmull_gcm_enc_final) 569 push {r4-r8, lr} 570 ldrd r4, r5, [sp, #24] 571 ldrd r6, r7, [sp, #32] 572 573 bl pmull_aes_encrypt_final 574 575 cmp r0, #0 576 beq .Lenc_final 577 578 mov_l ip, .Lpermute 579 sub r4, r4, #16 580 add r8, ip, r0 581 add ip, ip, #32 582 add r4, r4, r0 583 sub ip, ip, r0 584 585 vld1.8 {e3}, [r8] // permute vector for key stream 586 vld1.8 {e2}, [ip] // permute vector for ghash input 587 588 vtbl.8 e3l, {e0}, e3l 589 vtbl.8 e3h, {e0}, e3h 590 591 vld1.8 {e0}, [r4] // encrypt tail block 592 veor e0, e0, e3 593 vst1.8 {e0}, [r4] 594 595 vtbl.8 T1_L, {e0}, e2l 596 vtbl.8 T1_H, {e0}, e2h 597 598 vld1.64 {XL}, [r1] 599.Lenc_final: 600 vld1.64 {SHASH}, [r3, :128] 601 vmov.i8 MASK, #0xe1 602 veor SHASH2_p64, SHASH_L, SHASH_H 603 vshl.u64 MASK, MASK, #57 604 mov r0, #1 605 bne 3f // process head block first 606 ghash_update p64, aggregate=0, head=0 607 608 vrev64.8 XL, XL 609 vext.8 XL, XL, XL, #8 610 veor XL, XL, e1 611 612 sub r2, r2, #16 // rewind src pointer 613 vst1.8 {XL}, [r2] // store tag 614 615 pop {r4-r8, pc} 616ENDPROC(pmull_gcm_enc_final) 617 618 /* 619 * int pmull_gcm_dec_final(int bytes, u64 dg[], char *tag, 620 * struct gcm_key const *k, char *head, 621 * char *iv, int rounds, u32 counter, 622 * const char *otag, int authsize) 623 */ 624ENTRY(pmull_gcm_dec_final) 625 push {r4-r8, lr} 626 ldrd r4, r5, [sp, #24] 627 ldrd r6, r7, [sp, #32] 628 629 bl pmull_aes_encrypt_final 630 631 cmp r0, #0 632 beq .Ldec_final 633 634 mov_l ip, .Lpermute 635 sub r4, r4, #16 636 add r8, ip, r0 637 add ip, ip, #32 638 add r4, r4, r0 639 sub ip, ip, r0 640 641 vld1.8 {e3}, [r8] // permute vector for key stream 642 vld1.8 {e2}, [ip] // permute vector for ghash input 643 644 vtbl.8 e3l, {e0}, e3l 645 vtbl.8 e3h, {e0}, e3h 646 647 vld1.8 {e0}, [r4] 648 649 vtbl.8 T1_L, {e0}, e2l 650 vtbl.8 T1_H, {e0}, e2h 651 652 veor e0, e0, e3 653 vst1.8 {e0}, [r4] 654 655 vld1.64 {XL}, [r1] 656.Ldec_final: 657 vld1.64 {SHASH}, [r3] 658 vmov.i8 MASK, #0xe1 659 veor SHASH2_p64, SHASH_L, SHASH_H 660 vshl.u64 MASK, MASK, #57 661 mov r0, #1 662 bne 3f // process head block first 663 ghash_update p64, aggregate=0, head=0 664 665 vrev64.8 XL, XL 666 vext.8 XL, XL, XL, #8 667 veor XL, XL, e1 668 669 mov_l ip, .Lpermute 670 ldrd r2, r3, [sp, #40] // otag and authsize 671 vld1.8 {T1}, [r2] 672 add ip, ip, r3 673 vceq.i8 T1, T1, XL // compare tags 674 vmvn T1, T1 // 0 for eq, -1 for ne 675 676 vld1.8 {e0}, [ip] 677 vtbl.8 XL_L, {T1}, e0l // keep authsize bytes only 678 vtbl.8 XL_H, {T1}, e0h 679 680 vpmin.s8 XL_L, XL_L, XL_H // take the minimum s8 across the vector 681 vpmin.s8 XL_L, XL_L, XL_L 682 vmov.32 r0, XL_L[0] // fail if != 0x0 683 684 pop {r4-r8, pc} 685ENDPROC(pmull_gcm_dec_final) 686 687 .section ".rodata", "a", %progbits 688 .align 5 689.Lpermute: 690 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 691 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 692 .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 693 .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f 694 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 695 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 696