1/* 2 * Accelerated GHASH implementation with ARMv8 PMULL instructions. 3 * 4 * Copyright (C) 2014 - 2017 Linaro Ltd. <ard.biesheuvel@linaro.org> 5 * 6 * This program is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 as published 8 * by the Free Software Foundation. 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14 SHASH .req v0 15 SHASH2 .req v1 16 T1 .req v2 17 T2 .req v3 18 MASK .req v4 19 XL .req v5 20 XM .req v6 21 XH .req v7 22 IN1 .req v7 23 24 k00_16 .req v8 25 k32_48 .req v9 26 27 t3 .req v10 28 t4 .req v11 29 t5 .req v12 30 t6 .req v13 31 t7 .req v14 32 t8 .req v15 33 t9 .req v16 34 35 perm1 .req v17 36 perm2 .req v18 37 perm3 .req v19 38 39 sh1 .req v20 40 sh2 .req v21 41 sh3 .req v22 42 sh4 .req v23 43 44 ss1 .req v24 45 ss2 .req v25 46 ss3 .req v26 47 ss4 .req v27 48 49 .text 50 .arch armv8-a+crypto 51 52 .macro __pmull_p64, rd, rn, rm 53 pmull \rd\().1q, \rn\().1d, \rm\().1d 54 .endm 55 56 .macro __pmull2_p64, rd, rn, rm 57 pmull2 \rd\().1q, \rn\().2d, \rm\().2d 58 .endm 59 60 .macro __pmull_p8, rq, ad, bd 61 ext t3.8b, \ad\().8b, \ad\().8b, #1 // A1 62 ext t5.8b, \ad\().8b, \ad\().8b, #2 // A2 63 ext t7.8b, \ad\().8b, \ad\().8b, #3 // A3 64 65 __pmull_p8_\bd \rq, \ad 66 .endm 67 68 .macro __pmull2_p8, rq, ad, bd 69 tbl t3.16b, {\ad\().16b}, perm1.16b // A1 70 tbl t5.16b, {\ad\().16b}, perm2.16b // A2 71 tbl t7.16b, {\ad\().16b}, perm3.16b // A3 72 73 __pmull2_p8_\bd \rq, \ad 74 .endm 75 76 .macro __pmull_p8_SHASH, rq, ad 77 __pmull_p8_tail \rq, \ad\().8b, SHASH.8b, 8b,, sh1, sh2, sh3, sh4 78 .endm 79 80 .macro __pmull_p8_SHASH2, rq, ad 81 __pmull_p8_tail \rq, \ad\().8b, SHASH2.8b, 8b,, ss1, ss2, ss3, ss4 82 .endm 83 84 .macro __pmull2_p8_SHASH, rq, ad 85 __pmull_p8_tail \rq, \ad\().16b, SHASH.16b, 16b, 2, sh1, sh2, sh3, sh4 86 .endm 87 88 .macro __pmull_p8_tail, rq, ad, bd, nb, t, b1, b2, b3, b4 89 pmull\t t3.8h, t3.\nb, \bd // F = A1*B 90 pmull\t t4.8h, \ad, \b1\().\nb // E = A*B1 91 pmull\t t5.8h, t5.\nb, \bd // H = A2*B 92 pmull\t t6.8h, \ad, \b2\().\nb // G = A*B2 93 pmull\t t7.8h, t7.\nb, \bd // J = A3*B 94 pmull\t t8.8h, \ad, \b3\().\nb // I = A*B3 95 pmull\t t9.8h, \ad, \b4\().\nb // K = A*B4 96 pmull\t \rq\().8h, \ad, \bd // D = A*B 97 98 eor t3.16b, t3.16b, t4.16b // L = E + F 99 eor t5.16b, t5.16b, t6.16b // M = G + H 100 eor t7.16b, t7.16b, t8.16b // N = I + J 101 102 uzp1 t4.2d, t3.2d, t5.2d 103 uzp2 t3.2d, t3.2d, t5.2d 104 uzp1 t6.2d, t7.2d, t9.2d 105 uzp2 t7.2d, t7.2d, t9.2d 106 107 // t3 = (L) (P0 + P1) << 8 108 // t5 = (M) (P2 + P3) << 16 109 eor t4.16b, t4.16b, t3.16b 110 and t3.16b, t3.16b, k32_48.16b 111 112 // t7 = (N) (P4 + P5) << 24 113 // t9 = (K) (P6 + P7) << 32 114 eor t6.16b, t6.16b, t7.16b 115 and t7.16b, t7.16b, k00_16.16b 116 117 eor t4.16b, t4.16b, t3.16b 118 eor t6.16b, t6.16b, t7.16b 119 120 zip2 t5.2d, t4.2d, t3.2d 121 zip1 t3.2d, t4.2d, t3.2d 122 zip2 t9.2d, t6.2d, t7.2d 123 zip1 t7.2d, t6.2d, t7.2d 124 125 ext t3.16b, t3.16b, t3.16b, #15 126 ext t5.16b, t5.16b, t5.16b, #14 127 ext t7.16b, t7.16b, t7.16b, #13 128 ext t9.16b, t9.16b, t9.16b, #12 129 130 eor t3.16b, t3.16b, t5.16b 131 eor t7.16b, t7.16b, t9.16b 132 eor \rq\().16b, \rq\().16b, t3.16b 133 eor \rq\().16b, \rq\().16b, t7.16b 134 .endm 135 136 .macro __pmull_pre_p64 137 movi MASK.16b, #0xe1 138 shl MASK.2d, MASK.2d, #57 139 .endm 140 141 .macro __pmull_pre_p8 142 // k00_16 := 0x0000000000000000_000000000000ffff 143 // k32_48 := 0x00000000ffffffff_0000ffffffffffff 144 movi k32_48.2d, #0xffffffff 145 mov k32_48.h[2], k32_48.h[0] 146 ushr k00_16.2d, k32_48.2d, #32 147 148 // prepare the permutation vectors 149 mov_q x5, 0x080f0e0d0c0b0a09 150 movi T1.8b, #8 151 dup perm1.2d, x5 152 eor perm1.16b, perm1.16b, T1.16b 153 ushr perm2.2d, perm1.2d, #8 154 ushr perm3.2d, perm1.2d, #16 155 ushr T1.2d, perm1.2d, #24 156 sli perm2.2d, perm1.2d, #56 157 sli perm3.2d, perm1.2d, #48 158 sli T1.2d, perm1.2d, #40 159 160 // precompute loop invariants 161 tbl sh1.16b, {SHASH.16b}, perm1.16b 162 tbl sh2.16b, {SHASH.16b}, perm2.16b 163 tbl sh3.16b, {SHASH.16b}, perm3.16b 164 tbl sh4.16b, {SHASH.16b}, T1.16b 165 ext ss1.8b, SHASH2.8b, SHASH2.8b, #1 166 ext ss2.8b, SHASH2.8b, SHASH2.8b, #2 167 ext ss3.8b, SHASH2.8b, SHASH2.8b, #3 168 ext ss4.8b, SHASH2.8b, SHASH2.8b, #4 169 .endm 170 171 // 172 // PMULL (64x64->128) based reduction for CPUs that can do 173 // it in a single instruction. 174 // 175 .macro __pmull_reduce_p64 176 pmull T2.1q, XL.1d, MASK.1d 177 eor XM.16b, XM.16b, T1.16b 178 179 mov XH.d[0], XM.d[1] 180 mov XM.d[1], XL.d[0] 181 182 eor XL.16b, XM.16b, T2.16b 183 ext T2.16b, XL.16b, XL.16b, #8 184 pmull XL.1q, XL.1d, MASK.1d 185 .endm 186 187 // 188 // Alternative reduction for CPUs that lack support for the 189 // 64x64->128 PMULL instruction 190 // 191 .macro __pmull_reduce_p8 192 eor XM.16b, XM.16b, T1.16b 193 194 mov XL.d[1], XM.d[0] 195 mov XH.d[0], XM.d[1] 196 197 shl T1.2d, XL.2d, #57 198 shl T2.2d, XL.2d, #62 199 eor T2.16b, T2.16b, T1.16b 200 shl T1.2d, XL.2d, #63 201 eor T2.16b, T2.16b, T1.16b 202 ext T1.16b, XL.16b, XH.16b, #8 203 eor T2.16b, T2.16b, T1.16b 204 205 mov XL.d[1], T2.d[0] 206 mov XH.d[0], T2.d[1] 207 208 ushr T2.2d, XL.2d, #1 209 eor XH.16b, XH.16b, XL.16b 210 eor XL.16b, XL.16b, T2.16b 211 ushr T2.2d, T2.2d, #6 212 ushr XL.2d, XL.2d, #1 213 .endm 214 215 .macro __pmull_ghash, pn 216 frame_push 5 217 218 mov x19, x0 219 mov x20, x1 220 mov x21, x2 221 mov x22, x3 222 mov x23, x4 223 2240: ld1 {SHASH.2d}, [x22] 225 ld1 {XL.2d}, [x20] 226 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 227 eor SHASH2.16b, SHASH2.16b, SHASH.16b 228 229 __pmull_pre_\pn 230 231 /* do the head block first, if supplied */ 232 cbz x23, 1f 233 ld1 {T1.2d}, [x23] 234 mov x23, xzr 235 b 2f 236 2371: ld1 {T1.2d}, [x21], #16 238 sub w19, w19, #1 239 2402: /* multiply XL by SHASH in GF(2^128) */ 241CPU_LE( rev64 T1.16b, T1.16b ) 242 243 ext T2.16b, XL.16b, XL.16b, #8 244 ext IN1.16b, T1.16b, T1.16b, #8 245 eor T1.16b, T1.16b, T2.16b 246 eor XL.16b, XL.16b, IN1.16b 247 248 __pmull2_\pn XH, XL, SHASH // a1 * b1 249 eor T1.16b, T1.16b, XL.16b 250 __pmull_\pn XL, XL, SHASH // a0 * b0 251 __pmull_\pn XM, T1, SHASH2 // (a1 + a0)(b1 + b0) 252 253 eor T2.16b, XL.16b, XH.16b 254 ext T1.16b, XL.16b, XH.16b, #8 255 eor XM.16b, XM.16b, T2.16b 256 257 __pmull_reduce_\pn 258 259 eor T2.16b, T2.16b, XH.16b 260 eor XL.16b, XL.16b, T2.16b 261 262 cbz w19, 3f 263 264 if_will_cond_yield_neon 265 st1 {XL.2d}, [x20] 266 do_cond_yield_neon 267 b 0b 268 endif_yield_neon 269 270 b 1b 271 2723: st1 {XL.2d}, [x20] 273 frame_pop 274 ret 275 .endm 276 277 /* 278 * void pmull_ghash_update(int blocks, u64 dg[], const char *src, 279 * struct ghash_key const *k, const char *head) 280 */ 281ENTRY(pmull_ghash_update_p64) 282 __pmull_ghash p64 283ENDPROC(pmull_ghash_update_p64) 284 285ENTRY(pmull_ghash_update_p8) 286 __pmull_ghash p8 287ENDPROC(pmull_ghash_update_p8) 288 289 KS .req v8 290 CTR .req v9 291 INP .req v10 292 293 .macro load_round_keys, rounds, rk 294 cmp \rounds, #12 295 blo 2222f /* 128 bits */ 296 beq 1111f /* 192 bits */ 297 ld1 {v17.4s-v18.4s}, [\rk], #32 2981111: ld1 {v19.4s-v20.4s}, [\rk], #32 2992222: ld1 {v21.4s-v24.4s}, [\rk], #64 300 ld1 {v25.4s-v28.4s}, [\rk], #64 301 ld1 {v29.4s-v31.4s}, [\rk] 302 .endm 303 304 .macro enc_round, state, key 305 aese \state\().16b, \key\().16b 306 aesmc \state\().16b, \state\().16b 307 .endm 308 309 .macro enc_block, state, rounds 310 cmp \rounds, #12 311 b.lo 2222f /* 128 bits */ 312 b.eq 1111f /* 192 bits */ 313 enc_round \state, v17 314 enc_round \state, v18 3151111: enc_round \state, v19 316 enc_round \state, v20 3172222: .irp key, v21, v22, v23, v24, v25, v26, v27, v28, v29 318 enc_round \state, \key 319 .endr 320 aese \state\().16b, v30.16b 321 eor \state\().16b, \state\().16b, v31.16b 322 .endm 323 324 .macro pmull_gcm_do_crypt, enc 325 frame_push 10 326 327 mov x19, x0 328 mov x20, x1 329 mov x21, x2 330 mov x22, x3 331 mov x23, x4 332 mov x24, x5 333 mov x25, x6 334 mov x26, x7 335 .if \enc == 1 336 ldr x27, [sp, #96] // first stacked arg 337 .endif 338 339 ldr x28, [x24, #8] // load lower counter 340CPU_LE( rev x28, x28 ) 341 3420: mov x0, x25 343 load_round_keys w26, x0 344 ld1 {SHASH.2d}, [x23] 345 ld1 {XL.2d}, [x20] 346 347 movi MASK.16b, #0xe1 348 ext SHASH2.16b, SHASH.16b, SHASH.16b, #8 349 shl MASK.2d, MASK.2d, #57 350 eor SHASH2.16b, SHASH2.16b, SHASH.16b 351 352 .if \enc == 1 353 ld1 {KS.16b}, [x27] 354 .endif 355 3561: ld1 {CTR.8b}, [x24] // load upper counter 357 ld1 {INP.16b}, [x22], #16 358 rev x9, x28 359 add x28, x28, #1 360 sub w19, w19, #1 361 ins CTR.d[1], x9 // set lower counter 362 363 .if \enc == 1 364 eor INP.16b, INP.16b, KS.16b // encrypt input 365 st1 {INP.16b}, [x21], #16 366 .endif 367 368 rev64 T1.16b, INP.16b 369 370 cmp w26, #12 371 b.ge 4f // AES-192/256? 372 3732: enc_round CTR, v21 374 375 ext T2.16b, XL.16b, XL.16b, #8 376 ext IN1.16b, T1.16b, T1.16b, #8 377 378 enc_round CTR, v22 379 380 eor T1.16b, T1.16b, T2.16b 381 eor XL.16b, XL.16b, IN1.16b 382 383 enc_round CTR, v23 384 385 pmull2 XH.1q, SHASH.2d, XL.2d // a1 * b1 386 eor T1.16b, T1.16b, XL.16b 387 388 enc_round CTR, v24 389 390 pmull XL.1q, SHASH.1d, XL.1d // a0 * b0 391 pmull XM.1q, SHASH2.1d, T1.1d // (a1 + a0)(b1 + b0) 392 393 enc_round CTR, v25 394 395 ext T1.16b, XL.16b, XH.16b, #8 396 eor T2.16b, XL.16b, XH.16b 397 eor XM.16b, XM.16b, T1.16b 398 399 enc_round CTR, v26 400 401 eor XM.16b, XM.16b, T2.16b 402 pmull T2.1q, XL.1d, MASK.1d 403 404 enc_round CTR, v27 405 406 mov XH.d[0], XM.d[1] 407 mov XM.d[1], XL.d[0] 408 409 enc_round CTR, v28 410 411 eor XL.16b, XM.16b, T2.16b 412 413 enc_round CTR, v29 414 415 ext T2.16b, XL.16b, XL.16b, #8 416 417 aese CTR.16b, v30.16b 418 419 pmull XL.1q, XL.1d, MASK.1d 420 eor T2.16b, T2.16b, XH.16b 421 422 eor KS.16b, CTR.16b, v31.16b 423 424 eor XL.16b, XL.16b, T2.16b 425 426 .if \enc == 0 427 eor INP.16b, INP.16b, KS.16b 428 st1 {INP.16b}, [x21], #16 429 .endif 430 431 cbz w19, 3f 432 433 if_will_cond_yield_neon 434 st1 {XL.2d}, [x20] 435 .if \enc == 1 436 st1 {KS.16b}, [x27] 437 .endif 438 do_cond_yield_neon 439 b 0b 440 endif_yield_neon 441 442 b 1b 443 4443: st1 {XL.2d}, [x20] 445 .if \enc == 1 446 st1 {KS.16b}, [x27] 447 .endif 448 449CPU_LE( rev x28, x28 ) 450 str x28, [x24, #8] // store lower counter 451 452 frame_pop 453 ret 454 4554: b.eq 5f // AES-192? 456 enc_round CTR, v17 457 enc_round CTR, v18 4585: enc_round CTR, v19 459 enc_round CTR, v20 460 b 2b 461 .endm 462 463 /* 464 * void pmull_gcm_encrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 465 * struct ghash_key const *k, u8 ctr[], 466 * int rounds, u8 ks[]) 467 */ 468ENTRY(pmull_gcm_encrypt) 469 pmull_gcm_do_crypt 1 470ENDPROC(pmull_gcm_encrypt) 471 472 /* 473 * void pmull_gcm_decrypt(int blocks, u64 dg[], u8 dst[], const u8 src[], 474 * struct ghash_key const *k, u8 ctr[], 475 * int rounds) 476 */ 477ENTRY(pmull_gcm_decrypt) 478 pmull_gcm_do_crypt 0 479ENDPROC(pmull_gcm_decrypt) 480 481 /* 482 * void pmull_gcm_encrypt_block(u8 dst[], u8 src[], u8 rk[], int rounds) 483 */ 484ENTRY(pmull_gcm_encrypt_block) 485 cbz x2, 0f 486 load_round_keys w3, x2 4870: ld1 {v0.16b}, [x1] 488 enc_block v0, w3 489 st1 {v0.16b}, [x0] 490 ret 491ENDPROC(pmull_gcm_encrypt_block) 492