1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4-GCM AEAD Algorithm using ARMv8 Crypto Extensions 4 * as specified in rfc8998 5 * https://datatracker.ietf.org/doc/html/rfc8998 6 * 7 * Copyright (C) 2016 Jussi Kivilinna <jussi.kivilinna@iki.fi> 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <linux/cfi_types.h> 13#include <asm/assembler.h> 14#include "sm4-ce-asm.h" 15 16.arch armv8-a+crypto 17 18.irp b, 0, 1, 2, 3, 24, 25, 26, 27, 28, 29, 30, 31 19 .set .Lv\b\().4s, \b 20.endr 21 22.macro sm4e, vd, vn 23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd 24.endm 25 26/* Register macros */ 27 28/* Used for both encryption and decryption */ 29#define RHASH v21 30#define RRCONST v22 31#define RZERO v23 32 33/* Helper macros. */ 34 35/* 36 * input: m0, m1 37 * output: r0:r1 (low 128-bits in r0, high in r1) 38 */ 39#define PMUL_128x128(r0, r1, m0, m1, T0, T1) \ 40 ext T0.16b, m1.16b, m1.16b, #8; \ 41 pmull r0.1q, m0.1d, m1.1d; \ 42 pmull T1.1q, m0.1d, T0.1d; \ 43 pmull2 T0.1q, m0.2d, T0.2d; \ 44 pmull2 r1.1q, m0.2d, m1.2d; \ 45 eor T0.16b, T0.16b, T1.16b; \ 46 ext T1.16b, RZERO.16b, T0.16b, #8; \ 47 ext T0.16b, T0.16b, RZERO.16b, #8; \ 48 eor r0.16b, r0.16b, T1.16b; \ 49 eor r1.16b, r1.16b, T0.16b; 50 51#define PMUL_128x128_4x(r0, r1, m0, m1, T0, T1, \ 52 r2, r3, m2, m3, T2, T3, \ 53 r4, r5, m4, m5, T4, T5, \ 54 r6, r7, m6, m7, T6, T7) \ 55 ext T0.16b, m1.16b, m1.16b, #8; \ 56 ext T2.16b, m3.16b, m3.16b, #8; \ 57 ext T4.16b, m5.16b, m5.16b, #8; \ 58 ext T6.16b, m7.16b, m7.16b, #8; \ 59 pmull r0.1q, m0.1d, m1.1d; \ 60 pmull r2.1q, m2.1d, m3.1d; \ 61 pmull r4.1q, m4.1d, m5.1d; \ 62 pmull r6.1q, m6.1d, m7.1d; \ 63 pmull T1.1q, m0.1d, T0.1d; \ 64 pmull T3.1q, m2.1d, T2.1d; \ 65 pmull T5.1q, m4.1d, T4.1d; \ 66 pmull T7.1q, m6.1d, T6.1d; \ 67 pmull2 T0.1q, m0.2d, T0.2d; \ 68 pmull2 T2.1q, m2.2d, T2.2d; \ 69 pmull2 T4.1q, m4.2d, T4.2d; \ 70 pmull2 T6.1q, m6.2d, T6.2d; \ 71 pmull2 r1.1q, m0.2d, m1.2d; \ 72 pmull2 r3.1q, m2.2d, m3.2d; \ 73 pmull2 r5.1q, m4.2d, m5.2d; \ 74 pmull2 r7.1q, m6.2d, m7.2d; \ 75 eor T0.16b, T0.16b, T1.16b; \ 76 eor T2.16b, T2.16b, T3.16b; \ 77 eor T4.16b, T4.16b, T5.16b; \ 78 eor T6.16b, T6.16b, T7.16b; \ 79 ext T1.16b, RZERO.16b, T0.16b, #8; \ 80 ext T3.16b, RZERO.16b, T2.16b, #8; \ 81 ext T5.16b, RZERO.16b, T4.16b, #8; \ 82 ext T7.16b, RZERO.16b, T6.16b, #8; \ 83 ext T0.16b, T0.16b, RZERO.16b, #8; \ 84 ext T2.16b, T2.16b, RZERO.16b, #8; \ 85 ext T4.16b, T4.16b, RZERO.16b, #8; \ 86 ext T6.16b, T6.16b, RZERO.16b, #8; \ 87 eor r0.16b, r0.16b, T1.16b; \ 88 eor r2.16b, r2.16b, T3.16b; \ 89 eor r4.16b, r4.16b, T5.16b; \ 90 eor r6.16b, r6.16b, T7.16b; \ 91 eor r1.16b, r1.16b, T0.16b; \ 92 eor r3.16b, r3.16b, T2.16b; \ 93 eor r5.16b, r5.16b, T4.16b; \ 94 eor r7.16b, r7.16b, T6.16b; 95 96/* 97 * input: r0:r1 (low 128-bits in r0, high in r1) 98 * output: a 99 */ 100#define REDUCTION(a, r0, r1, rconst, T0, T1) \ 101 pmull2 T0.1q, r1.2d, rconst.2d; \ 102 ext T1.16b, T0.16b, RZERO.16b, #8; \ 103 ext T0.16b, RZERO.16b, T0.16b, #8; \ 104 eor r1.16b, r1.16b, T1.16b; \ 105 eor r0.16b, r0.16b, T0.16b; \ 106 pmull T0.1q, r1.1d, rconst.1d; \ 107 eor a.16b, r0.16b, T0.16b; 108 109#define SM4_CRYPT_PMUL_128x128_BLK(b0, r0, r1, m0, m1, T0, T1) \ 110 rev32 b0.16b, b0.16b; \ 111 ext T0.16b, m1.16b, m1.16b, #8; \ 112 sm4e b0.4s, v24.4s; \ 113 pmull r0.1q, m0.1d, m1.1d; \ 114 sm4e b0.4s, v25.4s; \ 115 pmull T1.1q, m0.1d, T0.1d; \ 116 sm4e b0.4s, v26.4s; \ 117 pmull2 T0.1q, m0.2d, T0.2d; \ 118 sm4e b0.4s, v27.4s; \ 119 pmull2 r1.1q, m0.2d, m1.2d; \ 120 sm4e b0.4s, v28.4s; \ 121 eor T0.16b, T0.16b, T1.16b; \ 122 sm4e b0.4s, v29.4s; \ 123 ext T1.16b, RZERO.16b, T0.16b, #8; \ 124 sm4e b0.4s, v30.4s; \ 125 ext T0.16b, T0.16b, RZERO.16b, #8; \ 126 sm4e b0.4s, v31.4s; \ 127 eor r0.16b, r0.16b, T1.16b; \ 128 rev64 b0.4s, b0.4s; \ 129 eor r1.16b, r1.16b, T0.16b; \ 130 ext b0.16b, b0.16b, b0.16b, #8; \ 131 rev32 b0.16b, b0.16b; 132 133#define SM4_CRYPT_PMUL_128x128_BLK3(b0, b1, b2, \ 134 r0, r1, m0, m1, T0, T1, \ 135 r2, r3, m2, m3, T2, T3, \ 136 r4, r5, m4, m5, T4, T5) \ 137 rev32 b0.16b, b0.16b; \ 138 rev32 b1.16b, b1.16b; \ 139 rev32 b2.16b, b2.16b; \ 140 ext T0.16b, m1.16b, m1.16b, #8; \ 141 ext T2.16b, m3.16b, m3.16b, #8; \ 142 ext T4.16b, m5.16b, m5.16b, #8; \ 143 sm4e b0.4s, v24.4s; \ 144 sm4e b1.4s, v24.4s; \ 145 sm4e b2.4s, v24.4s; \ 146 pmull r0.1q, m0.1d, m1.1d; \ 147 pmull r2.1q, m2.1d, m3.1d; \ 148 pmull r4.1q, m4.1d, m5.1d; \ 149 sm4e b0.4s, v25.4s; \ 150 sm4e b1.4s, v25.4s; \ 151 sm4e b2.4s, v25.4s; \ 152 pmull T1.1q, m0.1d, T0.1d; \ 153 pmull T3.1q, m2.1d, T2.1d; \ 154 pmull T5.1q, m4.1d, T4.1d; \ 155 sm4e b0.4s, v26.4s; \ 156 sm4e b1.4s, v26.4s; \ 157 sm4e b2.4s, v26.4s; \ 158 pmull2 T0.1q, m0.2d, T0.2d; \ 159 pmull2 T2.1q, m2.2d, T2.2d; \ 160 pmull2 T4.1q, m4.2d, T4.2d; \ 161 sm4e b0.4s, v27.4s; \ 162 sm4e b1.4s, v27.4s; \ 163 sm4e b2.4s, v27.4s; \ 164 pmull2 r1.1q, m0.2d, m1.2d; \ 165 pmull2 r3.1q, m2.2d, m3.2d; \ 166 pmull2 r5.1q, m4.2d, m5.2d; \ 167 sm4e b0.4s, v28.4s; \ 168 sm4e b1.4s, v28.4s; \ 169 sm4e b2.4s, v28.4s; \ 170 eor T0.16b, T0.16b, T1.16b; \ 171 eor T2.16b, T2.16b, T3.16b; \ 172 eor T4.16b, T4.16b, T5.16b; \ 173 sm4e b0.4s, v29.4s; \ 174 sm4e b1.4s, v29.4s; \ 175 sm4e b2.4s, v29.4s; \ 176 ext T1.16b, RZERO.16b, T0.16b, #8; \ 177 ext T3.16b, RZERO.16b, T2.16b, #8; \ 178 ext T5.16b, RZERO.16b, T4.16b, #8; \ 179 sm4e b0.4s, v30.4s; \ 180 sm4e b1.4s, v30.4s; \ 181 sm4e b2.4s, v30.4s; \ 182 ext T0.16b, T0.16b, RZERO.16b, #8; \ 183 ext T2.16b, T2.16b, RZERO.16b, #8; \ 184 ext T4.16b, T4.16b, RZERO.16b, #8; \ 185 sm4e b0.4s, v31.4s; \ 186 sm4e b1.4s, v31.4s; \ 187 sm4e b2.4s, v31.4s; \ 188 eor r0.16b, r0.16b, T1.16b; \ 189 eor r2.16b, r2.16b, T3.16b; \ 190 eor r4.16b, r4.16b, T5.16b; \ 191 rev64 b0.4s, b0.4s; \ 192 rev64 b1.4s, b1.4s; \ 193 rev64 b2.4s, b2.4s; \ 194 eor r1.16b, r1.16b, T0.16b; \ 195 eor r3.16b, r3.16b, T2.16b; \ 196 eor r5.16b, r5.16b, T4.16b; \ 197 ext b0.16b, b0.16b, b0.16b, #8; \ 198 ext b1.16b, b1.16b, b1.16b, #8; \ 199 ext b2.16b, b2.16b, b2.16b, #8; \ 200 eor r0.16b, r0.16b, r2.16b; \ 201 eor r1.16b, r1.16b, r3.16b; \ 202 rev32 b0.16b, b0.16b; \ 203 rev32 b1.16b, b1.16b; \ 204 rev32 b2.16b, b2.16b; \ 205 eor r0.16b, r0.16b, r4.16b; \ 206 eor r1.16b, r1.16b, r5.16b; 207 208#define inc32_le128(vctr) \ 209 mov vctr.d[1], x9; \ 210 add w6, w9, #1; \ 211 mov vctr.d[0], x8; \ 212 bfi x9, x6, #0, #32; \ 213 rev64 vctr.16b, vctr.16b; 214 215#define GTAG_HASH_LENGTHS(vctr0, vlen) \ 216 ld1 {vlen.16b}, [x7]; \ 217 /* construct CTR0 */ \ 218 /* the lower 32-bits of initial IV is always be32(1) */ \ 219 mov x6, #0x1; \ 220 bfi x9, x6, #0, #32; \ 221 mov vctr0.d[0], x8; \ 222 mov vctr0.d[1], x9; \ 223 rbit vlen.16b, vlen.16b; \ 224 rev64 vctr0.16b, vctr0.16b; \ 225 /* authtag = GCTR(CTR0, GHASH) */ \ 226 eor RHASH.16b, RHASH.16b, vlen.16b; \ 227 SM4_CRYPT_PMUL_128x128_BLK(vctr0, RR0, RR1, RHASH, RH1, \ 228 RTMP0, RTMP1); \ 229 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3); \ 230 rbit RHASH.16b, RHASH.16b; \ 231 eor RHASH.16b, RHASH.16b, vctr0.16b; 232 233 234/* Register macros for encrypt and ghash */ 235 236/* can be the same as input v0-v3 */ 237#define RR1 v0 238#define RR3 v1 239#define RR5 v2 240#define RR7 v3 241 242#define RR0 v4 243#define RR2 v5 244#define RR4 v6 245#define RR6 v7 246 247#define RTMP0 v8 248#define RTMP1 v9 249#define RTMP2 v10 250#define RTMP3 v11 251#define RTMP4 v12 252#define RTMP5 v13 253#define RTMP6 v14 254#define RTMP7 v15 255 256#define RH1 v16 257#define RH2 v17 258#define RH3 v18 259#define RH4 v19 260 261.align 3 262SYM_FUNC_START(sm4_ce_pmull_ghash_setup) 263 /* input: 264 * x0: round key array, CTX 265 * x1: ghash table 266 */ 267 SM4_PREPARE(x0) 268 269 adr_l x2, .Lghash_rconst 270 ld1r {RRCONST.2d}, [x2] 271 272 eor RZERO.16b, RZERO.16b, RZERO.16b 273 274 /* H = E(K, 0^128) */ 275 rev32 v0.16b, RZERO.16b 276 SM4_CRYPT_BLK_BE(v0) 277 278 /* H ^ 1 */ 279 rbit RH1.16b, v0.16b 280 281 /* H ^ 2 */ 282 PMUL_128x128(RR0, RR1, RH1, RH1, RTMP0, RTMP1) 283 REDUCTION(RH2, RR0, RR1, RRCONST, RTMP2, RTMP3) 284 285 /* H ^ 3 */ 286 PMUL_128x128(RR0, RR1, RH2, RH1, RTMP0, RTMP1) 287 REDUCTION(RH3, RR0, RR1, RRCONST, RTMP2, RTMP3) 288 289 /* H ^ 4 */ 290 PMUL_128x128(RR0, RR1, RH2, RH2, RTMP0, RTMP1) 291 REDUCTION(RH4, RR0, RR1, RRCONST, RTMP2, RTMP3) 292 293 st1 {RH1.16b-RH4.16b}, [x1] 294 295 ret 296SYM_FUNC_END(sm4_ce_pmull_ghash_setup) 297 298.align 3 299SYM_FUNC_START(pmull_ghash_update) 300 /* input: 301 * x0: ghash table 302 * x1: ghash result 303 * x2: src 304 * w3: nblocks 305 */ 306 ld1 {RH1.16b-RH4.16b}, [x0] 307 308 ld1 {RHASH.16b}, [x1] 309 rbit RHASH.16b, RHASH.16b 310 311 adr_l x4, .Lghash_rconst 312 ld1r {RRCONST.2d}, [x4] 313 314 eor RZERO.16b, RZERO.16b, RZERO.16b 315 316.Lghash_loop_4x: 317 cmp w3, #4 318 blt .Lghash_loop_1x 319 320 sub w3, w3, #4 321 322 ld1 {v0.16b-v3.16b}, [x2], #64 323 324 rbit v0.16b, v0.16b 325 rbit v1.16b, v1.16b 326 rbit v2.16b, v2.16b 327 rbit v3.16b, v3.16b 328 329 /* 330 * (in0 ^ HASH) * H^4 => rr0:rr1 331 * (in1) * H^3 => rr2:rr3 332 * (in2) * H^2 => rr4:rr5 333 * (in3) * H^1 => rr6:rr7 334 */ 335 eor RHASH.16b, RHASH.16b, v0.16b 336 337 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, 338 RR2, RR3, v1, RH3, RTMP2, RTMP3, 339 RR4, RR5, v2, RH2, RTMP4, RTMP5, 340 RR6, RR7, v3, RH1, RTMP6, RTMP7) 341 342 eor RR0.16b, RR0.16b, RR2.16b 343 eor RR1.16b, RR1.16b, RR3.16b 344 eor RR0.16b, RR0.16b, RR4.16b 345 eor RR1.16b, RR1.16b, RR5.16b 346 eor RR0.16b, RR0.16b, RR6.16b 347 eor RR1.16b, RR1.16b, RR7.16b 348 349 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 350 351 cbz w3, .Lghash_end 352 b .Lghash_loop_4x 353 354.Lghash_loop_1x: 355 sub w3, w3, #1 356 357 ld1 {v0.16b}, [x2], #16 358 rbit v0.16b, v0.16b 359 eor RHASH.16b, RHASH.16b, v0.16b 360 361 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 362 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 363 364 cbnz w3, .Lghash_loop_1x 365 366.Lghash_end: 367 rbit RHASH.16b, RHASH.16b 368 st1 {RHASH.2d}, [x1] 369 370 ret 371SYM_FUNC_END(pmull_ghash_update) 372 373.align 3 374SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_enc) 375 /* input: 376 * x0: round key array, CTX 377 * x1: dst 378 * x2: src 379 * x3: ctr (big endian, 128 bit) 380 * w4: nbytes 381 * x5: ghash result 382 * x6: ghash table 383 * x7: lengths (only for last block) 384 */ 385 SM4_PREPARE(x0) 386 387 ldp x8, x9, [x3] 388 rev x8, x8 389 rev x9, x9 390 391 ld1 {RH1.16b-RH4.16b}, [x6] 392 393 ld1 {RHASH.16b}, [x5] 394 rbit RHASH.16b, RHASH.16b 395 396 adr_l x6, .Lghash_rconst 397 ld1r {RRCONST.2d}, [x6] 398 399 eor RZERO.16b, RZERO.16b, RZERO.16b 400 401 cbz w4, .Lgcm_enc_hash_len 402 403.Lgcm_enc_loop_4x: 404 cmp w4, #(4 * 16) 405 blt .Lgcm_enc_loop_1x 406 407 sub w4, w4, #(4 * 16) 408 409 /* construct CTRs */ 410 inc32_le128(v0) /* +0 */ 411 inc32_le128(v1) /* +1 */ 412 inc32_le128(v2) /* +2 */ 413 inc32_le128(v3) /* +3 */ 414 415 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 416 417 SM4_CRYPT_BLK4(v0, v1, v2, v3) 418 419 eor v0.16b, v0.16b, RTMP0.16b 420 eor v1.16b, v1.16b, RTMP1.16b 421 eor v2.16b, v2.16b, RTMP2.16b 422 eor v3.16b, v3.16b, RTMP3.16b 423 st1 {v0.16b-v3.16b}, [x1], #64 424 425 /* ghash update */ 426 427 rbit v0.16b, v0.16b 428 rbit v1.16b, v1.16b 429 rbit v2.16b, v2.16b 430 rbit v3.16b, v3.16b 431 432 /* 433 * (in0 ^ HASH) * H^4 => rr0:rr1 434 * (in1) * H^3 => rr2:rr3 435 * (in2) * H^2 => rr4:rr5 436 * (in3) * H^1 => rr6:rr7 437 */ 438 eor RHASH.16b, RHASH.16b, v0.16b 439 440 PMUL_128x128_4x(RR0, RR1, RHASH, RH4, RTMP0, RTMP1, 441 RR2, RR3, v1, RH3, RTMP2, RTMP3, 442 RR4, RR5, v2, RH2, RTMP4, RTMP5, 443 RR6, RR7, v3, RH1, RTMP6, RTMP7) 444 445 eor RR0.16b, RR0.16b, RR2.16b 446 eor RR1.16b, RR1.16b, RR3.16b 447 eor RR0.16b, RR0.16b, RR4.16b 448 eor RR1.16b, RR1.16b, RR5.16b 449 eor RR0.16b, RR0.16b, RR6.16b 450 eor RR1.16b, RR1.16b, RR7.16b 451 452 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 453 454 cbz w4, .Lgcm_enc_hash_len 455 b .Lgcm_enc_loop_4x 456 457.Lgcm_enc_loop_1x: 458 cmp w4, #16 459 blt .Lgcm_enc_tail 460 461 sub w4, w4, #16 462 463 /* construct CTRs */ 464 inc32_le128(v0) 465 466 ld1 {RTMP0.16b}, [x2], #16 467 468 SM4_CRYPT_BLK(v0) 469 470 eor v0.16b, v0.16b, RTMP0.16b 471 st1 {v0.16b}, [x1], #16 472 473 /* ghash update */ 474 rbit v0.16b, v0.16b 475 eor RHASH.16b, RHASH.16b, v0.16b 476 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 477 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 478 479 cbz w4, .Lgcm_enc_hash_len 480 b .Lgcm_enc_loop_1x 481 482.Lgcm_enc_tail: 483 /* construct CTRs */ 484 inc32_le128(v0) 485 SM4_CRYPT_BLK(v0) 486 487 /* load permute table */ 488 adr_l x0, .Lcts_permute_table 489 add x0, x0, #32 490 sub x0, x0, w4, uxtw 491 ld1 {v3.16b}, [x0] 492 493.Lgcm_enc_tail_loop: 494 /* do encrypt */ 495 ldrb w0, [x2], #1 /* get 1 byte from input */ 496 umov w6, v0.b[0] /* get top crypted byte */ 497 eor w6, w6, w0 /* w6 = CTR ^ input */ 498 strb w6, [x1], #1 /* store out byte */ 499 500 /* shift right out one byte */ 501 ext v0.16b, v0.16b, v0.16b, #1 502 /* the last ciphertext is placed in high bytes */ 503 ins v0.b[15], w6 504 505 subs w4, w4, #1 506 bne .Lgcm_enc_tail_loop 507 508 /* padding last block with zeros */ 509 tbl v0.16b, {v0.16b}, v3.16b 510 511 /* ghash update */ 512 rbit v0.16b, v0.16b 513 eor RHASH.16b, RHASH.16b, v0.16b 514 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 515 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 516 517.Lgcm_enc_hash_len: 518 cbz x7, .Lgcm_enc_end 519 520 GTAG_HASH_LENGTHS(v1, v3) 521 522 b .Lgcm_enc_ret 523 524.Lgcm_enc_end: 525 /* store new CTR */ 526 rev x8, x8 527 rev x9, x9 528 stp x8, x9, [x3] 529 530 rbit RHASH.16b, RHASH.16b 531 532.Lgcm_enc_ret: 533 /* store new MAC */ 534 st1 {RHASH.2d}, [x5] 535 536 ret 537SYM_FUNC_END(sm4_ce_pmull_gcm_enc) 538 539#undef RR1 540#undef RR3 541#undef RR5 542#undef RR7 543#undef RR0 544#undef RR2 545#undef RR4 546#undef RR6 547#undef RTMP0 548#undef RTMP1 549#undef RTMP2 550#undef RTMP3 551#undef RTMP4 552#undef RTMP5 553#undef RTMP6 554#undef RTMP7 555#undef RH1 556#undef RH2 557#undef RH3 558#undef RH4 559 560 561/* Register macros for decrypt */ 562 563/* v0-v2 for building CTRs, v3-v5 for saving inputs */ 564 565#define RR1 v6 566#define RR3 v7 567#define RR5 v8 568 569#define RR0 v9 570#define RR2 v10 571#define RR4 v11 572 573#define RTMP0 v12 574#define RTMP1 v13 575#define RTMP2 v14 576#define RTMP3 v15 577#define RTMP4 v16 578#define RTMP5 v17 579 580#define RH1 v18 581#define RH2 v19 582#define RH3 v20 583 584.align 3 585SYM_TYPED_FUNC_START(sm4_ce_pmull_gcm_dec) 586 /* input: 587 * x0: round key array, CTX 588 * x1: dst 589 * x2: src 590 * x3: ctr (big endian, 128 bit) 591 * w4: nbytes 592 * x5: ghash result 593 * x6: ghash table 594 * x7: lengths (only for last block) 595 */ 596 SM4_PREPARE(x0) 597 598 ldp x8, x9, [x3] 599 rev x8, x8 600 rev x9, x9 601 602 ld1 {RH1.16b-RH3.16b}, [x6] 603 604 ld1 {RHASH.16b}, [x5] 605 rbit RHASH.16b, RHASH.16b 606 607 adr_l x6, .Lghash_rconst 608 ld1r {RRCONST.2d}, [x6] 609 610 eor RZERO.16b, RZERO.16b, RZERO.16b 611 612 cbz w4, .Lgcm_dec_hash_len 613 614.Lgcm_dec_loop_3x: 615 cmp w4, #(3 * 16) 616 blt .Lgcm_dec_loop_1x 617 618 sub w4, w4, #(3 * 16) 619 620 ld1 {v3.16b-v5.16b}, [x2], #(3 * 16) 621 622 /* construct CTRs */ 623 inc32_le128(v0) /* +0 */ 624 rbit v6.16b, v3.16b 625 inc32_le128(v1) /* +1 */ 626 rbit v7.16b, v4.16b 627 inc32_le128(v2) /* +2 */ 628 rbit v8.16b, v5.16b 629 630 eor RHASH.16b, RHASH.16b, v6.16b 631 632 /* decrypt & ghash update */ 633 SM4_CRYPT_PMUL_128x128_BLK3(v0, v1, v2, 634 RR0, RR1, RHASH, RH3, RTMP0, RTMP1, 635 RR2, RR3, v7, RH2, RTMP2, RTMP3, 636 RR4, RR5, v8, RH1, RTMP4, RTMP5) 637 638 eor v0.16b, v0.16b, v3.16b 639 eor v1.16b, v1.16b, v4.16b 640 eor v2.16b, v2.16b, v5.16b 641 642 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP0, RTMP1) 643 644 st1 {v0.16b-v2.16b}, [x1], #(3 * 16) 645 646 cbz w4, .Lgcm_dec_hash_len 647 b .Lgcm_dec_loop_3x 648 649.Lgcm_dec_loop_1x: 650 cmp w4, #16 651 blt .Lgcm_dec_tail 652 653 sub w4, w4, #16 654 655 ld1 {v3.16b}, [x2], #16 656 657 /* construct CTRs */ 658 inc32_le128(v0) 659 rbit v6.16b, v3.16b 660 661 eor RHASH.16b, RHASH.16b, v6.16b 662 663 SM4_CRYPT_PMUL_128x128_BLK(v0, RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 664 665 eor v0.16b, v0.16b, v3.16b 666 667 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 668 669 st1 {v0.16b}, [x1], #16 670 671 cbz w4, .Lgcm_dec_hash_len 672 b .Lgcm_dec_loop_1x 673 674.Lgcm_dec_tail: 675 /* construct CTRs */ 676 inc32_le128(v0) 677 SM4_CRYPT_BLK(v0) 678 679 /* load permute table */ 680 adr_l x0, .Lcts_permute_table 681 add x0, x0, #32 682 sub x0, x0, w4, uxtw 683 ld1 {v3.16b}, [x0] 684 685.Lgcm_dec_tail_loop: 686 /* do decrypt */ 687 ldrb w0, [x2], #1 /* get 1 byte from input */ 688 umov w6, v0.b[0] /* get top crypted byte */ 689 eor w6, w6, w0 /* w6 = CTR ^ input */ 690 strb w6, [x1], #1 /* store out byte */ 691 692 /* shift right out one byte */ 693 ext v0.16b, v0.16b, v0.16b, #1 694 /* the last ciphertext is placed in high bytes */ 695 ins v0.b[15], w0 696 697 subs w4, w4, #1 698 bne .Lgcm_dec_tail_loop 699 700 /* padding last block with zeros */ 701 tbl v0.16b, {v0.16b}, v3.16b 702 703 /* ghash update */ 704 rbit v0.16b, v0.16b 705 eor RHASH.16b, RHASH.16b, v0.16b 706 PMUL_128x128(RR0, RR1, RHASH, RH1, RTMP0, RTMP1) 707 REDUCTION(RHASH, RR0, RR1, RRCONST, RTMP2, RTMP3) 708 709.Lgcm_dec_hash_len: 710 cbz x7, .Lgcm_dec_end 711 712 GTAG_HASH_LENGTHS(v1, v3) 713 714 b .Lgcm_dec_ret 715 716.Lgcm_dec_end: 717 /* store new CTR */ 718 rev x8, x8 719 rev x9, x9 720 stp x8, x9, [x3] 721 722 rbit RHASH.16b, RHASH.16b 723 724.Lgcm_dec_ret: 725 /* store new MAC */ 726 st1 {RHASH.2d}, [x5] 727 728 ret 729SYM_FUNC_END(sm4_ce_pmull_gcm_dec) 730 731 .section ".rodata", "a" 732 .align 4 733.Lcts_permute_table: 734 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 735 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 736 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 737 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 738 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 739 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 740 741.Lghash_rconst: 742 .quad 0x87 743