1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 with Crypto Extensions 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13#include "sm4-ce-asm.h" 14 15.arch armv8-a+crypto 16 17.irp b, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, \ 18 20, 24, 25, 26, 27, 28, 29, 30, 31 19 .set .Lv\b\().4s, \b 20.endr 21 22.macro sm4e, vd, vn 23 .inst 0xcec08400 | (.L\vn << 5) | .L\vd 24.endm 25 26.macro sm4ekey, vd, vn, vm 27 .inst 0xce60c800 | (.L\vm << 16) | (.L\vn << 5) | .L\vd 28.endm 29 30/* Register macros */ 31 32#define RTMP0 v16 33#define RTMP1 v17 34#define RTMP2 v18 35#define RTMP3 v19 36 37#define RIV v20 38#define RMAC v20 39#define RMASK v21 40 41 42.align 3 43SYM_FUNC_START(sm4_ce_expand_key) 44 /* input: 45 * x0: 128-bit key 46 * x1: rkey_enc 47 * x2: rkey_dec 48 * x3: fk array 49 * x4: ck array 50 */ 51 ld1 {v0.16b}, [x0]; 52 rev32 v0.16b, v0.16b; 53 ld1 {v1.16b}, [x3]; 54 /* load ck */ 55 ld1 {v24.16b-v27.16b}, [x4], #64; 56 ld1 {v28.16b-v31.16b}, [x4]; 57 58 /* input ^ fk */ 59 eor v0.16b, v0.16b, v1.16b; 60 61 sm4ekey v0.4s, v0.4s, v24.4s; 62 sm4ekey v1.4s, v0.4s, v25.4s; 63 sm4ekey v2.4s, v1.4s, v26.4s; 64 sm4ekey v3.4s, v2.4s, v27.4s; 65 sm4ekey v4.4s, v3.4s, v28.4s; 66 sm4ekey v5.4s, v4.4s, v29.4s; 67 sm4ekey v6.4s, v5.4s, v30.4s; 68 sm4ekey v7.4s, v6.4s, v31.4s; 69 70 adr_l x5, .Lbswap128_mask 71 ld1 {v24.16b}, [x5] 72 73 st1 {v0.16b-v3.16b}, [x1], #64; 74 st1 {v4.16b-v7.16b}, [x1]; 75 76 tbl v16.16b, {v7.16b}, v24.16b 77 tbl v17.16b, {v6.16b}, v24.16b 78 tbl v18.16b, {v5.16b}, v24.16b 79 tbl v19.16b, {v4.16b}, v24.16b 80 tbl v20.16b, {v3.16b}, v24.16b 81 tbl v21.16b, {v2.16b}, v24.16b 82 tbl v22.16b, {v1.16b}, v24.16b 83 tbl v23.16b, {v0.16b}, v24.16b 84 85 st1 {v16.16b-v19.16b}, [x2], #64 86 st1 {v20.16b-v23.16b}, [x2] 87 88 ret; 89SYM_FUNC_END(sm4_ce_expand_key) 90 91.align 3 92SYM_FUNC_START(sm4_ce_crypt_block) 93 /* input: 94 * x0: round key array, CTX 95 * x1: dst 96 * x2: src 97 */ 98 SM4_PREPARE(x0) 99 100 ld1 {v0.16b}, [x2]; 101 SM4_CRYPT_BLK(v0); 102 st1 {v0.16b}, [x1]; 103 104 ret; 105SYM_FUNC_END(sm4_ce_crypt_block) 106 107.align 3 108SYM_FUNC_START(sm4_ce_crypt) 109 /* input: 110 * x0: round key array, CTX 111 * x1: dst 112 * x2: src 113 * w3: nblocks 114 */ 115 SM4_PREPARE(x0) 116 117.Lcrypt_loop_blk: 118 sub w3, w3, #8; 119 tbnz w3, #31, .Lcrypt_tail8; 120 121 ld1 {v0.16b-v3.16b}, [x2], #64; 122 ld1 {v4.16b-v7.16b}, [x2], #64; 123 124 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7); 125 126 st1 {v0.16b-v3.16b}, [x1], #64; 127 st1 {v4.16b-v7.16b}, [x1], #64; 128 129 cbz w3, .Lcrypt_end; 130 b .Lcrypt_loop_blk; 131 132.Lcrypt_tail8: 133 add w3, w3, #8; 134 cmp w3, #4; 135 blt .Lcrypt_tail4; 136 137 sub w3, w3, #4; 138 139 ld1 {v0.16b-v3.16b}, [x2], #64; 140 SM4_CRYPT_BLK4(v0, v1, v2, v3); 141 st1 {v0.16b-v3.16b}, [x1], #64; 142 143 cbz w3, .Lcrypt_end; 144 145.Lcrypt_tail4: 146 sub w3, w3, #1; 147 148 ld1 {v0.16b}, [x2], #16; 149 SM4_CRYPT_BLK(v0); 150 st1 {v0.16b}, [x1], #16; 151 152 cbnz w3, .Lcrypt_tail4; 153 154.Lcrypt_end: 155 ret; 156SYM_FUNC_END(sm4_ce_crypt) 157 158.align 3 159SYM_FUNC_START(sm4_ce_cbc_enc) 160 /* input: 161 * x0: round key array, CTX 162 * x1: dst 163 * x2: src 164 * x3: iv (big endian, 128 bit) 165 * w4: nblocks 166 */ 167 SM4_PREPARE(x0) 168 169 ld1 {RIV.16b}, [x3] 170 171.Lcbc_enc_loop_4x: 172 cmp w4, #4 173 blt .Lcbc_enc_loop_1x 174 175 sub w4, w4, #4 176 177 ld1 {v0.16b-v3.16b}, [x2], #64 178 179 eor v0.16b, v0.16b, RIV.16b 180 SM4_CRYPT_BLK(v0) 181 eor v1.16b, v1.16b, v0.16b 182 SM4_CRYPT_BLK(v1) 183 eor v2.16b, v2.16b, v1.16b 184 SM4_CRYPT_BLK(v2) 185 eor v3.16b, v3.16b, v2.16b 186 SM4_CRYPT_BLK(v3) 187 188 st1 {v0.16b-v3.16b}, [x1], #64 189 mov RIV.16b, v3.16b 190 191 cbz w4, .Lcbc_enc_end 192 b .Lcbc_enc_loop_4x 193 194.Lcbc_enc_loop_1x: 195 sub w4, w4, #1 196 197 ld1 {v0.16b}, [x2], #16 198 199 eor RIV.16b, RIV.16b, v0.16b 200 SM4_CRYPT_BLK(RIV) 201 202 st1 {RIV.16b}, [x1], #16 203 204 cbnz w4, .Lcbc_enc_loop_1x 205 206.Lcbc_enc_end: 207 /* store new IV */ 208 st1 {RIV.16b}, [x3] 209 210 ret 211SYM_FUNC_END(sm4_ce_cbc_enc) 212 213.align 3 214SYM_FUNC_START(sm4_ce_cbc_dec) 215 /* input: 216 * x0: round key array, CTX 217 * x1: dst 218 * x2: src 219 * x3: iv (big endian, 128 bit) 220 * w4: nblocks 221 */ 222 SM4_PREPARE(x0) 223 224 ld1 {RIV.16b}, [x3] 225 226.Lcbc_dec_loop_8x: 227 sub w4, w4, #8 228 tbnz w4, #31, .Lcbc_dec_4x 229 230 ld1 {v0.16b-v3.16b}, [x2], #64 231 ld1 {v4.16b-v7.16b}, [x2], #64 232 233 rev32 v8.16b, v0.16b 234 rev32 v9.16b, v1.16b 235 rev32 v10.16b, v2.16b 236 rev32 v11.16b, v3.16b 237 rev32 v12.16b, v4.16b 238 rev32 v13.16b, v5.16b 239 rev32 v14.16b, v6.16b 240 rev32 v15.16b, v7.16b 241 242 SM4_CRYPT_BLK8_BE(v8, v9, v10, v11, v12, v13, v14, v15) 243 244 eor v8.16b, v8.16b, RIV.16b 245 eor v9.16b, v9.16b, v0.16b 246 eor v10.16b, v10.16b, v1.16b 247 eor v11.16b, v11.16b, v2.16b 248 eor v12.16b, v12.16b, v3.16b 249 eor v13.16b, v13.16b, v4.16b 250 eor v14.16b, v14.16b, v5.16b 251 eor v15.16b, v15.16b, v6.16b 252 253 st1 {v8.16b-v11.16b}, [x1], #64 254 st1 {v12.16b-v15.16b}, [x1], #64 255 256 mov RIV.16b, v7.16b 257 258 cbz w4, .Lcbc_dec_end 259 b .Lcbc_dec_loop_8x 260 261.Lcbc_dec_4x: 262 add w4, w4, #8 263 cmp w4, #4 264 blt .Lcbc_dec_loop_1x 265 266 sub w4, w4, #4 267 268 ld1 {v0.16b-v3.16b}, [x2], #64 269 270 rev32 v8.16b, v0.16b 271 rev32 v9.16b, v1.16b 272 rev32 v10.16b, v2.16b 273 rev32 v11.16b, v3.16b 274 275 SM4_CRYPT_BLK4_BE(v8, v9, v10, v11) 276 277 eor v8.16b, v8.16b, RIV.16b 278 eor v9.16b, v9.16b, v0.16b 279 eor v10.16b, v10.16b, v1.16b 280 eor v11.16b, v11.16b, v2.16b 281 282 st1 {v8.16b-v11.16b}, [x1], #64 283 284 mov RIV.16b, v3.16b 285 286 cbz w4, .Lcbc_dec_end 287 288.Lcbc_dec_loop_1x: 289 sub w4, w4, #1 290 291 ld1 {v0.16b}, [x2], #16 292 293 rev32 v8.16b, v0.16b 294 295 SM4_CRYPT_BLK_BE(v8) 296 297 eor v8.16b, v8.16b, RIV.16b 298 st1 {v8.16b}, [x1], #16 299 300 mov RIV.16b, v0.16b 301 302 cbnz w4, .Lcbc_dec_loop_1x 303 304.Lcbc_dec_end: 305 /* store new IV */ 306 st1 {RIV.16b}, [x3] 307 308 ret 309SYM_FUNC_END(sm4_ce_cbc_dec) 310 311.align 3 312SYM_FUNC_START(sm4_ce_cbc_cts_enc) 313 /* input: 314 * x0: round key array, CTX 315 * x1: dst 316 * x2: src 317 * x3: iv (big endian, 128 bit) 318 * w4: nbytes 319 */ 320 SM4_PREPARE(x0) 321 322 sub w5, w4, #16 323 uxtw x5, w5 324 325 ld1 {RIV.16b}, [x3] 326 327 ld1 {v0.16b}, [x2] 328 eor RIV.16b, RIV.16b, v0.16b 329 SM4_CRYPT_BLK(RIV) 330 331 /* load permute table */ 332 adr_l x6, .Lcts_permute_table 333 add x7, x6, #32 334 add x6, x6, x5 335 sub x7, x7, x5 336 ld1 {v3.16b}, [x6] 337 ld1 {v4.16b}, [x7] 338 339 /* overlapping loads */ 340 add x2, x2, x5 341 ld1 {v1.16b}, [x2] 342 343 /* create Cn from En-1 */ 344 tbl v0.16b, {RIV.16b}, v3.16b 345 /* padding Pn with zeros */ 346 tbl v1.16b, {v1.16b}, v4.16b 347 348 eor v1.16b, v1.16b, RIV.16b 349 SM4_CRYPT_BLK(v1) 350 351 /* overlapping stores */ 352 add x5, x1, x5 353 st1 {v0.16b}, [x5] 354 st1 {v1.16b}, [x1] 355 356 ret 357SYM_FUNC_END(sm4_ce_cbc_cts_enc) 358 359.align 3 360SYM_FUNC_START(sm4_ce_cbc_cts_dec) 361 /* input: 362 * x0: round key array, CTX 363 * x1: dst 364 * x2: src 365 * x3: iv (big endian, 128 bit) 366 * w4: nbytes 367 */ 368 SM4_PREPARE(x0) 369 370 sub w5, w4, #16 371 uxtw x5, w5 372 373 ld1 {RIV.16b}, [x3] 374 375 /* load permute table */ 376 adr_l x6, .Lcts_permute_table 377 add x7, x6, #32 378 add x6, x6, x5 379 sub x7, x7, x5 380 ld1 {v3.16b}, [x6] 381 ld1 {v4.16b}, [x7] 382 383 /* overlapping loads */ 384 ld1 {v0.16b}, [x2], x5 385 ld1 {v1.16b}, [x2] 386 387 SM4_CRYPT_BLK(v0) 388 /* select the first Ln bytes of Xn to create Pn */ 389 tbl v2.16b, {v0.16b}, v3.16b 390 eor v2.16b, v2.16b, v1.16b 391 392 /* overwrite the first Ln bytes with Cn to create En-1 */ 393 tbx v0.16b, {v1.16b}, v4.16b 394 SM4_CRYPT_BLK(v0) 395 eor v0.16b, v0.16b, RIV.16b 396 397 /* overlapping stores */ 398 add x5, x1, x5 399 st1 {v2.16b}, [x5] 400 st1 {v0.16b}, [x1] 401 402 ret 403SYM_FUNC_END(sm4_ce_cbc_cts_dec) 404 405.align 3 406SYM_FUNC_START(sm4_ce_ctr_enc) 407 /* input: 408 * x0: round key array, CTX 409 * x1: dst 410 * x2: src 411 * x3: ctr (big endian, 128 bit) 412 * w4: nblocks 413 */ 414 SM4_PREPARE(x0) 415 416 ldp x7, x8, [x3] 417 rev x7, x7 418 rev x8, x8 419 420.Lctr_loop_8x: 421 sub w4, w4, #8 422 tbnz w4, #31, .Lctr_4x 423 424#define inc_le128(vctr) \ 425 mov vctr.d[1], x8; \ 426 mov vctr.d[0], x7; \ 427 adds x8, x8, #1; \ 428 rev64 vctr.16b, vctr.16b; \ 429 adc x7, x7, xzr; 430 431 /* construct CTRs */ 432 inc_le128(v0) /* +0 */ 433 inc_le128(v1) /* +1 */ 434 inc_le128(v2) /* +2 */ 435 inc_le128(v3) /* +3 */ 436 inc_le128(v4) /* +4 */ 437 inc_le128(v5) /* +5 */ 438 inc_le128(v6) /* +6 */ 439 inc_le128(v7) /* +7 */ 440 441 ld1 {v8.16b-v11.16b}, [x2], #64 442 ld1 {v12.16b-v15.16b}, [x2], #64 443 444 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 445 446 eor v0.16b, v0.16b, v8.16b 447 eor v1.16b, v1.16b, v9.16b 448 eor v2.16b, v2.16b, v10.16b 449 eor v3.16b, v3.16b, v11.16b 450 eor v4.16b, v4.16b, v12.16b 451 eor v5.16b, v5.16b, v13.16b 452 eor v6.16b, v6.16b, v14.16b 453 eor v7.16b, v7.16b, v15.16b 454 455 st1 {v0.16b-v3.16b}, [x1], #64 456 st1 {v4.16b-v7.16b}, [x1], #64 457 458 cbz w4, .Lctr_end 459 b .Lctr_loop_8x 460 461.Lctr_4x: 462 add w4, w4, #8 463 cmp w4, #4 464 blt .Lctr_loop_1x 465 466 sub w4, w4, #4 467 468 /* construct CTRs */ 469 inc_le128(v0) /* +0 */ 470 inc_le128(v1) /* +1 */ 471 inc_le128(v2) /* +2 */ 472 inc_le128(v3) /* +3 */ 473 474 ld1 {v8.16b-v11.16b}, [x2], #64 475 476 SM4_CRYPT_BLK4(v0, v1, v2, v3) 477 478 eor v0.16b, v0.16b, v8.16b 479 eor v1.16b, v1.16b, v9.16b 480 eor v2.16b, v2.16b, v10.16b 481 eor v3.16b, v3.16b, v11.16b 482 483 st1 {v0.16b-v3.16b}, [x1], #64 484 485 cbz w4, .Lctr_end 486 487.Lctr_loop_1x: 488 sub w4, w4, #1 489 490 /* construct CTRs */ 491 inc_le128(v0) 492 493 ld1 {v8.16b}, [x2], #16 494 495 SM4_CRYPT_BLK(v0) 496 497 eor v0.16b, v0.16b, v8.16b 498 st1 {v0.16b}, [x1], #16 499 500 cbnz w4, .Lctr_loop_1x 501 502.Lctr_end: 503 /* store new CTR */ 504 rev x7, x7 505 rev x8, x8 506 stp x7, x8, [x3] 507 508 ret 509SYM_FUNC_END(sm4_ce_ctr_enc) 510 511 512#define tweak_next(vt, vin, RTMP) \ 513 sshr RTMP.2d, vin.2d, #63; \ 514 and RTMP.16b, RTMP.16b, RMASK.16b; \ 515 add vt.2d, vin.2d, vin.2d; \ 516 ext RTMP.16b, RTMP.16b, RTMP.16b, #8; \ 517 eor vt.16b, vt.16b, RTMP.16b; 518 519.align 3 520SYM_FUNC_START(sm4_ce_xts_enc) 521 /* input: 522 * x0: round key array, CTX 523 * x1: dst 524 * x2: src 525 * x3: tweak (big endian, 128 bit) 526 * w4: nbytes 527 * x5: round key array for IV 528 */ 529 ld1 {v8.16b}, [x3] 530 531 cbz x5, .Lxts_enc_nofirst 532 533 SM4_PREPARE(x5) 534 535 /* Generate first tweak */ 536 SM4_CRYPT_BLK(v8) 537 538.Lxts_enc_nofirst: 539 SM4_PREPARE(x0) 540 541 ands w5, w4, #15 542 lsr w4, w4, #4 543 sub w6, w4, #1 544 csel w4, w4, w6, eq 545 uxtw x5, w5 546 547 movi RMASK.2s, #0x1 548 movi RTMP0.2s, #0x87 549 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s 550 551 cbz w4, .Lxts_enc_cts 552 553.Lxts_enc_loop_8x: 554 sub w4, w4, #8 555 tbnz w4, #31, .Lxts_enc_4x 556 557 tweak_next( v9, v8, RTMP0) 558 tweak_next(v10, v9, RTMP1) 559 tweak_next(v11, v10, RTMP2) 560 tweak_next(v12, v11, RTMP3) 561 tweak_next(v13, v12, RTMP0) 562 tweak_next(v14, v13, RTMP1) 563 tweak_next(v15, v14, RTMP2) 564 565 ld1 {v0.16b-v3.16b}, [x2], #64 566 ld1 {v4.16b-v7.16b}, [x2], #64 567 eor v0.16b, v0.16b, v8.16b 568 eor v1.16b, v1.16b, v9.16b 569 eor v2.16b, v2.16b, v10.16b 570 eor v3.16b, v3.16b, v11.16b 571 eor v4.16b, v4.16b, v12.16b 572 eor v5.16b, v5.16b, v13.16b 573 eor v6.16b, v6.16b, v14.16b 574 eor v7.16b, v7.16b, v15.16b 575 576 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 577 578 eor v0.16b, v0.16b, v8.16b 579 eor v1.16b, v1.16b, v9.16b 580 eor v2.16b, v2.16b, v10.16b 581 eor v3.16b, v3.16b, v11.16b 582 eor v4.16b, v4.16b, v12.16b 583 eor v5.16b, v5.16b, v13.16b 584 eor v6.16b, v6.16b, v14.16b 585 eor v7.16b, v7.16b, v15.16b 586 st1 {v0.16b-v3.16b}, [x1], #64 587 st1 {v4.16b-v7.16b}, [x1], #64 588 589 tweak_next(v8, v15, RTMP3) 590 591 cbz w4, .Lxts_enc_cts 592 b .Lxts_enc_loop_8x 593 594.Lxts_enc_4x: 595 add w4, w4, #8 596 cmp w4, #4 597 blt .Lxts_enc_loop_1x 598 599 sub w4, w4, #4 600 601 tweak_next( v9, v8, RTMP0) 602 tweak_next(v10, v9, RTMP1) 603 tweak_next(v11, v10, RTMP2) 604 605 ld1 {v0.16b-v3.16b}, [x2], #64 606 eor v0.16b, v0.16b, v8.16b 607 eor v1.16b, v1.16b, v9.16b 608 eor v2.16b, v2.16b, v10.16b 609 eor v3.16b, v3.16b, v11.16b 610 611 SM4_CRYPT_BLK4(v0, v1, v2, v3) 612 613 eor v0.16b, v0.16b, v8.16b 614 eor v1.16b, v1.16b, v9.16b 615 eor v2.16b, v2.16b, v10.16b 616 eor v3.16b, v3.16b, v11.16b 617 st1 {v0.16b-v3.16b}, [x1], #64 618 619 tweak_next(v8, v11, RTMP3) 620 621 cbz w4, .Lxts_enc_cts 622 623.Lxts_enc_loop_1x: 624 sub w4, w4, #1 625 626 ld1 {v0.16b}, [x2], #16 627 eor v0.16b, v0.16b, v8.16b 628 629 SM4_CRYPT_BLK(v0) 630 631 eor v0.16b, v0.16b, v8.16b 632 st1 {v0.16b}, [x1], #16 633 634 tweak_next(v8, v8, RTMP0) 635 636 cbnz w4, .Lxts_enc_loop_1x 637 638.Lxts_enc_cts: 639 cbz x5, .Lxts_enc_end 640 641 /* cipher text stealing */ 642 643 tweak_next(v9, v8, RTMP0) 644 ld1 {v0.16b}, [x2] 645 eor v0.16b, v0.16b, v8.16b 646 SM4_CRYPT_BLK(v0) 647 eor v0.16b, v0.16b, v8.16b 648 649 /* load permute table */ 650 adr_l x6, .Lcts_permute_table 651 add x7, x6, #32 652 add x6, x6, x5 653 sub x7, x7, x5 654 ld1 {v3.16b}, [x6] 655 ld1 {v4.16b}, [x7] 656 657 /* overlapping loads */ 658 add x2, x2, x5 659 ld1 {v1.16b}, [x2] 660 661 /* create Cn from En-1 */ 662 tbl v2.16b, {v0.16b}, v3.16b 663 /* padding Pn with En-1 at the end */ 664 tbx v0.16b, {v1.16b}, v4.16b 665 666 eor v0.16b, v0.16b, v9.16b 667 SM4_CRYPT_BLK(v0) 668 eor v0.16b, v0.16b, v9.16b 669 670 671 /* overlapping stores */ 672 add x5, x1, x5 673 st1 {v2.16b}, [x5] 674 st1 {v0.16b}, [x1] 675 676 b .Lxts_enc_ret 677 678.Lxts_enc_end: 679 /* store new tweak */ 680 st1 {v8.16b}, [x3] 681 682.Lxts_enc_ret: 683 ret 684SYM_FUNC_END(sm4_ce_xts_enc) 685 686.align 3 687SYM_FUNC_START(sm4_ce_xts_dec) 688 /* input: 689 * x0: round key array, CTX 690 * x1: dst 691 * x2: src 692 * x3: tweak (big endian, 128 bit) 693 * w4: nbytes 694 * x5: round key array for IV 695 */ 696 ld1 {v8.16b}, [x3] 697 698 cbz x5, .Lxts_dec_nofirst 699 700 SM4_PREPARE(x5) 701 702 /* Generate first tweak */ 703 SM4_CRYPT_BLK(v8) 704 705.Lxts_dec_nofirst: 706 SM4_PREPARE(x0) 707 708 ands w5, w4, #15 709 lsr w4, w4, #4 710 sub w6, w4, #1 711 csel w4, w4, w6, eq 712 uxtw x5, w5 713 714 movi RMASK.2s, #0x1 715 movi RTMP0.2s, #0x87 716 uzp1 RMASK.4s, RMASK.4s, RTMP0.4s 717 718 cbz w4, .Lxts_dec_cts 719 720.Lxts_dec_loop_8x: 721 sub w4, w4, #8 722 tbnz w4, #31, .Lxts_dec_4x 723 724 tweak_next( v9, v8, RTMP0) 725 tweak_next(v10, v9, RTMP1) 726 tweak_next(v11, v10, RTMP2) 727 tweak_next(v12, v11, RTMP3) 728 tweak_next(v13, v12, RTMP0) 729 tweak_next(v14, v13, RTMP1) 730 tweak_next(v15, v14, RTMP2) 731 732 ld1 {v0.16b-v3.16b}, [x2], #64 733 ld1 {v4.16b-v7.16b}, [x2], #64 734 eor v0.16b, v0.16b, v8.16b 735 eor v1.16b, v1.16b, v9.16b 736 eor v2.16b, v2.16b, v10.16b 737 eor v3.16b, v3.16b, v11.16b 738 eor v4.16b, v4.16b, v12.16b 739 eor v5.16b, v5.16b, v13.16b 740 eor v6.16b, v6.16b, v14.16b 741 eor v7.16b, v7.16b, v15.16b 742 743 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 744 745 eor v0.16b, v0.16b, v8.16b 746 eor v1.16b, v1.16b, v9.16b 747 eor v2.16b, v2.16b, v10.16b 748 eor v3.16b, v3.16b, v11.16b 749 eor v4.16b, v4.16b, v12.16b 750 eor v5.16b, v5.16b, v13.16b 751 eor v6.16b, v6.16b, v14.16b 752 eor v7.16b, v7.16b, v15.16b 753 st1 {v0.16b-v3.16b}, [x1], #64 754 st1 {v4.16b-v7.16b}, [x1], #64 755 756 tweak_next(v8, v15, RTMP3) 757 758 cbz w4, .Lxts_dec_cts 759 b .Lxts_dec_loop_8x 760 761.Lxts_dec_4x: 762 add w4, w4, #8 763 cmp w4, #4 764 blt .Lxts_dec_loop_1x 765 766 sub w4, w4, #4 767 768 tweak_next( v9, v8, RTMP0) 769 tweak_next(v10, v9, RTMP1) 770 tweak_next(v11, v10, RTMP2) 771 772 ld1 {v0.16b-v3.16b}, [x2], #64 773 eor v0.16b, v0.16b, v8.16b 774 eor v1.16b, v1.16b, v9.16b 775 eor v2.16b, v2.16b, v10.16b 776 eor v3.16b, v3.16b, v11.16b 777 778 SM4_CRYPT_BLK4(v0, v1, v2, v3) 779 780 eor v0.16b, v0.16b, v8.16b 781 eor v1.16b, v1.16b, v9.16b 782 eor v2.16b, v2.16b, v10.16b 783 eor v3.16b, v3.16b, v11.16b 784 st1 {v0.16b-v3.16b}, [x1], #64 785 786 tweak_next(v8, v11, RTMP3) 787 788 cbz w4, .Lxts_dec_cts 789 790.Lxts_dec_loop_1x: 791 sub w4, w4, #1 792 793 ld1 {v0.16b}, [x2], #16 794 eor v0.16b, v0.16b, v8.16b 795 796 SM4_CRYPT_BLK(v0) 797 798 eor v0.16b, v0.16b, v8.16b 799 st1 {v0.16b}, [x1], #16 800 801 tweak_next(v8, v8, RTMP0) 802 803 cbnz w4, .Lxts_dec_loop_1x 804 805.Lxts_dec_cts: 806 cbz x5, .Lxts_dec_end 807 808 /* cipher text stealing */ 809 810 tweak_next(v9, v8, RTMP0) 811 ld1 {v0.16b}, [x2] 812 eor v0.16b, v0.16b, v9.16b 813 SM4_CRYPT_BLK(v0) 814 eor v0.16b, v0.16b, v9.16b 815 816 /* load permute table */ 817 adr_l x6, .Lcts_permute_table 818 add x7, x6, #32 819 add x6, x6, x5 820 sub x7, x7, x5 821 ld1 {v3.16b}, [x6] 822 ld1 {v4.16b}, [x7] 823 824 /* overlapping loads */ 825 add x2, x2, x5 826 ld1 {v1.16b}, [x2] 827 828 /* create Cn from En-1 */ 829 tbl v2.16b, {v0.16b}, v3.16b 830 /* padding Pn with En-1 at the end */ 831 tbx v0.16b, {v1.16b}, v4.16b 832 833 eor v0.16b, v0.16b, v8.16b 834 SM4_CRYPT_BLK(v0) 835 eor v0.16b, v0.16b, v8.16b 836 837 838 /* overlapping stores */ 839 add x5, x1, x5 840 st1 {v2.16b}, [x5] 841 st1 {v0.16b}, [x1] 842 843 b .Lxts_dec_ret 844 845.Lxts_dec_end: 846 /* store new tweak */ 847 st1 {v8.16b}, [x3] 848 849.Lxts_dec_ret: 850 ret 851SYM_FUNC_END(sm4_ce_xts_dec) 852 853.align 3 854SYM_FUNC_START(sm4_ce_mac_update) 855 /* input: 856 * x0: round key array, CTX 857 * x1: digest 858 * x2: src 859 * w3: nblocks 860 * w4: enc_before 861 * w5: enc_after 862 */ 863 SM4_PREPARE(x0) 864 865 ld1 {RMAC.16b}, [x1] 866 867 cbz w4, .Lmac_update 868 869 SM4_CRYPT_BLK(RMAC) 870 871.Lmac_update: 872 cbz w3, .Lmac_ret 873 874 sub w6, w3, #1 875 cmp w5, wzr 876 csel w3, w3, w6, ne 877 878 cbz w3, .Lmac_end 879 880.Lmac_loop_4x: 881 cmp w3, #4 882 blt .Lmac_loop_1x 883 884 sub w3, w3, #4 885 886 ld1 {v0.16b-v3.16b}, [x2], #64 887 888 eor RMAC.16b, RMAC.16b, v0.16b 889 SM4_CRYPT_BLK(RMAC) 890 eor RMAC.16b, RMAC.16b, v1.16b 891 SM4_CRYPT_BLK(RMAC) 892 eor RMAC.16b, RMAC.16b, v2.16b 893 SM4_CRYPT_BLK(RMAC) 894 eor RMAC.16b, RMAC.16b, v3.16b 895 SM4_CRYPT_BLK(RMAC) 896 897 cbz w3, .Lmac_end 898 b .Lmac_loop_4x 899 900.Lmac_loop_1x: 901 sub w3, w3, #1 902 903 ld1 {v0.16b}, [x2], #16 904 905 eor RMAC.16b, RMAC.16b, v0.16b 906 SM4_CRYPT_BLK(RMAC) 907 908 cbnz w3, .Lmac_loop_1x 909 910 911.Lmac_end: 912 cbnz w5, .Lmac_ret 913 914 ld1 {v0.16b}, [x2], #16 915 eor RMAC.16b, RMAC.16b, v0.16b 916 917.Lmac_ret: 918 st1 {RMAC.16b}, [x1] 919 ret 920SYM_FUNC_END(sm4_ce_mac_update) 921 922 923 .section ".rodata", "a" 924 .align 4 925.Lbswap128_mask: 926 .byte 0x0c, 0x0d, 0x0e, 0x0f, 0x08, 0x09, 0x0a, 0x0b 927 .byte 0x04, 0x05, 0x06, 0x07, 0x00, 0x01, 0x02, 0x03 928 929.Lcts_permute_table: 930 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 931 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 932 .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 933 .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf 934 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 935 .byte 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff 936