1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * SM4 Cipher Algorithm for ARMv8 NEON 4 * as specified in 5 * https://tools.ietf.org/id/draft-ribose-cfrg-sm4-10.html 6 * 7 * Copyright (C) 2022, Alibaba Group. 8 * Copyright (C) 2022 Tianjia Zhang <tianjia.zhang@linux.alibaba.com> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/assembler.h> 13 14/* Register macros */ 15 16#define RTMP0 v8 17#define RTMP1 v9 18#define RTMP2 v10 19#define RTMP3 v11 20 21#define RTMP4 v12 22#define RTMP5 v13 23#define RTMP6 v14 24#define RTMP7 v15 25 26#define RX0 v12 27#define RX1 v13 28#define RKEY v14 29#define RIV v15 30 31/* Helper macros. */ 32 33#define SM4_PREPARE() \ 34 adr_l x5, crypto_sm4_sbox; \ 35 ld1 {v16.16b-v19.16b}, [x5], #64; \ 36 ld1 {v20.16b-v23.16b}, [x5], #64; \ 37 ld1 {v24.16b-v27.16b}, [x5], #64; \ 38 ld1 {v28.16b-v31.16b}, [x5]; 39 40#define transpose_4x4(s0, s1, s2, s3) \ 41 zip1 RTMP0.4s, s0.4s, s1.4s; \ 42 zip1 RTMP1.4s, s2.4s, s3.4s; \ 43 zip2 RTMP2.4s, s0.4s, s1.4s; \ 44 zip2 RTMP3.4s, s2.4s, s3.4s; \ 45 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 46 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 47 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 48 zip2 s3.2d, RTMP2.2d, RTMP3.2d; 49 50#define transpose_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 51 zip1 RTMP0.4s, s0.4s, s1.4s; \ 52 zip1 RTMP1.4s, s2.4s, s3.4s; \ 53 zip2 RTMP2.4s, s0.4s, s1.4s; \ 54 zip2 RTMP3.4s, s2.4s, s3.4s; \ 55 zip1 RTMP4.4s, s4.4s, s5.4s; \ 56 zip1 RTMP5.4s, s6.4s, s7.4s; \ 57 zip2 RTMP6.4s, s4.4s, s5.4s; \ 58 zip2 RTMP7.4s, s6.4s, s7.4s; \ 59 zip1 s0.2d, RTMP0.2d, RTMP1.2d; \ 60 zip2 s1.2d, RTMP0.2d, RTMP1.2d; \ 61 zip1 s2.2d, RTMP2.2d, RTMP3.2d; \ 62 zip2 s3.2d, RTMP2.2d, RTMP3.2d; \ 63 zip1 s4.2d, RTMP4.2d, RTMP5.2d; \ 64 zip2 s5.2d, RTMP4.2d, RTMP5.2d; \ 65 zip1 s6.2d, RTMP6.2d, RTMP7.2d; \ 66 zip2 s7.2d, RTMP6.2d, RTMP7.2d; 67 68#define rotate_clockwise_4x4(s0, s1, s2, s3) \ 69 zip1 RTMP0.4s, s1.4s, s0.4s; \ 70 zip2 RTMP1.4s, s1.4s, s0.4s; \ 71 zip1 RTMP2.4s, s3.4s, s2.4s; \ 72 zip2 RTMP3.4s, s3.4s, s2.4s; \ 73 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 74 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 75 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 76 zip2 s3.2d, RTMP3.2d, RTMP1.2d; 77 78#define rotate_clockwise_4x4_2x(s0, s1, s2, s3, s4, s5, s6, s7) \ 79 zip1 RTMP0.4s, s1.4s, s0.4s; \ 80 zip1 RTMP2.4s, s3.4s, s2.4s; \ 81 zip2 RTMP1.4s, s1.4s, s0.4s; \ 82 zip2 RTMP3.4s, s3.4s, s2.4s; \ 83 zip1 RTMP4.4s, s5.4s, s4.4s; \ 84 zip1 RTMP6.4s, s7.4s, s6.4s; \ 85 zip2 RTMP5.4s, s5.4s, s4.4s; \ 86 zip2 RTMP7.4s, s7.4s, s6.4s; \ 87 zip1 s0.2d, RTMP2.2d, RTMP0.2d; \ 88 zip2 s1.2d, RTMP2.2d, RTMP0.2d; \ 89 zip1 s2.2d, RTMP3.2d, RTMP1.2d; \ 90 zip2 s3.2d, RTMP3.2d, RTMP1.2d; \ 91 zip1 s4.2d, RTMP6.2d, RTMP4.2d; \ 92 zip2 s5.2d, RTMP6.2d, RTMP4.2d; \ 93 zip1 s6.2d, RTMP7.2d, RTMP5.2d; \ 94 zip2 s7.2d, RTMP7.2d, RTMP5.2d; 95 96#define ROUND4(round, s0, s1, s2, s3) \ 97 dup RX0.4s, RKEY.s[round]; \ 98 /* rk ^ s1 ^ s2 ^ s3 */ \ 99 eor RTMP1.16b, s2.16b, s3.16b; \ 100 eor RX0.16b, RX0.16b, s1.16b; \ 101 eor RX0.16b, RX0.16b, RTMP1.16b; \ 102 \ 103 /* sbox, non-linear part */ \ 104 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 105 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 106 sub RX0.16b, RX0.16b, RTMP3.16b; \ 107 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 108 sub RX0.16b, RX0.16b, RTMP3.16b; \ 109 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 110 sub RX0.16b, RX0.16b, RTMP3.16b; \ 111 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 112 \ 113 /* linear part */ \ 114 shl RTMP1.4s, RTMP0.4s, #8; \ 115 shl RTMP2.4s, RTMP0.4s, #16; \ 116 shl RTMP3.4s, RTMP0.4s, #24; \ 117 sri RTMP1.4s, RTMP0.4s, #(32-8); \ 118 sri RTMP2.4s, RTMP0.4s, #(32-16); \ 119 sri RTMP3.4s, RTMP0.4s, #(32-24); \ 120 /* RTMP1 = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 121 eor RTMP1.16b, RTMP1.16b, RTMP0.16b; \ 122 eor RTMP1.16b, RTMP1.16b, RTMP2.16b; \ 123 /* RTMP3 = x ^ rol32(x, 24) ^ rol32(RTMP1, 2) */ \ 124 eor RTMP3.16b, RTMP3.16b, RTMP0.16b; \ 125 shl RTMP2.4s, RTMP1.4s, 2; \ 126 sri RTMP2.4s, RTMP1.4s, #(32-2); \ 127 eor RTMP3.16b, RTMP3.16b, RTMP2.16b; \ 128 /* s0 ^= RTMP3 */ \ 129 eor s0.16b, s0.16b, RTMP3.16b; 130 131#define SM4_CRYPT_BLK4_BE(b0, b1, b2, b3) \ 132 mov x6, 8; \ 1334: \ 134 ld1 {RKEY.4s}, [x0], #16; \ 135 subs x6, x6, #1; \ 136 \ 137 ROUND4(0, b0, b1, b2, b3); \ 138 ROUND4(1, b1, b2, b3, b0); \ 139 ROUND4(2, b2, b3, b0, b1); \ 140 ROUND4(3, b3, b0, b1, b2); \ 141 \ 142 bne 4b; \ 143 \ 144 rev32 b0.16b, b0.16b; \ 145 rev32 b1.16b, b1.16b; \ 146 rev32 b2.16b, b2.16b; \ 147 rev32 b3.16b, b3.16b; \ 148 \ 149 rotate_clockwise_4x4(b0, b1, b2, b3); \ 150 \ 151 /* repoint to rkey */ \ 152 sub x0, x0, #128; 153 154#define SM4_CRYPT_BLK4(b0, b1, b2, b3) \ 155 rev32 b0.16b, b0.16b; \ 156 rev32 b1.16b, b1.16b; \ 157 rev32 b2.16b, b2.16b; \ 158 rev32 b3.16b, b3.16b; \ 159 SM4_CRYPT_BLK4_BE(b0, b1, b2, b3); 160 161#define ROUND8(round, s0, s1, s2, s3, t0, t1, t2, t3) \ 162 /* rk ^ s1 ^ s2 ^ s3 */ \ 163 dup RX0.4s, RKEY.s[round]; \ 164 eor RTMP0.16b, s2.16b, s3.16b; \ 165 mov RX1.16b, RX0.16b; \ 166 eor RTMP1.16b, t2.16b, t3.16b; \ 167 eor RX0.16b, RX0.16b, s1.16b; \ 168 eor RX1.16b, RX1.16b, t1.16b; \ 169 eor RX0.16b, RX0.16b, RTMP0.16b; \ 170 eor RX1.16b, RX1.16b, RTMP1.16b; \ 171 \ 172 /* sbox, non-linear part */ \ 173 movi RTMP3.16b, #64; /* sizeof(sbox) / 4 */ \ 174 tbl RTMP0.16b, {v16.16b-v19.16b}, RX0.16b; \ 175 tbl RTMP1.16b, {v16.16b-v19.16b}, RX1.16b; \ 176 sub RX0.16b, RX0.16b, RTMP3.16b; \ 177 sub RX1.16b, RX1.16b, RTMP3.16b; \ 178 tbx RTMP0.16b, {v20.16b-v23.16b}, RX0.16b; \ 179 tbx RTMP1.16b, {v20.16b-v23.16b}, RX1.16b; \ 180 sub RX0.16b, RX0.16b, RTMP3.16b; \ 181 sub RX1.16b, RX1.16b, RTMP3.16b; \ 182 tbx RTMP0.16b, {v24.16b-v27.16b}, RX0.16b; \ 183 tbx RTMP1.16b, {v24.16b-v27.16b}, RX1.16b; \ 184 sub RX0.16b, RX0.16b, RTMP3.16b; \ 185 sub RX1.16b, RX1.16b, RTMP3.16b; \ 186 tbx RTMP0.16b, {v28.16b-v31.16b}, RX0.16b; \ 187 tbx RTMP1.16b, {v28.16b-v31.16b}, RX1.16b; \ 188 \ 189 /* linear part */ \ 190 shl RX0.4s, RTMP0.4s, #8; \ 191 shl RX1.4s, RTMP1.4s, #8; \ 192 shl RTMP2.4s, RTMP0.4s, #16; \ 193 shl RTMP3.4s, RTMP1.4s, #16; \ 194 sri RX0.4s, RTMP0.4s, #(32 - 8); \ 195 sri RX1.4s, RTMP1.4s, #(32 - 8); \ 196 sri RTMP2.4s, RTMP0.4s, #(32 - 16); \ 197 sri RTMP3.4s, RTMP1.4s, #(32 - 16); \ 198 /* RX = x ^ rol32(x, 8) ^ rol32(x, 16) */ \ 199 eor RX0.16b, RX0.16b, RTMP0.16b; \ 200 eor RX1.16b, RX1.16b, RTMP1.16b; \ 201 eor RX0.16b, RX0.16b, RTMP2.16b; \ 202 eor RX1.16b, RX1.16b, RTMP3.16b; \ 203 /* RTMP0/1 ^= x ^ rol32(x, 24) ^ rol32(RX, 2) */ \ 204 shl RTMP2.4s, RTMP0.4s, #24; \ 205 shl RTMP3.4s, RTMP1.4s, #24; \ 206 sri RTMP2.4s, RTMP0.4s, #(32 - 24); \ 207 sri RTMP3.4s, RTMP1.4s, #(32 - 24); \ 208 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 209 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 210 shl RTMP2.4s, RX0.4s, #2; \ 211 shl RTMP3.4s, RX1.4s, #2; \ 212 sri RTMP2.4s, RX0.4s, #(32 - 2); \ 213 sri RTMP3.4s, RX1.4s, #(32 - 2); \ 214 eor RTMP0.16b, RTMP0.16b, RTMP2.16b; \ 215 eor RTMP1.16b, RTMP1.16b, RTMP3.16b; \ 216 /* s0/t0 ^= RTMP0/1 */ \ 217 eor s0.16b, s0.16b, RTMP0.16b; \ 218 eor t0.16b, t0.16b, RTMP1.16b; 219 220#define SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7) \ 221 rev32 b0.16b, b0.16b; \ 222 rev32 b1.16b, b1.16b; \ 223 rev32 b2.16b, b2.16b; \ 224 rev32 b3.16b, b3.16b; \ 225 rev32 b4.16b, b4.16b; \ 226 rev32 b5.16b, b5.16b; \ 227 rev32 b6.16b, b6.16b; \ 228 rev32 b7.16b, b7.16b; \ 229 \ 230 mov x6, 8; \ 2318: \ 232 ld1 {RKEY.4s}, [x0], #16; \ 233 subs x6, x6, #1; \ 234 \ 235 ROUND8(0, b0, b1, b2, b3, b4, b5, b6, b7); \ 236 ROUND8(1, b1, b2, b3, b0, b5, b6, b7, b4); \ 237 ROUND8(2, b2, b3, b0, b1, b6, b7, b4, b5); \ 238 ROUND8(3, b3, b0, b1, b2, b7, b4, b5, b6); \ 239 \ 240 bne 8b; \ 241 \ 242 rev32 b0.16b, b0.16b; \ 243 rev32 b1.16b, b1.16b; \ 244 rev32 b2.16b, b2.16b; \ 245 rev32 b3.16b, b3.16b; \ 246 rev32 b4.16b, b4.16b; \ 247 rev32 b5.16b, b5.16b; \ 248 rev32 b6.16b, b6.16b; \ 249 rev32 b7.16b, b7.16b; \ 250 \ 251 /* repoint to rkey */ \ 252 sub x0, x0, #128; 253 254#define SM4_CRYPT_BLK8(b0, b1, b2, b3, b4, b5, b6, b7) \ 255 SM4_CRYPT_BLK8_norotate(b0, b1, b2, b3, b4, b5, b6, b7); \ 256 rotate_clockwise_4x4_2x(b0, b1, b2, b3, b4, b5, b6, b7); \ 257 258 259.align 3 260SYM_FUNC_START(sm4_neon_crypt) 261 /* input: 262 * x0: round key array, CTX 263 * x1: dst 264 * x2: src 265 * w3: nblocks 266 */ 267 SM4_PREPARE() 268 269.Lcrypt_loop_8x: 270 sub w3, w3, #8 271 tbnz w3, #31, .Lcrypt_4x 272 273 ld4 {v0.4s-v3.4s}, [x2], #64 274 ld4 {v4.4s-v7.4s}, [x2], #64 275 276 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 277 278 st1 {v0.16b-v3.16b}, [x1], #64 279 st1 {v4.16b-v7.16b}, [x1], #64 280 281 cbz w3, .Lcrypt_end 282 b .Lcrypt_loop_8x 283 284.Lcrypt_4x: 285 add w3, w3, #8 286 cmp w3, #4 287 blt .Lcrypt_tail 288 289 sub w3, w3, #4 290 291 ld4 {v0.4s-v3.4s}, [x2], #64 292 293 SM4_CRYPT_BLK4(v0, v1, v2, v3) 294 295 st1 {v0.16b-v3.16b}, [x1], #64 296 297 cbz w3, .Lcrypt_end 298 299.Lcrypt_tail: 300 cmp w3, #2 301 ld1 {v0.16b}, [x2], #16 302 blt .Lcrypt_tail_load_done 303 ld1 {v1.16b}, [x2], #16 304 beq .Lcrypt_tail_load_done 305 ld1 {v2.16b}, [x2], #16 306 307.Lcrypt_tail_load_done: 308 transpose_4x4(v0, v1, v2, v3) 309 310 SM4_CRYPT_BLK4(v0, v1, v2, v3) 311 312 cmp w3, #2 313 st1 {v0.16b}, [x1], #16 314 blt .Lcrypt_end 315 st1 {v1.16b}, [x1], #16 316 beq .Lcrypt_end 317 st1 {v2.16b}, [x1], #16 318 319.Lcrypt_end: 320 ret 321SYM_FUNC_END(sm4_neon_crypt) 322 323.align 3 324SYM_FUNC_START(sm4_neon_cbc_dec) 325 /* input: 326 * x0: round key array, CTX 327 * x1: dst 328 * x2: src 329 * x3: iv (big endian, 128 bit) 330 * w4: nblocks 331 */ 332 SM4_PREPARE() 333 334 ld1 {RIV.16b}, [x3] 335 336.Lcbc_dec_loop_8x: 337 sub w4, w4, #8 338 tbnz w4, #31, .Lcbc_dec_4x 339 340 ld4 {v0.4s-v3.4s}, [x2], #64 341 ld4 {v4.4s-v7.4s}, [x2] 342 343 SM4_CRYPT_BLK8_norotate(v0, v1, v2, v3, v4, v5, v6, v7) 344 345 /* Avoid overwriting the RIV register */ 346 rotate_clockwise_4x4(v0, v1, v2, v3) 347 rotate_clockwise_4x4(v4, v5, v6, v7) 348 349 sub x2, x2, #64 350 351 eor v0.16b, v0.16b, RIV.16b 352 353 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 354 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 355 356 eor v1.16b, v1.16b, RTMP0.16b 357 eor v2.16b, v2.16b, RTMP1.16b 358 eor v3.16b, v3.16b, RTMP2.16b 359 eor v4.16b, v4.16b, RTMP3.16b 360 eor v5.16b, v5.16b, RTMP4.16b 361 eor v6.16b, v6.16b, RTMP5.16b 362 eor v7.16b, v7.16b, RTMP6.16b 363 364 mov RIV.16b, RTMP7.16b 365 366 st1 {v0.16b-v3.16b}, [x1], #64 367 st1 {v4.16b-v7.16b}, [x1], #64 368 369 cbz w4, .Lcbc_dec_end 370 b .Lcbc_dec_loop_8x 371 372.Lcbc_dec_4x: 373 add w4, w4, #8 374 cmp w4, #4 375 blt .Lcbc_dec_tail 376 377 sub w4, w4, #4 378 379 ld1 {v0.16b-v3.16b}, [x2], #64 380 381 rev32 v4.16b, v0.16b 382 rev32 v5.16b, v1.16b 383 rev32 v6.16b, v2.16b 384 rev32 v7.16b, v3.16b 385 386 transpose_4x4(v4, v5, v6, v7) 387 388 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 389 390 eor v4.16b, v4.16b, RIV.16b 391 eor v5.16b, v5.16b, v0.16b 392 eor v6.16b, v6.16b, v1.16b 393 eor v7.16b, v7.16b, v2.16b 394 395 mov RIV.16b, v3.16b 396 397 st1 {v4.16b-v7.16b}, [x1], #64 398 399 cbz w4, .Lcbc_dec_end 400 401.Lcbc_dec_tail: 402 cmp w4, #2 403 ld1 {v0.16b}, [x2], #16 404 blt .Lcbc_dec_tail_load_done 405 ld1 {v1.16b}, [x2], #16 406 beq .Lcbc_dec_tail_load_done 407 ld1 {v2.16b}, [x2], #16 408 409.Lcbc_dec_tail_load_done: 410 rev32 v4.16b, v0.16b 411 rev32 v5.16b, v1.16b 412 rev32 v6.16b, v2.16b 413 414 transpose_4x4(v4, v5, v6, v7) 415 416 SM4_CRYPT_BLK4_BE(v4, v5, v6, v7) 417 418 cmp w4, #2 419 eor v4.16b, v4.16b, RIV.16b 420 mov RIV.16b, v0.16b 421 st1 {v4.16b}, [x1], #16 422 blt .Lcbc_dec_end 423 424 eor v5.16b, v5.16b, v0.16b 425 mov RIV.16b, v1.16b 426 st1 {v5.16b}, [x1], #16 427 beq .Lcbc_dec_end 428 429 eor v6.16b, v6.16b, v1.16b 430 mov RIV.16b, v2.16b 431 st1 {v6.16b}, [x1], #16 432 433.Lcbc_dec_end: 434 /* store new IV */ 435 st1 {RIV.16b}, [x3] 436 437 ret 438SYM_FUNC_END(sm4_neon_cbc_dec) 439 440.align 3 441SYM_FUNC_START(sm4_neon_ctr_crypt) 442 /* input: 443 * x0: round key array, CTX 444 * x1: dst 445 * x2: src 446 * x3: ctr (big endian, 128 bit) 447 * w4: nblocks 448 */ 449 SM4_PREPARE() 450 451 ldp x7, x8, [x3] 452 rev x7, x7 453 rev x8, x8 454 455.Lctr_crypt_loop_8x: 456 sub w4, w4, #8 457 tbnz w4, #31, .Lctr_crypt_4x 458 459#define inc_le128(vctr) \ 460 mov vctr.d[1], x8; \ 461 mov vctr.d[0], x7; \ 462 adds x8, x8, #1; \ 463 rev64 vctr.16b, vctr.16b; \ 464 adc x7, x7, xzr; 465 466 /* construct CTRs */ 467 inc_le128(v0) /* +0 */ 468 inc_le128(v1) /* +1 */ 469 inc_le128(v2) /* +2 */ 470 inc_le128(v3) /* +3 */ 471 inc_le128(v4) /* +4 */ 472 inc_le128(v5) /* +5 */ 473 inc_le128(v6) /* +6 */ 474 inc_le128(v7) /* +7 */ 475 476 transpose_4x4_2x(v0, v1, v2, v3, v4, v5, v6, v7) 477 478 SM4_CRYPT_BLK8(v0, v1, v2, v3, v4, v5, v6, v7) 479 480 ld1 {RTMP0.16b-RTMP3.16b}, [x2], #64 481 ld1 {RTMP4.16b-RTMP7.16b}, [x2], #64 482 483 eor v0.16b, v0.16b, RTMP0.16b 484 eor v1.16b, v1.16b, RTMP1.16b 485 eor v2.16b, v2.16b, RTMP2.16b 486 eor v3.16b, v3.16b, RTMP3.16b 487 eor v4.16b, v4.16b, RTMP4.16b 488 eor v5.16b, v5.16b, RTMP5.16b 489 eor v6.16b, v6.16b, RTMP6.16b 490 eor v7.16b, v7.16b, RTMP7.16b 491 492 st1 {v0.16b-v3.16b}, [x1], #64 493 st1 {v4.16b-v7.16b}, [x1], #64 494 495 cbz w4, .Lctr_crypt_end 496 b .Lctr_crypt_loop_8x 497 498.Lctr_crypt_4x: 499 add w4, w4, #8 500 cmp w4, #4 501 blt .Lctr_crypt_tail 502 503 sub w4, w4, #4 504 505 /* construct CTRs */ 506 inc_le128(v0) /* +0 */ 507 inc_le128(v1) /* +1 */ 508 inc_le128(v2) /* +2 */ 509 inc_le128(v3) /* +3 */ 510 511 ld1 {v4.16b-v7.16b}, [x2], #64 512 513 transpose_4x4(v0, v1, v2, v3) 514 515 SM4_CRYPT_BLK4(v0, v1, v2, v3) 516 517 eor v0.16b, v0.16b, v4.16b 518 eor v1.16b, v1.16b, v5.16b 519 eor v2.16b, v2.16b, v6.16b 520 eor v3.16b, v3.16b, v7.16b 521 522 st1 {v0.16b-v3.16b}, [x1], #64 523 524 cbz w4, .Lctr_crypt_end 525 526.Lctr_crypt_tail: 527 /* inc_le128 will change the sign bit */ 528 ld1 {v4.16b}, [x2], #16 529 inc_le128(v0) 530 cmp w4, #2 531 blt .Lctr_crypt_tail_load_done 532 533 ld1 {v5.16b}, [x2], #16 534 inc_le128(v1) 535 cmp w4, #2 536 beq .Lctr_crypt_tail_load_done 537 538 ld1 {v6.16b}, [x2], #16 539 inc_le128(v2) 540 541.Lctr_crypt_tail_load_done: 542 transpose_4x4(v0, v1, v2, v3) 543 544 SM4_CRYPT_BLK4(v0, v1, v2, v3) 545 546 cmp w4, #2 547 548 eor v0.16b, v0.16b, v4.16b 549 st1 {v0.16b}, [x1], #16 550 blt .Lctr_crypt_end 551 552 eor v1.16b, v1.16b, v5.16b 553 st1 {v1.16b}, [x1], #16 554 beq .Lctr_crypt_end 555 556 eor v2.16b, v2.16b, v6.16b 557 st1 {v2.16b}, [x1], #16 558 559.Lctr_crypt_end: 560 /* store new CTR */ 561 rev x7, x7 562 rev x8, x8 563 stp x7, x8, [x3] 564 565 ret 566SYM_FUNC_END(sm4_neon_ctr_crypt) 567