1/* Do not modify. This file is auto-generated from vpsm4_ex-armv8.pl. */ 2// Copyright 2022-2025 The OpenSSL Project Authors. All Rights Reserved. 3// 4// Licensed under the Apache License 2.0 (the "License"). You may not use 5// this file except in compliance with the License. You can obtain a copy 6// in the file LICENSE in the source distribution or at 7// https://www.openssl.org/source/license.html 8 9// 10// This module implements SM4 with ASIMD and AESE on AARCH64 11// 12// Dec 2022 13// 14 15// $output is the last argument if it looks like a file (it has an extension) 16// $flavour is the first argument if it doesn't look like a file 17#include "arm_arch.h" 18.arch armv8-a+crypto 19.text 20 21.type _vpsm4_ex_consts,%object 22.align 7 23_vpsm4_ex_consts: 24.Lck: 25.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 26.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 27.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 28.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 29.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 30.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 31.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 32.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 33.Lfk: 34.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 35.Lshuffles: 36.quad 0x0B0A090807060504,0x030201000F0E0D0C 37.Lxts_magic: 38.quad 0x0101010101010187,0x0101010101010101 39.Lsbox_magic: 40.quad 0x0b0e0104070a0d00,0x0306090c0f020508 41.quad 0x62185a2042387a00,0x22581a6002783a40 42.quad 0x15df62a89e54e923,0xc10bb67c4a803df7 43.quad 0xb9aa6b78c1d21300,0x1407c6d56c7fbead 44.quad 0x6404462679195b3b,0xe383c1a1fe9edcbc 45.quad 0x0f0f0f0f0f0f0f0f,0x0f0f0f0f0f0f0f0f 46 47.size _vpsm4_ex_consts,.-_vpsm4_ex_consts 48.type _vpsm4_ex_set_key,%function 49.align 4 50_vpsm4_ex_set_key: 51 AARCH64_VALID_CALL_TARGET 52 ld1 {v5.4s},[x0] 53 adrp x9, .Lsbox_magic 54 ldr q26, [x9, #:lo12:.Lsbox_magic] 55 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 56 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 57 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 58 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 59 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 60#ifndef __AARCH64EB__ 61 rev32 v5.16b,v5.16b 62#endif 63 adrp x5,.Lshuffles 64 add x5,x5,#:lo12:.Lshuffles 65 ld1 {v7.2d},[x5] 66 adrp x5,.Lfk 67 add x5,x5,#:lo12:.Lfk 68 ld1 {v6.2d},[x5] 69 eor v5.16b,v5.16b,v6.16b 70 mov x6,#32 71 adrp x5,.Lck 72 add x5,x5,#:lo12:.Lck 73 movi v0.16b,#64 74 cbnz w2,1f 75 add x1,x1,124 761: 77 mov w7,v5.s[1] 78 ldr w8,[x5],#4 79 eor w8,w8,w7 80 mov w7,v5.s[2] 81 eor w8,w8,w7 82 mov w7,v5.s[3] 83 eor w8,w8,w7 84 // optimize sbox using AESE instruction 85 mov v4.s[0],w8 86 tbl v0.16b, {v4.16b}, v26.16b 87 ushr v2.16b, v0.16b, 4 88 and v0.16b, v0.16b, v31.16b 89 tbl v0.16b, {v28.16b}, v0.16b 90 tbl v2.16b, {v27.16b}, v2.16b 91 eor v0.16b, v0.16b, v2.16b 92 eor v1.16b, v1.16b, v1.16b 93 aese v0.16b,v1.16b 94 ushr v2.16b, v0.16b, 4 95 and v0.16b, v0.16b, v31.16b 96 tbl v0.16b, {v30.16b}, v0.16b 97 tbl v2.16b, {v29.16b}, v2.16b 98 eor v0.16b, v0.16b, v2.16b 99 mov w7,v0.s[0] 100 eor w8,w7,w7,ror #19 101 eor w8,w8,w7,ror #9 102 mov w7,v5.s[0] 103 eor w8,w8,w7 104 mov v5.s[0],w8 105 cbz w2,2f 106 str w8,[x1],#4 107 b 3f 1082: 109 str w8,[x1],#-4 1103: 111 tbl v5.16b,{v5.16b},v7.16b 112 subs x6,x6,#1 113 b.ne 1b 114 ret 115.size _vpsm4_ex_set_key,.-_vpsm4_ex_set_key 116.type _vpsm4_ex_enc_4blks,%function 117.align 4 118_vpsm4_ex_enc_4blks: 119 AARCH64_VALID_CALL_TARGET 120 mov x10,x3 121 mov w11,#8 12210: 123 ldp w7,w8,[x10],8 124 dup v12.4s,w7 125 dup v13.4s,w8 126 127 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 128 eor v14.16b,v6.16b,v7.16b 129 eor v12.16b,v5.16b,v12.16b 130 eor v12.16b,v14.16b,v12.16b 131 // optimize sbox using AESE instruction 132 tbl v0.16b, {v12.16b}, v26.16b 133 ushr v24.16b, v0.16b, 4 134 and v0.16b, v0.16b, v31.16b 135 tbl v0.16b, {v28.16b}, v0.16b 136 tbl v24.16b, {v27.16b}, v24.16b 137 eor v0.16b, v0.16b, v24.16b 138 eor v1.16b, v1.16b, v1.16b 139 aese v0.16b,v1.16b 140 ushr v24.16b, v0.16b, 4 141 and v0.16b, v0.16b, v31.16b 142 tbl v0.16b, {v30.16b}, v0.16b 143 tbl v24.16b, {v29.16b}, v24.16b 144 eor v0.16b, v0.16b, v24.16b 145 mov v12.16b,v0.16b 146 147 // linear transformation 148 ushr v0.4s,v12.4s,32-2 149 ushr v1.4s,v12.4s,32-10 150 ushr v2.4s,v12.4s,32-18 151 ushr v3.4s,v12.4s,32-24 152 sli v0.4s,v12.4s,2 153 sli v1.4s,v12.4s,10 154 sli v2.4s,v12.4s,18 155 sli v3.4s,v12.4s,24 156 eor v24.16b,v0.16b,v12.16b 157 eor v24.16b,v24.16b,v1.16b 158 eor v12.16b,v2.16b,v3.16b 159 eor v12.16b,v12.16b,v24.16b 160 eor v4.16b,v4.16b,v12.16b 161 162 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 163 eor v14.16b,v14.16b,v4.16b 164 eor v13.16b,v14.16b,v13.16b 165 // optimize sbox using AESE instruction 166 tbl v0.16b, {v13.16b}, v26.16b 167 ushr v24.16b, v0.16b, 4 168 and v0.16b, v0.16b, v31.16b 169 tbl v0.16b, {v28.16b}, v0.16b 170 tbl v24.16b, {v27.16b}, v24.16b 171 eor v0.16b, v0.16b, v24.16b 172 eor v1.16b, v1.16b, v1.16b 173 aese v0.16b,v1.16b 174 ushr v24.16b, v0.16b, 4 175 and v0.16b, v0.16b, v31.16b 176 tbl v0.16b, {v30.16b}, v0.16b 177 tbl v24.16b, {v29.16b}, v24.16b 178 eor v0.16b, v0.16b, v24.16b 179 mov v13.16b,v0.16b 180 181 // linear transformation 182 ushr v0.4s,v13.4s,32-2 183 ushr v1.4s,v13.4s,32-10 184 ushr v2.4s,v13.4s,32-18 185 ushr v3.4s,v13.4s,32-24 186 sli v0.4s,v13.4s,2 187 sli v1.4s,v13.4s,10 188 sli v2.4s,v13.4s,18 189 sli v3.4s,v13.4s,24 190 eor v24.16b,v0.16b,v13.16b 191 eor v24.16b,v24.16b,v1.16b 192 eor v13.16b,v2.16b,v3.16b 193 eor v13.16b,v13.16b,v24.16b 194 ldp w7,w8,[x10],8 195 eor v5.16b,v5.16b,v13.16b 196 197 dup v12.4s,w7 198 dup v13.4s,w8 199 200 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 201 eor v14.16b,v4.16b,v5.16b 202 eor v12.16b,v7.16b,v12.16b 203 eor v12.16b,v14.16b,v12.16b 204 // optimize sbox using AESE instruction 205 tbl v0.16b, {v12.16b}, v26.16b 206 ushr v24.16b, v0.16b, 4 207 and v0.16b, v0.16b, v31.16b 208 tbl v0.16b, {v28.16b}, v0.16b 209 tbl v24.16b, {v27.16b}, v24.16b 210 eor v0.16b, v0.16b, v24.16b 211 eor v1.16b, v1.16b, v1.16b 212 aese v0.16b,v1.16b 213 ushr v24.16b, v0.16b, 4 214 and v0.16b, v0.16b, v31.16b 215 tbl v0.16b, {v30.16b}, v0.16b 216 tbl v24.16b, {v29.16b}, v24.16b 217 eor v0.16b, v0.16b, v24.16b 218 mov v12.16b,v0.16b 219 220 // linear transformation 221 ushr v0.4s,v12.4s,32-2 222 ushr v1.4s,v12.4s,32-10 223 ushr v2.4s,v12.4s,32-18 224 ushr v3.4s,v12.4s,32-24 225 sli v0.4s,v12.4s,2 226 sli v1.4s,v12.4s,10 227 sli v2.4s,v12.4s,18 228 sli v3.4s,v12.4s,24 229 eor v24.16b,v0.16b,v12.16b 230 eor v24.16b,v24.16b,v1.16b 231 eor v12.16b,v2.16b,v3.16b 232 eor v12.16b,v12.16b,v24.16b 233 eor v6.16b,v6.16b,v12.16b 234 235 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 236 eor v14.16b,v14.16b,v6.16b 237 eor v13.16b,v14.16b,v13.16b 238 // optimize sbox using AESE instruction 239 tbl v0.16b, {v13.16b}, v26.16b 240 ushr v24.16b, v0.16b, 4 241 and v0.16b, v0.16b, v31.16b 242 tbl v0.16b, {v28.16b}, v0.16b 243 tbl v24.16b, {v27.16b}, v24.16b 244 eor v0.16b, v0.16b, v24.16b 245 eor v1.16b, v1.16b, v1.16b 246 aese v0.16b,v1.16b 247 ushr v24.16b, v0.16b, 4 248 and v0.16b, v0.16b, v31.16b 249 tbl v0.16b, {v30.16b}, v0.16b 250 tbl v24.16b, {v29.16b}, v24.16b 251 eor v0.16b, v0.16b, v24.16b 252 mov v13.16b,v0.16b 253 254 // linear transformation 255 ushr v0.4s,v13.4s,32-2 256 ushr v1.4s,v13.4s,32-10 257 ushr v2.4s,v13.4s,32-18 258 ushr v3.4s,v13.4s,32-24 259 sli v0.4s,v13.4s,2 260 sli v1.4s,v13.4s,10 261 sli v2.4s,v13.4s,18 262 sli v3.4s,v13.4s,24 263 eor v24.16b,v0.16b,v13.16b 264 eor v24.16b,v24.16b,v1.16b 265 eor v13.16b,v2.16b,v3.16b 266 eor v13.16b,v13.16b,v24.16b 267 eor v7.16b,v7.16b,v13.16b 268 subs w11,w11,#1 269 b.ne 10b 270#ifndef __AARCH64EB__ 271 rev32 v3.16b,v4.16b 272#else 273 mov v3.16b,v4.16b 274#endif 275#ifndef __AARCH64EB__ 276 rev32 v2.16b,v5.16b 277#else 278 mov v2.16b,v5.16b 279#endif 280#ifndef __AARCH64EB__ 281 rev32 v1.16b,v6.16b 282#else 283 mov v1.16b,v6.16b 284#endif 285#ifndef __AARCH64EB__ 286 rev32 v0.16b,v7.16b 287#else 288 mov v0.16b,v7.16b 289#endif 290 ret 291.size _vpsm4_ex_enc_4blks,.-_vpsm4_ex_enc_4blks 292.type _vpsm4_ex_enc_8blks,%function 293.align 4 294_vpsm4_ex_enc_8blks: 295 AARCH64_VALID_CALL_TARGET 296 mov x10,x3 297 mov w11,#8 29810: 299 ldp w7,w8,[x10],8 300 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 301 dup v12.4s,w7 302 eor v14.16b,v6.16b,v7.16b 303 eor v15.16b,v10.16b,v11.16b 304 eor v0.16b,v5.16b,v12.16b 305 eor v1.16b,v9.16b,v12.16b 306 eor v12.16b,v14.16b,v0.16b 307 eor v13.16b,v15.16b,v1.16b 308 // optimize sbox using AESE instruction 309 tbl v0.16b, {v12.16b}, v26.16b 310 tbl v1.16b, {v13.16b}, v26.16b 311 ushr v24.16b, v0.16b, 4 312 and v0.16b, v0.16b, v31.16b 313 tbl v0.16b, {v28.16b}, v0.16b 314 tbl v24.16b, {v27.16b}, v24.16b 315 eor v0.16b, v0.16b, v24.16b 316 ushr v24.16b, v1.16b, 4 317 and v1.16b, v1.16b, v31.16b 318 tbl v1.16b, {v28.16b}, v1.16b 319 tbl v24.16b, {v27.16b}, v24.16b 320 eor v1.16b, v1.16b, v24.16b 321 eor v25.16b, v25.16b, v25.16b 322 aese v0.16b,v25.16b 323 aese v1.16b,v25.16b 324 ushr v24.16b, v0.16b, 4 325 and v0.16b, v0.16b, v31.16b 326 tbl v0.16b, {v30.16b}, v0.16b 327 tbl v24.16b, {v29.16b}, v24.16b 328 eor v0.16b, v0.16b, v24.16b 329 ushr v24.16b, v1.16b, 4 330 and v1.16b, v1.16b, v31.16b 331 tbl v1.16b, {v30.16b}, v1.16b 332 tbl v24.16b, {v29.16b}, v24.16b 333 eor v1.16b, v1.16b, v24.16b 334 mov v12.16b,v0.16b 335 mov v13.16b,v1.16b 336 337 // linear transformation 338 ushr v0.4s,v12.4s,32-2 339 ushr v25.4s,v13.4s,32-2 340 ushr v1.4s,v12.4s,32-10 341 ushr v2.4s,v12.4s,32-18 342 ushr v3.4s,v12.4s,32-24 343 sli v0.4s,v12.4s,2 344 sli v25.4s,v13.4s,2 345 sli v1.4s,v12.4s,10 346 sli v2.4s,v12.4s,18 347 sli v3.4s,v12.4s,24 348 eor v24.16b,v0.16b,v12.16b 349 eor v24.16b,v24.16b,v1.16b 350 eor v12.16b,v2.16b,v3.16b 351 eor v12.16b,v12.16b,v24.16b 352 ushr v1.4s,v13.4s,32-10 353 ushr v2.4s,v13.4s,32-18 354 ushr v3.4s,v13.4s,32-24 355 sli v1.4s,v13.4s,10 356 sli v2.4s,v13.4s,18 357 sli v3.4s,v13.4s,24 358 eor v24.16b,v25.16b,v13.16b 359 eor v24.16b,v24.16b,v1.16b 360 eor v13.16b,v2.16b,v3.16b 361 eor v13.16b,v13.16b,v24.16b 362 eor v4.16b,v4.16b,v12.16b 363 eor v8.16b,v8.16b,v13.16b 364 365 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 366 dup v13.4s,w8 367 eor v14.16b,v14.16b,v4.16b 368 eor v15.16b,v15.16b,v8.16b 369 eor v12.16b,v14.16b,v13.16b 370 eor v13.16b,v15.16b,v13.16b 371 // optimize sbox using AESE instruction 372 tbl v0.16b, {v12.16b}, v26.16b 373 tbl v1.16b, {v13.16b}, v26.16b 374 ushr v24.16b, v0.16b, 4 375 and v0.16b, v0.16b, v31.16b 376 tbl v0.16b, {v28.16b}, v0.16b 377 tbl v24.16b, {v27.16b}, v24.16b 378 eor v0.16b, v0.16b, v24.16b 379 ushr v24.16b, v1.16b, 4 380 and v1.16b, v1.16b, v31.16b 381 tbl v1.16b, {v28.16b}, v1.16b 382 tbl v24.16b, {v27.16b}, v24.16b 383 eor v1.16b, v1.16b, v24.16b 384 eor v25.16b, v25.16b, v25.16b 385 aese v0.16b,v25.16b 386 aese v1.16b,v25.16b 387 ushr v24.16b, v0.16b, 4 388 and v0.16b, v0.16b, v31.16b 389 tbl v0.16b, {v30.16b}, v0.16b 390 tbl v24.16b, {v29.16b}, v24.16b 391 eor v0.16b, v0.16b, v24.16b 392 ushr v24.16b, v1.16b, 4 393 and v1.16b, v1.16b, v31.16b 394 tbl v1.16b, {v30.16b}, v1.16b 395 tbl v24.16b, {v29.16b}, v24.16b 396 eor v1.16b, v1.16b, v24.16b 397 mov v12.16b,v0.16b 398 mov v13.16b,v1.16b 399 400 // linear transformation 401 ushr v0.4s,v12.4s,32-2 402 ushr v25.4s,v13.4s,32-2 403 ushr v1.4s,v12.4s,32-10 404 ushr v2.4s,v12.4s,32-18 405 ushr v3.4s,v12.4s,32-24 406 sli v0.4s,v12.4s,2 407 sli v25.4s,v13.4s,2 408 sli v1.4s,v12.4s,10 409 sli v2.4s,v12.4s,18 410 sli v3.4s,v12.4s,24 411 eor v24.16b,v0.16b,v12.16b 412 eor v24.16b,v24.16b,v1.16b 413 eor v12.16b,v2.16b,v3.16b 414 eor v12.16b,v12.16b,v24.16b 415 ushr v1.4s,v13.4s,32-10 416 ushr v2.4s,v13.4s,32-18 417 ushr v3.4s,v13.4s,32-24 418 sli v1.4s,v13.4s,10 419 sli v2.4s,v13.4s,18 420 sli v3.4s,v13.4s,24 421 eor v24.16b,v25.16b,v13.16b 422 eor v24.16b,v24.16b,v1.16b 423 eor v13.16b,v2.16b,v3.16b 424 eor v13.16b,v13.16b,v24.16b 425 ldp w7,w8,[x10],8 426 eor v5.16b,v5.16b,v12.16b 427 eor v9.16b,v9.16b,v13.16b 428 429 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 430 dup v12.4s,w7 431 eor v14.16b,v4.16b,v5.16b 432 eor v15.16b,v8.16b,v9.16b 433 eor v0.16b,v7.16b,v12.16b 434 eor v1.16b,v11.16b,v12.16b 435 eor v12.16b,v14.16b,v0.16b 436 eor v13.16b,v15.16b,v1.16b 437 // optimize sbox using AESE instruction 438 tbl v0.16b, {v12.16b}, v26.16b 439 tbl v1.16b, {v13.16b}, v26.16b 440 ushr v24.16b, v0.16b, 4 441 and v0.16b, v0.16b, v31.16b 442 tbl v0.16b, {v28.16b}, v0.16b 443 tbl v24.16b, {v27.16b}, v24.16b 444 eor v0.16b, v0.16b, v24.16b 445 ushr v24.16b, v1.16b, 4 446 and v1.16b, v1.16b, v31.16b 447 tbl v1.16b, {v28.16b}, v1.16b 448 tbl v24.16b, {v27.16b}, v24.16b 449 eor v1.16b, v1.16b, v24.16b 450 eor v25.16b, v25.16b, v25.16b 451 aese v0.16b,v25.16b 452 aese v1.16b,v25.16b 453 ushr v24.16b, v0.16b, 4 454 and v0.16b, v0.16b, v31.16b 455 tbl v0.16b, {v30.16b}, v0.16b 456 tbl v24.16b, {v29.16b}, v24.16b 457 eor v0.16b, v0.16b, v24.16b 458 ushr v24.16b, v1.16b, 4 459 and v1.16b, v1.16b, v31.16b 460 tbl v1.16b, {v30.16b}, v1.16b 461 tbl v24.16b, {v29.16b}, v24.16b 462 eor v1.16b, v1.16b, v24.16b 463 mov v12.16b,v0.16b 464 mov v13.16b,v1.16b 465 466 // linear transformation 467 ushr v0.4s,v12.4s,32-2 468 ushr v25.4s,v13.4s,32-2 469 ushr v1.4s,v12.4s,32-10 470 ushr v2.4s,v12.4s,32-18 471 ushr v3.4s,v12.4s,32-24 472 sli v0.4s,v12.4s,2 473 sli v25.4s,v13.4s,2 474 sli v1.4s,v12.4s,10 475 sli v2.4s,v12.4s,18 476 sli v3.4s,v12.4s,24 477 eor v24.16b,v0.16b,v12.16b 478 eor v24.16b,v24.16b,v1.16b 479 eor v12.16b,v2.16b,v3.16b 480 eor v12.16b,v12.16b,v24.16b 481 ushr v1.4s,v13.4s,32-10 482 ushr v2.4s,v13.4s,32-18 483 ushr v3.4s,v13.4s,32-24 484 sli v1.4s,v13.4s,10 485 sli v2.4s,v13.4s,18 486 sli v3.4s,v13.4s,24 487 eor v24.16b,v25.16b,v13.16b 488 eor v24.16b,v24.16b,v1.16b 489 eor v13.16b,v2.16b,v3.16b 490 eor v13.16b,v13.16b,v24.16b 491 eor v6.16b,v6.16b,v12.16b 492 eor v10.16b,v10.16b,v13.16b 493 494 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 495 dup v13.4s,w8 496 eor v14.16b,v14.16b,v6.16b 497 eor v15.16b,v15.16b,v10.16b 498 eor v12.16b,v14.16b,v13.16b 499 eor v13.16b,v15.16b,v13.16b 500 // optimize sbox using AESE instruction 501 tbl v0.16b, {v12.16b}, v26.16b 502 tbl v1.16b, {v13.16b}, v26.16b 503 ushr v24.16b, v0.16b, 4 504 and v0.16b, v0.16b, v31.16b 505 tbl v0.16b, {v28.16b}, v0.16b 506 tbl v24.16b, {v27.16b}, v24.16b 507 eor v0.16b, v0.16b, v24.16b 508 ushr v24.16b, v1.16b, 4 509 and v1.16b, v1.16b, v31.16b 510 tbl v1.16b, {v28.16b}, v1.16b 511 tbl v24.16b, {v27.16b}, v24.16b 512 eor v1.16b, v1.16b, v24.16b 513 eor v25.16b, v25.16b, v25.16b 514 aese v0.16b,v25.16b 515 aese v1.16b,v25.16b 516 ushr v24.16b, v0.16b, 4 517 and v0.16b, v0.16b, v31.16b 518 tbl v0.16b, {v30.16b}, v0.16b 519 tbl v24.16b, {v29.16b}, v24.16b 520 eor v0.16b, v0.16b, v24.16b 521 ushr v24.16b, v1.16b, 4 522 and v1.16b, v1.16b, v31.16b 523 tbl v1.16b, {v30.16b}, v1.16b 524 tbl v24.16b, {v29.16b}, v24.16b 525 eor v1.16b, v1.16b, v24.16b 526 mov v12.16b,v0.16b 527 mov v13.16b,v1.16b 528 529 // linear transformation 530 ushr v0.4s,v12.4s,32-2 531 ushr v25.4s,v13.4s,32-2 532 ushr v1.4s,v12.4s,32-10 533 ushr v2.4s,v12.4s,32-18 534 ushr v3.4s,v12.4s,32-24 535 sli v0.4s,v12.4s,2 536 sli v25.4s,v13.4s,2 537 sli v1.4s,v12.4s,10 538 sli v2.4s,v12.4s,18 539 sli v3.4s,v12.4s,24 540 eor v24.16b,v0.16b,v12.16b 541 eor v24.16b,v24.16b,v1.16b 542 eor v12.16b,v2.16b,v3.16b 543 eor v12.16b,v12.16b,v24.16b 544 ushr v1.4s,v13.4s,32-10 545 ushr v2.4s,v13.4s,32-18 546 ushr v3.4s,v13.4s,32-24 547 sli v1.4s,v13.4s,10 548 sli v2.4s,v13.4s,18 549 sli v3.4s,v13.4s,24 550 eor v24.16b,v25.16b,v13.16b 551 eor v24.16b,v24.16b,v1.16b 552 eor v13.16b,v2.16b,v3.16b 553 eor v13.16b,v13.16b,v24.16b 554 eor v7.16b,v7.16b,v12.16b 555 eor v11.16b,v11.16b,v13.16b 556 subs w11,w11,#1 557 b.ne 10b 558#ifndef __AARCH64EB__ 559 rev32 v3.16b,v4.16b 560#else 561 mov v3.16b,v4.16b 562#endif 563#ifndef __AARCH64EB__ 564 rev32 v2.16b,v5.16b 565#else 566 mov v2.16b,v5.16b 567#endif 568#ifndef __AARCH64EB__ 569 rev32 v1.16b,v6.16b 570#else 571 mov v1.16b,v6.16b 572#endif 573#ifndef __AARCH64EB__ 574 rev32 v0.16b,v7.16b 575#else 576 mov v0.16b,v7.16b 577#endif 578#ifndef __AARCH64EB__ 579 rev32 v7.16b,v8.16b 580#else 581 mov v7.16b,v8.16b 582#endif 583#ifndef __AARCH64EB__ 584 rev32 v6.16b,v9.16b 585#else 586 mov v6.16b,v9.16b 587#endif 588#ifndef __AARCH64EB__ 589 rev32 v5.16b,v10.16b 590#else 591 mov v5.16b,v10.16b 592#endif 593#ifndef __AARCH64EB__ 594 rev32 v4.16b,v11.16b 595#else 596 mov v4.16b,v11.16b 597#endif 598 ret 599.size _vpsm4_ex_enc_8blks,.-_vpsm4_ex_enc_8blks 600.globl vpsm4_ex_set_encrypt_key 601.type vpsm4_ex_set_encrypt_key,%function 602.align 5 603vpsm4_ex_set_encrypt_key: 604 AARCH64_SIGN_LINK_REGISTER 605 stp x29,x30,[sp,#-16]! 606 mov w2,1 607 bl _vpsm4_ex_set_key 608 ldp x29,x30,[sp],#16 609 AARCH64_VALIDATE_LINK_REGISTER 610 ret 611.size vpsm4_ex_set_encrypt_key,.-vpsm4_ex_set_encrypt_key 612.globl vpsm4_ex_set_decrypt_key 613.type vpsm4_ex_set_decrypt_key,%function 614.align 5 615vpsm4_ex_set_decrypt_key: 616 AARCH64_SIGN_LINK_REGISTER 617 stp x29,x30,[sp,#-16]! 618 mov w2,0 619 bl _vpsm4_ex_set_key 620 ldp x29,x30,[sp],#16 621 AARCH64_VALIDATE_LINK_REGISTER 622 ret 623.size vpsm4_ex_set_decrypt_key,.-vpsm4_ex_set_decrypt_key 624.globl vpsm4_ex_encrypt 625.type vpsm4_ex_encrypt,%function 626.align 5 627vpsm4_ex_encrypt: 628 AARCH64_VALID_CALL_TARGET 629 ld1 {v4.4s},[x0] 630 adrp x9, .Lsbox_magic 631 ldr q26, [x9, #:lo12:.Lsbox_magic] 632 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 633 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 634 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 635 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 636 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 637#ifndef __AARCH64EB__ 638 rev32 v4.16b,v4.16b 639#endif 640 mov x3,x2 641 mov x10,x3 642 mov w11,#8 643 mov w12,v4.s[0] 644 mov w13,v4.s[1] 645 mov w14,v4.s[2] 646 mov w15,v4.s[3] 64710: 648 ldp w7,w8,[x10],8 649 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 650 eor w6,w14,w15 651 eor w9,w7,w13 652 eor w6,w6,w9 653 mov v3.s[0],w6 654 // optimize sbox using AESE instruction 655 tbl v0.16b, {v3.16b}, v26.16b 656 ushr v2.16b, v0.16b, 4 657 and v0.16b, v0.16b, v31.16b 658 tbl v0.16b, {v28.16b}, v0.16b 659 tbl v2.16b, {v27.16b}, v2.16b 660 eor v0.16b, v0.16b, v2.16b 661 eor v1.16b, v1.16b, v1.16b 662 aese v0.16b,v1.16b 663 ushr v2.16b, v0.16b, 4 664 and v0.16b, v0.16b, v31.16b 665 tbl v0.16b, {v30.16b}, v0.16b 666 tbl v2.16b, {v29.16b}, v2.16b 667 eor v0.16b, v0.16b, v2.16b 668 669 mov w7,v0.s[0] 670 eor w6,w7,w7,ror #32-2 671 eor w6,w6,w7,ror #32-10 672 eor w6,w6,w7,ror #32-18 673 eor w6,w6,w7,ror #32-24 674 eor w12,w12,w6 675 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 676 eor w6,w14,w15 677 eor w9,w12,w8 678 eor w6,w6,w9 679 mov v3.s[0],w6 680 // optimize sbox using AESE instruction 681 tbl v0.16b, {v3.16b}, v26.16b 682 ushr v2.16b, v0.16b, 4 683 and v0.16b, v0.16b, v31.16b 684 tbl v0.16b, {v28.16b}, v0.16b 685 tbl v2.16b, {v27.16b}, v2.16b 686 eor v0.16b, v0.16b, v2.16b 687 eor v1.16b, v1.16b, v1.16b 688 aese v0.16b,v1.16b 689 ushr v2.16b, v0.16b, 4 690 and v0.16b, v0.16b, v31.16b 691 tbl v0.16b, {v30.16b}, v0.16b 692 tbl v2.16b, {v29.16b}, v2.16b 693 eor v0.16b, v0.16b, v2.16b 694 695 mov w7,v0.s[0] 696 eor w6,w7,w7,ror #32-2 697 eor w6,w6,w7,ror #32-10 698 eor w6,w6,w7,ror #32-18 699 eor w6,w6,w7,ror #32-24 700 ldp w7,w8,[x10],8 701 eor w13,w13,w6 702 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 703 eor w6,w12,w13 704 eor w9,w7,w15 705 eor w6,w6,w9 706 mov v3.s[0],w6 707 // optimize sbox using AESE instruction 708 tbl v0.16b, {v3.16b}, v26.16b 709 ushr v2.16b, v0.16b, 4 710 and v0.16b, v0.16b, v31.16b 711 tbl v0.16b, {v28.16b}, v0.16b 712 tbl v2.16b, {v27.16b}, v2.16b 713 eor v0.16b, v0.16b, v2.16b 714 eor v1.16b, v1.16b, v1.16b 715 aese v0.16b,v1.16b 716 ushr v2.16b, v0.16b, 4 717 and v0.16b, v0.16b, v31.16b 718 tbl v0.16b, {v30.16b}, v0.16b 719 tbl v2.16b, {v29.16b}, v2.16b 720 eor v0.16b, v0.16b, v2.16b 721 722 mov w7,v0.s[0] 723 eor w6,w7,w7,ror #32-2 724 eor w6,w6,w7,ror #32-10 725 eor w6,w6,w7,ror #32-18 726 eor w6,w6,w7,ror #32-24 727 eor w14,w14,w6 728 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 729 eor w6,w12,w13 730 eor w9,w14,w8 731 eor w6,w6,w9 732 mov v3.s[0],w6 733 // optimize sbox using AESE instruction 734 tbl v0.16b, {v3.16b}, v26.16b 735 ushr v2.16b, v0.16b, 4 736 and v0.16b, v0.16b, v31.16b 737 tbl v0.16b, {v28.16b}, v0.16b 738 tbl v2.16b, {v27.16b}, v2.16b 739 eor v0.16b, v0.16b, v2.16b 740 eor v1.16b, v1.16b, v1.16b 741 aese v0.16b,v1.16b 742 ushr v2.16b, v0.16b, 4 743 and v0.16b, v0.16b, v31.16b 744 tbl v0.16b, {v30.16b}, v0.16b 745 tbl v2.16b, {v29.16b}, v2.16b 746 eor v0.16b, v0.16b, v2.16b 747 748 mov w7,v0.s[0] 749 eor w6,w7,w7,ror #32-2 750 eor w6,w6,w7,ror #32-10 751 eor w6,w6,w7,ror #32-18 752 eor w6,w6,w7,ror #32-24 753 eor w15,w15,w6 754 subs w11,w11,#1 755 b.ne 10b 756 mov v4.s[0],w15 757 mov v4.s[1],w14 758 mov v4.s[2],w13 759 mov v4.s[3],w12 760#ifndef __AARCH64EB__ 761 rev32 v4.16b,v4.16b 762#endif 763 st1 {v4.4s},[x1] 764 ret 765.size vpsm4_ex_encrypt,.-vpsm4_ex_encrypt 766.globl vpsm4_ex_decrypt 767.type vpsm4_ex_decrypt,%function 768.align 5 769vpsm4_ex_decrypt: 770 AARCH64_VALID_CALL_TARGET 771 ld1 {v4.4s},[x0] 772 adrp x9, .Lsbox_magic 773 ldr q26, [x9, #:lo12:.Lsbox_magic] 774 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 775 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 776 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 777 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 778 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 779#ifndef __AARCH64EB__ 780 rev32 v4.16b,v4.16b 781#endif 782 mov x3,x2 783 mov x10,x3 784 mov w11,#8 785 mov w12,v4.s[0] 786 mov w13,v4.s[1] 787 mov w14,v4.s[2] 788 mov w15,v4.s[3] 78910: 790 ldp w7,w8,[x10],8 791 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 792 eor w6,w14,w15 793 eor w9,w7,w13 794 eor w6,w6,w9 795 mov v3.s[0],w6 796 // optimize sbox using AESE instruction 797 tbl v0.16b, {v3.16b}, v26.16b 798 ushr v2.16b, v0.16b, 4 799 and v0.16b, v0.16b, v31.16b 800 tbl v0.16b, {v28.16b}, v0.16b 801 tbl v2.16b, {v27.16b}, v2.16b 802 eor v0.16b, v0.16b, v2.16b 803 eor v1.16b, v1.16b, v1.16b 804 aese v0.16b,v1.16b 805 ushr v2.16b, v0.16b, 4 806 and v0.16b, v0.16b, v31.16b 807 tbl v0.16b, {v30.16b}, v0.16b 808 tbl v2.16b, {v29.16b}, v2.16b 809 eor v0.16b, v0.16b, v2.16b 810 811 mov w7,v0.s[0] 812 eor w6,w7,w7,ror #32-2 813 eor w6,w6,w7,ror #32-10 814 eor w6,w6,w7,ror #32-18 815 eor w6,w6,w7,ror #32-24 816 eor w12,w12,w6 817 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 818 eor w6,w14,w15 819 eor w9,w12,w8 820 eor w6,w6,w9 821 mov v3.s[0],w6 822 // optimize sbox using AESE instruction 823 tbl v0.16b, {v3.16b}, v26.16b 824 ushr v2.16b, v0.16b, 4 825 and v0.16b, v0.16b, v31.16b 826 tbl v0.16b, {v28.16b}, v0.16b 827 tbl v2.16b, {v27.16b}, v2.16b 828 eor v0.16b, v0.16b, v2.16b 829 eor v1.16b, v1.16b, v1.16b 830 aese v0.16b,v1.16b 831 ushr v2.16b, v0.16b, 4 832 and v0.16b, v0.16b, v31.16b 833 tbl v0.16b, {v30.16b}, v0.16b 834 tbl v2.16b, {v29.16b}, v2.16b 835 eor v0.16b, v0.16b, v2.16b 836 837 mov w7,v0.s[0] 838 eor w6,w7,w7,ror #32-2 839 eor w6,w6,w7,ror #32-10 840 eor w6,w6,w7,ror #32-18 841 eor w6,w6,w7,ror #32-24 842 ldp w7,w8,[x10],8 843 eor w13,w13,w6 844 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 845 eor w6,w12,w13 846 eor w9,w7,w15 847 eor w6,w6,w9 848 mov v3.s[0],w6 849 // optimize sbox using AESE instruction 850 tbl v0.16b, {v3.16b}, v26.16b 851 ushr v2.16b, v0.16b, 4 852 and v0.16b, v0.16b, v31.16b 853 tbl v0.16b, {v28.16b}, v0.16b 854 tbl v2.16b, {v27.16b}, v2.16b 855 eor v0.16b, v0.16b, v2.16b 856 eor v1.16b, v1.16b, v1.16b 857 aese v0.16b,v1.16b 858 ushr v2.16b, v0.16b, 4 859 and v0.16b, v0.16b, v31.16b 860 tbl v0.16b, {v30.16b}, v0.16b 861 tbl v2.16b, {v29.16b}, v2.16b 862 eor v0.16b, v0.16b, v2.16b 863 864 mov w7,v0.s[0] 865 eor w6,w7,w7,ror #32-2 866 eor w6,w6,w7,ror #32-10 867 eor w6,w6,w7,ror #32-18 868 eor w6,w6,w7,ror #32-24 869 eor w14,w14,w6 870 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 871 eor w6,w12,w13 872 eor w9,w14,w8 873 eor w6,w6,w9 874 mov v3.s[0],w6 875 // optimize sbox using AESE instruction 876 tbl v0.16b, {v3.16b}, v26.16b 877 ushr v2.16b, v0.16b, 4 878 and v0.16b, v0.16b, v31.16b 879 tbl v0.16b, {v28.16b}, v0.16b 880 tbl v2.16b, {v27.16b}, v2.16b 881 eor v0.16b, v0.16b, v2.16b 882 eor v1.16b, v1.16b, v1.16b 883 aese v0.16b,v1.16b 884 ushr v2.16b, v0.16b, 4 885 and v0.16b, v0.16b, v31.16b 886 tbl v0.16b, {v30.16b}, v0.16b 887 tbl v2.16b, {v29.16b}, v2.16b 888 eor v0.16b, v0.16b, v2.16b 889 890 mov w7,v0.s[0] 891 eor w6,w7,w7,ror #32-2 892 eor w6,w6,w7,ror #32-10 893 eor w6,w6,w7,ror #32-18 894 eor w6,w6,w7,ror #32-24 895 eor w15,w15,w6 896 subs w11,w11,#1 897 b.ne 10b 898 mov v4.s[0],w15 899 mov v4.s[1],w14 900 mov v4.s[2],w13 901 mov v4.s[3],w12 902#ifndef __AARCH64EB__ 903 rev32 v4.16b,v4.16b 904#endif 905 st1 {v4.4s},[x1] 906 ret 907.size vpsm4_ex_decrypt,.-vpsm4_ex_decrypt 908.globl vpsm4_ex_ecb_encrypt 909.type vpsm4_ex_ecb_encrypt,%function 910.align 5 911vpsm4_ex_ecb_encrypt: 912 AARCH64_SIGN_LINK_REGISTER 913 // convert length into blocks 914 lsr x2,x2,4 915 stp d8,d9,[sp,#-80]! 916 stp d10,d11,[sp,#16] 917 stp d12,d13,[sp,#32] 918 stp d14,d15,[sp,#48] 919 stp x29,x30,[sp,#64] 920 adrp x9, .Lsbox_magic 921 ldr q26, [x9, #:lo12:.Lsbox_magic] 922 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 923 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 924 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 925 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 926 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 927.Lecb_8_blocks_process: 928 cmp w2,#8 929 b.lt .Lecb_4_blocks_process 930 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 931 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 932#ifndef __AARCH64EB__ 933 rev32 v4.16b,v4.16b 934#endif 935#ifndef __AARCH64EB__ 936 rev32 v5.16b,v5.16b 937#endif 938#ifndef __AARCH64EB__ 939 rev32 v6.16b,v6.16b 940#endif 941#ifndef __AARCH64EB__ 942 rev32 v7.16b,v7.16b 943#endif 944#ifndef __AARCH64EB__ 945 rev32 v8.16b,v8.16b 946#endif 947#ifndef __AARCH64EB__ 948 rev32 v9.16b,v9.16b 949#endif 950#ifndef __AARCH64EB__ 951 rev32 v10.16b,v10.16b 952#endif 953#ifndef __AARCH64EB__ 954 rev32 v11.16b,v11.16b 955#endif 956 bl _vpsm4_ex_enc_8blks 957 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 958 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 959 subs w2,w2,#8 960 b.gt .Lecb_8_blocks_process 961 b 100f 962.Lecb_4_blocks_process: 963 cmp w2,#4 964 b.lt 1f 965 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 966#ifndef __AARCH64EB__ 967 rev32 v4.16b,v4.16b 968#endif 969#ifndef __AARCH64EB__ 970 rev32 v5.16b,v5.16b 971#endif 972#ifndef __AARCH64EB__ 973 rev32 v6.16b,v6.16b 974#endif 975#ifndef __AARCH64EB__ 976 rev32 v7.16b,v7.16b 977#endif 978 bl _vpsm4_ex_enc_4blks 979 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 980 sub w2,w2,#4 9811: 982 // process last block 983 cmp w2,#1 984 b.lt 100f 985 b.gt 1f 986 ld1 {v4.4s},[x0] 987#ifndef __AARCH64EB__ 988 rev32 v4.16b,v4.16b 989#endif 990 mov x10,x3 991 mov w11,#8 992 mov w12,v4.s[0] 993 mov w13,v4.s[1] 994 mov w14,v4.s[2] 995 mov w15,v4.s[3] 99610: 997 ldp w7,w8,[x10],8 998 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 999 eor w6,w14,w15 1000 eor w9,w7,w13 1001 eor w6,w6,w9 1002 mov v3.s[0],w6 1003 // optimize sbox using AESE instruction 1004 tbl v0.16b, {v3.16b}, v26.16b 1005 ushr v2.16b, v0.16b, 4 1006 and v0.16b, v0.16b, v31.16b 1007 tbl v0.16b, {v28.16b}, v0.16b 1008 tbl v2.16b, {v27.16b}, v2.16b 1009 eor v0.16b, v0.16b, v2.16b 1010 eor v1.16b, v1.16b, v1.16b 1011 aese v0.16b,v1.16b 1012 ushr v2.16b, v0.16b, 4 1013 and v0.16b, v0.16b, v31.16b 1014 tbl v0.16b, {v30.16b}, v0.16b 1015 tbl v2.16b, {v29.16b}, v2.16b 1016 eor v0.16b, v0.16b, v2.16b 1017 1018 mov w7,v0.s[0] 1019 eor w6,w7,w7,ror #32-2 1020 eor w6,w6,w7,ror #32-10 1021 eor w6,w6,w7,ror #32-18 1022 eor w6,w6,w7,ror #32-24 1023 eor w12,w12,w6 1024 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1025 eor w6,w14,w15 1026 eor w9,w12,w8 1027 eor w6,w6,w9 1028 mov v3.s[0],w6 1029 // optimize sbox using AESE instruction 1030 tbl v0.16b, {v3.16b}, v26.16b 1031 ushr v2.16b, v0.16b, 4 1032 and v0.16b, v0.16b, v31.16b 1033 tbl v0.16b, {v28.16b}, v0.16b 1034 tbl v2.16b, {v27.16b}, v2.16b 1035 eor v0.16b, v0.16b, v2.16b 1036 eor v1.16b, v1.16b, v1.16b 1037 aese v0.16b,v1.16b 1038 ushr v2.16b, v0.16b, 4 1039 and v0.16b, v0.16b, v31.16b 1040 tbl v0.16b, {v30.16b}, v0.16b 1041 tbl v2.16b, {v29.16b}, v2.16b 1042 eor v0.16b, v0.16b, v2.16b 1043 1044 mov w7,v0.s[0] 1045 eor w6,w7,w7,ror #32-2 1046 eor w6,w6,w7,ror #32-10 1047 eor w6,w6,w7,ror #32-18 1048 eor w6,w6,w7,ror #32-24 1049 ldp w7,w8,[x10],8 1050 eor w13,w13,w6 1051 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1052 eor w6,w12,w13 1053 eor w9,w7,w15 1054 eor w6,w6,w9 1055 mov v3.s[0],w6 1056 // optimize sbox using AESE instruction 1057 tbl v0.16b, {v3.16b}, v26.16b 1058 ushr v2.16b, v0.16b, 4 1059 and v0.16b, v0.16b, v31.16b 1060 tbl v0.16b, {v28.16b}, v0.16b 1061 tbl v2.16b, {v27.16b}, v2.16b 1062 eor v0.16b, v0.16b, v2.16b 1063 eor v1.16b, v1.16b, v1.16b 1064 aese v0.16b,v1.16b 1065 ushr v2.16b, v0.16b, 4 1066 and v0.16b, v0.16b, v31.16b 1067 tbl v0.16b, {v30.16b}, v0.16b 1068 tbl v2.16b, {v29.16b}, v2.16b 1069 eor v0.16b, v0.16b, v2.16b 1070 1071 mov w7,v0.s[0] 1072 eor w6,w7,w7,ror #32-2 1073 eor w6,w6,w7,ror #32-10 1074 eor w6,w6,w7,ror #32-18 1075 eor w6,w6,w7,ror #32-24 1076 eor w14,w14,w6 1077 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1078 eor w6,w12,w13 1079 eor w9,w14,w8 1080 eor w6,w6,w9 1081 mov v3.s[0],w6 1082 // optimize sbox using AESE instruction 1083 tbl v0.16b, {v3.16b}, v26.16b 1084 ushr v2.16b, v0.16b, 4 1085 and v0.16b, v0.16b, v31.16b 1086 tbl v0.16b, {v28.16b}, v0.16b 1087 tbl v2.16b, {v27.16b}, v2.16b 1088 eor v0.16b, v0.16b, v2.16b 1089 eor v1.16b, v1.16b, v1.16b 1090 aese v0.16b,v1.16b 1091 ushr v2.16b, v0.16b, 4 1092 and v0.16b, v0.16b, v31.16b 1093 tbl v0.16b, {v30.16b}, v0.16b 1094 tbl v2.16b, {v29.16b}, v2.16b 1095 eor v0.16b, v0.16b, v2.16b 1096 1097 mov w7,v0.s[0] 1098 eor w6,w7,w7,ror #32-2 1099 eor w6,w6,w7,ror #32-10 1100 eor w6,w6,w7,ror #32-18 1101 eor w6,w6,w7,ror #32-24 1102 eor w15,w15,w6 1103 subs w11,w11,#1 1104 b.ne 10b 1105 mov v4.s[0],w15 1106 mov v4.s[1],w14 1107 mov v4.s[2],w13 1108 mov v4.s[3],w12 1109#ifndef __AARCH64EB__ 1110 rev32 v4.16b,v4.16b 1111#endif 1112 st1 {v4.4s},[x1] 1113 b 100f 11141: // process last 2 blocks 1115 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 1116 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 1117 cmp w2,#2 1118 b.gt 1f 1119#ifndef __AARCH64EB__ 1120 rev32 v4.16b,v4.16b 1121#endif 1122#ifndef __AARCH64EB__ 1123 rev32 v5.16b,v5.16b 1124#endif 1125#ifndef __AARCH64EB__ 1126 rev32 v6.16b,v6.16b 1127#endif 1128#ifndef __AARCH64EB__ 1129 rev32 v7.16b,v7.16b 1130#endif 1131 bl _vpsm4_ex_enc_4blks 1132 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1133 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] 1134 b 100f 11351: // process last 3 blocks 1136 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 1137#ifndef __AARCH64EB__ 1138 rev32 v4.16b,v4.16b 1139#endif 1140#ifndef __AARCH64EB__ 1141 rev32 v5.16b,v5.16b 1142#endif 1143#ifndef __AARCH64EB__ 1144 rev32 v6.16b,v6.16b 1145#endif 1146#ifndef __AARCH64EB__ 1147 rev32 v7.16b,v7.16b 1148#endif 1149 bl _vpsm4_ex_enc_4blks 1150 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1151 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 1152 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] 1153100: 1154 ldp d10,d11,[sp,#16] 1155 ldp d12,d13,[sp,#32] 1156 ldp d14,d15,[sp,#48] 1157 ldp x29,x30,[sp,#64] 1158 ldp d8,d9,[sp],#80 1159 AARCH64_VALIDATE_LINK_REGISTER 1160 ret 1161.size vpsm4_ex_ecb_encrypt,.-vpsm4_ex_ecb_encrypt 1162.globl vpsm4_ex_cbc_encrypt 1163.type vpsm4_ex_cbc_encrypt,%function 1164.align 5 1165vpsm4_ex_cbc_encrypt: 1166 AARCH64_VALID_CALL_TARGET 1167 lsr x2,x2,4 1168 adrp x9, .Lsbox_magic 1169 ldr q26, [x9, #:lo12:.Lsbox_magic] 1170 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 1171 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 1172 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 1173 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 1174 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 1175 cbz w5,.Ldec 1176 ld1 {v3.4s},[x4] 1177.Lcbc_4_blocks_enc: 1178 cmp w2,#4 1179 b.lt 1f 1180 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1181 eor v4.16b,v4.16b,v3.16b 1182#ifndef __AARCH64EB__ 1183 rev32 v5.16b,v5.16b 1184#endif 1185#ifndef __AARCH64EB__ 1186 rev32 v4.16b,v4.16b 1187#endif 1188#ifndef __AARCH64EB__ 1189 rev32 v6.16b,v6.16b 1190#endif 1191#ifndef __AARCH64EB__ 1192 rev32 v7.16b,v7.16b 1193#endif 1194 mov x10,x3 1195 mov w11,#8 1196 mov w12,v4.s[0] 1197 mov w13,v4.s[1] 1198 mov w14,v4.s[2] 1199 mov w15,v4.s[3] 120010: 1201 ldp w7,w8,[x10],8 1202 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1203 eor w6,w14,w15 1204 eor w9,w7,w13 1205 eor w6,w6,w9 1206 mov v3.s[0],w6 1207 // optimize sbox using AESE instruction 1208 tbl v0.16b, {v3.16b}, v26.16b 1209 ushr v2.16b, v0.16b, 4 1210 and v0.16b, v0.16b, v31.16b 1211 tbl v0.16b, {v28.16b}, v0.16b 1212 tbl v2.16b, {v27.16b}, v2.16b 1213 eor v0.16b, v0.16b, v2.16b 1214 eor v1.16b, v1.16b, v1.16b 1215 aese v0.16b,v1.16b 1216 ushr v2.16b, v0.16b, 4 1217 and v0.16b, v0.16b, v31.16b 1218 tbl v0.16b, {v30.16b}, v0.16b 1219 tbl v2.16b, {v29.16b}, v2.16b 1220 eor v0.16b, v0.16b, v2.16b 1221 1222 mov w7,v0.s[0] 1223 eor w6,w7,w7,ror #32-2 1224 eor w6,w6,w7,ror #32-10 1225 eor w6,w6,w7,ror #32-18 1226 eor w6,w6,w7,ror #32-24 1227 eor w12,w12,w6 1228 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1229 eor w6,w14,w15 1230 eor w9,w12,w8 1231 eor w6,w6,w9 1232 mov v3.s[0],w6 1233 // optimize sbox using AESE instruction 1234 tbl v0.16b, {v3.16b}, v26.16b 1235 ushr v2.16b, v0.16b, 4 1236 and v0.16b, v0.16b, v31.16b 1237 tbl v0.16b, {v28.16b}, v0.16b 1238 tbl v2.16b, {v27.16b}, v2.16b 1239 eor v0.16b, v0.16b, v2.16b 1240 eor v1.16b, v1.16b, v1.16b 1241 aese v0.16b,v1.16b 1242 ushr v2.16b, v0.16b, 4 1243 and v0.16b, v0.16b, v31.16b 1244 tbl v0.16b, {v30.16b}, v0.16b 1245 tbl v2.16b, {v29.16b}, v2.16b 1246 eor v0.16b, v0.16b, v2.16b 1247 1248 mov w7,v0.s[0] 1249 eor w6,w7,w7,ror #32-2 1250 eor w6,w6,w7,ror #32-10 1251 eor w6,w6,w7,ror #32-18 1252 eor w6,w6,w7,ror #32-24 1253 ldp w7,w8,[x10],8 1254 eor w13,w13,w6 1255 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1256 eor w6,w12,w13 1257 eor w9,w7,w15 1258 eor w6,w6,w9 1259 mov v3.s[0],w6 1260 // optimize sbox using AESE instruction 1261 tbl v0.16b, {v3.16b}, v26.16b 1262 ushr v2.16b, v0.16b, 4 1263 and v0.16b, v0.16b, v31.16b 1264 tbl v0.16b, {v28.16b}, v0.16b 1265 tbl v2.16b, {v27.16b}, v2.16b 1266 eor v0.16b, v0.16b, v2.16b 1267 eor v1.16b, v1.16b, v1.16b 1268 aese v0.16b,v1.16b 1269 ushr v2.16b, v0.16b, 4 1270 and v0.16b, v0.16b, v31.16b 1271 tbl v0.16b, {v30.16b}, v0.16b 1272 tbl v2.16b, {v29.16b}, v2.16b 1273 eor v0.16b, v0.16b, v2.16b 1274 1275 mov w7,v0.s[0] 1276 eor w6,w7,w7,ror #32-2 1277 eor w6,w6,w7,ror #32-10 1278 eor w6,w6,w7,ror #32-18 1279 eor w6,w6,w7,ror #32-24 1280 eor w14,w14,w6 1281 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1282 eor w6,w12,w13 1283 eor w9,w14,w8 1284 eor w6,w6,w9 1285 mov v3.s[0],w6 1286 // optimize sbox using AESE instruction 1287 tbl v0.16b, {v3.16b}, v26.16b 1288 ushr v2.16b, v0.16b, 4 1289 and v0.16b, v0.16b, v31.16b 1290 tbl v0.16b, {v28.16b}, v0.16b 1291 tbl v2.16b, {v27.16b}, v2.16b 1292 eor v0.16b, v0.16b, v2.16b 1293 eor v1.16b, v1.16b, v1.16b 1294 aese v0.16b,v1.16b 1295 ushr v2.16b, v0.16b, 4 1296 and v0.16b, v0.16b, v31.16b 1297 tbl v0.16b, {v30.16b}, v0.16b 1298 tbl v2.16b, {v29.16b}, v2.16b 1299 eor v0.16b, v0.16b, v2.16b 1300 1301 mov w7,v0.s[0] 1302 eor w6,w7,w7,ror #32-2 1303 eor w6,w6,w7,ror #32-10 1304 eor w6,w6,w7,ror #32-18 1305 eor w6,w6,w7,ror #32-24 1306 eor w15,w15,w6 1307 subs w11,w11,#1 1308 b.ne 10b 1309 mov v4.s[0],w15 1310 mov v4.s[1],w14 1311 mov v4.s[2],w13 1312 mov v4.s[3],w12 1313 eor v5.16b,v5.16b,v4.16b 1314 mov x10,x3 1315 mov w11,#8 1316 mov w12,v5.s[0] 1317 mov w13,v5.s[1] 1318 mov w14,v5.s[2] 1319 mov w15,v5.s[3] 132010: 1321 ldp w7,w8,[x10],8 1322 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1323 eor w6,w14,w15 1324 eor w9,w7,w13 1325 eor w6,w6,w9 1326 mov v3.s[0],w6 1327 // optimize sbox using AESE instruction 1328 tbl v0.16b, {v3.16b}, v26.16b 1329 ushr v2.16b, v0.16b, 4 1330 and v0.16b, v0.16b, v31.16b 1331 tbl v0.16b, {v28.16b}, v0.16b 1332 tbl v2.16b, {v27.16b}, v2.16b 1333 eor v0.16b, v0.16b, v2.16b 1334 eor v1.16b, v1.16b, v1.16b 1335 aese v0.16b,v1.16b 1336 ushr v2.16b, v0.16b, 4 1337 and v0.16b, v0.16b, v31.16b 1338 tbl v0.16b, {v30.16b}, v0.16b 1339 tbl v2.16b, {v29.16b}, v2.16b 1340 eor v0.16b, v0.16b, v2.16b 1341 1342 mov w7,v0.s[0] 1343 eor w6,w7,w7,ror #32-2 1344 eor w6,w6,w7,ror #32-10 1345 eor w6,w6,w7,ror #32-18 1346 eor w6,w6,w7,ror #32-24 1347 eor w12,w12,w6 1348 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1349 eor w6,w14,w15 1350 eor w9,w12,w8 1351 eor w6,w6,w9 1352 mov v3.s[0],w6 1353 // optimize sbox using AESE instruction 1354 tbl v0.16b, {v3.16b}, v26.16b 1355 ushr v2.16b, v0.16b, 4 1356 and v0.16b, v0.16b, v31.16b 1357 tbl v0.16b, {v28.16b}, v0.16b 1358 tbl v2.16b, {v27.16b}, v2.16b 1359 eor v0.16b, v0.16b, v2.16b 1360 eor v1.16b, v1.16b, v1.16b 1361 aese v0.16b,v1.16b 1362 ushr v2.16b, v0.16b, 4 1363 and v0.16b, v0.16b, v31.16b 1364 tbl v0.16b, {v30.16b}, v0.16b 1365 tbl v2.16b, {v29.16b}, v2.16b 1366 eor v0.16b, v0.16b, v2.16b 1367 1368 mov w7,v0.s[0] 1369 eor w6,w7,w7,ror #32-2 1370 eor w6,w6,w7,ror #32-10 1371 eor w6,w6,w7,ror #32-18 1372 eor w6,w6,w7,ror #32-24 1373 ldp w7,w8,[x10],8 1374 eor w13,w13,w6 1375 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1376 eor w6,w12,w13 1377 eor w9,w7,w15 1378 eor w6,w6,w9 1379 mov v3.s[0],w6 1380 // optimize sbox using AESE instruction 1381 tbl v0.16b, {v3.16b}, v26.16b 1382 ushr v2.16b, v0.16b, 4 1383 and v0.16b, v0.16b, v31.16b 1384 tbl v0.16b, {v28.16b}, v0.16b 1385 tbl v2.16b, {v27.16b}, v2.16b 1386 eor v0.16b, v0.16b, v2.16b 1387 eor v1.16b, v1.16b, v1.16b 1388 aese v0.16b,v1.16b 1389 ushr v2.16b, v0.16b, 4 1390 and v0.16b, v0.16b, v31.16b 1391 tbl v0.16b, {v30.16b}, v0.16b 1392 tbl v2.16b, {v29.16b}, v2.16b 1393 eor v0.16b, v0.16b, v2.16b 1394 1395 mov w7,v0.s[0] 1396 eor w6,w7,w7,ror #32-2 1397 eor w6,w6,w7,ror #32-10 1398 eor w6,w6,w7,ror #32-18 1399 eor w6,w6,w7,ror #32-24 1400 eor w14,w14,w6 1401 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1402 eor w6,w12,w13 1403 eor w9,w14,w8 1404 eor w6,w6,w9 1405 mov v3.s[0],w6 1406 // optimize sbox using AESE instruction 1407 tbl v0.16b, {v3.16b}, v26.16b 1408 ushr v2.16b, v0.16b, 4 1409 and v0.16b, v0.16b, v31.16b 1410 tbl v0.16b, {v28.16b}, v0.16b 1411 tbl v2.16b, {v27.16b}, v2.16b 1412 eor v0.16b, v0.16b, v2.16b 1413 eor v1.16b, v1.16b, v1.16b 1414 aese v0.16b,v1.16b 1415 ushr v2.16b, v0.16b, 4 1416 and v0.16b, v0.16b, v31.16b 1417 tbl v0.16b, {v30.16b}, v0.16b 1418 tbl v2.16b, {v29.16b}, v2.16b 1419 eor v0.16b, v0.16b, v2.16b 1420 1421 mov w7,v0.s[0] 1422 eor w6,w7,w7,ror #32-2 1423 eor w6,w6,w7,ror #32-10 1424 eor w6,w6,w7,ror #32-18 1425 eor w6,w6,w7,ror #32-24 1426 eor w15,w15,w6 1427 subs w11,w11,#1 1428 b.ne 10b 1429 mov v5.s[0],w15 1430 mov v5.s[1],w14 1431 mov v5.s[2],w13 1432 mov v5.s[3],w12 1433#ifndef __AARCH64EB__ 1434 rev32 v4.16b,v4.16b 1435#endif 1436 eor v6.16b,v6.16b,v5.16b 1437 mov x10,x3 1438 mov w11,#8 1439 mov w12,v6.s[0] 1440 mov w13,v6.s[1] 1441 mov w14,v6.s[2] 1442 mov w15,v6.s[3] 144310: 1444 ldp w7,w8,[x10],8 1445 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1446 eor w6,w14,w15 1447 eor w9,w7,w13 1448 eor w6,w6,w9 1449 mov v3.s[0],w6 1450 // optimize sbox using AESE instruction 1451 tbl v0.16b, {v3.16b}, v26.16b 1452 ushr v2.16b, v0.16b, 4 1453 and v0.16b, v0.16b, v31.16b 1454 tbl v0.16b, {v28.16b}, v0.16b 1455 tbl v2.16b, {v27.16b}, v2.16b 1456 eor v0.16b, v0.16b, v2.16b 1457 eor v1.16b, v1.16b, v1.16b 1458 aese v0.16b,v1.16b 1459 ushr v2.16b, v0.16b, 4 1460 and v0.16b, v0.16b, v31.16b 1461 tbl v0.16b, {v30.16b}, v0.16b 1462 tbl v2.16b, {v29.16b}, v2.16b 1463 eor v0.16b, v0.16b, v2.16b 1464 1465 mov w7,v0.s[0] 1466 eor w6,w7,w7,ror #32-2 1467 eor w6,w6,w7,ror #32-10 1468 eor w6,w6,w7,ror #32-18 1469 eor w6,w6,w7,ror #32-24 1470 eor w12,w12,w6 1471 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1472 eor w6,w14,w15 1473 eor w9,w12,w8 1474 eor w6,w6,w9 1475 mov v3.s[0],w6 1476 // optimize sbox using AESE instruction 1477 tbl v0.16b, {v3.16b}, v26.16b 1478 ushr v2.16b, v0.16b, 4 1479 and v0.16b, v0.16b, v31.16b 1480 tbl v0.16b, {v28.16b}, v0.16b 1481 tbl v2.16b, {v27.16b}, v2.16b 1482 eor v0.16b, v0.16b, v2.16b 1483 eor v1.16b, v1.16b, v1.16b 1484 aese v0.16b,v1.16b 1485 ushr v2.16b, v0.16b, 4 1486 and v0.16b, v0.16b, v31.16b 1487 tbl v0.16b, {v30.16b}, v0.16b 1488 tbl v2.16b, {v29.16b}, v2.16b 1489 eor v0.16b, v0.16b, v2.16b 1490 1491 mov w7,v0.s[0] 1492 eor w6,w7,w7,ror #32-2 1493 eor w6,w6,w7,ror #32-10 1494 eor w6,w6,w7,ror #32-18 1495 eor w6,w6,w7,ror #32-24 1496 ldp w7,w8,[x10],8 1497 eor w13,w13,w6 1498 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1499 eor w6,w12,w13 1500 eor w9,w7,w15 1501 eor w6,w6,w9 1502 mov v3.s[0],w6 1503 // optimize sbox using AESE instruction 1504 tbl v0.16b, {v3.16b}, v26.16b 1505 ushr v2.16b, v0.16b, 4 1506 and v0.16b, v0.16b, v31.16b 1507 tbl v0.16b, {v28.16b}, v0.16b 1508 tbl v2.16b, {v27.16b}, v2.16b 1509 eor v0.16b, v0.16b, v2.16b 1510 eor v1.16b, v1.16b, v1.16b 1511 aese v0.16b,v1.16b 1512 ushr v2.16b, v0.16b, 4 1513 and v0.16b, v0.16b, v31.16b 1514 tbl v0.16b, {v30.16b}, v0.16b 1515 tbl v2.16b, {v29.16b}, v2.16b 1516 eor v0.16b, v0.16b, v2.16b 1517 1518 mov w7,v0.s[0] 1519 eor w6,w7,w7,ror #32-2 1520 eor w6,w6,w7,ror #32-10 1521 eor w6,w6,w7,ror #32-18 1522 eor w6,w6,w7,ror #32-24 1523 eor w14,w14,w6 1524 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1525 eor w6,w12,w13 1526 eor w9,w14,w8 1527 eor w6,w6,w9 1528 mov v3.s[0],w6 1529 // optimize sbox using AESE instruction 1530 tbl v0.16b, {v3.16b}, v26.16b 1531 ushr v2.16b, v0.16b, 4 1532 and v0.16b, v0.16b, v31.16b 1533 tbl v0.16b, {v28.16b}, v0.16b 1534 tbl v2.16b, {v27.16b}, v2.16b 1535 eor v0.16b, v0.16b, v2.16b 1536 eor v1.16b, v1.16b, v1.16b 1537 aese v0.16b,v1.16b 1538 ushr v2.16b, v0.16b, 4 1539 and v0.16b, v0.16b, v31.16b 1540 tbl v0.16b, {v30.16b}, v0.16b 1541 tbl v2.16b, {v29.16b}, v2.16b 1542 eor v0.16b, v0.16b, v2.16b 1543 1544 mov w7,v0.s[0] 1545 eor w6,w7,w7,ror #32-2 1546 eor w6,w6,w7,ror #32-10 1547 eor w6,w6,w7,ror #32-18 1548 eor w6,w6,w7,ror #32-24 1549 eor w15,w15,w6 1550 subs w11,w11,#1 1551 b.ne 10b 1552 mov v6.s[0],w15 1553 mov v6.s[1],w14 1554 mov v6.s[2],w13 1555 mov v6.s[3],w12 1556#ifndef __AARCH64EB__ 1557 rev32 v5.16b,v5.16b 1558#endif 1559 eor v7.16b,v7.16b,v6.16b 1560 mov x10,x3 1561 mov w11,#8 1562 mov w12,v7.s[0] 1563 mov w13,v7.s[1] 1564 mov w14,v7.s[2] 1565 mov w15,v7.s[3] 156610: 1567 ldp w7,w8,[x10],8 1568 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1569 eor w6,w14,w15 1570 eor w9,w7,w13 1571 eor w6,w6,w9 1572 mov v3.s[0],w6 1573 // optimize sbox using AESE instruction 1574 tbl v0.16b, {v3.16b}, v26.16b 1575 ushr v2.16b, v0.16b, 4 1576 and v0.16b, v0.16b, v31.16b 1577 tbl v0.16b, {v28.16b}, v0.16b 1578 tbl v2.16b, {v27.16b}, v2.16b 1579 eor v0.16b, v0.16b, v2.16b 1580 eor v1.16b, v1.16b, v1.16b 1581 aese v0.16b,v1.16b 1582 ushr v2.16b, v0.16b, 4 1583 and v0.16b, v0.16b, v31.16b 1584 tbl v0.16b, {v30.16b}, v0.16b 1585 tbl v2.16b, {v29.16b}, v2.16b 1586 eor v0.16b, v0.16b, v2.16b 1587 1588 mov w7,v0.s[0] 1589 eor w6,w7,w7,ror #32-2 1590 eor w6,w6,w7,ror #32-10 1591 eor w6,w6,w7,ror #32-18 1592 eor w6,w6,w7,ror #32-24 1593 eor w12,w12,w6 1594 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1595 eor w6,w14,w15 1596 eor w9,w12,w8 1597 eor w6,w6,w9 1598 mov v3.s[0],w6 1599 // optimize sbox using AESE instruction 1600 tbl v0.16b, {v3.16b}, v26.16b 1601 ushr v2.16b, v0.16b, 4 1602 and v0.16b, v0.16b, v31.16b 1603 tbl v0.16b, {v28.16b}, v0.16b 1604 tbl v2.16b, {v27.16b}, v2.16b 1605 eor v0.16b, v0.16b, v2.16b 1606 eor v1.16b, v1.16b, v1.16b 1607 aese v0.16b,v1.16b 1608 ushr v2.16b, v0.16b, 4 1609 and v0.16b, v0.16b, v31.16b 1610 tbl v0.16b, {v30.16b}, v0.16b 1611 tbl v2.16b, {v29.16b}, v2.16b 1612 eor v0.16b, v0.16b, v2.16b 1613 1614 mov w7,v0.s[0] 1615 eor w6,w7,w7,ror #32-2 1616 eor w6,w6,w7,ror #32-10 1617 eor w6,w6,w7,ror #32-18 1618 eor w6,w6,w7,ror #32-24 1619 ldp w7,w8,[x10],8 1620 eor w13,w13,w6 1621 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1622 eor w6,w12,w13 1623 eor w9,w7,w15 1624 eor w6,w6,w9 1625 mov v3.s[0],w6 1626 // optimize sbox using AESE instruction 1627 tbl v0.16b, {v3.16b}, v26.16b 1628 ushr v2.16b, v0.16b, 4 1629 and v0.16b, v0.16b, v31.16b 1630 tbl v0.16b, {v28.16b}, v0.16b 1631 tbl v2.16b, {v27.16b}, v2.16b 1632 eor v0.16b, v0.16b, v2.16b 1633 eor v1.16b, v1.16b, v1.16b 1634 aese v0.16b,v1.16b 1635 ushr v2.16b, v0.16b, 4 1636 and v0.16b, v0.16b, v31.16b 1637 tbl v0.16b, {v30.16b}, v0.16b 1638 tbl v2.16b, {v29.16b}, v2.16b 1639 eor v0.16b, v0.16b, v2.16b 1640 1641 mov w7,v0.s[0] 1642 eor w6,w7,w7,ror #32-2 1643 eor w6,w6,w7,ror #32-10 1644 eor w6,w6,w7,ror #32-18 1645 eor w6,w6,w7,ror #32-24 1646 eor w14,w14,w6 1647 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1648 eor w6,w12,w13 1649 eor w9,w14,w8 1650 eor w6,w6,w9 1651 mov v3.s[0],w6 1652 // optimize sbox using AESE instruction 1653 tbl v0.16b, {v3.16b}, v26.16b 1654 ushr v2.16b, v0.16b, 4 1655 and v0.16b, v0.16b, v31.16b 1656 tbl v0.16b, {v28.16b}, v0.16b 1657 tbl v2.16b, {v27.16b}, v2.16b 1658 eor v0.16b, v0.16b, v2.16b 1659 eor v1.16b, v1.16b, v1.16b 1660 aese v0.16b,v1.16b 1661 ushr v2.16b, v0.16b, 4 1662 and v0.16b, v0.16b, v31.16b 1663 tbl v0.16b, {v30.16b}, v0.16b 1664 tbl v2.16b, {v29.16b}, v2.16b 1665 eor v0.16b, v0.16b, v2.16b 1666 1667 mov w7,v0.s[0] 1668 eor w6,w7,w7,ror #32-2 1669 eor w6,w6,w7,ror #32-10 1670 eor w6,w6,w7,ror #32-18 1671 eor w6,w6,w7,ror #32-24 1672 eor w15,w15,w6 1673 subs w11,w11,#1 1674 b.ne 10b 1675 mov v7.s[0],w15 1676 mov v7.s[1],w14 1677 mov v7.s[2],w13 1678 mov v7.s[3],w12 1679#ifndef __AARCH64EB__ 1680 rev32 v6.16b,v6.16b 1681#endif 1682#ifndef __AARCH64EB__ 1683 rev32 v7.16b,v7.16b 1684#endif 1685 orr v3.16b,v7.16b,v7.16b 1686 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1687 subs w2,w2,#4 1688 b.ne .Lcbc_4_blocks_enc 1689 b 2f 16901: 1691 subs w2,w2,#1 1692 b.lt 2f 1693 ld1 {v4.4s},[x0],#16 1694 eor v3.16b,v3.16b,v4.16b 1695#ifndef __AARCH64EB__ 1696 rev32 v3.16b,v3.16b 1697#endif 1698 mov x10,x3 1699 mov w11,#8 1700 mov w12,v3.s[0] 1701 mov w13,v3.s[1] 1702 mov w14,v3.s[2] 1703 mov w15,v3.s[3] 170410: 1705 ldp w7,w8,[x10],8 1706 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1707 eor w6,w14,w15 1708 eor w9,w7,w13 1709 eor w6,w6,w9 1710 mov v3.s[0],w6 1711 // optimize sbox using AESE instruction 1712 tbl v0.16b, {v3.16b}, v26.16b 1713 ushr v2.16b, v0.16b, 4 1714 and v0.16b, v0.16b, v31.16b 1715 tbl v0.16b, {v28.16b}, v0.16b 1716 tbl v2.16b, {v27.16b}, v2.16b 1717 eor v0.16b, v0.16b, v2.16b 1718 eor v1.16b, v1.16b, v1.16b 1719 aese v0.16b,v1.16b 1720 ushr v2.16b, v0.16b, 4 1721 and v0.16b, v0.16b, v31.16b 1722 tbl v0.16b, {v30.16b}, v0.16b 1723 tbl v2.16b, {v29.16b}, v2.16b 1724 eor v0.16b, v0.16b, v2.16b 1725 1726 mov w7,v0.s[0] 1727 eor w6,w7,w7,ror #32-2 1728 eor w6,w6,w7,ror #32-10 1729 eor w6,w6,w7,ror #32-18 1730 eor w6,w6,w7,ror #32-24 1731 eor w12,w12,w6 1732 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1733 eor w6,w14,w15 1734 eor w9,w12,w8 1735 eor w6,w6,w9 1736 mov v3.s[0],w6 1737 // optimize sbox using AESE instruction 1738 tbl v0.16b, {v3.16b}, v26.16b 1739 ushr v2.16b, v0.16b, 4 1740 and v0.16b, v0.16b, v31.16b 1741 tbl v0.16b, {v28.16b}, v0.16b 1742 tbl v2.16b, {v27.16b}, v2.16b 1743 eor v0.16b, v0.16b, v2.16b 1744 eor v1.16b, v1.16b, v1.16b 1745 aese v0.16b,v1.16b 1746 ushr v2.16b, v0.16b, 4 1747 and v0.16b, v0.16b, v31.16b 1748 tbl v0.16b, {v30.16b}, v0.16b 1749 tbl v2.16b, {v29.16b}, v2.16b 1750 eor v0.16b, v0.16b, v2.16b 1751 1752 mov w7,v0.s[0] 1753 eor w6,w7,w7,ror #32-2 1754 eor w6,w6,w7,ror #32-10 1755 eor w6,w6,w7,ror #32-18 1756 eor w6,w6,w7,ror #32-24 1757 ldp w7,w8,[x10],8 1758 eor w13,w13,w6 1759 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1760 eor w6,w12,w13 1761 eor w9,w7,w15 1762 eor w6,w6,w9 1763 mov v3.s[0],w6 1764 // optimize sbox using AESE instruction 1765 tbl v0.16b, {v3.16b}, v26.16b 1766 ushr v2.16b, v0.16b, 4 1767 and v0.16b, v0.16b, v31.16b 1768 tbl v0.16b, {v28.16b}, v0.16b 1769 tbl v2.16b, {v27.16b}, v2.16b 1770 eor v0.16b, v0.16b, v2.16b 1771 eor v1.16b, v1.16b, v1.16b 1772 aese v0.16b,v1.16b 1773 ushr v2.16b, v0.16b, 4 1774 and v0.16b, v0.16b, v31.16b 1775 tbl v0.16b, {v30.16b}, v0.16b 1776 tbl v2.16b, {v29.16b}, v2.16b 1777 eor v0.16b, v0.16b, v2.16b 1778 1779 mov w7,v0.s[0] 1780 eor w6,w7,w7,ror #32-2 1781 eor w6,w6,w7,ror #32-10 1782 eor w6,w6,w7,ror #32-18 1783 eor w6,w6,w7,ror #32-24 1784 eor w14,w14,w6 1785 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1786 eor w6,w12,w13 1787 eor w9,w14,w8 1788 eor w6,w6,w9 1789 mov v3.s[0],w6 1790 // optimize sbox using AESE instruction 1791 tbl v0.16b, {v3.16b}, v26.16b 1792 ushr v2.16b, v0.16b, 4 1793 and v0.16b, v0.16b, v31.16b 1794 tbl v0.16b, {v28.16b}, v0.16b 1795 tbl v2.16b, {v27.16b}, v2.16b 1796 eor v0.16b, v0.16b, v2.16b 1797 eor v1.16b, v1.16b, v1.16b 1798 aese v0.16b,v1.16b 1799 ushr v2.16b, v0.16b, 4 1800 and v0.16b, v0.16b, v31.16b 1801 tbl v0.16b, {v30.16b}, v0.16b 1802 tbl v2.16b, {v29.16b}, v2.16b 1803 eor v0.16b, v0.16b, v2.16b 1804 1805 mov w7,v0.s[0] 1806 eor w6,w7,w7,ror #32-2 1807 eor w6,w6,w7,ror #32-10 1808 eor w6,w6,w7,ror #32-18 1809 eor w6,w6,w7,ror #32-24 1810 eor w15,w15,w6 1811 subs w11,w11,#1 1812 b.ne 10b 1813 mov v3.s[0],w15 1814 mov v3.s[1],w14 1815 mov v3.s[2],w13 1816 mov v3.s[3],w12 1817#ifndef __AARCH64EB__ 1818 rev32 v3.16b,v3.16b 1819#endif 1820 st1 {v3.4s},[x1],#16 1821 b 1b 18222: 1823 // save back IV 1824 st1 {v3.4s},[x4] 1825 ret 1826 1827.Ldec: 1828 // decryption mode starts 1829 AARCH64_SIGN_LINK_REGISTER 1830 stp d8,d9,[sp,#-80]! 1831 stp d10,d11,[sp,#16] 1832 stp d12,d13,[sp,#32] 1833 stp d14,d15,[sp,#48] 1834 stp x29,x30,[sp,#64] 1835.Lcbc_8_blocks_dec: 1836 cmp w2,#8 1837 b.lt 1f 1838 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1839 add x10,x0,#64 1840 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1841#ifndef __AARCH64EB__ 1842 rev32 v4.16b,v4.16b 1843#endif 1844#ifndef __AARCH64EB__ 1845 rev32 v5.16b,v5.16b 1846#endif 1847#ifndef __AARCH64EB__ 1848 rev32 v6.16b,v6.16b 1849#endif 1850#ifndef __AARCH64EB__ 1851 rev32 v7.16b,v7.16b 1852#endif 1853#ifndef __AARCH64EB__ 1854 rev32 v8.16b,v8.16b 1855#endif 1856#ifndef __AARCH64EB__ 1857 rev32 v9.16b,v9.16b 1858#endif 1859#ifndef __AARCH64EB__ 1860 rev32 v10.16b,v10.16b 1861#endif 1862#ifndef __AARCH64EB__ 1863 rev32 v11.16b,v11.16b 1864#endif 1865 bl _vpsm4_ex_enc_8blks 1866 zip1 v8.4s,v0.4s,v1.4s 1867 zip2 v9.4s,v0.4s,v1.4s 1868 zip1 v10.4s,v2.4s,v3.4s 1869 zip2 v11.4s,v2.4s,v3.4s 1870 zip1 v0.2d,v8.2d,v10.2d 1871 zip2 v1.2d,v8.2d,v10.2d 1872 zip1 v2.2d,v9.2d,v11.2d 1873 zip2 v3.2d,v9.2d,v11.2d 1874 zip1 v8.4s,v4.4s,v5.4s 1875 zip2 v9.4s,v4.4s,v5.4s 1876 zip1 v10.4s,v6.4s,v7.4s 1877 zip2 v11.4s,v6.4s,v7.4s 1878 zip1 v4.2d,v8.2d,v10.2d 1879 zip2 v5.2d,v8.2d,v10.2d 1880 zip1 v6.2d,v9.2d,v11.2d 1881 zip2 v7.2d,v9.2d,v11.2d 1882 ld1 {v15.4s},[x4] 1883 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 1884 // note ivec1 and vtmpx[3] are reusing the same register 1885 // care needs to be taken to avoid conflict 1886 eor v0.16b,v0.16b,v15.16b 1887 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 1888 eor v1.16b,v1.16b,v8.16b 1889 eor v2.16b,v2.16b,v9.16b 1890 eor v3.16b,v3.16b,v10.16b 1891 // save back IV 1892 st1 {v15.4s}, [x4] 1893 eor v4.16b,v4.16b,v11.16b 1894 eor v5.16b,v5.16b,v12.16b 1895 eor v6.16b,v6.16b,v13.16b 1896 eor v7.16b,v7.16b,v14.16b 1897 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1898 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1899 subs w2,w2,#8 1900 b.gt .Lcbc_8_blocks_dec 1901 b.eq 100f 19021: 1903 ld1 {v15.4s},[x4] 1904.Lcbc_4_blocks_dec: 1905 cmp w2,#4 1906 b.lt 1f 1907 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1908#ifndef __AARCH64EB__ 1909 rev32 v4.16b,v4.16b 1910#endif 1911#ifndef __AARCH64EB__ 1912 rev32 v5.16b,v5.16b 1913#endif 1914#ifndef __AARCH64EB__ 1915 rev32 v6.16b,v6.16b 1916#endif 1917#ifndef __AARCH64EB__ 1918 rev32 v7.16b,v7.16b 1919#endif 1920 bl _vpsm4_ex_enc_4blks 1921 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1922 zip1 v8.4s,v0.4s,v1.4s 1923 zip2 v9.4s,v0.4s,v1.4s 1924 zip1 v10.4s,v2.4s,v3.4s 1925 zip2 v11.4s,v2.4s,v3.4s 1926 zip1 v0.2d,v8.2d,v10.2d 1927 zip2 v1.2d,v8.2d,v10.2d 1928 zip1 v2.2d,v9.2d,v11.2d 1929 zip2 v3.2d,v9.2d,v11.2d 1930 eor v0.16b,v0.16b,v15.16b 1931 eor v1.16b,v1.16b,v4.16b 1932 orr v15.16b,v7.16b,v7.16b 1933 eor v2.16b,v2.16b,v5.16b 1934 eor v3.16b,v3.16b,v6.16b 1935 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 1936 subs w2,w2,#4 1937 b.gt .Lcbc_4_blocks_dec 1938 // save back IV 1939 st1 {v7.4s}, [x4] 1940 b 100f 19411: // last block 1942 subs w2,w2,#1 1943 b.lt 100f 1944 b.gt 1f 1945 ld1 {v4.4s},[x0],#16 1946 // save back IV 1947 st1 {v4.4s}, [x4] 1948#ifndef __AARCH64EB__ 1949 rev32 v8.16b,v4.16b 1950#else 1951 mov v8.16b,v4.16b 1952#endif 1953 mov x10,x3 1954 mov w11,#8 1955 mov w12,v8.s[0] 1956 mov w13,v8.s[1] 1957 mov w14,v8.s[2] 1958 mov w15,v8.s[3] 195910: 1960 ldp w7,w8,[x10],8 1961 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1962 eor w6,w14,w15 1963 eor w9,w7,w13 1964 eor w6,w6,w9 1965 mov v3.s[0],w6 1966 // optimize sbox using AESE instruction 1967 tbl v0.16b, {v3.16b}, v26.16b 1968 ushr v2.16b, v0.16b, 4 1969 and v0.16b, v0.16b, v31.16b 1970 tbl v0.16b, {v28.16b}, v0.16b 1971 tbl v2.16b, {v27.16b}, v2.16b 1972 eor v0.16b, v0.16b, v2.16b 1973 eor v1.16b, v1.16b, v1.16b 1974 aese v0.16b,v1.16b 1975 ushr v2.16b, v0.16b, 4 1976 and v0.16b, v0.16b, v31.16b 1977 tbl v0.16b, {v30.16b}, v0.16b 1978 tbl v2.16b, {v29.16b}, v2.16b 1979 eor v0.16b, v0.16b, v2.16b 1980 1981 mov w7,v0.s[0] 1982 eor w6,w7,w7,ror #32-2 1983 eor w6,w6,w7,ror #32-10 1984 eor w6,w6,w7,ror #32-18 1985 eor w6,w6,w7,ror #32-24 1986 eor w12,w12,w6 1987 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1988 eor w6,w14,w15 1989 eor w9,w12,w8 1990 eor w6,w6,w9 1991 mov v3.s[0],w6 1992 // optimize sbox using AESE instruction 1993 tbl v0.16b, {v3.16b}, v26.16b 1994 ushr v2.16b, v0.16b, 4 1995 and v0.16b, v0.16b, v31.16b 1996 tbl v0.16b, {v28.16b}, v0.16b 1997 tbl v2.16b, {v27.16b}, v2.16b 1998 eor v0.16b, v0.16b, v2.16b 1999 eor v1.16b, v1.16b, v1.16b 2000 aese v0.16b,v1.16b 2001 ushr v2.16b, v0.16b, 4 2002 and v0.16b, v0.16b, v31.16b 2003 tbl v0.16b, {v30.16b}, v0.16b 2004 tbl v2.16b, {v29.16b}, v2.16b 2005 eor v0.16b, v0.16b, v2.16b 2006 2007 mov w7,v0.s[0] 2008 eor w6,w7,w7,ror #32-2 2009 eor w6,w6,w7,ror #32-10 2010 eor w6,w6,w7,ror #32-18 2011 eor w6,w6,w7,ror #32-24 2012 ldp w7,w8,[x10],8 2013 eor w13,w13,w6 2014 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2015 eor w6,w12,w13 2016 eor w9,w7,w15 2017 eor w6,w6,w9 2018 mov v3.s[0],w6 2019 // optimize sbox using AESE instruction 2020 tbl v0.16b, {v3.16b}, v26.16b 2021 ushr v2.16b, v0.16b, 4 2022 and v0.16b, v0.16b, v31.16b 2023 tbl v0.16b, {v28.16b}, v0.16b 2024 tbl v2.16b, {v27.16b}, v2.16b 2025 eor v0.16b, v0.16b, v2.16b 2026 eor v1.16b, v1.16b, v1.16b 2027 aese v0.16b,v1.16b 2028 ushr v2.16b, v0.16b, 4 2029 and v0.16b, v0.16b, v31.16b 2030 tbl v0.16b, {v30.16b}, v0.16b 2031 tbl v2.16b, {v29.16b}, v2.16b 2032 eor v0.16b, v0.16b, v2.16b 2033 2034 mov w7,v0.s[0] 2035 eor w6,w7,w7,ror #32-2 2036 eor w6,w6,w7,ror #32-10 2037 eor w6,w6,w7,ror #32-18 2038 eor w6,w6,w7,ror #32-24 2039 eor w14,w14,w6 2040 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2041 eor w6,w12,w13 2042 eor w9,w14,w8 2043 eor w6,w6,w9 2044 mov v3.s[0],w6 2045 // optimize sbox using AESE instruction 2046 tbl v0.16b, {v3.16b}, v26.16b 2047 ushr v2.16b, v0.16b, 4 2048 and v0.16b, v0.16b, v31.16b 2049 tbl v0.16b, {v28.16b}, v0.16b 2050 tbl v2.16b, {v27.16b}, v2.16b 2051 eor v0.16b, v0.16b, v2.16b 2052 eor v1.16b, v1.16b, v1.16b 2053 aese v0.16b,v1.16b 2054 ushr v2.16b, v0.16b, 4 2055 and v0.16b, v0.16b, v31.16b 2056 tbl v0.16b, {v30.16b}, v0.16b 2057 tbl v2.16b, {v29.16b}, v2.16b 2058 eor v0.16b, v0.16b, v2.16b 2059 2060 mov w7,v0.s[0] 2061 eor w6,w7,w7,ror #32-2 2062 eor w6,w6,w7,ror #32-10 2063 eor w6,w6,w7,ror #32-18 2064 eor w6,w6,w7,ror #32-24 2065 eor w15,w15,w6 2066 subs w11,w11,#1 2067 b.ne 10b 2068 mov v8.s[0],w15 2069 mov v8.s[1],w14 2070 mov v8.s[2],w13 2071 mov v8.s[3],w12 2072#ifndef __AARCH64EB__ 2073 rev32 v8.16b,v8.16b 2074#endif 2075 eor v8.16b,v8.16b,v15.16b 2076 st1 {v8.4s},[x1],#16 2077 b 100f 20781: // last two blocks 2079 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] 2080 add x10,x0,#16 2081 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 2082 subs w2,w2,1 2083 b.gt 1f 2084#ifndef __AARCH64EB__ 2085 rev32 v4.16b,v4.16b 2086#endif 2087#ifndef __AARCH64EB__ 2088 rev32 v5.16b,v5.16b 2089#endif 2090#ifndef __AARCH64EB__ 2091 rev32 v6.16b,v6.16b 2092#endif 2093#ifndef __AARCH64EB__ 2094 rev32 v7.16b,v7.16b 2095#endif 2096 bl _vpsm4_ex_enc_4blks 2097 ld1 {v4.4s,v5.4s},[x0],#32 2098 zip1 v8.4s,v0.4s,v1.4s 2099 zip2 v9.4s,v0.4s,v1.4s 2100 zip1 v10.4s,v2.4s,v3.4s 2101 zip2 v11.4s,v2.4s,v3.4s 2102 zip1 v0.2d,v8.2d,v10.2d 2103 zip2 v1.2d,v8.2d,v10.2d 2104 zip1 v2.2d,v9.2d,v11.2d 2105 zip2 v3.2d,v9.2d,v11.2d 2106 eor v0.16b,v0.16b,v15.16b 2107 eor v1.16b,v1.16b,v4.16b 2108 st1 {v0.4s,v1.4s},[x1],#32 2109 // save back IV 2110 st1 {v5.4s}, [x4] 2111 b 100f 21121: // last 3 blocks 2113 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] 2114#ifndef __AARCH64EB__ 2115 rev32 v4.16b,v4.16b 2116#endif 2117#ifndef __AARCH64EB__ 2118 rev32 v5.16b,v5.16b 2119#endif 2120#ifndef __AARCH64EB__ 2121 rev32 v6.16b,v6.16b 2122#endif 2123#ifndef __AARCH64EB__ 2124 rev32 v7.16b,v7.16b 2125#endif 2126 bl _vpsm4_ex_enc_4blks 2127 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2128 zip1 v8.4s,v0.4s,v1.4s 2129 zip2 v9.4s,v0.4s,v1.4s 2130 zip1 v10.4s,v2.4s,v3.4s 2131 zip2 v11.4s,v2.4s,v3.4s 2132 zip1 v0.2d,v8.2d,v10.2d 2133 zip2 v1.2d,v8.2d,v10.2d 2134 zip1 v2.2d,v9.2d,v11.2d 2135 zip2 v3.2d,v9.2d,v11.2d 2136 eor v0.16b,v0.16b,v15.16b 2137 eor v1.16b,v1.16b,v4.16b 2138 eor v2.16b,v2.16b,v5.16b 2139 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2140 // save back IV 2141 st1 {v6.4s}, [x4] 2142100: 2143 ldp d10,d11,[sp,#16] 2144 ldp d12,d13,[sp,#32] 2145 ldp d14,d15,[sp,#48] 2146 ldp x29,x30,[sp,#64] 2147 ldp d8,d9,[sp],#80 2148 AARCH64_VALIDATE_LINK_REGISTER 2149 ret 2150.size vpsm4_ex_cbc_encrypt,.-vpsm4_ex_cbc_encrypt 2151.globl vpsm4_ex_ctr32_encrypt_blocks 2152.type vpsm4_ex_ctr32_encrypt_blocks,%function 2153.align 5 2154vpsm4_ex_ctr32_encrypt_blocks: 2155 AARCH64_VALID_CALL_TARGET 2156 ld1 {v3.4s},[x4] 2157#ifndef __AARCH64EB__ 2158 rev32 v3.16b,v3.16b 2159#endif 2160 adrp x9, .Lsbox_magic 2161 ldr q26, [x9, #:lo12:.Lsbox_magic] 2162 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2163 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2164 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2165 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2166 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2167 cmp w2,#1 2168 b.ne 1f 2169 // fast processing for one single block without 2170 // context saving overhead 2171 mov x10,x3 2172 mov w11,#8 2173 mov w12,v3.s[0] 2174 mov w13,v3.s[1] 2175 mov w14,v3.s[2] 2176 mov w15,v3.s[3] 217710: 2178 ldp w7,w8,[x10],8 2179 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2180 eor w6,w14,w15 2181 eor w9,w7,w13 2182 eor w6,w6,w9 2183 mov v3.s[0],w6 2184 // optimize sbox using AESE instruction 2185 tbl v0.16b, {v3.16b}, v26.16b 2186 ushr v2.16b, v0.16b, 4 2187 and v0.16b, v0.16b, v31.16b 2188 tbl v0.16b, {v28.16b}, v0.16b 2189 tbl v2.16b, {v27.16b}, v2.16b 2190 eor v0.16b, v0.16b, v2.16b 2191 eor v1.16b, v1.16b, v1.16b 2192 aese v0.16b,v1.16b 2193 ushr v2.16b, v0.16b, 4 2194 and v0.16b, v0.16b, v31.16b 2195 tbl v0.16b, {v30.16b}, v0.16b 2196 tbl v2.16b, {v29.16b}, v2.16b 2197 eor v0.16b, v0.16b, v2.16b 2198 2199 mov w7,v0.s[0] 2200 eor w6,w7,w7,ror #32-2 2201 eor w6,w6,w7,ror #32-10 2202 eor w6,w6,w7,ror #32-18 2203 eor w6,w6,w7,ror #32-24 2204 eor w12,w12,w6 2205 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2206 eor w6,w14,w15 2207 eor w9,w12,w8 2208 eor w6,w6,w9 2209 mov v3.s[0],w6 2210 // optimize sbox using AESE instruction 2211 tbl v0.16b, {v3.16b}, v26.16b 2212 ushr v2.16b, v0.16b, 4 2213 and v0.16b, v0.16b, v31.16b 2214 tbl v0.16b, {v28.16b}, v0.16b 2215 tbl v2.16b, {v27.16b}, v2.16b 2216 eor v0.16b, v0.16b, v2.16b 2217 eor v1.16b, v1.16b, v1.16b 2218 aese v0.16b,v1.16b 2219 ushr v2.16b, v0.16b, 4 2220 and v0.16b, v0.16b, v31.16b 2221 tbl v0.16b, {v30.16b}, v0.16b 2222 tbl v2.16b, {v29.16b}, v2.16b 2223 eor v0.16b, v0.16b, v2.16b 2224 2225 mov w7,v0.s[0] 2226 eor w6,w7,w7,ror #32-2 2227 eor w6,w6,w7,ror #32-10 2228 eor w6,w6,w7,ror #32-18 2229 eor w6,w6,w7,ror #32-24 2230 ldp w7,w8,[x10],8 2231 eor w13,w13,w6 2232 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2233 eor w6,w12,w13 2234 eor w9,w7,w15 2235 eor w6,w6,w9 2236 mov v3.s[0],w6 2237 // optimize sbox using AESE instruction 2238 tbl v0.16b, {v3.16b}, v26.16b 2239 ushr v2.16b, v0.16b, 4 2240 and v0.16b, v0.16b, v31.16b 2241 tbl v0.16b, {v28.16b}, v0.16b 2242 tbl v2.16b, {v27.16b}, v2.16b 2243 eor v0.16b, v0.16b, v2.16b 2244 eor v1.16b, v1.16b, v1.16b 2245 aese v0.16b,v1.16b 2246 ushr v2.16b, v0.16b, 4 2247 and v0.16b, v0.16b, v31.16b 2248 tbl v0.16b, {v30.16b}, v0.16b 2249 tbl v2.16b, {v29.16b}, v2.16b 2250 eor v0.16b, v0.16b, v2.16b 2251 2252 mov w7,v0.s[0] 2253 eor w6,w7,w7,ror #32-2 2254 eor w6,w6,w7,ror #32-10 2255 eor w6,w6,w7,ror #32-18 2256 eor w6,w6,w7,ror #32-24 2257 eor w14,w14,w6 2258 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2259 eor w6,w12,w13 2260 eor w9,w14,w8 2261 eor w6,w6,w9 2262 mov v3.s[0],w6 2263 // optimize sbox using AESE instruction 2264 tbl v0.16b, {v3.16b}, v26.16b 2265 ushr v2.16b, v0.16b, 4 2266 and v0.16b, v0.16b, v31.16b 2267 tbl v0.16b, {v28.16b}, v0.16b 2268 tbl v2.16b, {v27.16b}, v2.16b 2269 eor v0.16b, v0.16b, v2.16b 2270 eor v1.16b, v1.16b, v1.16b 2271 aese v0.16b,v1.16b 2272 ushr v2.16b, v0.16b, 4 2273 and v0.16b, v0.16b, v31.16b 2274 tbl v0.16b, {v30.16b}, v0.16b 2275 tbl v2.16b, {v29.16b}, v2.16b 2276 eor v0.16b, v0.16b, v2.16b 2277 2278 mov w7,v0.s[0] 2279 eor w6,w7,w7,ror #32-2 2280 eor w6,w6,w7,ror #32-10 2281 eor w6,w6,w7,ror #32-18 2282 eor w6,w6,w7,ror #32-24 2283 eor w15,w15,w6 2284 subs w11,w11,#1 2285 b.ne 10b 2286 mov v3.s[0],w15 2287 mov v3.s[1],w14 2288 mov v3.s[2],w13 2289 mov v3.s[3],w12 2290#ifndef __AARCH64EB__ 2291 rev32 v3.16b,v3.16b 2292#endif 2293 ld1 {v4.4s},[x0] 2294 eor v4.16b,v4.16b,v3.16b 2295 st1 {v4.4s},[x1] 2296 ret 22971: 2298 AARCH64_SIGN_LINK_REGISTER 2299 stp d8,d9,[sp,#-80]! 2300 stp d10,d11,[sp,#16] 2301 stp d12,d13,[sp,#32] 2302 stp d14,d15,[sp,#48] 2303 stp x29,x30,[sp,#64] 2304 mov w12,v3.s[0] 2305 mov w13,v3.s[1] 2306 mov w14,v3.s[2] 2307 mov w5,v3.s[3] 2308.Lctr32_4_blocks_process: 2309 cmp w2,#4 2310 b.lt 1f 2311 dup v4.4s,w12 2312 dup v5.4s,w13 2313 dup v6.4s,w14 2314 mov v7.s[0],w5 2315 add w5,w5,#1 2316 mov v7.s[1],w5 2317 add w5,w5,#1 2318 mov v7.s[2],w5 2319 add w5,w5,#1 2320 mov v7.s[3],w5 2321 add w5,w5,#1 2322 cmp w2,#8 2323 b.ge .Lctr32_8_blocks_process 2324 bl _vpsm4_ex_enc_4blks 2325 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2326 eor v0.16b,v0.16b,v12.16b 2327 eor v1.16b,v1.16b,v13.16b 2328 eor v2.16b,v2.16b,v14.16b 2329 eor v3.16b,v3.16b,v15.16b 2330 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2331 subs w2,w2,#4 2332 b.ne .Lctr32_4_blocks_process 2333 b 100f 2334.Lctr32_8_blocks_process: 2335 dup v8.4s,w12 2336 dup v9.4s,w13 2337 dup v10.4s,w14 2338 mov v11.s[0],w5 2339 add w5,w5,#1 2340 mov v11.s[1],w5 2341 add w5,w5,#1 2342 mov v11.s[2],w5 2343 add w5,w5,#1 2344 mov v11.s[3],w5 2345 add w5,w5,#1 2346 bl _vpsm4_ex_enc_8blks 2347 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2348 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2349 eor v0.16b,v0.16b,v12.16b 2350 eor v1.16b,v1.16b,v13.16b 2351 eor v2.16b,v2.16b,v14.16b 2352 eor v3.16b,v3.16b,v15.16b 2353 eor v4.16b,v4.16b,v8.16b 2354 eor v5.16b,v5.16b,v9.16b 2355 eor v6.16b,v6.16b,v10.16b 2356 eor v7.16b,v7.16b,v11.16b 2357 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2358 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2359 subs w2,w2,#8 2360 b.ne .Lctr32_4_blocks_process 2361 b 100f 23621: // last block processing 2363 subs w2,w2,#1 2364 b.lt 100f 2365 b.gt 1f 2366 mov v3.s[0],w12 2367 mov v3.s[1],w13 2368 mov v3.s[2],w14 2369 mov v3.s[3],w5 2370 mov x10,x3 2371 mov w11,#8 2372 mov w12,v3.s[0] 2373 mov w13,v3.s[1] 2374 mov w14,v3.s[2] 2375 mov w15,v3.s[3] 237610: 2377 ldp w7,w8,[x10],8 2378 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2379 eor w6,w14,w15 2380 eor w9,w7,w13 2381 eor w6,w6,w9 2382 mov v3.s[0],w6 2383 // optimize sbox using AESE instruction 2384 tbl v0.16b, {v3.16b}, v26.16b 2385 ushr v2.16b, v0.16b, 4 2386 and v0.16b, v0.16b, v31.16b 2387 tbl v0.16b, {v28.16b}, v0.16b 2388 tbl v2.16b, {v27.16b}, v2.16b 2389 eor v0.16b, v0.16b, v2.16b 2390 eor v1.16b, v1.16b, v1.16b 2391 aese v0.16b,v1.16b 2392 ushr v2.16b, v0.16b, 4 2393 and v0.16b, v0.16b, v31.16b 2394 tbl v0.16b, {v30.16b}, v0.16b 2395 tbl v2.16b, {v29.16b}, v2.16b 2396 eor v0.16b, v0.16b, v2.16b 2397 2398 mov w7,v0.s[0] 2399 eor w6,w7,w7,ror #32-2 2400 eor w6,w6,w7,ror #32-10 2401 eor w6,w6,w7,ror #32-18 2402 eor w6,w6,w7,ror #32-24 2403 eor w12,w12,w6 2404 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2405 eor w6,w14,w15 2406 eor w9,w12,w8 2407 eor w6,w6,w9 2408 mov v3.s[0],w6 2409 // optimize sbox using AESE instruction 2410 tbl v0.16b, {v3.16b}, v26.16b 2411 ushr v2.16b, v0.16b, 4 2412 and v0.16b, v0.16b, v31.16b 2413 tbl v0.16b, {v28.16b}, v0.16b 2414 tbl v2.16b, {v27.16b}, v2.16b 2415 eor v0.16b, v0.16b, v2.16b 2416 eor v1.16b, v1.16b, v1.16b 2417 aese v0.16b,v1.16b 2418 ushr v2.16b, v0.16b, 4 2419 and v0.16b, v0.16b, v31.16b 2420 tbl v0.16b, {v30.16b}, v0.16b 2421 tbl v2.16b, {v29.16b}, v2.16b 2422 eor v0.16b, v0.16b, v2.16b 2423 2424 mov w7,v0.s[0] 2425 eor w6,w7,w7,ror #32-2 2426 eor w6,w6,w7,ror #32-10 2427 eor w6,w6,w7,ror #32-18 2428 eor w6,w6,w7,ror #32-24 2429 ldp w7,w8,[x10],8 2430 eor w13,w13,w6 2431 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2432 eor w6,w12,w13 2433 eor w9,w7,w15 2434 eor w6,w6,w9 2435 mov v3.s[0],w6 2436 // optimize sbox using AESE instruction 2437 tbl v0.16b, {v3.16b}, v26.16b 2438 ushr v2.16b, v0.16b, 4 2439 and v0.16b, v0.16b, v31.16b 2440 tbl v0.16b, {v28.16b}, v0.16b 2441 tbl v2.16b, {v27.16b}, v2.16b 2442 eor v0.16b, v0.16b, v2.16b 2443 eor v1.16b, v1.16b, v1.16b 2444 aese v0.16b,v1.16b 2445 ushr v2.16b, v0.16b, 4 2446 and v0.16b, v0.16b, v31.16b 2447 tbl v0.16b, {v30.16b}, v0.16b 2448 tbl v2.16b, {v29.16b}, v2.16b 2449 eor v0.16b, v0.16b, v2.16b 2450 2451 mov w7,v0.s[0] 2452 eor w6,w7,w7,ror #32-2 2453 eor w6,w6,w7,ror #32-10 2454 eor w6,w6,w7,ror #32-18 2455 eor w6,w6,w7,ror #32-24 2456 eor w14,w14,w6 2457 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2458 eor w6,w12,w13 2459 eor w9,w14,w8 2460 eor w6,w6,w9 2461 mov v3.s[0],w6 2462 // optimize sbox using AESE instruction 2463 tbl v0.16b, {v3.16b}, v26.16b 2464 ushr v2.16b, v0.16b, 4 2465 and v0.16b, v0.16b, v31.16b 2466 tbl v0.16b, {v28.16b}, v0.16b 2467 tbl v2.16b, {v27.16b}, v2.16b 2468 eor v0.16b, v0.16b, v2.16b 2469 eor v1.16b, v1.16b, v1.16b 2470 aese v0.16b,v1.16b 2471 ushr v2.16b, v0.16b, 4 2472 and v0.16b, v0.16b, v31.16b 2473 tbl v0.16b, {v30.16b}, v0.16b 2474 tbl v2.16b, {v29.16b}, v2.16b 2475 eor v0.16b, v0.16b, v2.16b 2476 2477 mov w7,v0.s[0] 2478 eor w6,w7,w7,ror #32-2 2479 eor w6,w6,w7,ror #32-10 2480 eor w6,w6,w7,ror #32-18 2481 eor w6,w6,w7,ror #32-24 2482 eor w15,w15,w6 2483 subs w11,w11,#1 2484 b.ne 10b 2485 mov v3.s[0],w15 2486 mov v3.s[1],w14 2487 mov v3.s[2],w13 2488 mov v3.s[3],w12 2489#ifndef __AARCH64EB__ 2490 rev32 v3.16b,v3.16b 2491#endif 2492 ld1 {v4.4s},[x0] 2493 eor v4.16b,v4.16b,v3.16b 2494 st1 {v4.4s},[x1] 2495 b 100f 24961: // last 2 blocks processing 2497 dup v4.4s,w12 2498 dup v5.4s,w13 2499 dup v6.4s,w14 2500 mov v7.s[0],w5 2501 add w5,w5,#1 2502 mov v7.s[1],w5 2503 subs w2,w2,#1 2504 b.ne 1f 2505 bl _vpsm4_ex_enc_4blks 2506 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2507 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2508 eor v0.16b,v0.16b,v12.16b 2509 eor v1.16b,v1.16b,v13.16b 2510 eor v2.16b,v2.16b,v14.16b 2511 eor v3.16b,v3.16b,v15.16b 2512 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2513 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2514 b 100f 25151: // last 3 blocks processing 2516 add w5,w5,#1 2517 mov v7.s[2],w5 2518 bl _vpsm4_ex_enc_4blks 2519 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2520 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2521 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 2522 eor v0.16b,v0.16b,v12.16b 2523 eor v1.16b,v1.16b,v13.16b 2524 eor v2.16b,v2.16b,v14.16b 2525 eor v3.16b,v3.16b,v15.16b 2526 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2527 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2528 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 2529100: 2530 ldp d10,d11,[sp,#16] 2531 ldp d12,d13,[sp,#32] 2532 ldp d14,d15,[sp,#48] 2533 ldp x29,x30,[sp,#64] 2534 ldp d8,d9,[sp],#80 2535 AARCH64_VALIDATE_LINK_REGISTER 2536 ret 2537.size vpsm4_ex_ctr32_encrypt_blocks,.-vpsm4_ex_ctr32_encrypt_blocks 2538.globl vpsm4_ex_xts_encrypt_gb 2539.type vpsm4_ex_xts_encrypt_gb,%function 2540.align 5 2541vpsm4_ex_xts_encrypt_gb: 2542 AARCH64_SIGN_LINK_REGISTER 2543 stp x15, x16, [sp, #-0x10]! 2544 stp x17, x18, [sp, #-0x10]! 2545 stp x19, x20, [sp, #-0x10]! 2546 stp x21, x22, [sp, #-0x10]! 2547 stp x23, x24, [sp, #-0x10]! 2548 stp x25, x26, [sp, #-0x10]! 2549 stp x27, x28, [sp, #-0x10]! 2550 stp x29, x30, [sp, #-0x10]! 2551 stp d8, d9, [sp, #-0x10]! 2552 stp d10, d11, [sp, #-0x10]! 2553 stp d12, d13, [sp, #-0x10]! 2554 stp d14, d15, [sp, #-0x10]! 2555 mov x26,x3 2556 mov x27,x4 2557 mov w28,w6 2558 ld1 {v16.4s}, [x5] 2559 mov x3,x27 2560 adrp x9, .Lsbox_magic 2561 ldr q26, [x9, #:lo12:.Lsbox_magic] 2562 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 2563 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 2564 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 2565 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 2566 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 2567#ifndef __AARCH64EB__ 2568 rev32 v16.16b,v16.16b 2569#endif 2570 mov x10,x3 2571 mov w11,#8 2572 mov w12,v16.s[0] 2573 mov w13,v16.s[1] 2574 mov w14,v16.s[2] 2575 mov w15,v16.s[3] 257610: 2577 ldp w7,w8,[x10],8 2578 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2579 eor w6,w14,w15 2580 eor w9,w7,w13 2581 eor w6,w6,w9 2582 mov v3.s[0],w6 2583 // optimize sbox using AESE instruction 2584 tbl v0.16b, {v3.16b}, v26.16b 2585 ushr v2.16b, v0.16b, 4 2586 and v0.16b, v0.16b, v31.16b 2587 tbl v0.16b, {v28.16b}, v0.16b 2588 tbl v2.16b, {v27.16b}, v2.16b 2589 eor v0.16b, v0.16b, v2.16b 2590 eor v1.16b, v1.16b, v1.16b 2591 aese v0.16b,v1.16b 2592 ushr v2.16b, v0.16b, 4 2593 and v0.16b, v0.16b, v31.16b 2594 tbl v0.16b, {v30.16b}, v0.16b 2595 tbl v2.16b, {v29.16b}, v2.16b 2596 eor v0.16b, v0.16b, v2.16b 2597 2598 mov w7,v0.s[0] 2599 eor w6,w7,w7,ror #32-2 2600 eor w6,w6,w7,ror #32-10 2601 eor w6,w6,w7,ror #32-18 2602 eor w6,w6,w7,ror #32-24 2603 eor w12,w12,w6 2604 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2605 eor w6,w14,w15 2606 eor w9,w12,w8 2607 eor w6,w6,w9 2608 mov v3.s[0],w6 2609 // optimize sbox using AESE instruction 2610 tbl v0.16b, {v3.16b}, v26.16b 2611 ushr v2.16b, v0.16b, 4 2612 and v0.16b, v0.16b, v31.16b 2613 tbl v0.16b, {v28.16b}, v0.16b 2614 tbl v2.16b, {v27.16b}, v2.16b 2615 eor v0.16b, v0.16b, v2.16b 2616 eor v1.16b, v1.16b, v1.16b 2617 aese v0.16b,v1.16b 2618 ushr v2.16b, v0.16b, 4 2619 and v0.16b, v0.16b, v31.16b 2620 tbl v0.16b, {v30.16b}, v0.16b 2621 tbl v2.16b, {v29.16b}, v2.16b 2622 eor v0.16b, v0.16b, v2.16b 2623 2624 mov w7,v0.s[0] 2625 eor w6,w7,w7,ror #32-2 2626 eor w6,w6,w7,ror #32-10 2627 eor w6,w6,w7,ror #32-18 2628 eor w6,w6,w7,ror #32-24 2629 ldp w7,w8,[x10],8 2630 eor w13,w13,w6 2631 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2632 eor w6,w12,w13 2633 eor w9,w7,w15 2634 eor w6,w6,w9 2635 mov v3.s[0],w6 2636 // optimize sbox using AESE instruction 2637 tbl v0.16b, {v3.16b}, v26.16b 2638 ushr v2.16b, v0.16b, 4 2639 and v0.16b, v0.16b, v31.16b 2640 tbl v0.16b, {v28.16b}, v0.16b 2641 tbl v2.16b, {v27.16b}, v2.16b 2642 eor v0.16b, v0.16b, v2.16b 2643 eor v1.16b, v1.16b, v1.16b 2644 aese v0.16b,v1.16b 2645 ushr v2.16b, v0.16b, 4 2646 and v0.16b, v0.16b, v31.16b 2647 tbl v0.16b, {v30.16b}, v0.16b 2648 tbl v2.16b, {v29.16b}, v2.16b 2649 eor v0.16b, v0.16b, v2.16b 2650 2651 mov w7,v0.s[0] 2652 eor w6,w7,w7,ror #32-2 2653 eor w6,w6,w7,ror #32-10 2654 eor w6,w6,w7,ror #32-18 2655 eor w6,w6,w7,ror #32-24 2656 eor w14,w14,w6 2657 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2658 eor w6,w12,w13 2659 eor w9,w14,w8 2660 eor w6,w6,w9 2661 mov v3.s[0],w6 2662 // optimize sbox using AESE instruction 2663 tbl v0.16b, {v3.16b}, v26.16b 2664 ushr v2.16b, v0.16b, 4 2665 and v0.16b, v0.16b, v31.16b 2666 tbl v0.16b, {v28.16b}, v0.16b 2667 tbl v2.16b, {v27.16b}, v2.16b 2668 eor v0.16b, v0.16b, v2.16b 2669 eor v1.16b, v1.16b, v1.16b 2670 aese v0.16b,v1.16b 2671 ushr v2.16b, v0.16b, 4 2672 and v0.16b, v0.16b, v31.16b 2673 tbl v0.16b, {v30.16b}, v0.16b 2674 tbl v2.16b, {v29.16b}, v2.16b 2675 eor v0.16b, v0.16b, v2.16b 2676 2677 mov w7,v0.s[0] 2678 eor w6,w7,w7,ror #32-2 2679 eor w6,w6,w7,ror #32-10 2680 eor w6,w6,w7,ror #32-18 2681 eor w6,w6,w7,ror #32-24 2682 eor w15,w15,w6 2683 subs w11,w11,#1 2684 b.ne 10b 2685 mov v16.s[0],w15 2686 mov v16.s[1],w14 2687 mov v16.s[2],w13 2688 mov v16.s[3],w12 2689#ifndef __AARCH64EB__ 2690 rev32 v16.16b,v16.16b 2691#endif 2692 mov x3,x26 2693 and x29,x2,#0x0F 2694 // convert length into blocks 2695 lsr x2,x2,4 2696 cmp x2,#1 2697 b.lt .return_gb 2698 2699 cmp x29,0 2700 // If the encryption/decryption Length is N times of 16, 2701 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2702 b.eq .xts_encrypt_blocks_gb 2703 2704 // If the encryption/decryption length is not N times of 16, 2705 // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb 2706 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2707 subs x2,x2,#1 2708 b.eq .only_2blks_tweak_gb 2709.xts_encrypt_blocks_gb: 2710 rbit v16.16b,v16.16b 2711#ifdef __AARCH64EB__ 2712 rev32 v16.16b,v16.16b 2713#endif 2714 mov x12,v16.d[0] 2715 mov x13,v16.d[1] 2716 mov w7,0x87 2717 extr x9,x13,x13,#32 2718 extr x15,x13,x12,#63 2719 and w8,w7,w9,asr#31 2720 eor x14,x8,x12,lsl#1 2721 mov w7,0x87 2722 extr x9,x15,x15,#32 2723 extr x17,x15,x14,#63 2724 and w8,w7,w9,asr#31 2725 eor x16,x8,x14,lsl#1 2726 mov w7,0x87 2727 extr x9,x17,x17,#32 2728 extr x19,x17,x16,#63 2729 and w8,w7,w9,asr#31 2730 eor x18,x8,x16,lsl#1 2731 mov w7,0x87 2732 extr x9,x19,x19,#32 2733 extr x21,x19,x18,#63 2734 and w8,w7,w9,asr#31 2735 eor x20,x8,x18,lsl#1 2736 mov w7,0x87 2737 extr x9,x21,x21,#32 2738 extr x23,x21,x20,#63 2739 and w8,w7,w9,asr#31 2740 eor x22,x8,x20,lsl#1 2741 mov w7,0x87 2742 extr x9,x23,x23,#32 2743 extr x25,x23,x22,#63 2744 and w8,w7,w9,asr#31 2745 eor x24,x8,x22,lsl#1 2746 mov w7,0x87 2747 extr x9,x25,x25,#32 2748 extr x27,x25,x24,#63 2749 and w8,w7,w9,asr#31 2750 eor x26,x8,x24,lsl#1 2751.Lxts_8_blocks_process_gb: 2752 cmp x2,#8 2753 mov v16.d[0],x12 2754 mov v16.d[1],x13 2755#ifdef __AARCH64EB__ 2756 rev32 v16.16b,v16.16b 2757#endif 2758 mov w7,0x87 2759 extr x9,x27,x27,#32 2760 extr x13,x27,x26,#63 2761 and w8,w7,w9,asr#31 2762 eor x12,x8,x26,lsl#1 2763 mov v17.d[0],x14 2764 mov v17.d[1],x15 2765#ifdef __AARCH64EB__ 2766 rev32 v17.16b,v17.16b 2767#endif 2768 mov w7,0x87 2769 extr x9,x13,x13,#32 2770 extr x15,x13,x12,#63 2771 and w8,w7,w9,asr#31 2772 eor x14,x8,x12,lsl#1 2773 mov v18.d[0],x16 2774 mov v18.d[1],x17 2775#ifdef __AARCH64EB__ 2776 rev32 v18.16b,v18.16b 2777#endif 2778 mov w7,0x87 2779 extr x9,x15,x15,#32 2780 extr x17,x15,x14,#63 2781 and w8,w7,w9,asr#31 2782 eor x16,x8,x14,lsl#1 2783 mov v19.d[0],x18 2784 mov v19.d[1],x19 2785#ifdef __AARCH64EB__ 2786 rev32 v19.16b,v19.16b 2787#endif 2788 mov w7,0x87 2789 extr x9,x17,x17,#32 2790 extr x19,x17,x16,#63 2791 and w8,w7,w9,asr#31 2792 eor x18,x8,x16,lsl#1 2793 mov v20.d[0],x20 2794 mov v20.d[1],x21 2795#ifdef __AARCH64EB__ 2796 rev32 v20.16b,v20.16b 2797#endif 2798 mov w7,0x87 2799 extr x9,x19,x19,#32 2800 extr x21,x19,x18,#63 2801 and w8,w7,w9,asr#31 2802 eor x20,x8,x18,lsl#1 2803 mov v21.d[0],x22 2804 mov v21.d[1],x23 2805#ifdef __AARCH64EB__ 2806 rev32 v21.16b,v21.16b 2807#endif 2808 mov w7,0x87 2809 extr x9,x21,x21,#32 2810 extr x23,x21,x20,#63 2811 and w8,w7,w9,asr#31 2812 eor x22,x8,x20,lsl#1 2813 mov v22.d[0],x24 2814 mov v22.d[1],x25 2815#ifdef __AARCH64EB__ 2816 rev32 v22.16b,v22.16b 2817#endif 2818 mov w7,0x87 2819 extr x9,x23,x23,#32 2820 extr x25,x23,x22,#63 2821 and w8,w7,w9,asr#31 2822 eor x24,x8,x22,lsl#1 2823 mov v23.d[0],x26 2824 mov v23.d[1],x27 2825#ifdef __AARCH64EB__ 2826 rev32 v23.16b,v23.16b 2827#endif 2828 mov w7,0x87 2829 extr x9,x25,x25,#32 2830 extr x27,x25,x24,#63 2831 and w8,w7,w9,asr#31 2832 eor x26,x8,x24,lsl#1 2833 b.lt .Lxts_4_blocks_process_gb 2834 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2835 rbit v16.16b,v16.16b 2836 rbit v17.16b,v17.16b 2837 rbit v18.16b,v18.16b 2838 rbit v19.16b,v19.16b 2839 eor v4.16b, v4.16b, v16.16b 2840 eor v5.16b, v5.16b, v17.16b 2841 eor v6.16b, v6.16b, v18.16b 2842 eor v7.16b, v7.16b, v19.16b 2843 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2844 rbit v20.16b,v20.16b 2845 rbit v21.16b,v21.16b 2846 rbit v22.16b,v22.16b 2847 rbit v23.16b,v23.16b 2848 eor v8.16b, v8.16b, v20.16b 2849 eor v9.16b, v9.16b, v21.16b 2850 eor v10.16b, v10.16b, v22.16b 2851 eor v11.16b, v11.16b, v23.16b 2852#ifndef __AARCH64EB__ 2853 rev32 v4.16b,v4.16b 2854#endif 2855#ifndef __AARCH64EB__ 2856 rev32 v5.16b,v5.16b 2857#endif 2858#ifndef __AARCH64EB__ 2859 rev32 v6.16b,v6.16b 2860#endif 2861#ifndef __AARCH64EB__ 2862 rev32 v7.16b,v7.16b 2863#endif 2864#ifndef __AARCH64EB__ 2865 rev32 v8.16b,v8.16b 2866#endif 2867#ifndef __AARCH64EB__ 2868 rev32 v9.16b,v9.16b 2869#endif 2870#ifndef __AARCH64EB__ 2871 rev32 v10.16b,v10.16b 2872#endif 2873#ifndef __AARCH64EB__ 2874 rev32 v11.16b,v11.16b 2875#endif 2876 zip1 v0.4s,v4.4s,v5.4s 2877 zip2 v1.4s,v4.4s,v5.4s 2878 zip1 v2.4s,v6.4s,v7.4s 2879 zip2 v3.4s,v6.4s,v7.4s 2880 zip1 v4.2d,v0.2d,v2.2d 2881 zip2 v5.2d,v0.2d,v2.2d 2882 zip1 v6.2d,v1.2d,v3.2d 2883 zip2 v7.2d,v1.2d,v3.2d 2884 zip1 v0.4s,v8.4s,v9.4s 2885 zip2 v1.4s,v8.4s,v9.4s 2886 zip1 v2.4s,v10.4s,v11.4s 2887 zip2 v3.4s,v10.4s,v11.4s 2888 zip1 v8.2d,v0.2d,v2.2d 2889 zip2 v9.2d,v0.2d,v2.2d 2890 zip1 v10.2d,v1.2d,v3.2d 2891 zip2 v11.2d,v1.2d,v3.2d 2892 bl _vpsm4_ex_enc_8blks 2893 zip1 v8.4s,v0.4s,v1.4s 2894 zip2 v9.4s,v0.4s,v1.4s 2895 zip1 v10.4s,v2.4s,v3.4s 2896 zip2 v11.4s,v2.4s,v3.4s 2897 zip1 v0.2d,v8.2d,v10.2d 2898 zip2 v1.2d,v8.2d,v10.2d 2899 zip1 v2.2d,v9.2d,v11.2d 2900 zip2 v3.2d,v9.2d,v11.2d 2901 zip1 v8.4s,v4.4s,v5.4s 2902 zip2 v9.4s,v4.4s,v5.4s 2903 zip1 v10.4s,v6.4s,v7.4s 2904 zip2 v11.4s,v6.4s,v7.4s 2905 zip1 v4.2d,v8.2d,v10.2d 2906 zip2 v5.2d,v8.2d,v10.2d 2907 zip1 v6.2d,v9.2d,v11.2d 2908 zip2 v7.2d,v9.2d,v11.2d 2909 eor v0.16b, v0.16b, v16.16b 2910 eor v1.16b, v1.16b, v17.16b 2911 eor v2.16b, v2.16b, v18.16b 2912 eor v3.16b, v3.16b, v19.16b 2913 eor v4.16b, v4.16b, v20.16b 2914 eor v5.16b, v5.16b, v21.16b 2915 eor v6.16b, v6.16b, v22.16b 2916 eor v7.16b, v7.16b, v23.16b 2917 2918 // save the last tweak 2919 mov v25.16b,v23.16b 2920 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2921 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2922 subs x2,x2,#8 2923 b.gt .Lxts_8_blocks_process_gb 2924 b 100f 2925.Lxts_4_blocks_process_gb: 2926 cmp x2,#4 2927 b.lt 1f 2928 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2929 rbit v16.16b,v16.16b 2930 rbit v17.16b,v17.16b 2931 rbit v18.16b,v18.16b 2932 rbit v19.16b,v19.16b 2933 eor v4.16b, v4.16b, v16.16b 2934 eor v5.16b, v5.16b, v17.16b 2935 eor v6.16b, v6.16b, v18.16b 2936 eor v7.16b, v7.16b, v19.16b 2937#ifndef __AARCH64EB__ 2938 rev32 v4.16b,v4.16b 2939#endif 2940#ifndef __AARCH64EB__ 2941 rev32 v5.16b,v5.16b 2942#endif 2943#ifndef __AARCH64EB__ 2944 rev32 v6.16b,v6.16b 2945#endif 2946#ifndef __AARCH64EB__ 2947 rev32 v7.16b,v7.16b 2948#endif 2949 zip1 v0.4s,v4.4s,v5.4s 2950 zip2 v1.4s,v4.4s,v5.4s 2951 zip1 v2.4s,v6.4s,v7.4s 2952 zip2 v3.4s,v6.4s,v7.4s 2953 zip1 v4.2d,v0.2d,v2.2d 2954 zip2 v5.2d,v0.2d,v2.2d 2955 zip1 v6.2d,v1.2d,v3.2d 2956 zip2 v7.2d,v1.2d,v3.2d 2957 bl _vpsm4_ex_enc_4blks 2958 zip1 v4.4s,v0.4s,v1.4s 2959 zip2 v5.4s,v0.4s,v1.4s 2960 zip1 v6.4s,v2.4s,v3.4s 2961 zip2 v7.4s,v2.4s,v3.4s 2962 zip1 v0.2d,v4.2d,v6.2d 2963 zip2 v1.2d,v4.2d,v6.2d 2964 zip1 v2.2d,v5.2d,v7.2d 2965 zip2 v3.2d,v5.2d,v7.2d 2966 eor v0.16b, v0.16b, v16.16b 2967 eor v1.16b, v1.16b, v17.16b 2968 eor v2.16b, v2.16b, v18.16b 2969 eor v3.16b, v3.16b, v19.16b 2970 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2971 sub x2,x2,#4 2972 mov v16.16b,v20.16b 2973 mov v17.16b,v21.16b 2974 mov v18.16b,v22.16b 2975 // save the last tweak 2976 mov v25.16b,v19.16b 29771: 2978 // process last block 2979 cmp x2,#1 2980 b.lt 100f 2981 b.gt 1f 2982 ld1 {v4.4s},[x0],#16 2983 rbit v16.16b,v16.16b 2984 eor v4.16b, v4.16b, v16.16b 2985#ifndef __AARCH64EB__ 2986 rev32 v4.16b,v4.16b 2987#endif 2988 mov x10,x3 2989 mov w11,#8 2990 mov w12,v4.s[0] 2991 mov w13,v4.s[1] 2992 mov w14,v4.s[2] 2993 mov w15,v4.s[3] 299410: 2995 ldp w7,w8,[x10],8 2996 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2997 eor w6,w14,w15 2998 eor w9,w7,w13 2999 eor w6,w6,w9 3000 mov v3.s[0],w6 3001 // optimize sbox using AESE instruction 3002 tbl v0.16b, {v3.16b}, v26.16b 3003 ushr v2.16b, v0.16b, 4 3004 and v0.16b, v0.16b, v31.16b 3005 tbl v0.16b, {v28.16b}, v0.16b 3006 tbl v2.16b, {v27.16b}, v2.16b 3007 eor v0.16b, v0.16b, v2.16b 3008 eor v1.16b, v1.16b, v1.16b 3009 aese v0.16b,v1.16b 3010 ushr v2.16b, v0.16b, 4 3011 and v0.16b, v0.16b, v31.16b 3012 tbl v0.16b, {v30.16b}, v0.16b 3013 tbl v2.16b, {v29.16b}, v2.16b 3014 eor v0.16b, v0.16b, v2.16b 3015 3016 mov w7,v0.s[0] 3017 eor w6,w7,w7,ror #32-2 3018 eor w6,w6,w7,ror #32-10 3019 eor w6,w6,w7,ror #32-18 3020 eor w6,w6,w7,ror #32-24 3021 eor w12,w12,w6 3022 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3023 eor w6,w14,w15 3024 eor w9,w12,w8 3025 eor w6,w6,w9 3026 mov v3.s[0],w6 3027 // optimize sbox using AESE instruction 3028 tbl v0.16b, {v3.16b}, v26.16b 3029 ushr v2.16b, v0.16b, 4 3030 and v0.16b, v0.16b, v31.16b 3031 tbl v0.16b, {v28.16b}, v0.16b 3032 tbl v2.16b, {v27.16b}, v2.16b 3033 eor v0.16b, v0.16b, v2.16b 3034 eor v1.16b, v1.16b, v1.16b 3035 aese v0.16b,v1.16b 3036 ushr v2.16b, v0.16b, 4 3037 and v0.16b, v0.16b, v31.16b 3038 tbl v0.16b, {v30.16b}, v0.16b 3039 tbl v2.16b, {v29.16b}, v2.16b 3040 eor v0.16b, v0.16b, v2.16b 3041 3042 mov w7,v0.s[0] 3043 eor w6,w7,w7,ror #32-2 3044 eor w6,w6,w7,ror #32-10 3045 eor w6,w6,w7,ror #32-18 3046 eor w6,w6,w7,ror #32-24 3047 ldp w7,w8,[x10],8 3048 eor w13,w13,w6 3049 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3050 eor w6,w12,w13 3051 eor w9,w7,w15 3052 eor w6,w6,w9 3053 mov v3.s[0],w6 3054 // optimize sbox using AESE instruction 3055 tbl v0.16b, {v3.16b}, v26.16b 3056 ushr v2.16b, v0.16b, 4 3057 and v0.16b, v0.16b, v31.16b 3058 tbl v0.16b, {v28.16b}, v0.16b 3059 tbl v2.16b, {v27.16b}, v2.16b 3060 eor v0.16b, v0.16b, v2.16b 3061 eor v1.16b, v1.16b, v1.16b 3062 aese v0.16b,v1.16b 3063 ushr v2.16b, v0.16b, 4 3064 and v0.16b, v0.16b, v31.16b 3065 tbl v0.16b, {v30.16b}, v0.16b 3066 tbl v2.16b, {v29.16b}, v2.16b 3067 eor v0.16b, v0.16b, v2.16b 3068 3069 mov w7,v0.s[0] 3070 eor w6,w7,w7,ror #32-2 3071 eor w6,w6,w7,ror #32-10 3072 eor w6,w6,w7,ror #32-18 3073 eor w6,w6,w7,ror #32-24 3074 eor w14,w14,w6 3075 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3076 eor w6,w12,w13 3077 eor w9,w14,w8 3078 eor w6,w6,w9 3079 mov v3.s[0],w6 3080 // optimize sbox using AESE instruction 3081 tbl v0.16b, {v3.16b}, v26.16b 3082 ushr v2.16b, v0.16b, 4 3083 and v0.16b, v0.16b, v31.16b 3084 tbl v0.16b, {v28.16b}, v0.16b 3085 tbl v2.16b, {v27.16b}, v2.16b 3086 eor v0.16b, v0.16b, v2.16b 3087 eor v1.16b, v1.16b, v1.16b 3088 aese v0.16b,v1.16b 3089 ushr v2.16b, v0.16b, 4 3090 and v0.16b, v0.16b, v31.16b 3091 tbl v0.16b, {v30.16b}, v0.16b 3092 tbl v2.16b, {v29.16b}, v2.16b 3093 eor v0.16b, v0.16b, v2.16b 3094 3095 mov w7,v0.s[0] 3096 eor w6,w7,w7,ror #32-2 3097 eor w6,w6,w7,ror #32-10 3098 eor w6,w6,w7,ror #32-18 3099 eor w6,w6,w7,ror #32-24 3100 eor w15,w15,w6 3101 subs w11,w11,#1 3102 b.ne 10b 3103 mov v4.s[0],w15 3104 mov v4.s[1],w14 3105 mov v4.s[2],w13 3106 mov v4.s[3],w12 3107#ifndef __AARCH64EB__ 3108 rev32 v4.16b,v4.16b 3109#endif 3110 eor v4.16b, v4.16b, v16.16b 3111 st1 {v4.4s},[x1],#16 3112 // save the last tweak 3113 mov v25.16b,v16.16b 3114 b 100f 31151: // process last 2 blocks 3116 cmp x2,#2 3117 b.gt 1f 3118 ld1 {v4.4s,v5.4s},[x0],#32 3119 rbit v16.16b,v16.16b 3120 rbit v17.16b,v17.16b 3121 eor v4.16b, v4.16b, v16.16b 3122 eor v5.16b, v5.16b, v17.16b 3123#ifndef __AARCH64EB__ 3124 rev32 v4.16b,v4.16b 3125#endif 3126#ifndef __AARCH64EB__ 3127 rev32 v5.16b,v5.16b 3128#endif 3129 zip1 v0.4s,v4.4s,v5.4s 3130 zip2 v1.4s,v4.4s,v5.4s 3131 zip1 v2.4s,v6.4s,v7.4s 3132 zip2 v3.4s,v6.4s,v7.4s 3133 zip1 v4.2d,v0.2d,v2.2d 3134 zip2 v5.2d,v0.2d,v2.2d 3135 zip1 v6.2d,v1.2d,v3.2d 3136 zip2 v7.2d,v1.2d,v3.2d 3137 bl _vpsm4_ex_enc_4blks 3138 zip1 v4.4s,v0.4s,v1.4s 3139 zip2 v5.4s,v0.4s,v1.4s 3140 zip1 v6.4s,v2.4s,v3.4s 3141 zip2 v7.4s,v2.4s,v3.4s 3142 zip1 v0.2d,v4.2d,v6.2d 3143 zip2 v1.2d,v4.2d,v6.2d 3144 zip1 v2.2d,v5.2d,v7.2d 3145 zip2 v3.2d,v5.2d,v7.2d 3146 eor v0.16b, v0.16b, v16.16b 3147 eor v1.16b, v1.16b, v17.16b 3148 st1 {v0.4s,v1.4s},[x1],#32 3149 // save the last tweak 3150 mov v25.16b,v17.16b 3151 b 100f 31521: // process last 3 blocks 3153 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 3154 rbit v16.16b,v16.16b 3155 rbit v17.16b,v17.16b 3156 rbit v18.16b,v18.16b 3157 eor v4.16b, v4.16b, v16.16b 3158 eor v5.16b, v5.16b, v17.16b 3159 eor v6.16b, v6.16b, v18.16b 3160#ifndef __AARCH64EB__ 3161 rev32 v4.16b,v4.16b 3162#endif 3163#ifndef __AARCH64EB__ 3164 rev32 v5.16b,v5.16b 3165#endif 3166#ifndef __AARCH64EB__ 3167 rev32 v6.16b,v6.16b 3168#endif 3169 zip1 v0.4s,v4.4s,v5.4s 3170 zip2 v1.4s,v4.4s,v5.4s 3171 zip1 v2.4s,v6.4s,v7.4s 3172 zip2 v3.4s,v6.4s,v7.4s 3173 zip1 v4.2d,v0.2d,v2.2d 3174 zip2 v5.2d,v0.2d,v2.2d 3175 zip1 v6.2d,v1.2d,v3.2d 3176 zip2 v7.2d,v1.2d,v3.2d 3177 bl _vpsm4_ex_enc_4blks 3178 zip1 v4.4s,v0.4s,v1.4s 3179 zip2 v5.4s,v0.4s,v1.4s 3180 zip1 v6.4s,v2.4s,v3.4s 3181 zip2 v7.4s,v2.4s,v3.4s 3182 zip1 v0.2d,v4.2d,v6.2d 3183 zip2 v1.2d,v4.2d,v6.2d 3184 zip1 v2.2d,v5.2d,v7.2d 3185 zip2 v3.2d,v5.2d,v7.2d 3186 eor v0.16b, v0.16b, v16.16b 3187 eor v1.16b, v1.16b, v17.16b 3188 eor v2.16b, v2.16b, v18.16b 3189 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 3190 // save the last tweak 3191 mov v25.16b,v18.16b 3192100: 3193 cmp x29,0 3194 b.eq .return_gb 3195 3196// This branch calculates the last two tweaks, 3197// while the encryption/decryption length is larger than 32 3198.last_2blks_tweak_gb: 3199#ifdef __AARCH64EB__ 3200 rev32 v25.16b,v25.16b 3201#endif 3202 rbit v2.16b,v25.16b 3203 adrp x9, .Lxts_magic 3204 ldr q0, [x9, #:lo12:.Lxts_magic] 3205 shl v17.16b, v2.16b, #1 3206 ext v1.16b, v2.16b, v2.16b,#15 3207 ushr v1.16b, v1.16b, #7 3208 mul v1.16b, v1.16b, v0.16b 3209 eor v17.16b, v17.16b, v1.16b 3210 rbit v17.16b,v17.16b 3211 rbit v2.16b,v17.16b 3212 adrp x9, .Lxts_magic 3213 ldr q0, [x9, #:lo12:.Lxts_magic] 3214 shl v18.16b, v2.16b, #1 3215 ext v1.16b, v2.16b, v2.16b,#15 3216 ushr v1.16b, v1.16b, #7 3217 mul v1.16b, v1.16b, v0.16b 3218 eor v18.16b, v18.16b, v1.16b 3219 rbit v18.16b,v18.16b 3220 b .check_dec_gb 3221 3222 3223// This branch calculates the last two tweaks, 3224// while the encryption/decryption length is equal to 32, who only need two tweaks 3225.only_2blks_tweak_gb: 3226 mov v17.16b,v16.16b 3227#ifdef __AARCH64EB__ 3228 rev32 v17.16b,v17.16b 3229#endif 3230 rbit v2.16b,v17.16b 3231 adrp x9, .Lxts_magic 3232 ldr q0, [x9, #:lo12:.Lxts_magic] 3233 shl v18.16b, v2.16b, #1 3234 ext v1.16b, v2.16b, v2.16b,#15 3235 ushr v1.16b, v1.16b, #7 3236 mul v1.16b, v1.16b, v0.16b 3237 eor v18.16b, v18.16b, v1.16b 3238 rbit v18.16b,v18.16b 3239 b .check_dec_gb 3240 3241 3242// Determine whether encryption or decryption is required. 3243// The last two tweaks need to be swapped for decryption. 3244.check_dec_gb: 3245 // encryption:1 decryption:0 3246 cmp w28,1 3247 b.eq .process_last_2blks_gb 3248 mov v0.16B,v17.16b 3249 mov v17.16B,v18.16b 3250 mov v18.16B,v0.16b 3251 3252.process_last_2blks_gb: 3253#ifdef __AARCH64EB__ 3254 rev32 v17.16b,v17.16b 3255#endif 3256#ifdef __AARCH64EB__ 3257 rev32 v18.16b,v18.16b 3258#endif 3259 ld1 {v4.4s},[x0],#16 3260 eor v4.16b, v4.16b, v17.16b 3261#ifndef __AARCH64EB__ 3262 rev32 v4.16b,v4.16b 3263#endif 3264 mov x10,x3 3265 mov w11,#8 3266 mov w12,v4.s[0] 3267 mov w13,v4.s[1] 3268 mov w14,v4.s[2] 3269 mov w15,v4.s[3] 327010: 3271 ldp w7,w8,[x10],8 3272 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3273 eor w6,w14,w15 3274 eor w9,w7,w13 3275 eor w6,w6,w9 3276 mov v3.s[0],w6 3277 // optimize sbox using AESE instruction 3278 tbl v0.16b, {v3.16b}, v26.16b 3279 ushr v2.16b, v0.16b, 4 3280 and v0.16b, v0.16b, v31.16b 3281 tbl v0.16b, {v28.16b}, v0.16b 3282 tbl v2.16b, {v27.16b}, v2.16b 3283 eor v0.16b, v0.16b, v2.16b 3284 eor v1.16b, v1.16b, v1.16b 3285 aese v0.16b,v1.16b 3286 ushr v2.16b, v0.16b, 4 3287 and v0.16b, v0.16b, v31.16b 3288 tbl v0.16b, {v30.16b}, v0.16b 3289 tbl v2.16b, {v29.16b}, v2.16b 3290 eor v0.16b, v0.16b, v2.16b 3291 3292 mov w7,v0.s[0] 3293 eor w6,w7,w7,ror #32-2 3294 eor w6,w6,w7,ror #32-10 3295 eor w6,w6,w7,ror #32-18 3296 eor w6,w6,w7,ror #32-24 3297 eor w12,w12,w6 3298 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3299 eor w6,w14,w15 3300 eor w9,w12,w8 3301 eor w6,w6,w9 3302 mov v3.s[0],w6 3303 // optimize sbox using AESE instruction 3304 tbl v0.16b, {v3.16b}, v26.16b 3305 ushr v2.16b, v0.16b, 4 3306 and v0.16b, v0.16b, v31.16b 3307 tbl v0.16b, {v28.16b}, v0.16b 3308 tbl v2.16b, {v27.16b}, v2.16b 3309 eor v0.16b, v0.16b, v2.16b 3310 eor v1.16b, v1.16b, v1.16b 3311 aese v0.16b,v1.16b 3312 ushr v2.16b, v0.16b, 4 3313 and v0.16b, v0.16b, v31.16b 3314 tbl v0.16b, {v30.16b}, v0.16b 3315 tbl v2.16b, {v29.16b}, v2.16b 3316 eor v0.16b, v0.16b, v2.16b 3317 3318 mov w7,v0.s[0] 3319 eor w6,w7,w7,ror #32-2 3320 eor w6,w6,w7,ror #32-10 3321 eor w6,w6,w7,ror #32-18 3322 eor w6,w6,w7,ror #32-24 3323 ldp w7,w8,[x10],8 3324 eor w13,w13,w6 3325 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3326 eor w6,w12,w13 3327 eor w9,w7,w15 3328 eor w6,w6,w9 3329 mov v3.s[0],w6 3330 // optimize sbox using AESE instruction 3331 tbl v0.16b, {v3.16b}, v26.16b 3332 ushr v2.16b, v0.16b, 4 3333 and v0.16b, v0.16b, v31.16b 3334 tbl v0.16b, {v28.16b}, v0.16b 3335 tbl v2.16b, {v27.16b}, v2.16b 3336 eor v0.16b, v0.16b, v2.16b 3337 eor v1.16b, v1.16b, v1.16b 3338 aese v0.16b,v1.16b 3339 ushr v2.16b, v0.16b, 4 3340 and v0.16b, v0.16b, v31.16b 3341 tbl v0.16b, {v30.16b}, v0.16b 3342 tbl v2.16b, {v29.16b}, v2.16b 3343 eor v0.16b, v0.16b, v2.16b 3344 3345 mov w7,v0.s[0] 3346 eor w6,w7,w7,ror #32-2 3347 eor w6,w6,w7,ror #32-10 3348 eor w6,w6,w7,ror #32-18 3349 eor w6,w6,w7,ror #32-24 3350 eor w14,w14,w6 3351 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3352 eor w6,w12,w13 3353 eor w9,w14,w8 3354 eor w6,w6,w9 3355 mov v3.s[0],w6 3356 // optimize sbox using AESE instruction 3357 tbl v0.16b, {v3.16b}, v26.16b 3358 ushr v2.16b, v0.16b, 4 3359 and v0.16b, v0.16b, v31.16b 3360 tbl v0.16b, {v28.16b}, v0.16b 3361 tbl v2.16b, {v27.16b}, v2.16b 3362 eor v0.16b, v0.16b, v2.16b 3363 eor v1.16b, v1.16b, v1.16b 3364 aese v0.16b,v1.16b 3365 ushr v2.16b, v0.16b, 4 3366 and v0.16b, v0.16b, v31.16b 3367 tbl v0.16b, {v30.16b}, v0.16b 3368 tbl v2.16b, {v29.16b}, v2.16b 3369 eor v0.16b, v0.16b, v2.16b 3370 3371 mov w7,v0.s[0] 3372 eor w6,w7,w7,ror #32-2 3373 eor w6,w6,w7,ror #32-10 3374 eor w6,w6,w7,ror #32-18 3375 eor w6,w6,w7,ror #32-24 3376 eor w15,w15,w6 3377 subs w11,w11,#1 3378 b.ne 10b 3379 mov v4.s[0],w15 3380 mov v4.s[1],w14 3381 mov v4.s[2],w13 3382 mov v4.s[3],w12 3383#ifndef __AARCH64EB__ 3384 rev32 v4.16b,v4.16b 3385#endif 3386 eor v4.16b, v4.16b, v17.16b 3387 st1 {v4.4s},[x1],#16 3388 3389 sub x26,x1,16 3390.loop_gb: 3391 subs x29,x29,1 3392 ldrb w7,[x26,x29] 3393 ldrb w8,[x0,x29] 3394 strb w8,[x26,x29] 3395 strb w7,[x1,x29] 3396 b.gt .loop_gb 3397 ld1 {v4.4s}, [x26] 3398 eor v4.16b, v4.16b, v18.16b 3399#ifndef __AARCH64EB__ 3400 rev32 v4.16b,v4.16b 3401#endif 3402 mov x10,x3 3403 mov w11,#8 3404 mov w12,v4.s[0] 3405 mov w13,v4.s[1] 3406 mov w14,v4.s[2] 3407 mov w15,v4.s[3] 340810: 3409 ldp w7,w8,[x10],8 3410 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3411 eor w6,w14,w15 3412 eor w9,w7,w13 3413 eor w6,w6,w9 3414 mov v3.s[0],w6 3415 // optimize sbox using AESE instruction 3416 tbl v0.16b, {v3.16b}, v26.16b 3417 ushr v2.16b, v0.16b, 4 3418 and v0.16b, v0.16b, v31.16b 3419 tbl v0.16b, {v28.16b}, v0.16b 3420 tbl v2.16b, {v27.16b}, v2.16b 3421 eor v0.16b, v0.16b, v2.16b 3422 eor v1.16b, v1.16b, v1.16b 3423 aese v0.16b,v1.16b 3424 ushr v2.16b, v0.16b, 4 3425 and v0.16b, v0.16b, v31.16b 3426 tbl v0.16b, {v30.16b}, v0.16b 3427 tbl v2.16b, {v29.16b}, v2.16b 3428 eor v0.16b, v0.16b, v2.16b 3429 3430 mov w7,v0.s[0] 3431 eor w6,w7,w7,ror #32-2 3432 eor w6,w6,w7,ror #32-10 3433 eor w6,w6,w7,ror #32-18 3434 eor w6,w6,w7,ror #32-24 3435 eor w12,w12,w6 3436 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3437 eor w6,w14,w15 3438 eor w9,w12,w8 3439 eor w6,w6,w9 3440 mov v3.s[0],w6 3441 // optimize sbox using AESE instruction 3442 tbl v0.16b, {v3.16b}, v26.16b 3443 ushr v2.16b, v0.16b, 4 3444 and v0.16b, v0.16b, v31.16b 3445 tbl v0.16b, {v28.16b}, v0.16b 3446 tbl v2.16b, {v27.16b}, v2.16b 3447 eor v0.16b, v0.16b, v2.16b 3448 eor v1.16b, v1.16b, v1.16b 3449 aese v0.16b,v1.16b 3450 ushr v2.16b, v0.16b, 4 3451 and v0.16b, v0.16b, v31.16b 3452 tbl v0.16b, {v30.16b}, v0.16b 3453 tbl v2.16b, {v29.16b}, v2.16b 3454 eor v0.16b, v0.16b, v2.16b 3455 3456 mov w7,v0.s[0] 3457 eor w6,w7,w7,ror #32-2 3458 eor w6,w6,w7,ror #32-10 3459 eor w6,w6,w7,ror #32-18 3460 eor w6,w6,w7,ror #32-24 3461 ldp w7,w8,[x10],8 3462 eor w13,w13,w6 3463 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3464 eor w6,w12,w13 3465 eor w9,w7,w15 3466 eor w6,w6,w9 3467 mov v3.s[0],w6 3468 // optimize sbox using AESE instruction 3469 tbl v0.16b, {v3.16b}, v26.16b 3470 ushr v2.16b, v0.16b, 4 3471 and v0.16b, v0.16b, v31.16b 3472 tbl v0.16b, {v28.16b}, v0.16b 3473 tbl v2.16b, {v27.16b}, v2.16b 3474 eor v0.16b, v0.16b, v2.16b 3475 eor v1.16b, v1.16b, v1.16b 3476 aese v0.16b,v1.16b 3477 ushr v2.16b, v0.16b, 4 3478 and v0.16b, v0.16b, v31.16b 3479 tbl v0.16b, {v30.16b}, v0.16b 3480 tbl v2.16b, {v29.16b}, v2.16b 3481 eor v0.16b, v0.16b, v2.16b 3482 3483 mov w7,v0.s[0] 3484 eor w6,w7,w7,ror #32-2 3485 eor w6,w6,w7,ror #32-10 3486 eor w6,w6,w7,ror #32-18 3487 eor w6,w6,w7,ror #32-24 3488 eor w14,w14,w6 3489 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3490 eor w6,w12,w13 3491 eor w9,w14,w8 3492 eor w6,w6,w9 3493 mov v3.s[0],w6 3494 // optimize sbox using AESE instruction 3495 tbl v0.16b, {v3.16b}, v26.16b 3496 ushr v2.16b, v0.16b, 4 3497 and v0.16b, v0.16b, v31.16b 3498 tbl v0.16b, {v28.16b}, v0.16b 3499 tbl v2.16b, {v27.16b}, v2.16b 3500 eor v0.16b, v0.16b, v2.16b 3501 eor v1.16b, v1.16b, v1.16b 3502 aese v0.16b,v1.16b 3503 ushr v2.16b, v0.16b, 4 3504 and v0.16b, v0.16b, v31.16b 3505 tbl v0.16b, {v30.16b}, v0.16b 3506 tbl v2.16b, {v29.16b}, v2.16b 3507 eor v0.16b, v0.16b, v2.16b 3508 3509 mov w7,v0.s[0] 3510 eor w6,w7,w7,ror #32-2 3511 eor w6,w6,w7,ror #32-10 3512 eor w6,w6,w7,ror #32-18 3513 eor w6,w6,w7,ror #32-24 3514 eor w15,w15,w6 3515 subs w11,w11,#1 3516 b.ne 10b 3517 mov v4.s[0],w15 3518 mov v4.s[1],w14 3519 mov v4.s[2],w13 3520 mov v4.s[3],w12 3521#ifndef __AARCH64EB__ 3522 rev32 v4.16b,v4.16b 3523#endif 3524 eor v4.16b, v4.16b, v18.16b 3525 st1 {v4.4s}, [x26] 3526.return_gb: 3527 ldp d14, d15, [sp], #0x10 3528 ldp d12, d13, [sp], #0x10 3529 ldp d10, d11, [sp], #0x10 3530 ldp d8, d9, [sp], #0x10 3531 ldp x29, x30, [sp], #0x10 3532 ldp x27, x28, [sp], #0x10 3533 ldp x25, x26, [sp], #0x10 3534 ldp x23, x24, [sp], #0x10 3535 ldp x21, x22, [sp], #0x10 3536 ldp x19, x20, [sp], #0x10 3537 ldp x17, x18, [sp], #0x10 3538 ldp x15, x16, [sp], #0x10 3539 AARCH64_VALIDATE_LINK_REGISTER 3540 ret 3541.size vpsm4_ex_xts_encrypt_gb,.-vpsm4_ex_xts_encrypt_gb 3542.globl vpsm4_ex_xts_encrypt 3543.type vpsm4_ex_xts_encrypt,%function 3544.align 5 3545vpsm4_ex_xts_encrypt: 3546 AARCH64_SIGN_LINK_REGISTER 3547 stp x15, x16, [sp, #-0x10]! 3548 stp x17, x18, [sp, #-0x10]! 3549 stp x19, x20, [sp, #-0x10]! 3550 stp x21, x22, [sp, #-0x10]! 3551 stp x23, x24, [sp, #-0x10]! 3552 stp x25, x26, [sp, #-0x10]! 3553 stp x27, x28, [sp, #-0x10]! 3554 stp x29, x30, [sp, #-0x10]! 3555 stp d8, d9, [sp, #-0x10]! 3556 stp d10, d11, [sp, #-0x10]! 3557 stp d12, d13, [sp, #-0x10]! 3558 stp d14, d15, [sp, #-0x10]! 3559 mov x26,x3 3560 mov x27,x4 3561 mov w28,w6 3562 ld1 {v16.4s}, [x5] 3563 mov x3,x27 3564 adrp x9, .Lsbox_magic 3565 ldr q26, [x9, #:lo12:.Lsbox_magic] 3566 ldr q27, [x9, #:lo12:.Lsbox_magic+16] 3567 ldr q28, [x9, #:lo12:.Lsbox_magic+32] 3568 ldr q29, [x9, #:lo12:.Lsbox_magic+48] 3569 ldr q30, [x9, #:lo12:.Lsbox_magic+64] 3570 ldr q31, [x9, #:lo12:.Lsbox_magic+80] 3571#ifndef __AARCH64EB__ 3572 rev32 v16.16b,v16.16b 3573#endif 3574 mov x10,x3 3575 mov w11,#8 3576 mov w12,v16.s[0] 3577 mov w13,v16.s[1] 3578 mov w14,v16.s[2] 3579 mov w15,v16.s[3] 358010: 3581 ldp w7,w8,[x10],8 3582 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3583 eor w6,w14,w15 3584 eor w9,w7,w13 3585 eor w6,w6,w9 3586 mov v3.s[0],w6 3587 // optimize sbox using AESE instruction 3588 tbl v0.16b, {v3.16b}, v26.16b 3589 ushr v2.16b, v0.16b, 4 3590 and v0.16b, v0.16b, v31.16b 3591 tbl v0.16b, {v28.16b}, v0.16b 3592 tbl v2.16b, {v27.16b}, v2.16b 3593 eor v0.16b, v0.16b, v2.16b 3594 eor v1.16b, v1.16b, v1.16b 3595 aese v0.16b,v1.16b 3596 ushr v2.16b, v0.16b, 4 3597 and v0.16b, v0.16b, v31.16b 3598 tbl v0.16b, {v30.16b}, v0.16b 3599 tbl v2.16b, {v29.16b}, v2.16b 3600 eor v0.16b, v0.16b, v2.16b 3601 3602 mov w7,v0.s[0] 3603 eor w6,w7,w7,ror #32-2 3604 eor w6,w6,w7,ror #32-10 3605 eor w6,w6,w7,ror #32-18 3606 eor w6,w6,w7,ror #32-24 3607 eor w12,w12,w6 3608 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3609 eor w6,w14,w15 3610 eor w9,w12,w8 3611 eor w6,w6,w9 3612 mov v3.s[0],w6 3613 // optimize sbox using AESE instruction 3614 tbl v0.16b, {v3.16b}, v26.16b 3615 ushr v2.16b, v0.16b, 4 3616 and v0.16b, v0.16b, v31.16b 3617 tbl v0.16b, {v28.16b}, v0.16b 3618 tbl v2.16b, {v27.16b}, v2.16b 3619 eor v0.16b, v0.16b, v2.16b 3620 eor v1.16b, v1.16b, v1.16b 3621 aese v0.16b,v1.16b 3622 ushr v2.16b, v0.16b, 4 3623 and v0.16b, v0.16b, v31.16b 3624 tbl v0.16b, {v30.16b}, v0.16b 3625 tbl v2.16b, {v29.16b}, v2.16b 3626 eor v0.16b, v0.16b, v2.16b 3627 3628 mov w7,v0.s[0] 3629 eor w6,w7,w7,ror #32-2 3630 eor w6,w6,w7,ror #32-10 3631 eor w6,w6,w7,ror #32-18 3632 eor w6,w6,w7,ror #32-24 3633 ldp w7,w8,[x10],8 3634 eor w13,w13,w6 3635 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3636 eor w6,w12,w13 3637 eor w9,w7,w15 3638 eor w6,w6,w9 3639 mov v3.s[0],w6 3640 // optimize sbox using AESE instruction 3641 tbl v0.16b, {v3.16b}, v26.16b 3642 ushr v2.16b, v0.16b, 4 3643 and v0.16b, v0.16b, v31.16b 3644 tbl v0.16b, {v28.16b}, v0.16b 3645 tbl v2.16b, {v27.16b}, v2.16b 3646 eor v0.16b, v0.16b, v2.16b 3647 eor v1.16b, v1.16b, v1.16b 3648 aese v0.16b,v1.16b 3649 ushr v2.16b, v0.16b, 4 3650 and v0.16b, v0.16b, v31.16b 3651 tbl v0.16b, {v30.16b}, v0.16b 3652 tbl v2.16b, {v29.16b}, v2.16b 3653 eor v0.16b, v0.16b, v2.16b 3654 3655 mov w7,v0.s[0] 3656 eor w6,w7,w7,ror #32-2 3657 eor w6,w6,w7,ror #32-10 3658 eor w6,w6,w7,ror #32-18 3659 eor w6,w6,w7,ror #32-24 3660 eor w14,w14,w6 3661 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3662 eor w6,w12,w13 3663 eor w9,w14,w8 3664 eor w6,w6,w9 3665 mov v3.s[0],w6 3666 // optimize sbox using AESE instruction 3667 tbl v0.16b, {v3.16b}, v26.16b 3668 ushr v2.16b, v0.16b, 4 3669 and v0.16b, v0.16b, v31.16b 3670 tbl v0.16b, {v28.16b}, v0.16b 3671 tbl v2.16b, {v27.16b}, v2.16b 3672 eor v0.16b, v0.16b, v2.16b 3673 eor v1.16b, v1.16b, v1.16b 3674 aese v0.16b,v1.16b 3675 ushr v2.16b, v0.16b, 4 3676 and v0.16b, v0.16b, v31.16b 3677 tbl v0.16b, {v30.16b}, v0.16b 3678 tbl v2.16b, {v29.16b}, v2.16b 3679 eor v0.16b, v0.16b, v2.16b 3680 3681 mov w7,v0.s[0] 3682 eor w6,w7,w7,ror #32-2 3683 eor w6,w6,w7,ror #32-10 3684 eor w6,w6,w7,ror #32-18 3685 eor w6,w6,w7,ror #32-24 3686 eor w15,w15,w6 3687 subs w11,w11,#1 3688 b.ne 10b 3689 mov v16.s[0],w15 3690 mov v16.s[1],w14 3691 mov v16.s[2],w13 3692 mov v16.s[3],w12 3693#ifndef __AARCH64EB__ 3694 rev32 v16.16b,v16.16b 3695#endif 3696 mov x3,x26 3697 and x29,x2,#0x0F 3698 // convert length into blocks 3699 lsr x2,x2,4 3700 cmp x2,#1 3701 b.lt .return 3702 3703 cmp x29,0 3704 // If the encryption/decryption Length is N times of 16, 3705 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 3706 b.eq .xts_encrypt_blocks 3707 3708 // If the encryption/decryption length is not N times of 16, 3709 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 3710 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 3711 subs x2,x2,#1 3712 b.eq .only_2blks_tweak 3713.xts_encrypt_blocks: 3714#ifdef __AARCH64EB__ 3715 rev32 v16.16b,v16.16b 3716#endif 3717 mov x12,v16.d[0] 3718 mov x13,v16.d[1] 3719 mov w7,0x87 3720 extr x9,x13,x13,#32 3721 extr x15,x13,x12,#63 3722 and w8,w7,w9,asr#31 3723 eor x14,x8,x12,lsl#1 3724 mov w7,0x87 3725 extr x9,x15,x15,#32 3726 extr x17,x15,x14,#63 3727 and w8,w7,w9,asr#31 3728 eor x16,x8,x14,lsl#1 3729 mov w7,0x87 3730 extr x9,x17,x17,#32 3731 extr x19,x17,x16,#63 3732 and w8,w7,w9,asr#31 3733 eor x18,x8,x16,lsl#1 3734 mov w7,0x87 3735 extr x9,x19,x19,#32 3736 extr x21,x19,x18,#63 3737 and w8,w7,w9,asr#31 3738 eor x20,x8,x18,lsl#1 3739 mov w7,0x87 3740 extr x9,x21,x21,#32 3741 extr x23,x21,x20,#63 3742 and w8,w7,w9,asr#31 3743 eor x22,x8,x20,lsl#1 3744 mov w7,0x87 3745 extr x9,x23,x23,#32 3746 extr x25,x23,x22,#63 3747 and w8,w7,w9,asr#31 3748 eor x24,x8,x22,lsl#1 3749 mov w7,0x87 3750 extr x9,x25,x25,#32 3751 extr x27,x25,x24,#63 3752 and w8,w7,w9,asr#31 3753 eor x26,x8,x24,lsl#1 3754.Lxts_8_blocks_process: 3755 cmp x2,#8 3756 mov v16.d[0],x12 3757 mov v16.d[1],x13 3758#ifdef __AARCH64EB__ 3759 rev32 v16.16b,v16.16b 3760#endif 3761 mov w7,0x87 3762 extr x9,x27,x27,#32 3763 extr x13,x27,x26,#63 3764 and w8,w7,w9,asr#31 3765 eor x12,x8,x26,lsl#1 3766 mov v17.d[0],x14 3767 mov v17.d[1],x15 3768#ifdef __AARCH64EB__ 3769 rev32 v17.16b,v17.16b 3770#endif 3771 mov w7,0x87 3772 extr x9,x13,x13,#32 3773 extr x15,x13,x12,#63 3774 and w8,w7,w9,asr#31 3775 eor x14,x8,x12,lsl#1 3776 mov v18.d[0],x16 3777 mov v18.d[1],x17 3778#ifdef __AARCH64EB__ 3779 rev32 v18.16b,v18.16b 3780#endif 3781 mov w7,0x87 3782 extr x9,x15,x15,#32 3783 extr x17,x15,x14,#63 3784 and w8,w7,w9,asr#31 3785 eor x16,x8,x14,lsl#1 3786 mov v19.d[0],x18 3787 mov v19.d[1],x19 3788#ifdef __AARCH64EB__ 3789 rev32 v19.16b,v19.16b 3790#endif 3791 mov w7,0x87 3792 extr x9,x17,x17,#32 3793 extr x19,x17,x16,#63 3794 and w8,w7,w9,asr#31 3795 eor x18,x8,x16,lsl#1 3796 mov v20.d[0],x20 3797 mov v20.d[1],x21 3798#ifdef __AARCH64EB__ 3799 rev32 v20.16b,v20.16b 3800#endif 3801 mov w7,0x87 3802 extr x9,x19,x19,#32 3803 extr x21,x19,x18,#63 3804 and w8,w7,w9,asr#31 3805 eor x20,x8,x18,lsl#1 3806 mov v21.d[0],x22 3807 mov v21.d[1],x23 3808#ifdef __AARCH64EB__ 3809 rev32 v21.16b,v21.16b 3810#endif 3811 mov w7,0x87 3812 extr x9,x21,x21,#32 3813 extr x23,x21,x20,#63 3814 and w8,w7,w9,asr#31 3815 eor x22,x8,x20,lsl#1 3816 mov v22.d[0],x24 3817 mov v22.d[1],x25 3818#ifdef __AARCH64EB__ 3819 rev32 v22.16b,v22.16b 3820#endif 3821 mov w7,0x87 3822 extr x9,x23,x23,#32 3823 extr x25,x23,x22,#63 3824 and w8,w7,w9,asr#31 3825 eor x24,x8,x22,lsl#1 3826 mov v23.d[0],x26 3827 mov v23.d[1],x27 3828#ifdef __AARCH64EB__ 3829 rev32 v23.16b,v23.16b 3830#endif 3831 mov w7,0x87 3832 extr x9,x25,x25,#32 3833 extr x27,x25,x24,#63 3834 and w8,w7,w9,asr#31 3835 eor x26,x8,x24,lsl#1 3836 b.lt .Lxts_4_blocks_process 3837 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3838 eor v4.16b, v4.16b, v16.16b 3839 eor v5.16b, v5.16b, v17.16b 3840 eor v6.16b, v6.16b, v18.16b 3841 eor v7.16b, v7.16b, v19.16b 3842 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3843 eor v8.16b, v8.16b, v20.16b 3844 eor v9.16b, v9.16b, v21.16b 3845 eor v10.16b, v10.16b, v22.16b 3846 eor v11.16b, v11.16b, v23.16b 3847#ifndef __AARCH64EB__ 3848 rev32 v4.16b,v4.16b 3849#endif 3850#ifndef __AARCH64EB__ 3851 rev32 v5.16b,v5.16b 3852#endif 3853#ifndef __AARCH64EB__ 3854 rev32 v6.16b,v6.16b 3855#endif 3856#ifndef __AARCH64EB__ 3857 rev32 v7.16b,v7.16b 3858#endif 3859#ifndef __AARCH64EB__ 3860 rev32 v8.16b,v8.16b 3861#endif 3862#ifndef __AARCH64EB__ 3863 rev32 v9.16b,v9.16b 3864#endif 3865#ifndef __AARCH64EB__ 3866 rev32 v10.16b,v10.16b 3867#endif 3868#ifndef __AARCH64EB__ 3869 rev32 v11.16b,v11.16b 3870#endif 3871 zip1 v0.4s,v4.4s,v5.4s 3872 zip2 v1.4s,v4.4s,v5.4s 3873 zip1 v2.4s,v6.4s,v7.4s 3874 zip2 v3.4s,v6.4s,v7.4s 3875 zip1 v4.2d,v0.2d,v2.2d 3876 zip2 v5.2d,v0.2d,v2.2d 3877 zip1 v6.2d,v1.2d,v3.2d 3878 zip2 v7.2d,v1.2d,v3.2d 3879 zip1 v0.4s,v8.4s,v9.4s 3880 zip2 v1.4s,v8.4s,v9.4s 3881 zip1 v2.4s,v10.4s,v11.4s 3882 zip2 v3.4s,v10.4s,v11.4s 3883 zip1 v8.2d,v0.2d,v2.2d 3884 zip2 v9.2d,v0.2d,v2.2d 3885 zip1 v10.2d,v1.2d,v3.2d 3886 zip2 v11.2d,v1.2d,v3.2d 3887 bl _vpsm4_ex_enc_8blks 3888 zip1 v8.4s,v0.4s,v1.4s 3889 zip2 v9.4s,v0.4s,v1.4s 3890 zip1 v10.4s,v2.4s,v3.4s 3891 zip2 v11.4s,v2.4s,v3.4s 3892 zip1 v0.2d,v8.2d,v10.2d 3893 zip2 v1.2d,v8.2d,v10.2d 3894 zip1 v2.2d,v9.2d,v11.2d 3895 zip2 v3.2d,v9.2d,v11.2d 3896 zip1 v8.4s,v4.4s,v5.4s 3897 zip2 v9.4s,v4.4s,v5.4s 3898 zip1 v10.4s,v6.4s,v7.4s 3899 zip2 v11.4s,v6.4s,v7.4s 3900 zip1 v4.2d,v8.2d,v10.2d 3901 zip2 v5.2d,v8.2d,v10.2d 3902 zip1 v6.2d,v9.2d,v11.2d 3903 zip2 v7.2d,v9.2d,v11.2d 3904 eor v0.16b, v0.16b, v16.16b 3905 eor v1.16b, v1.16b, v17.16b 3906 eor v2.16b, v2.16b, v18.16b 3907 eor v3.16b, v3.16b, v19.16b 3908 eor v4.16b, v4.16b, v20.16b 3909 eor v5.16b, v5.16b, v21.16b 3910 eor v6.16b, v6.16b, v22.16b 3911 eor v7.16b, v7.16b, v23.16b 3912 3913 // save the last tweak 3914 mov v25.16b,v23.16b 3915 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3916 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 3917 subs x2,x2,#8 3918 b.gt .Lxts_8_blocks_process 3919 b 100f 3920.Lxts_4_blocks_process: 3921 cmp x2,#4 3922 b.lt 1f 3923 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3924 eor v4.16b, v4.16b, v16.16b 3925 eor v5.16b, v5.16b, v17.16b 3926 eor v6.16b, v6.16b, v18.16b 3927 eor v7.16b, v7.16b, v19.16b 3928#ifndef __AARCH64EB__ 3929 rev32 v4.16b,v4.16b 3930#endif 3931#ifndef __AARCH64EB__ 3932 rev32 v5.16b,v5.16b 3933#endif 3934#ifndef __AARCH64EB__ 3935 rev32 v6.16b,v6.16b 3936#endif 3937#ifndef __AARCH64EB__ 3938 rev32 v7.16b,v7.16b 3939#endif 3940 zip1 v0.4s,v4.4s,v5.4s 3941 zip2 v1.4s,v4.4s,v5.4s 3942 zip1 v2.4s,v6.4s,v7.4s 3943 zip2 v3.4s,v6.4s,v7.4s 3944 zip1 v4.2d,v0.2d,v2.2d 3945 zip2 v5.2d,v0.2d,v2.2d 3946 zip1 v6.2d,v1.2d,v3.2d 3947 zip2 v7.2d,v1.2d,v3.2d 3948 bl _vpsm4_ex_enc_4blks 3949 zip1 v4.4s,v0.4s,v1.4s 3950 zip2 v5.4s,v0.4s,v1.4s 3951 zip1 v6.4s,v2.4s,v3.4s 3952 zip2 v7.4s,v2.4s,v3.4s 3953 zip1 v0.2d,v4.2d,v6.2d 3954 zip2 v1.2d,v4.2d,v6.2d 3955 zip1 v2.2d,v5.2d,v7.2d 3956 zip2 v3.2d,v5.2d,v7.2d 3957 eor v0.16b, v0.16b, v16.16b 3958 eor v1.16b, v1.16b, v17.16b 3959 eor v2.16b, v2.16b, v18.16b 3960 eor v3.16b, v3.16b, v19.16b 3961 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3962 sub x2,x2,#4 3963 mov v16.16b,v20.16b 3964 mov v17.16b,v21.16b 3965 mov v18.16b,v22.16b 3966 // save the last tweak 3967 mov v25.16b,v19.16b 39681: 3969 // process last block 3970 cmp x2,#1 3971 b.lt 100f 3972 b.gt 1f 3973 ld1 {v4.4s},[x0],#16 3974 eor v4.16b, v4.16b, v16.16b 3975#ifndef __AARCH64EB__ 3976 rev32 v4.16b,v4.16b 3977#endif 3978 mov x10,x3 3979 mov w11,#8 3980 mov w12,v4.s[0] 3981 mov w13,v4.s[1] 3982 mov w14,v4.s[2] 3983 mov w15,v4.s[3] 398410: 3985 ldp w7,w8,[x10],8 3986 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3987 eor w6,w14,w15 3988 eor w9,w7,w13 3989 eor w6,w6,w9 3990 mov v3.s[0],w6 3991 // optimize sbox using AESE instruction 3992 tbl v0.16b, {v3.16b}, v26.16b 3993 ushr v2.16b, v0.16b, 4 3994 and v0.16b, v0.16b, v31.16b 3995 tbl v0.16b, {v28.16b}, v0.16b 3996 tbl v2.16b, {v27.16b}, v2.16b 3997 eor v0.16b, v0.16b, v2.16b 3998 eor v1.16b, v1.16b, v1.16b 3999 aese v0.16b,v1.16b 4000 ushr v2.16b, v0.16b, 4 4001 and v0.16b, v0.16b, v31.16b 4002 tbl v0.16b, {v30.16b}, v0.16b 4003 tbl v2.16b, {v29.16b}, v2.16b 4004 eor v0.16b, v0.16b, v2.16b 4005 4006 mov w7,v0.s[0] 4007 eor w6,w7,w7,ror #32-2 4008 eor w6,w6,w7,ror #32-10 4009 eor w6,w6,w7,ror #32-18 4010 eor w6,w6,w7,ror #32-24 4011 eor w12,w12,w6 4012 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4013 eor w6,w14,w15 4014 eor w9,w12,w8 4015 eor w6,w6,w9 4016 mov v3.s[0],w6 4017 // optimize sbox using AESE instruction 4018 tbl v0.16b, {v3.16b}, v26.16b 4019 ushr v2.16b, v0.16b, 4 4020 and v0.16b, v0.16b, v31.16b 4021 tbl v0.16b, {v28.16b}, v0.16b 4022 tbl v2.16b, {v27.16b}, v2.16b 4023 eor v0.16b, v0.16b, v2.16b 4024 eor v1.16b, v1.16b, v1.16b 4025 aese v0.16b,v1.16b 4026 ushr v2.16b, v0.16b, 4 4027 and v0.16b, v0.16b, v31.16b 4028 tbl v0.16b, {v30.16b}, v0.16b 4029 tbl v2.16b, {v29.16b}, v2.16b 4030 eor v0.16b, v0.16b, v2.16b 4031 4032 mov w7,v0.s[0] 4033 eor w6,w7,w7,ror #32-2 4034 eor w6,w6,w7,ror #32-10 4035 eor w6,w6,w7,ror #32-18 4036 eor w6,w6,w7,ror #32-24 4037 ldp w7,w8,[x10],8 4038 eor w13,w13,w6 4039 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4040 eor w6,w12,w13 4041 eor w9,w7,w15 4042 eor w6,w6,w9 4043 mov v3.s[0],w6 4044 // optimize sbox using AESE instruction 4045 tbl v0.16b, {v3.16b}, v26.16b 4046 ushr v2.16b, v0.16b, 4 4047 and v0.16b, v0.16b, v31.16b 4048 tbl v0.16b, {v28.16b}, v0.16b 4049 tbl v2.16b, {v27.16b}, v2.16b 4050 eor v0.16b, v0.16b, v2.16b 4051 eor v1.16b, v1.16b, v1.16b 4052 aese v0.16b,v1.16b 4053 ushr v2.16b, v0.16b, 4 4054 and v0.16b, v0.16b, v31.16b 4055 tbl v0.16b, {v30.16b}, v0.16b 4056 tbl v2.16b, {v29.16b}, v2.16b 4057 eor v0.16b, v0.16b, v2.16b 4058 4059 mov w7,v0.s[0] 4060 eor w6,w7,w7,ror #32-2 4061 eor w6,w6,w7,ror #32-10 4062 eor w6,w6,w7,ror #32-18 4063 eor w6,w6,w7,ror #32-24 4064 eor w14,w14,w6 4065 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4066 eor w6,w12,w13 4067 eor w9,w14,w8 4068 eor w6,w6,w9 4069 mov v3.s[0],w6 4070 // optimize sbox using AESE instruction 4071 tbl v0.16b, {v3.16b}, v26.16b 4072 ushr v2.16b, v0.16b, 4 4073 and v0.16b, v0.16b, v31.16b 4074 tbl v0.16b, {v28.16b}, v0.16b 4075 tbl v2.16b, {v27.16b}, v2.16b 4076 eor v0.16b, v0.16b, v2.16b 4077 eor v1.16b, v1.16b, v1.16b 4078 aese v0.16b,v1.16b 4079 ushr v2.16b, v0.16b, 4 4080 and v0.16b, v0.16b, v31.16b 4081 tbl v0.16b, {v30.16b}, v0.16b 4082 tbl v2.16b, {v29.16b}, v2.16b 4083 eor v0.16b, v0.16b, v2.16b 4084 4085 mov w7,v0.s[0] 4086 eor w6,w7,w7,ror #32-2 4087 eor w6,w6,w7,ror #32-10 4088 eor w6,w6,w7,ror #32-18 4089 eor w6,w6,w7,ror #32-24 4090 eor w15,w15,w6 4091 subs w11,w11,#1 4092 b.ne 10b 4093 mov v4.s[0],w15 4094 mov v4.s[1],w14 4095 mov v4.s[2],w13 4096 mov v4.s[3],w12 4097#ifndef __AARCH64EB__ 4098 rev32 v4.16b,v4.16b 4099#endif 4100 eor v4.16b, v4.16b, v16.16b 4101 st1 {v4.4s},[x1],#16 4102 // save the last tweak 4103 mov v25.16b,v16.16b 4104 b 100f 41051: // process last 2 blocks 4106 cmp x2,#2 4107 b.gt 1f 4108 ld1 {v4.4s,v5.4s},[x0],#32 4109 eor v4.16b, v4.16b, v16.16b 4110 eor v5.16b, v5.16b, v17.16b 4111#ifndef __AARCH64EB__ 4112 rev32 v4.16b,v4.16b 4113#endif 4114#ifndef __AARCH64EB__ 4115 rev32 v5.16b,v5.16b 4116#endif 4117 zip1 v0.4s,v4.4s,v5.4s 4118 zip2 v1.4s,v4.4s,v5.4s 4119 zip1 v2.4s,v6.4s,v7.4s 4120 zip2 v3.4s,v6.4s,v7.4s 4121 zip1 v4.2d,v0.2d,v2.2d 4122 zip2 v5.2d,v0.2d,v2.2d 4123 zip1 v6.2d,v1.2d,v3.2d 4124 zip2 v7.2d,v1.2d,v3.2d 4125 bl _vpsm4_ex_enc_4blks 4126 zip1 v4.4s,v0.4s,v1.4s 4127 zip2 v5.4s,v0.4s,v1.4s 4128 zip1 v6.4s,v2.4s,v3.4s 4129 zip2 v7.4s,v2.4s,v3.4s 4130 zip1 v0.2d,v4.2d,v6.2d 4131 zip2 v1.2d,v4.2d,v6.2d 4132 zip1 v2.2d,v5.2d,v7.2d 4133 zip2 v3.2d,v5.2d,v7.2d 4134 eor v0.16b, v0.16b, v16.16b 4135 eor v1.16b, v1.16b, v17.16b 4136 st1 {v0.4s,v1.4s},[x1],#32 4137 // save the last tweak 4138 mov v25.16b,v17.16b 4139 b 100f 41401: // process last 3 blocks 4141 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 4142 eor v4.16b, v4.16b, v16.16b 4143 eor v5.16b, v5.16b, v17.16b 4144 eor v6.16b, v6.16b, v18.16b 4145#ifndef __AARCH64EB__ 4146 rev32 v4.16b,v4.16b 4147#endif 4148#ifndef __AARCH64EB__ 4149 rev32 v5.16b,v5.16b 4150#endif 4151#ifndef __AARCH64EB__ 4152 rev32 v6.16b,v6.16b 4153#endif 4154 zip1 v0.4s,v4.4s,v5.4s 4155 zip2 v1.4s,v4.4s,v5.4s 4156 zip1 v2.4s,v6.4s,v7.4s 4157 zip2 v3.4s,v6.4s,v7.4s 4158 zip1 v4.2d,v0.2d,v2.2d 4159 zip2 v5.2d,v0.2d,v2.2d 4160 zip1 v6.2d,v1.2d,v3.2d 4161 zip2 v7.2d,v1.2d,v3.2d 4162 bl _vpsm4_ex_enc_4blks 4163 zip1 v4.4s,v0.4s,v1.4s 4164 zip2 v5.4s,v0.4s,v1.4s 4165 zip1 v6.4s,v2.4s,v3.4s 4166 zip2 v7.4s,v2.4s,v3.4s 4167 zip1 v0.2d,v4.2d,v6.2d 4168 zip2 v1.2d,v4.2d,v6.2d 4169 zip1 v2.2d,v5.2d,v7.2d 4170 zip2 v3.2d,v5.2d,v7.2d 4171 eor v0.16b, v0.16b, v16.16b 4172 eor v1.16b, v1.16b, v17.16b 4173 eor v2.16b, v2.16b, v18.16b 4174 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 4175 // save the last tweak 4176 mov v25.16b,v18.16b 4177100: 4178 cmp x29,0 4179 b.eq .return 4180 4181// This branch calculates the last two tweaks, 4182// while the encryption/decryption length is larger than 32 4183.last_2blks_tweak: 4184#ifdef __AARCH64EB__ 4185 rev32 v25.16b,v25.16b 4186#endif 4187 mov v2.16b,v25.16b 4188 adrp x9, .Lxts_magic 4189 ldr q0, [x9, #:lo12:.Lxts_magic] 4190 shl v17.16b, v2.16b, #1 4191 ext v1.16b, v2.16b, v2.16b,#15 4192 ushr v1.16b, v1.16b, #7 4193 mul v1.16b, v1.16b, v0.16b 4194 eor v17.16b, v17.16b, v1.16b 4195 mov v2.16b,v17.16b 4196 adrp x9, .Lxts_magic 4197 ldr q0, [x9, #:lo12:.Lxts_magic] 4198 shl v18.16b, v2.16b, #1 4199 ext v1.16b, v2.16b, v2.16b,#15 4200 ushr v1.16b, v1.16b, #7 4201 mul v1.16b, v1.16b, v0.16b 4202 eor v18.16b, v18.16b, v1.16b 4203 b .check_dec 4204 4205 4206// This branch calculates the last two tweaks, 4207// while the encryption/decryption length is equal to 32, who only need two tweaks 4208.only_2blks_tweak: 4209 mov v17.16b,v16.16b 4210#ifdef __AARCH64EB__ 4211 rev32 v17.16b,v17.16b 4212#endif 4213 mov v2.16b,v17.16b 4214 adrp x9, .Lxts_magic 4215 ldr q0, [x9, #:lo12:.Lxts_magic] 4216 shl v18.16b, v2.16b, #1 4217 ext v1.16b, v2.16b, v2.16b,#15 4218 ushr v1.16b, v1.16b, #7 4219 mul v1.16b, v1.16b, v0.16b 4220 eor v18.16b, v18.16b, v1.16b 4221 b .check_dec 4222 4223 4224// Determine whether encryption or decryption is required. 4225// The last two tweaks need to be swapped for decryption. 4226.check_dec: 4227 // encryption:1 decryption:0 4228 cmp w28,1 4229 b.eq .process_last_2blks 4230 mov v0.16B,v17.16b 4231 mov v17.16B,v18.16b 4232 mov v18.16B,v0.16b 4233 4234.process_last_2blks: 4235#ifdef __AARCH64EB__ 4236 rev32 v17.16b,v17.16b 4237#endif 4238#ifdef __AARCH64EB__ 4239 rev32 v18.16b,v18.16b 4240#endif 4241 ld1 {v4.4s},[x0],#16 4242 eor v4.16b, v4.16b, v17.16b 4243#ifndef __AARCH64EB__ 4244 rev32 v4.16b,v4.16b 4245#endif 4246 mov x10,x3 4247 mov w11,#8 4248 mov w12,v4.s[0] 4249 mov w13,v4.s[1] 4250 mov w14,v4.s[2] 4251 mov w15,v4.s[3] 425210: 4253 ldp w7,w8,[x10],8 4254 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4255 eor w6,w14,w15 4256 eor w9,w7,w13 4257 eor w6,w6,w9 4258 mov v3.s[0],w6 4259 // optimize sbox using AESE instruction 4260 tbl v0.16b, {v3.16b}, v26.16b 4261 ushr v2.16b, v0.16b, 4 4262 and v0.16b, v0.16b, v31.16b 4263 tbl v0.16b, {v28.16b}, v0.16b 4264 tbl v2.16b, {v27.16b}, v2.16b 4265 eor v0.16b, v0.16b, v2.16b 4266 eor v1.16b, v1.16b, v1.16b 4267 aese v0.16b,v1.16b 4268 ushr v2.16b, v0.16b, 4 4269 and v0.16b, v0.16b, v31.16b 4270 tbl v0.16b, {v30.16b}, v0.16b 4271 tbl v2.16b, {v29.16b}, v2.16b 4272 eor v0.16b, v0.16b, v2.16b 4273 4274 mov w7,v0.s[0] 4275 eor w6,w7,w7,ror #32-2 4276 eor w6,w6,w7,ror #32-10 4277 eor w6,w6,w7,ror #32-18 4278 eor w6,w6,w7,ror #32-24 4279 eor w12,w12,w6 4280 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4281 eor w6,w14,w15 4282 eor w9,w12,w8 4283 eor w6,w6,w9 4284 mov v3.s[0],w6 4285 // optimize sbox using AESE instruction 4286 tbl v0.16b, {v3.16b}, v26.16b 4287 ushr v2.16b, v0.16b, 4 4288 and v0.16b, v0.16b, v31.16b 4289 tbl v0.16b, {v28.16b}, v0.16b 4290 tbl v2.16b, {v27.16b}, v2.16b 4291 eor v0.16b, v0.16b, v2.16b 4292 eor v1.16b, v1.16b, v1.16b 4293 aese v0.16b,v1.16b 4294 ushr v2.16b, v0.16b, 4 4295 and v0.16b, v0.16b, v31.16b 4296 tbl v0.16b, {v30.16b}, v0.16b 4297 tbl v2.16b, {v29.16b}, v2.16b 4298 eor v0.16b, v0.16b, v2.16b 4299 4300 mov w7,v0.s[0] 4301 eor w6,w7,w7,ror #32-2 4302 eor w6,w6,w7,ror #32-10 4303 eor w6,w6,w7,ror #32-18 4304 eor w6,w6,w7,ror #32-24 4305 ldp w7,w8,[x10],8 4306 eor w13,w13,w6 4307 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4308 eor w6,w12,w13 4309 eor w9,w7,w15 4310 eor w6,w6,w9 4311 mov v3.s[0],w6 4312 // optimize sbox using AESE instruction 4313 tbl v0.16b, {v3.16b}, v26.16b 4314 ushr v2.16b, v0.16b, 4 4315 and v0.16b, v0.16b, v31.16b 4316 tbl v0.16b, {v28.16b}, v0.16b 4317 tbl v2.16b, {v27.16b}, v2.16b 4318 eor v0.16b, v0.16b, v2.16b 4319 eor v1.16b, v1.16b, v1.16b 4320 aese v0.16b,v1.16b 4321 ushr v2.16b, v0.16b, 4 4322 and v0.16b, v0.16b, v31.16b 4323 tbl v0.16b, {v30.16b}, v0.16b 4324 tbl v2.16b, {v29.16b}, v2.16b 4325 eor v0.16b, v0.16b, v2.16b 4326 4327 mov w7,v0.s[0] 4328 eor w6,w7,w7,ror #32-2 4329 eor w6,w6,w7,ror #32-10 4330 eor w6,w6,w7,ror #32-18 4331 eor w6,w6,w7,ror #32-24 4332 eor w14,w14,w6 4333 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4334 eor w6,w12,w13 4335 eor w9,w14,w8 4336 eor w6,w6,w9 4337 mov v3.s[0],w6 4338 // optimize sbox using AESE instruction 4339 tbl v0.16b, {v3.16b}, v26.16b 4340 ushr v2.16b, v0.16b, 4 4341 and v0.16b, v0.16b, v31.16b 4342 tbl v0.16b, {v28.16b}, v0.16b 4343 tbl v2.16b, {v27.16b}, v2.16b 4344 eor v0.16b, v0.16b, v2.16b 4345 eor v1.16b, v1.16b, v1.16b 4346 aese v0.16b,v1.16b 4347 ushr v2.16b, v0.16b, 4 4348 and v0.16b, v0.16b, v31.16b 4349 tbl v0.16b, {v30.16b}, v0.16b 4350 tbl v2.16b, {v29.16b}, v2.16b 4351 eor v0.16b, v0.16b, v2.16b 4352 4353 mov w7,v0.s[0] 4354 eor w6,w7,w7,ror #32-2 4355 eor w6,w6,w7,ror #32-10 4356 eor w6,w6,w7,ror #32-18 4357 eor w6,w6,w7,ror #32-24 4358 eor w15,w15,w6 4359 subs w11,w11,#1 4360 b.ne 10b 4361 mov v4.s[0],w15 4362 mov v4.s[1],w14 4363 mov v4.s[2],w13 4364 mov v4.s[3],w12 4365#ifndef __AARCH64EB__ 4366 rev32 v4.16b,v4.16b 4367#endif 4368 eor v4.16b, v4.16b, v17.16b 4369 st1 {v4.4s},[x1],#16 4370 4371 sub x26,x1,16 4372.loop: 4373 subs x29,x29,1 4374 ldrb w7,[x26,x29] 4375 ldrb w8,[x0,x29] 4376 strb w8,[x26,x29] 4377 strb w7,[x1,x29] 4378 b.gt .loop 4379 ld1 {v4.4s}, [x26] 4380 eor v4.16b, v4.16b, v18.16b 4381#ifndef __AARCH64EB__ 4382 rev32 v4.16b,v4.16b 4383#endif 4384 mov x10,x3 4385 mov w11,#8 4386 mov w12,v4.s[0] 4387 mov w13,v4.s[1] 4388 mov w14,v4.s[2] 4389 mov w15,v4.s[3] 439010: 4391 ldp w7,w8,[x10],8 4392 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4393 eor w6,w14,w15 4394 eor w9,w7,w13 4395 eor w6,w6,w9 4396 mov v3.s[0],w6 4397 // optimize sbox using AESE instruction 4398 tbl v0.16b, {v3.16b}, v26.16b 4399 ushr v2.16b, v0.16b, 4 4400 and v0.16b, v0.16b, v31.16b 4401 tbl v0.16b, {v28.16b}, v0.16b 4402 tbl v2.16b, {v27.16b}, v2.16b 4403 eor v0.16b, v0.16b, v2.16b 4404 eor v1.16b, v1.16b, v1.16b 4405 aese v0.16b,v1.16b 4406 ushr v2.16b, v0.16b, 4 4407 and v0.16b, v0.16b, v31.16b 4408 tbl v0.16b, {v30.16b}, v0.16b 4409 tbl v2.16b, {v29.16b}, v2.16b 4410 eor v0.16b, v0.16b, v2.16b 4411 4412 mov w7,v0.s[0] 4413 eor w6,w7,w7,ror #32-2 4414 eor w6,w6,w7,ror #32-10 4415 eor w6,w6,w7,ror #32-18 4416 eor w6,w6,w7,ror #32-24 4417 eor w12,w12,w6 4418 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4419 eor w6,w14,w15 4420 eor w9,w12,w8 4421 eor w6,w6,w9 4422 mov v3.s[0],w6 4423 // optimize sbox using AESE instruction 4424 tbl v0.16b, {v3.16b}, v26.16b 4425 ushr v2.16b, v0.16b, 4 4426 and v0.16b, v0.16b, v31.16b 4427 tbl v0.16b, {v28.16b}, v0.16b 4428 tbl v2.16b, {v27.16b}, v2.16b 4429 eor v0.16b, v0.16b, v2.16b 4430 eor v1.16b, v1.16b, v1.16b 4431 aese v0.16b,v1.16b 4432 ushr v2.16b, v0.16b, 4 4433 and v0.16b, v0.16b, v31.16b 4434 tbl v0.16b, {v30.16b}, v0.16b 4435 tbl v2.16b, {v29.16b}, v2.16b 4436 eor v0.16b, v0.16b, v2.16b 4437 4438 mov w7,v0.s[0] 4439 eor w6,w7,w7,ror #32-2 4440 eor w6,w6,w7,ror #32-10 4441 eor w6,w6,w7,ror #32-18 4442 eor w6,w6,w7,ror #32-24 4443 ldp w7,w8,[x10],8 4444 eor w13,w13,w6 4445 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4446 eor w6,w12,w13 4447 eor w9,w7,w15 4448 eor w6,w6,w9 4449 mov v3.s[0],w6 4450 // optimize sbox using AESE instruction 4451 tbl v0.16b, {v3.16b}, v26.16b 4452 ushr v2.16b, v0.16b, 4 4453 and v0.16b, v0.16b, v31.16b 4454 tbl v0.16b, {v28.16b}, v0.16b 4455 tbl v2.16b, {v27.16b}, v2.16b 4456 eor v0.16b, v0.16b, v2.16b 4457 eor v1.16b, v1.16b, v1.16b 4458 aese v0.16b,v1.16b 4459 ushr v2.16b, v0.16b, 4 4460 and v0.16b, v0.16b, v31.16b 4461 tbl v0.16b, {v30.16b}, v0.16b 4462 tbl v2.16b, {v29.16b}, v2.16b 4463 eor v0.16b, v0.16b, v2.16b 4464 4465 mov w7,v0.s[0] 4466 eor w6,w7,w7,ror #32-2 4467 eor w6,w6,w7,ror #32-10 4468 eor w6,w6,w7,ror #32-18 4469 eor w6,w6,w7,ror #32-24 4470 eor w14,w14,w6 4471 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4472 eor w6,w12,w13 4473 eor w9,w14,w8 4474 eor w6,w6,w9 4475 mov v3.s[0],w6 4476 // optimize sbox using AESE instruction 4477 tbl v0.16b, {v3.16b}, v26.16b 4478 ushr v2.16b, v0.16b, 4 4479 and v0.16b, v0.16b, v31.16b 4480 tbl v0.16b, {v28.16b}, v0.16b 4481 tbl v2.16b, {v27.16b}, v2.16b 4482 eor v0.16b, v0.16b, v2.16b 4483 eor v1.16b, v1.16b, v1.16b 4484 aese v0.16b,v1.16b 4485 ushr v2.16b, v0.16b, 4 4486 and v0.16b, v0.16b, v31.16b 4487 tbl v0.16b, {v30.16b}, v0.16b 4488 tbl v2.16b, {v29.16b}, v2.16b 4489 eor v0.16b, v0.16b, v2.16b 4490 4491 mov w7,v0.s[0] 4492 eor w6,w7,w7,ror #32-2 4493 eor w6,w6,w7,ror #32-10 4494 eor w6,w6,w7,ror #32-18 4495 eor w6,w6,w7,ror #32-24 4496 eor w15,w15,w6 4497 subs w11,w11,#1 4498 b.ne 10b 4499 mov v4.s[0],w15 4500 mov v4.s[1],w14 4501 mov v4.s[2],w13 4502 mov v4.s[3],w12 4503#ifndef __AARCH64EB__ 4504 rev32 v4.16b,v4.16b 4505#endif 4506 eor v4.16b, v4.16b, v18.16b 4507 st1 {v4.4s}, [x26] 4508.return: 4509 ldp d14, d15, [sp], #0x10 4510 ldp d12, d13, [sp], #0x10 4511 ldp d10, d11, [sp], #0x10 4512 ldp d8, d9, [sp], #0x10 4513 ldp x29, x30, [sp], #0x10 4514 ldp x27, x28, [sp], #0x10 4515 ldp x25, x26, [sp], #0x10 4516 ldp x23, x24, [sp], #0x10 4517 ldp x21, x22, [sp], #0x10 4518 ldp x19, x20, [sp], #0x10 4519 ldp x17, x18, [sp], #0x10 4520 ldp x15, x16, [sp], #0x10 4521 AARCH64_VALIDATE_LINK_REGISTER 4522 ret 4523.size vpsm4_ex_xts_encrypt,.-vpsm4_ex_xts_encrypt 4524