1/* Do not modify. This file is auto-generated from vpsm4-armv8.pl. */ 2// Copyright 2020-2025 The OpenSSL Project Authors. All Rights Reserved. 3// 4// Licensed under the Apache License 2.0 (the "License"). You may not use 5// this file except in compliance with the License. You can obtain a copy 6// in the file LICENSE in the source distribution or at 7// https://www.openssl.org/source/license.html 8 9// 10// This module implements SM4 with ASIMD on aarch64 11// 12// Feb 2022 13// 14 15// $output is the last argument if it looks like a file (it has an extension) 16// $flavour is the first argument if it doesn't look like a file 17#include "arm_arch.h" 18.arch armv8-a 19.text 20 21.section .rodata 22.type _vpsm4_consts,%object 23.align 7 24_vpsm4_consts: 25.Lsbox: 26.byte 0xD6,0x90,0xE9,0xFE,0xCC,0xE1,0x3D,0xB7,0x16,0xB6,0x14,0xC2,0x28,0xFB,0x2C,0x05 27.byte 0x2B,0x67,0x9A,0x76,0x2A,0xBE,0x04,0xC3,0xAA,0x44,0x13,0x26,0x49,0x86,0x06,0x99 28.byte 0x9C,0x42,0x50,0xF4,0x91,0xEF,0x98,0x7A,0x33,0x54,0x0B,0x43,0xED,0xCF,0xAC,0x62 29.byte 0xE4,0xB3,0x1C,0xA9,0xC9,0x08,0xE8,0x95,0x80,0xDF,0x94,0xFA,0x75,0x8F,0x3F,0xA6 30.byte 0x47,0x07,0xA7,0xFC,0xF3,0x73,0x17,0xBA,0x83,0x59,0x3C,0x19,0xE6,0x85,0x4F,0xA8 31.byte 0x68,0x6B,0x81,0xB2,0x71,0x64,0xDA,0x8B,0xF8,0xEB,0x0F,0x4B,0x70,0x56,0x9D,0x35 32.byte 0x1E,0x24,0x0E,0x5E,0x63,0x58,0xD1,0xA2,0x25,0x22,0x7C,0x3B,0x01,0x21,0x78,0x87 33.byte 0xD4,0x00,0x46,0x57,0x9F,0xD3,0x27,0x52,0x4C,0x36,0x02,0xE7,0xA0,0xC4,0xC8,0x9E 34.byte 0xEA,0xBF,0x8A,0xD2,0x40,0xC7,0x38,0xB5,0xA3,0xF7,0xF2,0xCE,0xF9,0x61,0x15,0xA1 35.byte 0xE0,0xAE,0x5D,0xA4,0x9B,0x34,0x1A,0x55,0xAD,0x93,0x32,0x30,0xF5,0x8C,0xB1,0xE3 36.byte 0x1D,0xF6,0xE2,0x2E,0x82,0x66,0xCA,0x60,0xC0,0x29,0x23,0xAB,0x0D,0x53,0x4E,0x6F 37.byte 0xD5,0xDB,0x37,0x45,0xDE,0xFD,0x8E,0x2F,0x03,0xFF,0x6A,0x72,0x6D,0x6C,0x5B,0x51 38.byte 0x8D,0x1B,0xAF,0x92,0xBB,0xDD,0xBC,0x7F,0x11,0xD9,0x5C,0x41,0x1F,0x10,0x5A,0xD8 39.byte 0x0A,0xC1,0x31,0x88,0xA5,0xCD,0x7B,0xBD,0x2D,0x74,0xD0,0x12,0xB8,0xE5,0xB4,0xB0 40.byte 0x89,0x69,0x97,0x4A,0x0C,0x96,0x77,0x7E,0x65,0xB9,0xF1,0x09,0xC5,0x6E,0xC6,0x84 41.byte 0x18,0xF0,0x7D,0xEC,0x3A,0xDC,0x4D,0x20,0x79,0xEE,0x5F,0x3E,0xD7,0xCB,0x39,0x48 42.Lck: 43.long 0x00070E15, 0x1C232A31, 0x383F464D, 0x545B6269 44.long 0x70777E85, 0x8C939AA1, 0xA8AFB6BD, 0xC4CBD2D9 45.long 0xE0E7EEF5, 0xFC030A11, 0x181F262D, 0x343B4249 46.long 0x50575E65, 0x6C737A81, 0x888F969D, 0xA4ABB2B9 47.long 0xC0C7CED5, 0xDCE3EAF1, 0xF8FF060D, 0x141B2229 48.long 0x30373E45, 0x4C535A61, 0x686F767D, 0x848B9299 49.long 0xA0A7AEB5, 0xBCC3CAD1, 0xD8DFE6ED, 0xF4FB0209 50.long 0x10171E25, 0x2C333A41, 0x484F565D, 0x646B7279 51.Lfk: 52.quad 0x56aa3350a3b1bac6,0xb27022dc677d9197 53.Lshuffles: 54.quad 0x0B0A090807060504,0x030201000F0E0D0C 55.Lxts_magic: 56.quad 0x0101010101010187,0x0101010101010101 57 58.size _vpsm4_consts,.-_vpsm4_consts 59 60.previous 61 62.type _vpsm4_set_key,%function 63.align 4 64_vpsm4_set_key: 65 AARCH64_VALID_CALL_TARGET 66 ld1 {v5.4s},[x0] 67 adrp x10,.Lsbox 68 add x10,x10,#:lo12:.Lsbox 69 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 70 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 71 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 72 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 73#ifndef __AARCH64EB__ 74 rev32 v5.16b,v5.16b 75#endif 76 adrp x5,.Lshuffles 77 add x5,x5,#:lo12:.Lshuffles 78 ld1 {v7.2d},[x5] 79 adrp x5,.Lfk 80 add x5,x5,#:lo12:.Lfk 81 ld1 {v6.2d},[x5] 82 eor v5.16b,v5.16b,v6.16b 83 mov x6,#32 84 adrp x5,.Lck 85 add x5,x5,#:lo12:.Lck 86 movi v0.16b,#64 87 cbnz w2,1f 88 add x1,x1,124 891: 90 mov w7,v5.s[1] 91 ldr w8,[x5],#4 92 eor w8,w8,w7 93 mov w7,v5.s[2] 94 eor w8,w8,w7 95 mov w7,v5.s[3] 96 eor w8,w8,w7 97 // sbox lookup 98 mov v4.s[0],w8 99 tbl v1.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v4.16b 100 sub v4.16b,v4.16b,v0.16b 101 tbx v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v4.16b 102 sub v4.16b,v4.16b,v0.16b 103 tbx v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v4.16b 104 sub v4.16b,v4.16b,v0.16b 105 tbx v1.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v4.16b 106 mov w7,v1.s[0] 107 eor w8,w7,w7,ror #19 108 eor w8,w8,w7,ror #9 109 mov w7,v5.s[0] 110 eor w8,w8,w7 111 mov v5.s[0],w8 112 cbz w2,2f 113 str w8,[x1],#4 114 b 3f 1152: 116 str w8,[x1],#-4 1173: 118 tbl v5.16b,{v5.16b},v7.16b 119 subs x6,x6,#1 120 b.ne 1b 121 ret 122.size _vpsm4_set_key,.-_vpsm4_set_key 123.type _vpsm4_enc_4blks,%function 124.align 4 125_vpsm4_enc_4blks: 126 AARCH64_VALID_CALL_TARGET 127 mov x10,x3 128 mov w11,#8 12910: 130 ldp w7,w8,[x10],8 131 dup v12.4s,w7 132 dup v13.4s,w8 133 134 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 135 eor v14.16b,v6.16b,v7.16b 136 eor v12.16b,v5.16b,v12.16b 137 eor v12.16b,v14.16b,v12.16b 138 movi v0.16b,#64 139 movi v1.16b,#128 140 movi v2.16b,#192 141 sub v0.16b,v12.16b,v0.16b 142 sub v1.16b,v12.16b,v1.16b 143 sub v2.16b,v12.16b,v2.16b 144 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 145 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 146 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 147 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 148 add v0.2d,v0.2d,v1.2d 149 add v2.2d,v2.2d,v12.2d 150 add v12.2d,v0.2d,v2.2d 151 152 ushr v0.4s,v12.4s,32-2 153 sli v0.4s,v12.4s,2 154 ushr v2.4s,v12.4s,32-10 155 eor v1.16b,v0.16b,v12.16b 156 sli v2.4s,v12.4s,10 157 eor v1.16b,v2.16b,v1.16b 158 ushr v0.4s,v12.4s,32-18 159 sli v0.4s,v12.4s,18 160 ushr v2.4s,v12.4s,32-24 161 eor v1.16b,v0.16b,v1.16b 162 sli v2.4s,v12.4s,24 163 eor v12.16b,v2.16b,v1.16b 164 eor v4.16b,v4.16b,v12.16b 165 166 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 167 eor v14.16b,v14.16b,v4.16b 168 eor v13.16b,v14.16b,v13.16b 169 movi v0.16b,#64 170 movi v1.16b,#128 171 movi v2.16b,#192 172 sub v0.16b,v13.16b,v0.16b 173 sub v1.16b,v13.16b,v1.16b 174 sub v2.16b,v13.16b,v2.16b 175 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 176 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 177 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 178 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 179 add v0.2d,v0.2d,v1.2d 180 add v2.2d,v2.2d,v13.2d 181 add v13.2d,v0.2d,v2.2d 182 183 ushr v0.4s,v13.4s,32-2 184 sli v0.4s,v13.4s,2 185 ushr v2.4s,v13.4s,32-10 186 eor v1.16b,v0.16b,v13.16b 187 sli v2.4s,v13.4s,10 188 eor v1.16b,v2.16b,v1.16b 189 ushr v0.4s,v13.4s,32-18 190 sli v0.4s,v13.4s,18 191 ushr v2.4s,v13.4s,32-24 192 eor v1.16b,v0.16b,v1.16b 193 sli v2.4s,v13.4s,24 194 eor v13.16b,v2.16b,v1.16b 195 ldp w7,w8,[x10],8 196 eor v5.16b,v5.16b,v13.16b 197 198 dup v12.4s,w7 199 dup v13.4s,w8 200 201 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 202 eor v14.16b,v4.16b,v5.16b 203 eor v12.16b,v7.16b,v12.16b 204 eor v12.16b,v14.16b,v12.16b 205 movi v0.16b,#64 206 movi v1.16b,#128 207 movi v2.16b,#192 208 sub v0.16b,v12.16b,v0.16b 209 sub v1.16b,v12.16b,v1.16b 210 sub v2.16b,v12.16b,v2.16b 211 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 212 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 213 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 214 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 215 add v0.2d,v0.2d,v1.2d 216 add v2.2d,v2.2d,v12.2d 217 add v12.2d,v0.2d,v2.2d 218 219 ushr v0.4s,v12.4s,32-2 220 sli v0.4s,v12.4s,2 221 ushr v2.4s,v12.4s,32-10 222 eor v1.16b,v0.16b,v12.16b 223 sli v2.4s,v12.4s,10 224 eor v1.16b,v2.16b,v1.16b 225 ushr v0.4s,v12.4s,32-18 226 sli v0.4s,v12.4s,18 227 ushr v2.4s,v12.4s,32-24 228 eor v1.16b,v0.16b,v1.16b 229 sli v2.4s,v12.4s,24 230 eor v12.16b,v2.16b,v1.16b 231 eor v6.16b,v6.16b,v12.16b 232 233 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 234 eor v14.16b,v14.16b,v6.16b 235 eor v13.16b,v14.16b,v13.16b 236 movi v0.16b,#64 237 movi v1.16b,#128 238 movi v2.16b,#192 239 sub v0.16b,v13.16b,v0.16b 240 sub v1.16b,v13.16b,v1.16b 241 sub v2.16b,v13.16b,v2.16b 242 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 243 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 244 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 245 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 246 add v0.2d,v0.2d,v1.2d 247 add v2.2d,v2.2d,v13.2d 248 add v13.2d,v0.2d,v2.2d 249 250 ushr v0.4s,v13.4s,32-2 251 sli v0.4s,v13.4s,2 252 ushr v2.4s,v13.4s,32-10 253 eor v1.16b,v0.16b,v13.16b 254 sli v2.4s,v13.4s,10 255 eor v1.16b,v2.16b,v1.16b 256 ushr v0.4s,v13.4s,32-18 257 sli v0.4s,v13.4s,18 258 ushr v2.4s,v13.4s,32-24 259 eor v1.16b,v0.16b,v1.16b 260 sli v2.4s,v13.4s,24 261 eor v13.16b,v2.16b,v1.16b 262 eor v7.16b,v7.16b,v13.16b 263 subs w11,w11,#1 264 b.ne 10b 265#ifndef __AARCH64EB__ 266 rev32 v3.16b,v4.16b 267#else 268 mov v3.16b,v4.16b 269#endif 270#ifndef __AARCH64EB__ 271 rev32 v2.16b,v5.16b 272#else 273 mov v2.16b,v5.16b 274#endif 275#ifndef __AARCH64EB__ 276 rev32 v1.16b,v6.16b 277#else 278 mov v1.16b,v6.16b 279#endif 280#ifndef __AARCH64EB__ 281 rev32 v0.16b,v7.16b 282#else 283 mov v0.16b,v7.16b 284#endif 285 ret 286.size _vpsm4_enc_4blks,.-_vpsm4_enc_4blks 287.type _vpsm4_enc_8blks,%function 288.align 4 289_vpsm4_enc_8blks: 290 AARCH64_VALID_CALL_TARGET 291 mov x10,x3 292 mov w11,#8 29310: 294 ldp w7,w8,[x10],8 295 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 296 dup v12.4s,w7 297 eor v14.16b,v6.16b,v7.16b 298 eor v15.16b,v10.16b,v11.16b 299 eor v0.16b,v5.16b,v12.16b 300 eor v1.16b,v9.16b,v12.16b 301 eor v12.16b,v14.16b,v0.16b 302 eor v13.16b,v15.16b,v1.16b 303 movi v3.16b,#64 304 sub v0.16b,v12.16b,v3.16b 305 sub v1.16b,v0.16b,v3.16b 306 sub v2.16b,v1.16b,v3.16b 307 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 308 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 309 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 310 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 311 add v1.2d,v0.2d,v1.2d 312 add v12.2d,v2.2d,v12.2d 313 add v12.2d,v1.2d,v12.2d 314 315 sub v0.16b,v13.16b,v3.16b 316 sub v1.16b,v0.16b,v3.16b 317 sub v2.16b,v1.16b,v3.16b 318 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 319 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 320 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 321 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 322 add v1.2d,v0.2d,v1.2d 323 add v13.2d,v2.2d,v13.2d 324 add v13.2d,v1.2d,v13.2d 325 326 ushr v0.4s,v12.4s,32-2 327 sli v0.4s,v12.4s,2 328 ushr v2.4s,v13.4s,32-2 329 eor v1.16b,v0.16b,v12.16b 330 sli v2.4s,v13.4s,2 331 332 ushr v0.4s,v12.4s,32-10 333 eor v3.16b,v2.16b,v13.16b 334 sli v0.4s,v12.4s,10 335 ushr v2.4s,v13.4s,32-10 336 eor v1.16b,v0.16b,v1.16b 337 sli v2.4s,v13.4s,10 338 339 ushr v0.4s,v12.4s,32-18 340 eor v3.16b,v2.16b,v3.16b 341 sli v0.4s,v12.4s,18 342 ushr v2.4s,v13.4s,32-18 343 eor v1.16b,v0.16b,v1.16b 344 sli v2.4s,v13.4s,18 345 346 ushr v0.4s,v12.4s,32-24 347 eor v3.16b,v2.16b,v3.16b 348 sli v0.4s,v12.4s,24 349 ushr v2.4s,v13.4s,32-24 350 eor v12.16b,v0.16b,v1.16b 351 sli v2.4s,v13.4s,24 352 eor v13.16b,v2.16b,v3.16b 353 eor v4.16b,v4.16b,v12.16b 354 eor v8.16b,v8.16b,v13.16b 355 356 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 357 dup v13.4s,w8 358 eor v14.16b,v14.16b,v4.16b 359 eor v15.16b,v15.16b,v8.16b 360 eor v12.16b,v14.16b,v13.16b 361 eor v13.16b,v15.16b,v13.16b 362 movi v3.16b,#64 363 sub v0.16b,v12.16b,v3.16b 364 sub v1.16b,v0.16b,v3.16b 365 sub v2.16b,v1.16b,v3.16b 366 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 367 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 368 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 369 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 370 add v1.2d,v0.2d,v1.2d 371 add v12.2d,v2.2d,v12.2d 372 add v12.2d,v1.2d,v12.2d 373 374 sub v0.16b,v13.16b,v3.16b 375 sub v1.16b,v0.16b,v3.16b 376 sub v2.16b,v1.16b,v3.16b 377 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 378 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 379 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 380 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 381 add v1.2d,v0.2d,v1.2d 382 add v13.2d,v2.2d,v13.2d 383 add v13.2d,v1.2d,v13.2d 384 385 ushr v0.4s,v12.4s,32-2 386 sli v0.4s,v12.4s,2 387 ushr v2.4s,v13.4s,32-2 388 eor v1.16b,v0.16b,v12.16b 389 sli v2.4s,v13.4s,2 390 391 ushr v0.4s,v12.4s,32-10 392 eor v3.16b,v2.16b,v13.16b 393 sli v0.4s,v12.4s,10 394 ushr v2.4s,v13.4s,32-10 395 eor v1.16b,v0.16b,v1.16b 396 sli v2.4s,v13.4s,10 397 398 ushr v0.4s,v12.4s,32-18 399 eor v3.16b,v2.16b,v3.16b 400 sli v0.4s,v12.4s,18 401 ushr v2.4s,v13.4s,32-18 402 eor v1.16b,v0.16b,v1.16b 403 sli v2.4s,v13.4s,18 404 405 ushr v0.4s,v12.4s,32-24 406 eor v3.16b,v2.16b,v3.16b 407 sli v0.4s,v12.4s,24 408 ushr v2.4s,v13.4s,32-24 409 eor v12.16b,v0.16b,v1.16b 410 sli v2.4s,v13.4s,24 411 eor v13.16b,v2.16b,v3.16b 412 ldp w7,w8,[x10],8 413 eor v5.16b,v5.16b,v12.16b 414 eor v9.16b,v9.16b,v13.16b 415 416 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 417 dup v12.4s,w7 418 eor v14.16b,v4.16b,v5.16b 419 eor v15.16b,v8.16b,v9.16b 420 eor v0.16b,v7.16b,v12.16b 421 eor v1.16b,v11.16b,v12.16b 422 eor v12.16b,v14.16b,v0.16b 423 eor v13.16b,v15.16b,v1.16b 424 movi v3.16b,#64 425 sub v0.16b,v12.16b,v3.16b 426 sub v1.16b,v0.16b,v3.16b 427 sub v2.16b,v1.16b,v3.16b 428 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 429 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 430 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 431 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 432 add v1.2d,v0.2d,v1.2d 433 add v12.2d,v2.2d,v12.2d 434 add v12.2d,v1.2d,v12.2d 435 436 sub v0.16b,v13.16b,v3.16b 437 sub v1.16b,v0.16b,v3.16b 438 sub v2.16b,v1.16b,v3.16b 439 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 440 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 441 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 442 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 443 add v1.2d,v0.2d,v1.2d 444 add v13.2d,v2.2d,v13.2d 445 add v13.2d,v1.2d,v13.2d 446 447 ushr v0.4s,v12.4s,32-2 448 sli v0.4s,v12.4s,2 449 ushr v2.4s,v13.4s,32-2 450 eor v1.16b,v0.16b,v12.16b 451 sli v2.4s,v13.4s,2 452 453 ushr v0.4s,v12.4s,32-10 454 eor v3.16b,v2.16b,v13.16b 455 sli v0.4s,v12.4s,10 456 ushr v2.4s,v13.4s,32-10 457 eor v1.16b,v0.16b,v1.16b 458 sli v2.4s,v13.4s,10 459 460 ushr v0.4s,v12.4s,32-18 461 eor v3.16b,v2.16b,v3.16b 462 sli v0.4s,v12.4s,18 463 ushr v2.4s,v13.4s,32-18 464 eor v1.16b,v0.16b,v1.16b 465 sli v2.4s,v13.4s,18 466 467 ushr v0.4s,v12.4s,32-24 468 eor v3.16b,v2.16b,v3.16b 469 sli v0.4s,v12.4s,24 470 ushr v2.4s,v13.4s,32-24 471 eor v12.16b,v0.16b,v1.16b 472 sli v2.4s,v13.4s,24 473 eor v13.16b,v2.16b,v3.16b 474 eor v6.16b,v6.16b,v12.16b 475 eor v10.16b,v10.16b,v13.16b 476 477 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 478 dup v13.4s,w8 479 eor v14.16b,v14.16b,v6.16b 480 eor v15.16b,v15.16b,v10.16b 481 eor v12.16b,v14.16b,v13.16b 482 eor v13.16b,v15.16b,v13.16b 483 movi v3.16b,#64 484 sub v0.16b,v12.16b,v3.16b 485 sub v1.16b,v0.16b,v3.16b 486 sub v2.16b,v1.16b,v3.16b 487 tbl v12.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v12.16b 488 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 489 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 490 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 491 add v1.2d,v0.2d,v1.2d 492 add v12.2d,v2.2d,v12.2d 493 add v12.2d,v1.2d,v12.2d 494 495 sub v0.16b,v13.16b,v3.16b 496 sub v1.16b,v0.16b,v3.16b 497 sub v2.16b,v1.16b,v3.16b 498 tbl v13.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v13.16b 499 tbl v0.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v0.16b 500 tbl v1.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v1.16b 501 tbl v2.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v2.16b 502 add v1.2d,v0.2d,v1.2d 503 add v13.2d,v2.2d,v13.2d 504 add v13.2d,v1.2d,v13.2d 505 506 ushr v0.4s,v12.4s,32-2 507 sli v0.4s,v12.4s,2 508 ushr v2.4s,v13.4s,32-2 509 eor v1.16b,v0.16b,v12.16b 510 sli v2.4s,v13.4s,2 511 512 ushr v0.4s,v12.4s,32-10 513 eor v3.16b,v2.16b,v13.16b 514 sli v0.4s,v12.4s,10 515 ushr v2.4s,v13.4s,32-10 516 eor v1.16b,v0.16b,v1.16b 517 sli v2.4s,v13.4s,10 518 519 ushr v0.4s,v12.4s,32-18 520 eor v3.16b,v2.16b,v3.16b 521 sli v0.4s,v12.4s,18 522 ushr v2.4s,v13.4s,32-18 523 eor v1.16b,v0.16b,v1.16b 524 sli v2.4s,v13.4s,18 525 526 ushr v0.4s,v12.4s,32-24 527 eor v3.16b,v2.16b,v3.16b 528 sli v0.4s,v12.4s,24 529 ushr v2.4s,v13.4s,32-24 530 eor v12.16b,v0.16b,v1.16b 531 sli v2.4s,v13.4s,24 532 eor v13.16b,v2.16b,v3.16b 533 eor v7.16b,v7.16b,v12.16b 534 eor v11.16b,v11.16b,v13.16b 535 subs w11,w11,#1 536 b.ne 10b 537#ifndef __AARCH64EB__ 538 rev32 v3.16b,v4.16b 539#else 540 mov v3.16b,v4.16b 541#endif 542#ifndef __AARCH64EB__ 543 rev32 v2.16b,v5.16b 544#else 545 mov v2.16b,v5.16b 546#endif 547#ifndef __AARCH64EB__ 548 rev32 v1.16b,v6.16b 549#else 550 mov v1.16b,v6.16b 551#endif 552#ifndef __AARCH64EB__ 553 rev32 v0.16b,v7.16b 554#else 555 mov v0.16b,v7.16b 556#endif 557#ifndef __AARCH64EB__ 558 rev32 v7.16b,v8.16b 559#else 560 mov v7.16b,v8.16b 561#endif 562#ifndef __AARCH64EB__ 563 rev32 v6.16b,v9.16b 564#else 565 mov v6.16b,v9.16b 566#endif 567#ifndef __AARCH64EB__ 568 rev32 v5.16b,v10.16b 569#else 570 mov v5.16b,v10.16b 571#endif 572#ifndef __AARCH64EB__ 573 rev32 v4.16b,v11.16b 574#else 575 mov v4.16b,v11.16b 576#endif 577 ret 578.size _vpsm4_enc_8blks,.-_vpsm4_enc_8blks 579.globl vpsm4_set_encrypt_key 580.type vpsm4_set_encrypt_key,%function 581.align 5 582vpsm4_set_encrypt_key: 583 AARCH64_SIGN_LINK_REGISTER 584 stp x29,x30,[sp,#-16]! 585 mov w2,1 586 bl _vpsm4_set_key 587 ldp x29,x30,[sp],#16 588 AARCH64_VALIDATE_LINK_REGISTER 589 ret 590.size vpsm4_set_encrypt_key,.-vpsm4_set_encrypt_key 591.globl vpsm4_set_decrypt_key 592.type vpsm4_set_decrypt_key,%function 593.align 5 594vpsm4_set_decrypt_key: 595 AARCH64_SIGN_LINK_REGISTER 596 stp x29,x30,[sp,#-16]! 597 mov w2,0 598 bl _vpsm4_set_key 599 ldp x29,x30,[sp],#16 600 AARCH64_VALIDATE_LINK_REGISTER 601 ret 602.size vpsm4_set_decrypt_key,.-vpsm4_set_decrypt_key 603.globl vpsm4_encrypt 604.type vpsm4_encrypt,%function 605.align 5 606vpsm4_encrypt: 607 AARCH64_VALID_CALL_TARGET 608 ld1 {v4.4s},[x0] 609 adrp x10,.Lsbox 610 add x10,x10,#:lo12:.Lsbox 611 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 612 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 613 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 614 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 615#ifndef __AARCH64EB__ 616 rev32 v4.16b,v4.16b 617#endif 618 mov x3,x2 619 mov x10,x3 620 mov w11,#8 621 mov w12,v4.s[0] 622 mov w13,v4.s[1] 623 mov w14,v4.s[2] 624 mov w15,v4.s[3] 62510: 626 ldp w7,w8,[x10],8 627 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 628 eor w6,w14,w15 629 eor w9,w7,w13 630 eor w6,w6,w9 631 movi v1.16b,#64 632 movi v2.16b,#128 633 movi v3.16b,#192 634 mov v0.s[0],w6 635 636 sub v1.16b,v0.16b,v1.16b 637 sub v2.16b,v0.16b,v2.16b 638 sub v3.16b,v0.16b,v3.16b 639 640 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 641 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 642 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 643 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 644 645 mov w6,v0.s[0] 646 mov w7,v1.s[0] 647 mov w9,v2.s[0] 648 add w7,w6,w7 649 mov w6,v3.s[0] 650 add w7,w7,w9 651 add w7,w7,w6 652 653 eor w6,w7,w7,ror #32-2 654 eor w6,w6,w7,ror #32-10 655 eor w6,w6,w7,ror #32-18 656 eor w6,w6,w7,ror #32-24 657 eor w12,w12,w6 658 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 659 eor w6,w14,w15 660 eor w9,w12,w8 661 eor w6,w6,w9 662 movi v1.16b,#64 663 movi v2.16b,#128 664 movi v3.16b,#192 665 mov v0.s[0],w6 666 667 sub v1.16b,v0.16b,v1.16b 668 sub v2.16b,v0.16b,v2.16b 669 sub v3.16b,v0.16b,v3.16b 670 671 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 672 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 673 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 674 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 675 676 mov w6,v0.s[0] 677 mov w7,v1.s[0] 678 mov w9,v2.s[0] 679 add w7,w6,w7 680 mov w6,v3.s[0] 681 add w7,w7,w9 682 add w7,w7,w6 683 684 eor w6,w7,w7,ror #32-2 685 eor w6,w6,w7,ror #32-10 686 eor w6,w6,w7,ror #32-18 687 eor w6,w6,w7,ror #32-24 688 ldp w7,w8,[x10],8 689 eor w13,w13,w6 690 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 691 eor w6,w12,w13 692 eor w9,w7,w15 693 eor w6,w6,w9 694 movi v1.16b,#64 695 movi v2.16b,#128 696 movi v3.16b,#192 697 mov v0.s[0],w6 698 699 sub v1.16b,v0.16b,v1.16b 700 sub v2.16b,v0.16b,v2.16b 701 sub v3.16b,v0.16b,v3.16b 702 703 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 704 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 705 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 706 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 707 708 mov w6,v0.s[0] 709 mov w7,v1.s[0] 710 mov w9,v2.s[0] 711 add w7,w6,w7 712 mov w6,v3.s[0] 713 add w7,w7,w9 714 add w7,w7,w6 715 716 eor w6,w7,w7,ror #32-2 717 eor w6,w6,w7,ror #32-10 718 eor w6,w6,w7,ror #32-18 719 eor w6,w6,w7,ror #32-24 720 eor w14,w14,w6 721 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 722 eor w6,w12,w13 723 eor w9,w14,w8 724 eor w6,w6,w9 725 movi v1.16b,#64 726 movi v2.16b,#128 727 movi v3.16b,#192 728 mov v0.s[0],w6 729 730 sub v1.16b,v0.16b,v1.16b 731 sub v2.16b,v0.16b,v2.16b 732 sub v3.16b,v0.16b,v3.16b 733 734 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 735 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 736 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 737 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 738 739 mov w6,v0.s[0] 740 mov w7,v1.s[0] 741 mov w9,v2.s[0] 742 add w7,w6,w7 743 mov w6,v3.s[0] 744 add w7,w7,w9 745 add w7,w7,w6 746 747 eor w6,w7,w7,ror #32-2 748 eor w6,w6,w7,ror #32-10 749 eor w6,w6,w7,ror #32-18 750 eor w6,w6,w7,ror #32-24 751 eor w15,w15,w6 752 subs w11,w11,#1 753 b.ne 10b 754 mov v4.s[0],w15 755 mov v4.s[1],w14 756 mov v4.s[2],w13 757 mov v4.s[3],w12 758#ifndef __AARCH64EB__ 759 rev32 v4.16b,v4.16b 760#endif 761 st1 {v4.4s},[x1] 762 ret 763.size vpsm4_encrypt,.-vpsm4_encrypt 764.globl vpsm4_decrypt 765.type vpsm4_decrypt,%function 766.align 5 767vpsm4_decrypt: 768 AARCH64_VALID_CALL_TARGET 769 ld1 {v4.4s},[x0] 770 adrp x10,.Lsbox 771 add x10,x10,#:lo12:.Lsbox 772 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 773 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 774 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 775 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 776#ifndef __AARCH64EB__ 777 rev32 v4.16b,v4.16b 778#endif 779 mov x3,x2 780 mov x10,x3 781 mov w11,#8 782 mov w12,v4.s[0] 783 mov w13,v4.s[1] 784 mov w14,v4.s[2] 785 mov w15,v4.s[3] 78610: 787 ldp w7,w8,[x10],8 788 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 789 eor w6,w14,w15 790 eor w9,w7,w13 791 eor w6,w6,w9 792 movi v1.16b,#64 793 movi v2.16b,#128 794 movi v3.16b,#192 795 mov v0.s[0],w6 796 797 sub v1.16b,v0.16b,v1.16b 798 sub v2.16b,v0.16b,v2.16b 799 sub v3.16b,v0.16b,v3.16b 800 801 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 802 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 803 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 804 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 805 806 mov w6,v0.s[0] 807 mov w7,v1.s[0] 808 mov w9,v2.s[0] 809 add w7,w6,w7 810 mov w6,v3.s[0] 811 add w7,w7,w9 812 add w7,w7,w6 813 814 eor w6,w7,w7,ror #32-2 815 eor w6,w6,w7,ror #32-10 816 eor w6,w6,w7,ror #32-18 817 eor w6,w6,w7,ror #32-24 818 eor w12,w12,w6 819 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 820 eor w6,w14,w15 821 eor w9,w12,w8 822 eor w6,w6,w9 823 movi v1.16b,#64 824 movi v2.16b,#128 825 movi v3.16b,#192 826 mov v0.s[0],w6 827 828 sub v1.16b,v0.16b,v1.16b 829 sub v2.16b,v0.16b,v2.16b 830 sub v3.16b,v0.16b,v3.16b 831 832 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 833 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 834 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 835 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 836 837 mov w6,v0.s[0] 838 mov w7,v1.s[0] 839 mov w9,v2.s[0] 840 add w7,w6,w7 841 mov w6,v3.s[0] 842 add w7,w7,w9 843 add w7,w7,w6 844 845 eor w6,w7,w7,ror #32-2 846 eor w6,w6,w7,ror #32-10 847 eor w6,w6,w7,ror #32-18 848 eor w6,w6,w7,ror #32-24 849 ldp w7,w8,[x10],8 850 eor w13,w13,w6 851 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 852 eor w6,w12,w13 853 eor w9,w7,w15 854 eor w6,w6,w9 855 movi v1.16b,#64 856 movi v2.16b,#128 857 movi v3.16b,#192 858 mov v0.s[0],w6 859 860 sub v1.16b,v0.16b,v1.16b 861 sub v2.16b,v0.16b,v2.16b 862 sub v3.16b,v0.16b,v3.16b 863 864 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 865 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 866 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 867 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 868 869 mov w6,v0.s[0] 870 mov w7,v1.s[0] 871 mov w9,v2.s[0] 872 add w7,w6,w7 873 mov w6,v3.s[0] 874 add w7,w7,w9 875 add w7,w7,w6 876 877 eor w6,w7,w7,ror #32-2 878 eor w6,w6,w7,ror #32-10 879 eor w6,w6,w7,ror #32-18 880 eor w6,w6,w7,ror #32-24 881 eor w14,w14,w6 882 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 883 eor w6,w12,w13 884 eor w9,w14,w8 885 eor w6,w6,w9 886 movi v1.16b,#64 887 movi v2.16b,#128 888 movi v3.16b,#192 889 mov v0.s[0],w6 890 891 sub v1.16b,v0.16b,v1.16b 892 sub v2.16b,v0.16b,v2.16b 893 sub v3.16b,v0.16b,v3.16b 894 895 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 896 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 897 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 898 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 899 900 mov w6,v0.s[0] 901 mov w7,v1.s[0] 902 mov w9,v2.s[0] 903 add w7,w6,w7 904 mov w6,v3.s[0] 905 add w7,w7,w9 906 add w7,w7,w6 907 908 eor w6,w7,w7,ror #32-2 909 eor w6,w6,w7,ror #32-10 910 eor w6,w6,w7,ror #32-18 911 eor w6,w6,w7,ror #32-24 912 eor w15,w15,w6 913 subs w11,w11,#1 914 b.ne 10b 915 mov v4.s[0],w15 916 mov v4.s[1],w14 917 mov v4.s[2],w13 918 mov v4.s[3],w12 919#ifndef __AARCH64EB__ 920 rev32 v4.16b,v4.16b 921#endif 922 st1 {v4.4s},[x1] 923 ret 924.size vpsm4_decrypt,.-vpsm4_decrypt 925.globl vpsm4_ecb_encrypt 926.type vpsm4_ecb_encrypt,%function 927.align 5 928vpsm4_ecb_encrypt: 929 AARCH64_SIGN_LINK_REGISTER 930 // convert length into blocks 931 lsr x2,x2,4 932 stp d8,d9,[sp,#-80]! 933 stp d10,d11,[sp,#16] 934 stp d12,d13,[sp,#32] 935 stp d14,d15,[sp,#48] 936 stp x29,x30,[sp,#64] 937 adrp x10,.Lsbox 938 add x10,x10,#:lo12:.Lsbox 939 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 940 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 941 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 942 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 943.Lecb_8_blocks_process: 944 cmp w2,#8 945 b.lt .Lecb_4_blocks_process 946 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 947 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 948#ifndef __AARCH64EB__ 949 rev32 v4.16b,v4.16b 950#endif 951#ifndef __AARCH64EB__ 952 rev32 v5.16b,v5.16b 953#endif 954#ifndef __AARCH64EB__ 955 rev32 v6.16b,v6.16b 956#endif 957#ifndef __AARCH64EB__ 958 rev32 v7.16b,v7.16b 959#endif 960#ifndef __AARCH64EB__ 961 rev32 v8.16b,v8.16b 962#endif 963#ifndef __AARCH64EB__ 964 rev32 v9.16b,v9.16b 965#endif 966#ifndef __AARCH64EB__ 967 rev32 v10.16b,v10.16b 968#endif 969#ifndef __AARCH64EB__ 970 rev32 v11.16b,v11.16b 971#endif 972 bl _vpsm4_enc_8blks 973 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 974 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 975 subs w2,w2,#8 976 b.gt .Lecb_8_blocks_process 977 b 100f 978.Lecb_4_blocks_process: 979 cmp w2,#4 980 b.lt 1f 981 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 982#ifndef __AARCH64EB__ 983 rev32 v4.16b,v4.16b 984#endif 985#ifndef __AARCH64EB__ 986 rev32 v5.16b,v5.16b 987#endif 988#ifndef __AARCH64EB__ 989 rev32 v6.16b,v6.16b 990#endif 991#ifndef __AARCH64EB__ 992 rev32 v7.16b,v7.16b 993#endif 994 bl _vpsm4_enc_4blks 995 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 996 sub w2,w2,#4 9971: 998 // process last block 999 cmp w2,#1 1000 b.lt 100f 1001 b.gt 1f 1002 ld1 {v4.4s},[x0] 1003#ifndef __AARCH64EB__ 1004 rev32 v4.16b,v4.16b 1005#endif 1006 mov x10,x3 1007 mov w11,#8 1008 mov w12,v4.s[0] 1009 mov w13,v4.s[1] 1010 mov w14,v4.s[2] 1011 mov w15,v4.s[3] 101210: 1013 ldp w7,w8,[x10],8 1014 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1015 eor w6,w14,w15 1016 eor w9,w7,w13 1017 eor w6,w6,w9 1018 movi v1.16b,#64 1019 movi v2.16b,#128 1020 movi v3.16b,#192 1021 mov v0.s[0],w6 1022 1023 sub v1.16b,v0.16b,v1.16b 1024 sub v2.16b,v0.16b,v2.16b 1025 sub v3.16b,v0.16b,v3.16b 1026 1027 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1028 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1029 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1030 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1031 1032 mov w6,v0.s[0] 1033 mov w7,v1.s[0] 1034 mov w9,v2.s[0] 1035 add w7,w6,w7 1036 mov w6,v3.s[0] 1037 add w7,w7,w9 1038 add w7,w7,w6 1039 1040 eor w6,w7,w7,ror #32-2 1041 eor w6,w6,w7,ror #32-10 1042 eor w6,w6,w7,ror #32-18 1043 eor w6,w6,w7,ror #32-24 1044 eor w12,w12,w6 1045 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1046 eor w6,w14,w15 1047 eor w9,w12,w8 1048 eor w6,w6,w9 1049 movi v1.16b,#64 1050 movi v2.16b,#128 1051 movi v3.16b,#192 1052 mov v0.s[0],w6 1053 1054 sub v1.16b,v0.16b,v1.16b 1055 sub v2.16b,v0.16b,v2.16b 1056 sub v3.16b,v0.16b,v3.16b 1057 1058 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1059 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1060 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1061 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1062 1063 mov w6,v0.s[0] 1064 mov w7,v1.s[0] 1065 mov w9,v2.s[0] 1066 add w7,w6,w7 1067 mov w6,v3.s[0] 1068 add w7,w7,w9 1069 add w7,w7,w6 1070 1071 eor w6,w7,w7,ror #32-2 1072 eor w6,w6,w7,ror #32-10 1073 eor w6,w6,w7,ror #32-18 1074 eor w6,w6,w7,ror #32-24 1075 ldp w7,w8,[x10],8 1076 eor w13,w13,w6 1077 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1078 eor w6,w12,w13 1079 eor w9,w7,w15 1080 eor w6,w6,w9 1081 movi v1.16b,#64 1082 movi v2.16b,#128 1083 movi v3.16b,#192 1084 mov v0.s[0],w6 1085 1086 sub v1.16b,v0.16b,v1.16b 1087 sub v2.16b,v0.16b,v2.16b 1088 sub v3.16b,v0.16b,v3.16b 1089 1090 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1091 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1092 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1093 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1094 1095 mov w6,v0.s[0] 1096 mov w7,v1.s[0] 1097 mov w9,v2.s[0] 1098 add w7,w6,w7 1099 mov w6,v3.s[0] 1100 add w7,w7,w9 1101 add w7,w7,w6 1102 1103 eor w6,w7,w7,ror #32-2 1104 eor w6,w6,w7,ror #32-10 1105 eor w6,w6,w7,ror #32-18 1106 eor w6,w6,w7,ror #32-24 1107 eor w14,w14,w6 1108 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1109 eor w6,w12,w13 1110 eor w9,w14,w8 1111 eor w6,w6,w9 1112 movi v1.16b,#64 1113 movi v2.16b,#128 1114 movi v3.16b,#192 1115 mov v0.s[0],w6 1116 1117 sub v1.16b,v0.16b,v1.16b 1118 sub v2.16b,v0.16b,v2.16b 1119 sub v3.16b,v0.16b,v3.16b 1120 1121 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1122 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1123 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1124 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1125 1126 mov w6,v0.s[0] 1127 mov w7,v1.s[0] 1128 mov w9,v2.s[0] 1129 add w7,w6,w7 1130 mov w6,v3.s[0] 1131 add w7,w7,w9 1132 add w7,w7,w6 1133 1134 eor w6,w7,w7,ror #32-2 1135 eor w6,w6,w7,ror #32-10 1136 eor w6,w6,w7,ror #32-18 1137 eor w6,w6,w7,ror #32-24 1138 eor w15,w15,w6 1139 subs w11,w11,#1 1140 b.ne 10b 1141 mov v4.s[0],w15 1142 mov v4.s[1],w14 1143 mov v4.s[2],w13 1144 mov v4.s[3],w12 1145#ifndef __AARCH64EB__ 1146 rev32 v4.16b,v4.16b 1147#endif 1148 st1 {v4.4s},[x1] 1149 b 100f 11501: // process last 2 blocks 1151 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0],#16 1152 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x0],#16 1153 cmp w2,#2 1154 b.gt 1f 1155#ifndef __AARCH64EB__ 1156 rev32 v4.16b,v4.16b 1157#endif 1158#ifndef __AARCH64EB__ 1159 rev32 v5.16b,v5.16b 1160#endif 1161#ifndef __AARCH64EB__ 1162 rev32 v6.16b,v6.16b 1163#endif 1164#ifndef __AARCH64EB__ 1165 rev32 v7.16b,v7.16b 1166#endif 1167 bl _vpsm4_enc_4blks 1168 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1169 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1] 1170 b 100f 11711: // process last 3 blocks 1172 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x0],#16 1173#ifndef __AARCH64EB__ 1174 rev32 v4.16b,v4.16b 1175#endif 1176#ifndef __AARCH64EB__ 1177 rev32 v5.16b,v5.16b 1178#endif 1179#ifndef __AARCH64EB__ 1180 rev32 v6.16b,v6.16b 1181#endif 1182#ifndef __AARCH64EB__ 1183 rev32 v7.16b,v7.16b 1184#endif 1185 bl _vpsm4_enc_4blks 1186 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 1187 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 1188 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1] 1189100: 1190 ldp d10,d11,[sp,#16] 1191 ldp d12,d13,[sp,#32] 1192 ldp d14,d15,[sp,#48] 1193 ldp x29,x30,[sp,#64] 1194 ldp d8,d9,[sp],#80 1195 AARCH64_VALIDATE_LINK_REGISTER 1196 ret 1197.size vpsm4_ecb_encrypt,.-vpsm4_ecb_encrypt 1198.globl vpsm4_cbc_encrypt 1199.type vpsm4_cbc_encrypt,%function 1200.align 5 1201vpsm4_cbc_encrypt: 1202 AARCH64_VALID_CALL_TARGET 1203 lsr x2,x2,4 1204 adrp x10,.Lsbox 1205 add x10,x10,#:lo12:.Lsbox 1206 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 1207 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 1208 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 1209 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 1210 cbz w5,.Ldec 1211 ld1 {v3.4s},[x4] 1212.Lcbc_4_blocks_enc: 1213 cmp w2,#4 1214 b.lt 1f 1215 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 1216 eor v4.16b,v4.16b,v3.16b 1217#ifndef __AARCH64EB__ 1218 rev32 v5.16b,v5.16b 1219#endif 1220#ifndef __AARCH64EB__ 1221 rev32 v4.16b,v4.16b 1222#endif 1223#ifndef __AARCH64EB__ 1224 rev32 v6.16b,v6.16b 1225#endif 1226#ifndef __AARCH64EB__ 1227 rev32 v7.16b,v7.16b 1228#endif 1229 mov x10,x3 1230 mov w11,#8 1231 mov w12,v4.s[0] 1232 mov w13,v4.s[1] 1233 mov w14,v4.s[2] 1234 mov w15,v4.s[3] 123510: 1236 ldp w7,w8,[x10],8 1237 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1238 eor w6,w14,w15 1239 eor w9,w7,w13 1240 eor w6,w6,w9 1241 movi v1.16b,#64 1242 movi v2.16b,#128 1243 movi v3.16b,#192 1244 mov v0.s[0],w6 1245 1246 sub v1.16b,v0.16b,v1.16b 1247 sub v2.16b,v0.16b,v2.16b 1248 sub v3.16b,v0.16b,v3.16b 1249 1250 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1251 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1252 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1253 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1254 1255 mov w6,v0.s[0] 1256 mov w7,v1.s[0] 1257 mov w9,v2.s[0] 1258 add w7,w6,w7 1259 mov w6,v3.s[0] 1260 add w7,w7,w9 1261 add w7,w7,w6 1262 1263 eor w6,w7,w7,ror #32-2 1264 eor w6,w6,w7,ror #32-10 1265 eor w6,w6,w7,ror #32-18 1266 eor w6,w6,w7,ror #32-24 1267 eor w12,w12,w6 1268 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1269 eor w6,w14,w15 1270 eor w9,w12,w8 1271 eor w6,w6,w9 1272 movi v1.16b,#64 1273 movi v2.16b,#128 1274 movi v3.16b,#192 1275 mov v0.s[0],w6 1276 1277 sub v1.16b,v0.16b,v1.16b 1278 sub v2.16b,v0.16b,v2.16b 1279 sub v3.16b,v0.16b,v3.16b 1280 1281 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1282 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1283 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1284 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1285 1286 mov w6,v0.s[0] 1287 mov w7,v1.s[0] 1288 mov w9,v2.s[0] 1289 add w7,w6,w7 1290 mov w6,v3.s[0] 1291 add w7,w7,w9 1292 add w7,w7,w6 1293 1294 eor w6,w7,w7,ror #32-2 1295 eor w6,w6,w7,ror #32-10 1296 eor w6,w6,w7,ror #32-18 1297 eor w6,w6,w7,ror #32-24 1298 ldp w7,w8,[x10],8 1299 eor w13,w13,w6 1300 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1301 eor w6,w12,w13 1302 eor w9,w7,w15 1303 eor w6,w6,w9 1304 movi v1.16b,#64 1305 movi v2.16b,#128 1306 movi v3.16b,#192 1307 mov v0.s[0],w6 1308 1309 sub v1.16b,v0.16b,v1.16b 1310 sub v2.16b,v0.16b,v2.16b 1311 sub v3.16b,v0.16b,v3.16b 1312 1313 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1314 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1315 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1316 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1317 1318 mov w6,v0.s[0] 1319 mov w7,v1.s[0] 1320 mov w9,v2.s[0] 1321 add w7,w6,w7 1322 mov w6,v3.s[0] 1323 add w7,w7,w9 1324 add w7,w7,w6 1325 1326 eor w6,w7,w7,ror #32-2 1327 eor w6,w6,w7,ror #32-10 1328 eor w6,w6,w7,ror #32-18 1329 eor w6,w6,w7,ror #32-24 1330 eor w14,w14,w6 1331 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1332 eor w6,w12,w13 1333 eor w9,w14,w8 1334 eor w6,w6,w9 1335 movi v1.16b,#64 1336 movi v2.16b,#128 1337 movi v3.16b,#192 1338 mov v0.s[0],w6 1339 1340 sub v1.16b,v0.16b,v1.16b 1341 sub v2.16b,v0.16b,v2.16b 1342 sub v3.16b,v0.16b,v3.16b 1343 1344 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1345 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1346 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1347 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1348 1349 mov w6,v0.s[0] 1350 mov w7,v1.s[0] 1351 mov w9,v2.s[0] 1352 add w7,w6,w7 1353 mov w6,v3.s[0] 1354 add w7,w7,w9 1355 add w7,w7,w6 1356 1357 eor w6,w7,w7,ror #32-2 1358 eor w6,w6,w7,ror #32-10 1359 eor w6,w6,w7,ror #32-18 1360 eor w6,w6,w7,ror #32-24 1361 eor w15,w15,w6 1362 subs w11,w11,#1 1363 b.ne 10b 1364 mov v4.s[0],w15 1365 mov v4.s[1],w14 1366 mov v4.s[2],w13 1367 mov v4.s[3],w12 1368 eor v5.16b,v5.16b,v4.16b 1369 mov x10,x3 1370 mov w11,#8 1371 mov w12,v5.s[0] 1372 mov w13,v5.s[1] 1373 mov w14,v5.s[2] 1374 mov w15,v5.s[3] 137510: 1376 ldp w7,w8,[x10],8 1377 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1378 eor w6,w14,w15 1379 eor w9,w7,w13 1380 eor w6,w6,w9 1381 movi v1.16b,#64 1382 movi v2.16b,#128 1383 movi v3.16b,#192 1384 mov v0.s[0],w6 1385 1386 sub v1.16b,v0.16b,v1.16b 1387 sub v2.16b,v0.16b,v2.16b 1388 sub v3.16b,v0.16b,v3.16b 1389 1390 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1391 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1392 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1393 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1394 1395 mov w6,v0.s[0] 1396 mov w7,v1.s[0] 1397 mov w9,v2.s[0] 1398 add w7,w6,w7 1399 mov w6,v3.s[0] 1400 add w7,w7,w9 1401 add w7,w7,w6 1402 1403 eor w6,w7,w7,ror #32-2 1404 eor w6,w6,w7,ror #32-10 1405 eor w6,w6,w7,ror #32-18 1406 eor w6,w6,w7,ror #32-24 1407 eor w12,w12,w6 1408 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1409 eor w6,w14,w15 1410 eor w9,w12,w8 1411 eor w6,w6,w9 1412 movi v1.16b,#64 1413 movi v2.16b,#128 1414 movi v3.16b,#192 1415 mov v0.s[0],w6 1416 1417 sub v1.16b,v0.16b,v1.16b 1418 sub v2.16b,v0.16b,v2.16b 1419 sub v3.16b,v0.16b,v3.16b 1420 1421 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1422 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1423 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1424 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1425 1426 mov w6,v0.s[0] 1427 mov w7,v1.s[0] 1428 mov w9,v2.s[0] 1429 add w7,w6,w7 1430 mov w6,v3.s[0] 1431 add w7,w7,w9 1432 add w7,w7,w6 1433 1434 eor w6,w7,w7,ror #32-2 1435 eor w6,w6,w7,ror #32-10 1436 eor w6,w6,w7,ror #32-18 1437 eor w6,w6,w7,ror #32-24 1438 ldp w7,w8,[x10],8 1439 eor w13,w13,w6 1440 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1441 eor w6,w12,w13 1442 eor w9,w7,w15 1443 eor w6,w6,w9 1444 movi v1.16b,#64 1445 movi v2.16b,#128 1446 movi v3.16b,#192 1447 mov v0.s[0],w6 1448 1449 sub v1.16b,v0.16b,v1.16b 1450 sub v2.16b,v0.16b,v2.16b 1451 sub v3.16b,v0.16b,v3.16b 1452 1453 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1454 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1455 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1456 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1457 1458 mov w6,v0.s[0] 1459 mov w7,v1.s[0] 1460 mov w9,v2.s[0] 1461 add w7,w6,w7 1462 mov w6,v3.s[0] 1463 add w7,w7,w9 1464 add w7,w7,w6 1465 1466 eor w6,w7,w7,ror #32-2 1467 eor w6,w6,w7,ror #32-10 1468 eor w6,w6,w7,ror #32-18 1469 eor w6,w6,w7,ror #32-24 1470 eor w14,w14,w6 1471 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1472 eor w6,w12,w13 1473 eor w9,w14,w8 1474 eor w6,w6,w9 1475 movi v1.16b,#64 1476 movi v2.16b,#128 1477 movi v3.16b,#192 1478 mov v0.s[0],w6 1479 1480 sub v1.16b,v0.16b,v1.16b 1481 sub v2.16b,v0.16b,v2.16b 1482 sub v3.16b,v0.16b,v3.16b 1483 1484 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1485 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1486 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1487 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1488 1489 mov w6,v0.s[0] 1490 mov w7,v1.s[0] 1491 mov w9,v2.s[0] 1492 add w7,w6,w7 1493 mov w6,v3.s[0] 1494 add w7,w7,w9 1495 add w7,w7,w6 1496 1497 eor w6,w7,w7,ror #32-2 1498 eor w6,w6,w7,ror #32-10 1499 eor w6,w6,w7,ror #32-18 1500 eor w6,w6,w7,ror #32-24 1501 eor w15,w15,w6 1502 subs w11,w11,#1 1503 b.ne 10b 1504 mov v5.s[0],w15 1505 mov v5.s[1],w14 1506 mov v5.s[2],w13 1507 mov v5.s[3],w12 1508#ifndef __AARCH64EB__ 1509 rev32 v4.16b,v4.16b 1510#endif 1511 eor v6.16b,v6.16b,v5.16b 1512 mov x10,x3 1513 mov w11,#8 1514 mov w12,v6.s[0] 1515 mov w13,v6.s[1] 1516 mov w14,v6.s[2] 1517 mov w15,v6.s[3] 151810: 1519 ldp w7,w8,[x10],8 1520 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1521 eor w6,w14,w15 1522 eor w9,w7,w13 1523 eor w6,w6,w9 1524 movi v1.16b,#64 1525 movi v2.16b,#128 1526 movi v3.16b,#192 1527 mov v0.s[0],w6 1528 1529 sub v1.16b,v0.16b,v1.16b 1530 sub v2.16b,v0.16b,v2.16b 1531 sub v3.16b,v0.16b,v3.16b 1532 1533 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1534 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1535 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1536 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1537 1538 mov w6,v0.s[0] 1539 mov w7,v1.s[0] 1540 mov w9,v2.s[0] 1541 add w7,w6,w7 1542 mov w6,v3.s[0] 1543 add w7,w7,w9 1544 add w7,w7,w6 1545 1546 eor w6,w7,w7,ror #32-2 1547 eor w6,w6,w7,ror #32-10 1548 eor w6,w6,w7,ror #32-18 1549 eor w6,w6,w7,ror #32-24 1550 eor w12,w12,w6 1551 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1552 eor w6,w14,w15 1553 eor w9,w12,w8 1554 eor w6,w6,w9 1555 movi v1.16b,#64 1556 movi v2.16b,#128 1557 movi v3.16b,#192 1558 mov v0.s[0],w6 1559 1560 sub v1.16b,v0.16b,v1.16b 1561 sub v2.16b,v0.16b,v2.16b 1562 sub v3.16b,v0.16b,v3.16b 1563 1564 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1565 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1566 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1567 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1568 1569 mov w6,v0.s[0] 1570 mov w7,v1.s[0] 1571 mov w9,v2.s[0] 1572 add w7,w6,w7 1573 mov w6,v3.s[0] 1574 add w7,w7,w9 1575 add w7,w7,w6 1576 1577 eor w6,w7,w7,ror #32-2 1578 eor w6,w6,w7,ror #32-10 1579 eor w6,w6,w7,ror #32-18 1580 eor w6,w6,w7,ror #32-24 1581 ldp w7,w8,[x10],8 1582 eor w13,w13,w6 1583 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1584 eor w6,w12,w13 1585 eor w9,w7,w15 1586 eor w6,w6,w9 1587 movi v1.16b,#64 1588 movi v2.16b,#128 1589 movi v3.16b,#192 1590 mov v0.s[0],w6 1591 1592 sub v1.16b,v0.16b,v1.16b 1593 sub v2.16b,v0.16b,v2.16b 1594 sub v3.16b,v0.16b,v3.16b 1595 1596 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1597 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1598 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1599 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1600 1601 mov w6,v0.s[0] 1602 mov w7,v1.s[0] 1603 mov w9,v2.s[0] 1604 add w7,w6,w7 1605 mov w6,v3.s[0] 1606 add w7,w7,w9 1607 add w7,w7,w6 1608 1609 eor w6,w7,w7,ror #32-2 1610 eor w6,w6,w7,ror #32-10 1611 eor w6,w6,w7,ror #32-18 1612 eor w6,w6,w7,ror #32-24 1613 eor w14,w14,w6 1614 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1615 eor w6,w12,w13 1616 eor w9,w14,w8 1617 eor w6,w6,w9 1618 movi v1.16b,#64 1619 movi v2.16b,#128 1620 movi v3.16b,#192 1621 mov v0.s[0],w6 1622 1623 sub v1.16b,v0.16b,v1.16b 1624 sub v2.16b,v0.16b,v2.16b 1625 sub v3.16b,v0.16b,v3.16b 1626 1627 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1628 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1629 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1630 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1631 1632 mov w6,v0.s[0] 1633 mov w7,v1.s[0] 1634 mov w9,v2.s[0] 1635 add w7,w6,w7 1636 mov w6,v3.s[0] 1637 add w7,w7,w9 1638 add w7,w7,w6 1639 1640 eor w6,w7,w7,ror #32-2 1641 eor w6,w6,w7,ror #32-10 1642 eor w6,w6,w7,ror #32-18 1643 eor w6,w6,w7,ror #32-24 1644 eor w15,w15,w6 1645 subs w11,w11,#1 1646 b.ne 10b 1647 mov v6.s[0],w15 1648 mov v6.s[1],w14 1649 mov v6.s[2],w13 1650 mov v6.s[3],w12 1651#ifndef __AARCH64EB__ 1652 rev32 v5.16b,v5.16b 1653#endif 1654 eor v7.16b,v7.16b,v6.16b 1655 mov x10,x3 1656 mov w11,#8 1657 mov w12,v7.s[0] 1658 mov w13,v7.s[1] 1659 mov w14,v7.s[2] 1660 mov w15,v7.s[3] 166110: 1662 ldp w7,w8,[x10],8 1663 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1664 eor w6,w14,w15 1665 eor w9,w7,w13 1666 eor w6,w6,w9 1667 movi v1.16b,#64 1668 movi v2.16b,#128 1669 movi v3.16b,#192 1670 mov v0.s[0],w6 1671 1672 sub v1.16b,v0.16b,v1.16b 1673 sub v2.16b,v0.16b,v2.16b 1674 sub v3.16b,v0.16b,v3.16b 1675 1676 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1677 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1678 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1679 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1680 1681 mov w6,v0.s[0] 1682 mov w7,v1.s[0] 1683 mov w9,v2.s[0] 1684 add w7,w6,w7 1685 mov w6,v3.s[0] 1686 add w7,w7,w9 1687 add w7,w7,w6 1688 1689 eor w6,w7,w7,ror #32-2 1690 eor w6,w6,w7,ror #32-10 1691 eor w6,w6,w7,ror #32-18 1692 eor w6,w6,w7,ror #32-24 1693 eor w12,w12,w6 1694 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1695 eor w6,w14,w15 1696 eor w9,w12,w8 1697 eor w6,w6,w9 1698 movi v1.16b,#64 1699 movi v2.16b,#128 1700 movi v3.16b,#192 1701 mov v0.s[0],w6 1702 1703 sub v1.16b,v0.16b,v1.16b 1704 sub v2.16b,v0.16b,v2.16b 1705 sub v3.16b,v0.16b,v3.16b 1706 1707 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1708 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1709 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1710 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1711 1712 mov w6,v0.s[0] 1713 mov w7,v1.s[0] 1714 mov w9,v2.s[0] 1715 add w7,w6,w7 1716 mov w6,v3.s[0] 1717 add w7,w7,w9 1718 add w7,w7,w6 1719 1720 eor w6,w7,w7,ror #32-2 1721 eor w6,w6,w7,ror #32-10 1722 eor w6,w6,w7,ror #32-18 1723 eor w6,w6,w7,ror #32-24 1724 ldp w7,w8,[x10],8 1725 eor w13,w13,w6 1726 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1727 eor w6,w12,w13 1728 eor w9,w7,w15 1729 eor w6,w6,w9 1730 movi v1.16b,#64 1731 movi v2.16b,#128 1732 movi v3.16b,#192 1733 mov v0.s[0],w6 1734 1735 sub v1.16b,v0.16b,v1.16b 1736 sub v2.16b,v0.16b,v2.16b 1737 sub v3.16b,v0.16b,v3.16b 1738 1739 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1740 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1741 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1742 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1743 1744 mov w6,v0.s[0] 1745 mov w7,v1.s[0] 1746 mov w9,v2.s[0] 1747 add w7,w6,w7 1748 mov w6,v3.s[0] 1749 add w7,w7,w9 1750 add w7,w7,w6 1751 1752 eor w6,w7,w7,ror #32-2 1753 eor w6,w6,w7,ror #32-10 1754 eor w6,w6,w7,ror #32-18 1755 eor w6,w6,w7,ror #32-24 1756 eor w14,w14,w6 1757 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1758 eor w6,w12,w13 1759 eor w9,w14,w8 1760 eor w6,w6,w9 1761 movi v1.16b,#64 1762 movi v2.16b,#128 1763 movi v3.16b,#192 1764 mov v0.s[0],w6 1765 1766 sub v1.16b,v0.16b,v1.16b 1767 sub v2.16b,v0.16b,v2.16b 1768 sub v3.16b,v0.16b,v3.16b 1769 1770 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1771 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1772 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1773 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1774 1775 mov w6,v0.s[0] 1776 mov w7,v1.s[0] 1777 mov w9,v2.s[0] 1778 add w7,w6,w7 1779 mov w6,v3.s[0] 1780 add w7,w7,w9 1781 add w7,w7,w6 1782 1783 eor w6,w7,w7,ror #32-2 1784 eor w6,w6,w7,ror #32-10 1785 eor w6,w6,w7,ror #32-18 1786 eor w6,w6,w7,ror #32-24 1787 eor w15,w15,w6 1788 subs w11,w11,#1 1789 b.ne 10b 1790 mov v7.s[0],w15 1791 mov v7.s[1],w14 1792 mov v7.s[2],w13 1793 mov v7.s[3],w12 1794#ifndef __AARCH64EB__ 1795 rev32 v6.16b,v6.16b 1796#endif 1797#ifndef __AARCH64EB__ 1798 rev32 v7.16b,v7.16b 1799#endif 1800 orr v3.16b,v7.16b,v7.16b 1801 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 1802 subs w2,w2,#4 1803 b.ne .Lcbc_4_blocks_enc 1804 b 2f 18051: 1806 subs w2,w2,#1 1807 b.lt 2f 1808 ld1 {v4.4s},[x0],#16 1809 eor v3.16b,v3.16b,v4.16b 1810#ifndef __AARCH64EB__ 1811 rev32 v3.16b,v3.16b 1812#endif 1813 mov x10,x3 1814 mov w11,#8 1815 mov w12,v3.s[0] 1816 mov w13,v3.s[1] 1817 mov w14,v3.s[2] 1818 mov w15,v3.s[3] 181910: 1820 ldp w7,w8,[x10],8 1821 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 1822 eor w6,w14,w15 1823 eor w9,w7,w13 1824 eor w6,w6,w9 1825 movi v1.16b,#64 1826 movi v2.16b,#128 1827 movi v3.16b,#192 1828 mov v0.s[0],w6 1829 1830 sub v1.16b,v0.16b,v1.16b 1831 sub v2.16b,v0.16b,v2.16b 1832 sub v3.16b,v0.16b,v3.16b 1833 1834 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1835 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1836 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1837 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1838 1839 mov w6,v0.s[0] 1840 mov w7,v1.s[0] 1841 mov w9,v2.s[0] 1842 add w7,w6,w7 1843 mov w6,v3.s[0] 1844 add w7,w7,w9 1845 add w7,w7,w6 1846 1847 eor w6,w7,w7,ror #32-2 1848 eor w6,w6,w7,ror #32-10 1849 eor w6,w6,w7,ror #32-18 1850 eor w6,w6,w7,ror #32-24 1851 eor w12,w12,w6 1852 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 1853 eor w6,w14,w15 1854 eor w9,w12,w8 1855 eor w6,w6,w9 1856 movi v1.16b,#64 1857 movi v2.16b,#128 1858 movi v3.16b,#192 1859 mov v0.s[0],w6 1860 1861 sub v1.16b,v0.16b,v1.16b 1862 sub v2.16b,v0.16b,v2.16b 1863 sub v3.16b,v0.16b,v3.16b 1864 1865 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1866 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1867 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1868 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1869 1870 mov w6,v0.s[0] 1871 mov w7,v1.s[0] 1872 mov w9,v2.s[0] 1873 add w7,w6,w7 1874 mov w6,v3.s[0] 1875 add w7,w7,w9 1876 add w7,w7,w6 1877 1878 eor w6,w7,w7,ror #32-2 1879 eor w6,w6,w7,ror #32-10 1880 eor w6,w6,w7,ror #32-18 1881 eor w6,w6,w7,ror #32-24 1882 ldp w7,w8,[x10],8 1883 eor w13,w13,w6 1884 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 1885 eor w6,w12,w13 1886 eor w9,w7,w15 1887 eor w6,w6,w9 1888 movi v1.16b,#64 1889 movi v2.16b,#128 1890 movi v3.16b,#192 1891 mov v0.s[0],w6 1892 1893 sub v1.16b,v0.16b,v1.16b 1894 sub v2.16b,v0.16b,v2.16b 1895 sub v3.16b,v0.16b,v3.16b 1896 1897 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1898 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1899 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1900 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1901 1902 mov w6,v0.s[0] 1903 mov w7,v1.s[0] 1904 mov w9,v2.s[0] 1905 add w7,w6,w7 1906 mov w6,v3.s[0] 1907 add w7,w7,w9 1908 add w7,w7,w6 1909 1910 eor w6,w7,w7,ror #32-2 1911 eor w6,w6,w7,ror #32-10 1912 eor w6,w6,w7,ror #32-18 1913 eor w6,w6,w7,ror #32-24 1914 eor w14,w14,w6 1915 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 1916 eor w6,w12,w13 1917 eor w9,w14,w8 1918 eor w6,w6,w9 1919 movi v1.16b,#64 1920 movi v2.16b,#128 1921 movi v3.16b,#192 1922 mov v0.s[0],w6 1923 1924 sub v1.16b,v0.16b,v1.16b 1925 sub v2.16b,v0.16b,v2.16b 1926 sub v3.16b,v0.16b,v3.16b 1927 1928 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 1929 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 1930 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 1931 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 1932 1933 mov w6,v0.s[0] 1934 mov w7,v1.s[0] 1935 mov w9,v2.s[0] 1936 add w7,w6,w7 1937 mov w6,v3.s[0] 1938 add w7,w7,w9 1939 add w7,w7,w6 1940 1941 eor w6,w7,w7,ror #32-2 1942 eor w6,w6,w7,ror #32-10 1943 eor w6,w6,w7,ror #32-18 1944 eor w6,w6,w7,ror #32-24 1945 eor w15,w15,w6 1946 subs w11,w11,#1 1947 b.ne 10b 1948 mov v3.s[0],w15 1949 mov v3.s[1],w14 1950 mov v3.s[2],w13 1951 mov v3.s[3],w12 1952#ifndef __AARCH64EB__ 1953 rev32 v3.16b,v3.16b 1954#endif 1955 st1 {v3.4s},[x1],#16 1956 b 1b 19572: 1958 // save back IV 1959 st1 {v3.4s},[x4] 1960 ret 1961 1962.Ldec: 1963 // decryption mode starts 1964 AARCH64_SIGN_LINK_REGISTER 1965 stp d8,d9,[sp,#-80]! 1966 stp d10,d11,[sp,#16] 1967 stp d12,d13,[sp,#32] 1968 stp d14,d15,[sp,#48] 1969 stp x29,x30,[sp,#64] 1970.Lcbc_8_blocks_dec: 1971 cmp w2,#8 1972 b.lt 1f 1973 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 1974 add x10,x0,#64 1975 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x10] 1976#ifndef __AARCH64EB__ 1977 rev32 v4.16b,v4.16b 1978#endif 1979#ifndef __AARCH64EB__ 1980 rev32 v5.16b,v5.16b 1981#endif 1982#ifndef __AARCH64EB__ 1983 rev32 v6.16b,v6.16b 1984#endif 1985#ifndef __AARCH64EB__ 1986 rev32 v7.16b,v7.16b 1987#endif 1988#ifndef __AARCH64EB__ 1989 rev32 v8.16b,v8.16b 1990#endif 1991#ifndef __AARCH64EB__ 1992 rev32 v9.16b,v9.16b 1993#endif 1994#ifndef __AARCH64EB__ 1995 rev32 v10.16b,v10.16b 1996#endif 1997#ifndef __AARCH64EB__ 1998 rev32 v11.16b,v11.16b 1999#endif 2000 bl _vpsm4_enc_8blks 2001 zip1 v8.4s,v0.4s,v1.4s 2002 zip2 v9.4s,v0.4s,v1.4s 2003 zip1 v10.4s,v2.4s,v3.4s 2004 zip2 v11.4s,v2.4s,v3.4s 2005 zip1 v0.2d,v8.2d,v10.2d 2006 zip2 v1.2d,v8.2d,v10.2d 2007 zip1 v2.2d,v9.2d,v11.2d 2008 zip2 v3.2d,v9.2d,v11.2d 2009 zip1 v8.4s,v4.4s,v5.4s 2010 zip2 v9.4s,v4.4s,v5.4s 2011 zip1 v10.4s,v6.4s,v7.4s 2012 zip2 v11.4s,v6.4s,v7.4s 2013 zip1 v4.2d,v8.2d,v10.2d 2014 zip2 v5.2d,v8.2d,v10.2d 2015 zip1 v6.2d,v9.2d,v11.2d 2016 zip2 v7.2d,v9.2d,v11.2d 2017 ld1 {v15.4s},[x4] 2018 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2019 // note ivec1 and vtmpx[3] are reusing the same register 2020 // care needs to be taken to avoid conflict 2021 eor v0.16b,v0.16b,v15.16b 2022 ld1 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2023 eor v1.16b,v1.16b,v8.16b 2024 eor v2.16b,v2.16b,v9.16b 2025 eor v3.16b,v3.16b,v10.16b 2026 // save back IV 2027 st1 {v15.4s}, [x4] 2028 eor v4.16b,v4.16b,v11.16b 2029 eor v5.16b,v5.16b,v12.16b 2030 eor v6.16b,v6.16b,v13.16b 2031 eor v7.16b,v7.16b,v14.16b 2032 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2033 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2034 subs w2,w2,#8 2035 b.gt .Lcbc_8_blocks_dec 2036 b.eq 100f 20371: 2038 ld1 {v15.4s},[x4] 2039.Lcbc_4_blocks_dec: 2040 cmp w2,#4 2041 b.lt 1f 2042 ld4 {v4.4s,v5.4s,v6.4s,v7.4s},[x0] 2043#ifndef __AARCH64EB__ 2044 rev32 v4.16b,v4.16b 2045#endif 2046#ifndef __AARCH64EB__ 2047 rev32 v5.16b,v5.16b 2048#endif 2049#ifndef __AARCH64EB__ 2050 rev32 v6.16b,v6.16b 2051#endif 2052#ifndef __AARCH64EB__ 2053 rev32 v7.16b,v7.16b 2054#endif 2055 bl _vpsm4_enc_4blks 2056 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 2057 zip1 v8.4s,v0.4s,v1.4s 2058 zip2 v9.4s,v0.4s,v1.4s 2059 zip1 v10.4s,v2.4s,v3.4s 2060 zip2 v11.4s,v2.4s,v3.4s 2061 zip1 v0.2d,v8.2d,v10.2d 2062 zip2 v1.2d,v8.2d,v10.2d 2063 zip1 v2.2d,v9.2d,v11.2d 2064 zip2 v3.2d,v9.2d,v11.2d 2065 eor v0.16b,v0.16b,v15.16b 2066 eor v1.16b,v1.16b,v4.16b 2067 orr v15.16b,v7.16b,v7.16b 2068 eor v2.16b,v2.16b,v5.16b 2069 eor v3.16b,v3.16b,v6.16b 2070 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2071 subs w2,w2,#4 2072 b.gt .Lcbc_4_blocks_dec 2073 // save back IV 2074 st1 {v7.4s}, [x4] 2075 b 100f 20761: // last block 2077 subs w2,w2,#1 2078 b.lt 100f 2079 b.gt 1f 2080 ld1 {v4.4s},[x0],#16 2081 // save back IV 2082 st1 {v4.4s}, [x4] 2083#ifndef __AARCH64EB__ 2084 rev32 v8.16b,v4.16b 2085#else 2086 mov v8.16b,v4.16b 2087#endif 2088 mov x10,x3 2089 mov w11,#8 2090 mov w12,v8.s[0] 2091 mov w13,v8.s[1] 2092 mov w14,v8.s[2] 2093 mov w15,v8.s[3] 209410: 2095 ldp w7,w8,[x10],8 2096 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2097 eor w6,w14,w15 2098 eor w9,w7,w13 2099 eor w6,w6,w9 2100 movi v1.16b,#64 2101 movi v2.16b,#128 2102 movi v3.16b,#192 2103 mov v0.s[0],w6 2104 2105 sub v1.16b,v0.16b,v1.16b 2106 sub v2.16b,v0.16b,v2.16b 2107 sub v3.16b,v0.16b,v3.16b 2108 2109 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2110 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2111 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2112 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2113 2114 mov w6,v0.s[0] 2115 mov w7,v1.s[0] 2116 mov w9,v2.s[0] 2117 add w7,w6,w7 2118 mov w6,v3.s[0] 2119 add w7,w7,w9 2120 add w7,w7,w6 2121 2122 eor w6,w7,w7,ror #32-2 2123 eor w6,w6,w7,ror #32-10 2124 eor w6,w6,w7,ror #32-18 2125 eor w6,w6,w7,ror #32-24 2126 eor w12,w12,w6 2127 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2128 eor w6,w14,w15 2129 eor w9,w12,w8 2130 eor w6,w6,w9 2131 movi v1.16b,#64 2132 movi v2.16b,#128 2133 movi v3.16b,#192 2134 mov v0.s[0],w6 2135 2136 sub v1.16b,v0.16b,v1.16b 2137 sub v2.16b,v0.16b,v2.16b 2138 sub v3.16b,v0.16b,v3.16b 2139 2140 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2141 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2142 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2143 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2144 2145 mov w6,v0.s[0] 2146 mov w7,v1.s[0] 2147 mov w9,v2.s[0] 2148 add w7,w6,w7 2149 mov w6,v3.s[0] 2150 add w7,w7,w9 2151 add w7,w7,w6 2152 2153 eor w6,w7,w7,ror #32-2 2154 eor w6,w6,w7,ror #32-10 2155 eor w6,w6,w7,ror #32-18 2156 eor w6,w6,w7,ror #32-24 2157 ldp w7,w8,[x10],8 2158 eor w13,w13,w6 2159 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2160 eor w6,w12,w13 2161 eor w9,w7,w15 2162 eor w6,w6,w9 2163 movi v1.16b,#64 2164 movi v2.16b,#128 2165 movi v3.16b,#192 2166 mov v0.s[0],w6 2167 2168 sub v1.16b,v0.16b,v1.16b 2169 sub v2.16b,v0.16b,v2.16b 2170 sub v3.16b,v0.16b,v3.16b 2171 2172 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2173 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2174 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2175 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2176 2177 mov w6,v0.s[0] 2178 mov w7,v1.s[0] 2179 mov w9,v2.s[0] 2180 add w7,w6,w7 2181 mov w6,v3.s[0] 2182 add w7,w7,w9 2183 add w7,w7,w6 2184 2185 eor w6,w7,w7,ror #32-2 2186 eor w6,w6,w7,ror #32-10 2187 eor w6,w6,w7,ror #32-18 2188 eor w6,w6,w7,ror #32-24 2189 eor w14,w14,w6 2190 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2191 eor w6,w12,w13 2192 eor w9,w14,w8 2193 eor w6,w6,w9 2194 movi v1.16b,#64 2195 movi v2.16b,#128 2196 movi v3.16b,#192 2197 mov v0.s[0],w6 2198 2199 sub v1.16b,v0.16b,v1.16b 2200 sub v2.16b,v0.16b,v2.16b 2201 sub v3.16b,v0.16b,v3.16b 2202 2203 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2204 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2205 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2206 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2207 2208 mov w6,v0.s[0] 2209 mov w7,v1.s[0] 2210 mov w9,v2.s[0] 2211 add w7,w6,w7 2212 mov w6,v3.s[0] 2213 add w7,w7,w9 2214 add w7,w7,w6 2215 2216 eor w6,w7,w7,ror #32-2 2217 eor w6,w6,w7,ror #32-10 2218 eor w6,w6,w7,ror #32-18 2219 eor w6,w6,w7,ror #32-24 2220 eor w15,w15,w6 2221 subs w11,w11,#1 2222 b.ne 10b 2223 mov v8.s[0],w15 2224 mov v8.s[1],w14 2225 mov v8.s[2],w13 2226 mov v8.s[3],w12 2227#ifndef __AARCH64EB__ 2228 rev32 v8.16b,v8.16b 2229#endif 2230 eor v8.16b,v8.16b,v15.16b 2231 st1 {v8.4s},[x1],#16 2232 b 100f 22331: // last two blocks 2234 ld4 {v4.s,v5.s,v6.s,v7.s}[0],[x0] 2235 add x10,x0,#16 2236 ld4 {v4.s,v5.s,v6.s,v7.s}[1],[x10],#16 2237 subs w2,w2,1 2238 b.gt 1f 2239#ifndef __AARCH64EB__ 2240 rev32 v4.16b,v4.16b 2241#endif 2242#ifndef __AARCH64EB__ 2243 rev32 v5.16b,v5.16b 2244#endif 2245#ifndef __AARCH64EB__ 2246 rev32 v6.16b,v6.16b 2247#endif 2248#ifndef __AARCH64EB__ 2249 rev32 v7.16b,v7.16b 2250#endif 2251 bl _vpsm4_enc_4blks 2252 ld1 {v4.4s,v5.4s},[x0],#32 2253 zip1 v8.4s,v0.4s,v1.4s 2254 zip2 v9.4s,v0.4s,v1.4s 2255 zip1 v10.4s,v2.4s,v3.4s 2256 zip2 v11.4s,v2.4s,v3.4s 2257 zip1 v0.2d,v8.2d,v10.2d 2258 zip2 v1.2d,v8.2d,v10.2d 2259 zip1 v2.2d,v9.2d,v11.2d 2260 zip2 v3.2d,v9.2d,v11.2d 2261 eor v0.16b,v0.16b,v15.16b 2262 eor v1.16b,v1.16b,v4.16b 2263 st1 {v0.4s,v1.4s},[x1],#32 2264 // save back IV 2265 st1 {v5.4s}, [x4] 2266 b 100f 22671: // last 3 blocks 2268 ld4 {v4.s,v5.s,v6.s,v7.s}[2],[x10] 2269#ifndef __AARCH64EB__ 2270 rev32 v4.16b,v4.16b 2271#endif 2272#ifndef __AARCH64EB__ 2273 rev32 v5.16b,v5.16b 2274#endif 2275#ifndef __AARCH64EB__ 2276 rev32 v6.16b,v6.16b 2277#endif 2278#ifndef __AARCH64EB__ 2279 rev32 v7.16b,v7.16b 2280#endif 2281 bl _vpsm4_enc_4blks 2282 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 2283 zip1 v8.4s,v0.4s,v1.4s 2284 zip2 v9.4s,v0.4s,v1.4s 2285 zip1 v10.4s,v2.4s,v3.4s 2286 zip2 v11.4s,v2.4s,v3.4s 2287 zip1 v0.2d,v8.2d,v10.2d 2288 zip2 v1.2d,v8.2d,v10.2d 2289 zip1 v2.2d,v9.2d,v11.2d 2290 zip2 v3.2d,v9.2d,v11.2d 2291 eor v0.16b,v0.16b,v15.16b 2292 eor v1.16b,v1.16b,v4.16b 2293 eor v2.16b,v2.16b,v5.16b 2294 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 2295 // save back IV 2296 st1 {v6.4s}, [x4] 2297100: 2298 ldp d10,d11,[sp,#16] 2299 ldp d12,d13,[sp,#32] 2300 ldp d14,d15,[sp,#48] 2301 ldp x29,x30,[sp,#64] 2302 ldp d8,d9,[sp],#80 2303 AARCH64_VALIDATE_LINK_REGISTER 2304 ret 2305.size vpsm4_cbc_encrypt,.-vpsm4_cbc_encrypt 2306.globl vpsm4_ctr32_encrypt_blocks 2307.type vpsm4_ctr32_encrypt_blocks,%function 2308.align 5 2309vpsm4_ctr32_encrypt_blocks: 2310 AARCH64_VALID_CALL_TARGET 2311 ld1 {v3.4s},[x4] 2312#ifndef __AARCH64EB__ 2313 rev32 v3.16b,v3.16b 2314#endif 2315 adrp x10,.Lsbox 2316 add x10,x10,#:lo12:.Lsbox 2317 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 2318 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 2319 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 2320 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 2321 cmp w2,#1 2322 b.ne 1f 2323 // fast processing for one single block without 2324 // context saving overhead 2325 mov x10,x3 2326 mov w11,#8 2327 mov w12,v3.s[0] 2328 mov w13,v3.s[1] 2329 mov w14,v3.s[2] 2330 mov w15,v3.s[3] 233110: 2332 ldp w7,w8,[x10],8 2333 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2334 eor w6,w14,w15 2335 eor w9,w7,w13 2336 eor w6,w6,w9 2337 movi v1.16b,#64 2338 movi v2.16b,#128 2339 movi v3.16b,#192 2340 mov v0.s[0],w6 2341 2342 sub v1.16b,v0.16b,v1.16b 2343 sub v2.16b,v0.16b,v2.16b 2344 sub v3.16b,v0.16b,v3.16b 2345 2346 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2347 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2348 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2349 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2350 2351 mov w6,v0.s[0] 2352 mov w7,v1.s[0] 2353 mov w9,v2.s[0] 2354 add w7,w6,w7 2355 mov w6,v3.s[0] 2356 add w7,w7,w9 2357 add w7,w7,w6 2358 2359 eor w6,w7,w7,ror #32-2 2360 eor w6,w6,w7,ror #32-10 2361 eor w6,w6,w7,ror #32-18 2362 eor w6,w6,w7,ror #32-24 2363 eor w12,w12,w6 2364 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2365 eor w6,w14,w15 2366 eor w9,w12,w8 2367 eor w6,w6,w9 2368 movi v1.16b,#64 2369 movi v2.16b,#128 2370 movi v3.16b,#192 2371 mov v0.s[0],w6 2372 2373 sub v1.16b,v0.16b,v1.16b 2374 sub v2.16b,v0.16b,v2.16b 2375 sub v3.16b,v0.16b,v3.16b 2376 2377 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2378 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2379 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2380 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2381 2382 mov w6,v0.s[0] 2383 mov w7,v1.s[0] 2384 mov w9,v2.s[0] 2385 add w7,w6,w7 2386 mov w6,v3.s[0] 2387 add w7,w7,w9 2388 add w7,w7,w6 2389 2390 eor w6,w7,w7,ror #32-2 2391 eor w6,w6,w7,ror #32-10 2392 eor w6,w6,w7,ror #32-18 2393 eor w6,w6,w7,ror #32-24 2394 ldp w7,w8,[x10],8 2395 eor w13,w13,w6 2396 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2397 eor w6,w12,w13 2398 eor w9,w7,w15 2399 eor w6,w6,w9 2400 movi v1.16b,#64 2401 movi v2.16b,#128 2402 movi v3.16b,#192 2403 mov v0.s[0],w6 2404 2405 sub v1.16b,v0.16b,v1.16b 2406 sub v2.16b,v0.16b,v2.16b 2407 sub v3.16b,v0.16b,v3.16b 2408 2409 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2410 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2411 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2412 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2413 2414 mov w6,v0.s[0] 2415 mov w7,v1.s[0] 2416 mov w9,v2.s[0] 2417 add w7,w6,w7 2418 mov w6,v3.s[0] 2419 add w7,w7,w9 2420 add w7,w7,w6 2421 2422 eor w6,w7,w7,ror #32-2 2423 eor w6,w6,w7,ror #32-10 2424 eor w6,w6,w7,ror #32-18 2425 eor w6,w6,w7,ror #32-24 2426 eor w14,w14,w6 2427 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2428 eor w6,w12,w13 2429 eor w9,w14,w8 2430 eor w6,w6,w9 2431 movi v1.16b,#64 2432 movi v2.16b,#128 2433 movi v3.16b,#192 2434 mov v0.s[0],w6 2435 2436 sub v1.16b,v0.16b,v1.16b 2437 sub v2.16b,v0.16b,v2.16b 2438 sub v3.16b,v0.16b,v3.16b 2439 2440 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2441 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2442 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2443 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2444 2445 mov w6,v0.s[0] 2446 mov w7,v1.s[0] 2447 mov w9,v2.s[0] 2448 add w7,w6,w7 2449 mov w6,v3.s[0] 2450 add w7,w7,w9 2451 add w7,w7,w6 2452 2453 eor w6,w7,w7,ror #32-2 2454 eor w6,w6,w7,ror #32-10 2455 eor w6,w6,w7,ror #32-18 2456 eor w6,w6,w7,ror #32-24 2457 eor w15,w15,w6 2458 subs w11,w11,#1 2459 b.ne 10b 2460 mov v3.s[0],w15 2461 mov v3.s[1],w14 2462 mov v3.s[2],w13 2463 mov v3.s[3],w12 2464#ifndef __AARCH64EB__ 2465 rev32 v3.16b,v3.16b 2466#endif 2467 ld1 {v4.4s},[x0] 2468 eor v4.16b,v4.16b,v3.16b 2469 st1 {v4.4s},[x1] 2470 ret 24711: 2472 AARCH64_SIGN_LINK_REGISTER 2473 stp d8,d9,[sp,#-80]! 2474 stp d10,d11,[sp,#16] 2475 stp d12,d13,[sp,#32] 2476 stp d14,d15,[sp,#48] 2477 stp x29,x30,[sp,#64] 2478 mov w12,v3.s[0] 2479 mov w13,v3.s[1] 2480 mov w14,v3.s[2] 2481 mov w5,v3.s[3] 2482.Lctr32_4_blocks_process: 2483 cmp w2,#4 2484 b.lt 1f 2485 dup v4.4s,w12 2486 dup v5.4s,w13 2487 dup v6.4s,w14 2488 mov v7.s[0],w5 2489 add w5,w5,#1 2490 mov v7.s[1],w5 2491 add w5,w5,#1 2492 mov v7.s[2],w5 2493 add w5,w5,#1 2494 mov v7.s[3],w5 2495 add w5,w5,#1 2496 cmp w2,#8 2497 b.ge .Lctr32_8_blocks_process 2498 bl _vpsm4_enc_4blks 2499 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2500 eor v0.16b,v0.16b,v12.16b 2501 eor v1.16b,v1.16b,v13.16b 2502 eor v2.16b,v2.16b,v14.16b 2503 eor v3.16b,v3.16b,v15.16b 2504 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2505 subs w2,w2,#4 2506 b.ne .Lctr32_4_blocks_process 2507 b 100f 2508.Lctr32_8_blocks_process: 2509 dup v8.4s,w12 2510 dup v9.4s,w13 2511 dup v10.4s,w14 2512 mov v11.s[0],w5 2513 add w5,w5,#1 2514 mov v11.s[1],w5 2515 add w5,w5,#1 2516 mov v11.s[2],w5 2517 add w5,w5,#1 2518 mov v11.s[3],w5 2519 add w5,w5,#1 2520 bl _vpsm4_enc_8blks 2521 ld4 {v12.4s,v13.4s,v14.4s,v15.4s},[x0],#64 2522 ld4 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 2523 eor v0.16b,v0.16b,v12.16b 2524 eor v1.16b,v1.16b,v13.16b 2525 eor v2.16b,v2.16b,v14.16b 2526 eor v3.16b,v3.16b,v15.16b 2527 eor v4.16b,v4.16b,v8.16b 2528 eor v5.16b,v5.16b,v9.16b 2529 eor v6.16b,v6.16b,v10.16b 2530 eor v7.16b,v7.16b,v11.16b 2531 st4 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 2532 st4 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 2533 subs w2,w2,#8 2534 b.ne .Lctr32_4_blocks_process 2535 b 100f 25361: // last block processing 2537 subs w2,w2,#1 2538 b.lt 100f 2539 b.gt 1f 2540 mov v3.s[0],w12 2541 mov v3.s[1],w13 2542 mov v3.s[2],w14 2543 mov v3.s[3],w5 2544 mov x10,x3 2545 mov w11,#8 2546 mov w12,v3.s[0] 2547 mov w13,v3.s[1] 2548 mov w14,v3.s[2] 2549 mov w15,v3.s[3] 255010: 2551 ldp w7,w8,[x10],8 2552 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2553 eor w6,w14,w15 2554 eor w9,w7,w13 2555 eor w6,w6,w9 2556 movi v1.16b,#64 2557 movi v2.16b,#128 2558 movi v3.16b,#192 2559 mov v0.s[0],w6 2560 2561 sub v1.16b,v0.16b,v1.16b 2562 sub v2.16b,v0.16b,v2.16b 2563 sub v3.16b,v0.16b,v3.16b 2564 2565 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2566 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2567 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2568 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2569 2570 mov w6,v0.s[0] 2571 mov w7,v1.s[0] 2572 mov w9,v2.s[0] 2573 add w7,w6,w7 2574 mov w6,v3.s[0] 2575 add w7,w7,w9 2576 add w7,w7,w6 2577 2578 eor w6,w7,w7,ror #32-2 2579 eor w6,w6,w7,ror #32-10 2580 eor w6,w6,w7,ror #32-18 2581 eor w6,w6,w7,ror #32-24 2582 eor w12,w12,w6 2583 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2584 eor w6,w14,w15 2585 eor w9,w12,w8 2586 eor w6,w6,w9 2587 movi v1.16b,#64 2588 movi v2.16b,#128 2589 movi v3.16b,#192 2590 mov v0.s[0],w6 2591 2592 sub v1.16b,v0.16b,v1.16b 2593 sub v2.16b,v0.16b,v2.16b 2594 sub v3.16b,v0.16b,v3.16b 2595 2596 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2597 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2598 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2599 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2600 2601 mov w6,v0.s[0] 2602 mov w7,v1.s[0] 2603 mov w9,v2.s[0] 2604 add w7,w6,w7 2605 mov w6,v3.s[0] 2606 add w7,w7,w9 2607 add w7,w7,w6 2608 2609 eor w6,w7,w7,ror #32-2 2610 eor w6,w6,w7,ror #32-10 2611 eor w6,w6,w7,ror #32-18 2612 eor w6,w6,w7,ror #32-24 2613 ldp w7,w8,[x10],8 2614 eor w13,w13,w6 2615 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2616 eor w6,w12,w13 2617 eor w9,w7,w15 2618 eor w6,w6,w9 2619 movi v1.16b,#64 2620 movi v2.16b,#128 2621 movi v3.16b,#192 2622 mov v0.s[0],w6 2623 2624 sub v1.16b,v0.16b,v1.16b 2625 sub v2.16b,v0.16b,v2.16b 2626 sub v3.16b,v0.16b,v3.16b 2627 2628 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2629 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2630 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2631 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2632 2633 mov w6,v0.s[0] 2634 mov w7,v1.s[0] 2635 mov w9,v2.s[0] 2636 add w7,w6,w7 2637 mov w6,v3.s[0] 2638 add w7,w7,w9 2639 add w7,w7,w6 2640 2641 eor w6,w7,w7,ror #32-2 2642 eor w6,w6,w7,ror #32-10 2643 eor w6,w6,w7,ror #32-18 2644 eor w6,w6,w7,ror #32-24 2645 eor w14,w14,w6 2646 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2647 eor w6,w12,w13 2648 eor w9,w14,w8 2649 eor w6,w6,w9 2650 movi v1.16b,#64 2651 movi v2.16b,#128 2652 movi v3.16b,#192 2653 mov v0.s[0],w6 2654 2655 sub v1.16b,v0.16b,v1.16b 2656 sub v2.16b,v0.16b,v2.16b 2657 sub v3.16b,v0.16b,v3.16b 2658 2659 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2660 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2661 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2662 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2663 2664 mov w6,v0.s[0] 2665 mov w7,v1.s[0] 2666 mov w9,v2.s[0] 2667 add w7,w6,w7 2668 mov w6,v3.s[0] 2669 add w7,w7,w9 2670 add w7,w7,w6 2671 2672 eor w6,w7,w7,ror #32-2 2673 eor w6,w6,w7,ror #32-10 2674 eor w6,w6,w7,ror #32-18 2675 eor w6,w6,w7,ror #32-24 2676 eor w15,w15,w6 2677 subs w11,w11,#1 2678 b.ne 10b 2679 mov v3.s[0],w15 2680 mov v3.s[1],w14 2681 mov v3.s[2],w13 2682 mov v3.s[3],w12 2683#ifndef __AARCH64EB__ 2684 rev32 v3.16b,v3.16b 2685#endif 2686 ld1 {v4.4s},[x0] 2687 eor v4.16b,v4.16b,v3.16b 2688 st1 {v4.4s},[x1] 2689 b 100f 26901: // last 2 blocks processing 2691 dup v4.4s,w12 2692 dup v5.4s,w13 2693 dup v6.4s,w14 2694 mov v7.s[0],w5 2695 add w5,w5,#1 2696 mov v7.s[1],w5 2697 subs w2,w2,#1 2698 b.ne 1f 2699 bl _vpsm4_enc_4blks 2700 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2701 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2702 eor v0.16b,v0.16b,v12.16b 2703 eor v1.16b,v1.16b,v13.16b 2704 eor v2.16b,v2.16b,v14.16b 2705 eor v3.16b,v3.16b,v15.16b 2706 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2707 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2708 b 100f 27091: // last 3 blocks processing 2710 add w5,w5,#1 2711 mov v7.s[2],w5 2712 bl _vpsm4_enc_4blks 2713 ld4 {v12.s,v13.s,v14.s,v15.s}[0],[x0],#16 2714 ld4 {v12.s,v13.s,v14.s,v15.s}[1],[x0],#16 2715 ld4 {v12.s,v13.s,v14.s,v15.s}[2],[x0],#16 2716 eor v0.16b,v0.16b,v12.16b 2717 eor v1.16b,v1.16b,v13.16b 2718 eor v2.16b,v2.16b,v14.16b 2719 eor v3.16b,v3.16b,v15.16b 2720 st4 {v0.s,v1.s,v2.s,v3.s}[0],[x1],#16 2721 st4 {v0.s,v1.s,v2.s,v3.s}[1],[x1],#16 2722 st4 {v0.s,v1.s,v2.s,v3.s}[2],[x1],#16 2723100: 2724 ldp d10,d11,[sp,#16] 2725 ldp d12,d13,[sp,#32] 2726 ldp d14,d15,[sp,#48] 2727 ldp x29,x30,[sp,#64] 2728 ldp d8,d9,[sp],#80 2729 AARCH64_VALIDATE_LINK_REGISTER 2730 ret 2731.size vpsm4_ctr32_encrypt_blocks,.-vpsm4_ctr32_encrypt_blocks 2732.globl vpsm4_xts_encrypt_gb 2733.type vpsm4_xts_encrypt_gb,%function 2734.align 5 2735vpsm4_xts_encrypt_gb: 2736 AARCH64_SIGN_LINK_REGISTER 2737 stp x15, x16, [sp, #-0x10]! 2738 stp x17, x18, [sp, #-0x10]! 2739 stp x19, x20, [sp, #-0x10]! 2740 stp x21, x22, [sp, #-0x10]! 2741 stp x23, x24, [sp, #-0x10]! 2742 stp x25, x26, [sp, #-0x10]! 2743 stp x27, x28, [sp, #-0x10]! 2744 stp x29, x30, [sp, #-0x10]! 2745 stp d8, d9, [sp, #-0x10]! 2746 stp d10, d11, [sp, #-0x10]! 2747 stp d12, d13, [sp, #-0x10]! 2748 stp d14, d15, [sp, #-0x10]! 2749 mov x26,x3 2750 mov x27,x4 2751 mov w28,w6 2752 ld1 {v8.4s}, [x5] 2753 mov x3,x27 2754 adrp x10,.Lsbox 2755 add x10,x10,#:lo12:.Lsbox 2756 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 2757 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 2758 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 2759 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 2760#ifndef __AARCH64EB__ 2761 rev32 v8.16b,v8.16b 2762#endif 2763 mov x10,x3 2764 mov w11,#8 2765 mov w12,v8.s[0] 2766 mov w13,v8.s[1] 2767 mov w14,v8.s[2] 2768 mov w15,v8.s[3] 276910: 2770 ldp w7,w8,[x10],8 2771 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 2772 eor w6,w14,w15 2773 eor w9,w7,w13 2774 eor w6,w6,w9 2775 movi v1.16b,#64 2776 movi v2.16b,#128 2777 movi v3.16b,#192 2778 mov v0.s[0],w6 2779 2780 sub v1.16b,v0.16b,v1.16b 2781 sub v2.16b,v0.16b,v2.16b 2782 sub v3.16b,v0.16b,v3.16b 2783 2784 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2785 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2786 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2787 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2788 2789 mov w6,v0.s[0] 2790 mov w7,v1.s[0] 2791 mov w9,v2.s[0] 2792 add w7,w6,w7 2793 mov w6,v3.s[0] 2794 add w7,w7,w9 2795 add w7,w7,w6 2796 2797 eor w6,w7,w7,ror #32-2 2798 eor w6,w6,w7,ror #32-10 2799 eor w6,w6,w7,ror #32-18 2800 eor w6,w6,w7,ror #32-24 2801 eor w12,w12,w6 2802 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 2803 eor w6,w14,w15 2804 eor w9,w12,w8 2805 eor w6,w6,w9 2806 movi v1.16b,#64 2807 movi v2.16b,#128 2808 movi v3.16b,#192 2809 mov v0.s[0],w6 2810 2811 sub v1.16b,v0.16b,v1.16b 2812 sub v2.16b,v0.16b,v2.16b 2813 sub v3.16b,v0.16b,v3.16b 2814 2815 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2816 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2817 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2818 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2819 2820 mov w6,v0.s[0] 2821 mov w7,v1.s[0] 2822 mov w9,v2.s[0] 2823 add w7,w6,w7 2824 mov w6,v3.s[0] 2825 add w7,w7,w9 2826 add w7,w7,w6 2827 2828 eor w6,w7,w7,ror #32-2 2829 eor w6,w6,w7,ror #32-10 2830 eor w6,w6,w7,ror #32-18 2831 eor w6,w6,w7,ror #32-24 2832 ldp w7,w8,[x10],8 2833 eor w13,w13,w6 2834 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 2835 eor w6,w12,w13 2836 eor w9,w7,w15 2837 eor w6,w6,w9 2838 movi v1.16b,#64 2839 movi v2.16b,#128 2840 movi v3.16b,#192 2841 mov v0.s[0],w6 2842 2843 sub v1.16b,v0.16b,v1.16b 2844 sub v2.16b,v0.16b,v2.16b 2845 sub v3.16b,v0.16b,v3.16b 2846 2847 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2848 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2849 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2850 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2851 2852 mov w6,v0.s[0] 2853 mov w7,v1.s[0] 2854 mov w9,v2.s[0] 2855 add w7,w6,w7 2856 mov w6,v3.s[0] 2857 add w7,w7,w9 2858 add w7,w7,w6 2859 2860 eor w6,w7,w7,ror #32-2 2861 eor w6,w6,w7,ror #32-10 2862 eor w6,w6,w7,ror #32-18 2863 eor w6,w6,w7,ror #32-24 2864 eor w14,w14,w6 2865 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 2866 eor w6,w12,w13 2867 eor w9,w14,w8 2868 eor w6,w6,w9 2869 movi v1.16b,#64 2870 movi v2.16b,#128 2871 movi v3.16b,#192 2872 mov v0.s[0],w6 2873 2874 sub v1.16b,v0.16b,v1.16b 2875 sub v2.16b,v0.16b,v2.16b 2876 sub v3.16b,v0.16b,v3.16b 2877 2878 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 2879 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 2880 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 2881 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 2882 2883 mov w6,v0.s[0] 2884 mov w7,v1.s[0] 2885 mov w9,v2.s[0] 2886 add w7,w6,w7 2887 mov w6,v3.s[0] 2888 add w7,w7,w9 2889 add w7,w7,w6 2890 2891 eor w6,w7,w7,ror #32-2 2892 eor w6,w6,w7,ror #32-10 2893 eor w6,w6,w7,ror #32-18 2894 eor w6,w6,w7,ror #32-24 2895 eor w15,w15,w6 2896 subs w11,w11,#1 2897 b.ne 10b 2898 mov v8.s[0],w15 2899 mov v8.s[1],w14 2900 mov v8.s[2],w13 2901 mov v8.s[3],w12 2902#ifndef __AARCH64EB__ 2903 rev32 v8.16b,v8.16b 2904#endif 2905 mov x3,x26 2906 and x29,x2,#0x0F 2907 // convert length into blocks 2908 lsr x2,x2,4 2909 cmp x2,#1 2910 b.lt .return_gb 2911 2912 cmp x29,0 2913 // If the encryption/decryption Length is N times of 16, 2914 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2915 b.eq .xts_encrypt_blocks_gb 2916 2917 // If the encryption/decryption length is not N times of 16, 2918 // the last two blocks are encrypted/decrypted in .last_2blks_tweak_gb or .only_2blks_tweak_gb 2919 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks_gb 2920 subs x2,x2,#1 2921 b.eq .only_2blks_tweak_gb 2922.xts_encrypt_blocks_gb: 2923 rbit v8.16b,v8.16b 2924#ifdef __AARCH64EB__ 2925 rev32 v8.16b,v8.16b 2926#endif 2927 mov x12,v8.d[0] 2928 mov x13,v8.d[1] 2929 mov w7,0x87 2930 extr x9,x13,x13,#32 2931 extr x15,x13,x12,#63 2932 and w8,w7,w9,asr#31 2933 eor x14,x8,x12,lsl#1 2934 mov w7,0x87 2935 extr x9,x15,x15,#32 2936 extr x17,x15,x14,#63 2937 and w8,w7,w9,asr#31 2938 eor x16,x8,x14,lsl#1 2939 mov w7,0x87 2940 extr x9,x17,x17,#32 2941 extr x19,x17,x16,#63 2942 and w8,w7,w9,asr#31 2943 eor x18,x8,x16,lsl#1 2944 mov w7,0x87 2945 extr x9,x19,x19,#32 2946 extr x21,x19,x18,#63 2947 and w8,w7,w9,asr#31 2948 eor x20,x8,x18,lsl#1 2949 mov w7,0x87 2950 extr x9,x21,x21,#32 2951 extr x23,x21,x20,#63 2952 and w8,w7,w9,asr#31 2953 eor x22,x8,x20,lsl#1 2954 mov w7,0x87 2955 extr x9,x23,x23,#32 2956 extr x25,x23,x22,#63 2957 and w8,w7,w9,asr#31 2958 eor x24,x8,x22,lsl#1 2959 mov w7,0x87 2960 extr x9,x25,x25,#32 2961 extr x27,x25,x24,#63 2962 and w8,w7,w9,asr#31 2963 eor x26,x8,x24,lsl#1 2964.Lxts_8_blocks_process_gb: 2965 cmp x2,#8 2966 b.lt .Lxts_4_blocks_process_gb 2967 mov v0.d[0],x12 2968 mov v0.d[1],x13 2969#ifdef __AARCH64EB__ 2970 rev32 v0.16b,v0.16b 2971#endif 2972 mov v1.d[0],x14 2973 mov v1.d[1],x15 2974#ifdef __AARCH64EB__ 2975 rev32 v1.16b,v1.16b 2976#endif 2977 mov v2.d[0],x16 2978 mov v2.d[1],x17 2979#ifdef __AARCH64EB__ 2980 rev32 v2.16b,v2.16b 2981#endif 2982 mov v3.d[0],x18 2983 mov v3.d[1],x19 2984#ifdef __AARCH64EB__ 2985 rev32 v3.16b,v3.16b 2986#endif 2987 mov v12.d[0],x20 2988 mov v12.d[1],x21 2989#ifdef __AARCH64EB__ 2990 rev32 v12.16b,v12.16b 2991#endif 2992 mov v13.d[0],x22 2993 mov v13.d[1],x23 2994#ifdef __AARCH64EB__ 2995 rev32 v13.16b,v13.16b 2996#endif 2997 mov v14.d[0],x24 2998 mov v14.d[1],x25 2999#ifdef __AARCH64EB__ 3000 rev32 v14.16b,v14.16b 3001#endif 3002 mov v15.d[0],x26 3003 mov v15.d[1],x27 3004#ifdef __AARCH64EB__ 3005 rev32 v15.16b,v15.16b 3006#endif 3007 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3008 rbit v0.16b,v0.16b 3009 rbit v1.16b,v1.16b 3010 rbit v2.16b,v2.16b 3011 rbit v3.16b,v3.16b 3012 eor v4.16b, v4.16b, v0.16b 3013 eor v5.16b, v5.16b, v1.16b 3014 eor v6.16b, v6.16b, v2.16b 3015 eor v7.16b, v7.16b, v3.16b 3016 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 3017 rbit v12.16b,v12.16b 3018 rbit v13.16b,v13.16b 3019 rbit v14.16b,v14.16b 3020 rbit v15.16b,v15.16b 3021 eor v8.16b, v8.16b, v12.16b 3022 eor v9.16b, v9.16b, v13.16b 3023 eor v10.16b, v10.16b, v14.16b 3024 eor v11.16b, v11.16b, v15.16b 3025#ifndef __AARCH64EB__ 3026 rev32 v4.16b,v4.16b 3027#endif 3028#ifndef __AARCH64EB__ 3029 rev32 v5.16b,v5.16b 3030#endif 3031#ifndef __AARCH64EB__ 3032 rev32 v6.16b,v6.16b 3033#endif 3034#ifndef __AARCH64EB__ 3035 rev32 v7.16b,v7.16b 3036#endif 3037#ifndef __AARCH64EB__ 3038 rev32 v8.16b,v8.16b 3039#endif 3040#ifndef __AARCH64EB__ 3041 rev32 v9.16b,v9.16b 3042#endif 3043#ifndef __AARCH64EB__ 3044 rev32 v10.16b,v10.16b 3045#endif 3046#ifndef __AARCH64EB__ 3047 rev32 v11.16b,v11.16b 3048#endif 3049 zip1 v0.4s,v4.4s,v5.4s 3050 zip2 v1.4s,v4.4s,v5.4s 3051 zip1 v2.4s,v6.4s,v7.4s 3052 zip2 v3.4s,v6.4s,v7.4s 3053 zip1 v4.2d,v0.2d,v2.2d 3054 zip2 v5.2d,v0.2d,v2.2d 3055 zip1 v6.2d,v1.2d,v3.2d 3056 zip2 v7.2d,v1.2d,v3.2d 3057 zip1 v0.4s,v8.4s,v9.4s 3058 zip2 v1.4s,v8.4s,v9.4s 3059 zip1 v2.4s,v10.4s,v11.4s 3060 zip2 v3.4s,v10.4s,v11.4s 3061 zip1 v8.2d,v0.2d,v2.2d 3062 zip2 v9.2d,v0.2d,v2.2d 3063 zip1 v10.2d,v1.2d,v3.2d 3064 zip2 v11.2d,v1.2d,v3.2d 3065 bl _vpsm4_enc_8blks 3066 zip1 v8.4s,v0.4s,v1.4s 3067 zip2 v9.4s,v0.4s,v1.4s 3068 zip1 v10.4s,v2.4s,v3.4s 3069 zip2 v11.4s,v2.4s,v3.4s 3070 zip1 v0.2d,v8.2d,v10.2d 3071 zip2 v1.2d,v8.2d,v10.2d 3072 zip1 v2.2d,v9.2d,v11.2d 3073 zip2 v3.2d,v9.2d,v11.2d 3074 zip1 v8.4s,v4.4s,v5.4s 3075 zip2 v9.4s,v4.4s,v5.4s 3076 zip1 v10.4s,v6.4s,v7.4s 3077 zip2 v11.4s,v6.4s,v7.4s 3078 zip1 v4.2d,v8.2d,v10.2d 3079 zip2 v5.2d,v8.2d,v10.2d 3080 zip1 v6.2d,v9.2d,v11.2d 3081 zip2 v7.2d,v9.2d,v11.2d 3082 mov v12.d[0],x12 3083 mov v12.d[1],x13 3084#ifdef __AARCH64EB__ 3085 rev32 v12.16b,v12.16b 3086#endif 3087 mov w7,0x87 3088 extr x9,x27,x27,#32 3089 extr x13,x27,x26,#63 3090 and w8,w7,w9,asr#31 3091 eor x12,x8,x26,lsl#1 3092 mov v13.d[0],x14 3093 mov v13.d[1],x15 3094#ifdef __AARCH64EB__ 3095 rev32 v13.16b,v13.16b 3096#endif 3097 mov w7,0x87 3098 extr x9,x13,x13,#32 3099 extr x15,x13,x12,#63 3100 and w8,w7,w9,asr#31 3101 eor x14,x8,x12,lsl#1 3102 mov v14.d[0],x16 3103 mov v14.d[1],x17 3104#ifdef __AARCH64EB__ 3105 rev32 v14.16b,v14.16b 3106#endif 3107 mov w7,0x87 3108 extr x9,x15,x15,#32 3109 extr x17,x15,x14,#63 3110 and w8,w7,w9,asr#31 3111 eor x16,x8,x14,lsl#1 3112 mov v15.d[0],x18 3113 mov v15.d[1],x19 3114#ifdef __AARCH64EB__ 3115 rev32 v15.16b,v15.16b 3116#endif 3117 mov w7,0x87 3118 extr x9,x17,x17,#32 3119 extr x19,x17,x16,#63 3120 and w8,w7,w9,asr#31 3121 eor x18,x8,x16,lsl#1 3122 mov v8.d[0],x20 3123 mov v8.d[1],x21 3124#ifdef __AARCH64EB__ 3125 rev32 v8.16b,v8.16b 3126#endif 3127 mov w7,0x87 3128 extr x9,x19,x19,#32 3129 extr x21,x19,x18,#63 3130 and w8,w7,w9,asr#31 3131 eor x20,x8,x18,lsl#1 3132 mov v9.d[0],x22 3133 mov v9.d[1],x23 3134#ifdef __AARCH64EB__ 3135 rev32 v9.16b,v9.16b 3136#endif 3137 mov w7,0x87 3138 extr x9,x21,x21,#32 3139 extr x23,x21,x20,#63 3140 and w8,w7,w9,asr#31 3141 eor x22,x8,x20,lsl#1 3142 mov v10.d[0],x24 3143 mov v10.d[1],x25 3144#ifdef __AARCH64EB__ 3145 rev32 v10.16b,v10.16b 3146#endif 3147 mov w7,0x87 3148 extr x9,x23,x23,#32 3149 extr x25,x23,x22,#63 3150 and w8,w7,w9,asr#31 3151 eor x24,x8,x22,lsl#1 3152 mov v11.d[0],x26 3153 mov v11.d[1],x27 3154#ifdef __AARCH64EB__ 3155 rev32 v11.16b,v11.16b 3156#endif 3157 mov w7,0x87 3158 extr x9,x25,x25,#32 3159 extr x27,x25,x24,#63 3160 and w8,w7,w9,asr#31 3161 eor x26,x8,x24,lsl#1 3162 eor v0.16b, v0.16b, v12.16b 3163 eor v1.16b, v1.16b, v13.16b 3164 eor v2.16b, v2.16b, v14.16b 3165 eor v3.16b, v3.16b, v15.16b 3166 eor v4.16b, v4.16b, v8.16b 3167 eor v5.16b, v5.16b, v9.16b 3168 eor v6.16b, v6.16b, v10.16b 3169 eor v7.16b, v7.16b, v11.16b 3170 3171 // save the last tweak 3172 st1 {v11.4s},[x5] 3173 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3174 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 3175 subs x2,x2,#8 3176 b.gt .Lxts_8_blocks_process_gb 3177 b 100f 3178.Lxts_4_blocks_process_gb: 3179 mov v8.d[0],x12 3180 mov v8.d[1],x13 3181#ifdef __AARCH64EB__ 3182 rev32 v8.16b,v8.16b 3183#endif 3184 mov v9.d[0],x14 3185 mov v9.d[1],x15 3186#ifdef __AARCH64EB__ 3187 rev32 v9.16b,v9.16b 3188#endif 3189 mov v10.d[0],x16 3190 mov v10.d[1],x17 3191#ifdef __AARCH64EB__ 3192 rev32 v10.16b,v10.16b 3193#endif 3194 mov v11.d[0],x18 3195 mov v11.d[1],x19 3196#ifdef __AARCH64EB__ 3197 rev32 v11.16b,v11.16b 3198#endif 3199 cmp x2,#4 3200 b.lt 1f 3201 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 3202 rbit v8.16b,v8.16b 3203 rbit v9.16b,v9.16b 3204 rbit v10.16b,v10.16b 3205 rbit v11.16b,v11.16b 3206 eor v4.16b, v4.16b, v8.16b 3207 eor v5.16b, v5.16b, v9.16b 3208 eor v6.16b, v6.16b, v10.16b 3209 eor v7.16b, v7.16b, v11.16b 3210#ifndef __AARCH64EB__ 3211 rev32 v4.16b,v4.16b 3212#endif 3213#ifndef __AARCH64EB__ 3214 rev32 v5.16b,v5.16b 3215#endif 3216#ifndef __AARCH64EB__ 3217 rev32 v6.16b,v6.16b 3218#endif 3219#ifndef __AARCH64EB__ 3220 rev32 v7.16b,v7.16b 3221#endif 3222 zip1 v0.4s,v4.4s,v5.4s 3223 zip2 v1.4s,v4.4s,v5.4s 3224 zip1 v2.4s,v6.4s,v7.4s 3225 zip2 v3.4s,v6.4s,v7.4s 3226 zip1 v4.2d,v0.2d,v2.2d 3227 zip2 v5.2d,v0.2d,v2.2d 3228 zip1 v6.2d,v1.2d,v3.2d 3229 zip2 v7.2d,v1.2d,v3.2d 3230 bl _vpsm4_enc_4blks 3231 zip1 v4.4s,v0.4s,v1.4s 3232 zip2 v5.4s,v0.4s,v1.4s 3233 zip1 v6.4s,v2.4s,v3.4s 3234 zip2 v7.4s,v2.4s,v3.4s 3235 zip1 v0.2d,v4.2d,v6.2d 3236 zip2 v1.2d,v4.2d,v6.2d 3237 zip1 v2.2d,v5.2d,v7.2d 3238 zip2 v3.2d,v5.2d,v7.2d 3239 eor v0.16b, v0.16b, v8.16b 3240 eor v1.16b, v1.16b, v9.16b 3241 eor v2.16b, v2.16b, v10.16b 3242 eor v3.16b, v3.16b, v11.16b 3243 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 3244 sub x2,x2,#4 3245 mov v8.d[0],x20 3246 mov v8.d[1],x21 3247#ifdef __AARCH64EB__ 3248 rev32 v8.16b,v8.16b 3249#endif 3250 mov v9.d[0],x22 3251 mov v9.d[1],x23 3252#ifdef __AARCH64EB__ 3253 rev32 v9.16b,v9.16b 3254#endif 3255 mov v10.d[0],x24 3256 mov v10.d[1],x25 3257#ifdef __AARCH64EB__ 3258 rev32 v10.16b,v10.16b 3259#endif 3260 // save the last tweak 3261 st1 {v11.4s},[x5] 32621: 3263 // process last block 3264 cmp x2,#1 3265 b.lt 100f 3266 b.gt 1f 3267 ld1 {v4.4s},[x0],#16 3268 rbit v8.16b,v8.16b 3269 eor v4.16b, v4.16b, v8.16b 3270#ifndef __AARCH64EB__ 3271 rev32 v4.16b,v4.16b 3272#endif 3273 mov x10,x3 3274 mov w11,#8 3275 mov w12,v4.s[0] 3276 mov w13,v4.s[1] 3277 mov w14,v4.s[2] 3278 mov w15,v4.s[3] 327910: 3280 ldp w7,w8,[x10],8 3281 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3282 eor w6,w14,w15 3283 eor w9,w7,w13 3284 eor w6,w6,w9 3285 movi v1.16b,#64 3286 movi v2.16b,#128 3287 movi v3.16b,#192 3288 mov v0.s[0],w6 3289 3290 sub v1.16b,v0.16b,v1.16b 3291 sub v2.16b,v0.16b,v2.16b 3292 sub v3.16b,v0.16b,v3.16b 3293 3294 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3295 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3296 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3297 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3298 3299 mov w6,v0.s[0] 3300 mov w7,v1.s[0] 3301 mov w9,v2.s[0] 3302 add w7,w6,w7 3303 mov w6,v3.s[0] 3304 add w7,w7,w9 3305 add w7,w7,w6 3306 3307 eor w6,w7,w7,ror #32-2 3308 eor w6,w6,w7,ror #32-10 3309 eor w6,w6,w7,ror #32-18 3310 eor w6,w6,w7,ror #32-24 3311 eor w12,w12,w6 3312 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3313 eor w6,w14,w15 3314 eor w9,w12,w8 3315 eor w6,w6,w9 3316 movi v1.16b,#64 3317 movi v2.16b,#128 3318 movi v3.16b,#192 3319 mov v0.s[0],w6 3320 3321 sub v1.16b,v0.16b,v1.16b 3322 sub v2.16b,v0.16b,v2.16b 3323 sub v3.16b,v0.16b,v3.16b 3324 3325 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3326 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3327 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3328 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3329 3330 mov w6,v0.s[0] 3331 mov w7,v1.s[0] 3332 mov w9,v2.s[0] 3333 add w7,w6,w7 3334 mov w6,v3.s[0] 3335 add w7,w7,w9 3336 add w7,w7,w6 3337 3338 eor w6,w7,w7,ror #32-2 3339 eor w6,w6,w7,ror #32-10 3340 eor w6,w6,w7,ror #32-18 3341 eor w6,w6,w7,ror #32-24 3342 ldp w7,w8,[x10],8 3343 eor w13,w13,w6 3344 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3345 eor w6,w12,w13 3346 eor w9,w7,w15 3347 eor w6,w6,w9 3348 movi v1.16b,#64 3349 movi v2.16b,#128 3350 movi v3.16b,#192 3351 mov v0.s[0],w6 3352 3353 sub v1.16b,v0.16b,v1.16b 3354 sub v2.16b,v0.16b,v2.16b 3355 sub v3.16b,v0.16b,v3.16b 3356 3357 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3358 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3359 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3360 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3361 3362 mov w6,v0.s[0] 3363 mov w7,v1.s[0] 3364 mov w9,v2.s[0] 3365 add w7,w6,w7 3366 mov w6,v3.s[0] 3367 add w7,w7,w9 3368 add w7,w7,w6 3369 3370 eor w6,w7,w7,ror #32-2 3371 eor w6,w6,w7,ror #32-10 3372 eor w6,w6,w7,ror #32-18 3373 eor w6,w6,w7,ror #32-24 3374 eor w14,w14,w6 3375 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3376 eor w6,w12,w13 3377 eor w9,w14,w8 3378 eor w6,w6,w9 3379 movi v1.16b,#64 3380 movi v2.16b,#128 3381 movi v3.16b,#192 3382 mov v0.s[0],w6 3383 3384 sub v1.16b,v0.16b,v1.16b 3385 sub v2.16b,v0.16b,v2.16b 3386 sub v3.16b,v0.16b,v3.16b 3387 3388 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3389 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3390 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3391 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3392 3393 mov w6,v0.s[0] 3394 mov w7,v1.s[0] 3395 mov w9,v2.s[0] 3396 add w7,w6,w7 3397 mov w6,v3.s[0] 3398 add w7,w7,w9 3399 add w7,w7,w6 3400 3401 eor w6,w7,w7,ror #32-2 3402 eor w6,w6,w7,ror #32-10 3403 eor w6,w6,w7,ror #32-18 3404 eor w6,w6,w7,ror #32-24 3405 eor w15,w15,w6 3406 subs w11,w11,#1 3407 b.ne 10b 3408 mov v4.s[0],w15 3409 mov v4.s[1],w14 3410 mov v4.s[2],w13 3411 mov v4.s[3],w12 3412#ifndef __AARCH64EB__ 3413 rev32 v4.16b,v4.16b 3414#endif 3415 eor v4.16b, v4.16b, v8.16b 3416 st1 {v4.4s},[x1],#16 3417 // save the last tweak 3418 st1 {v8.4s},[x5] 3419 b 100f 34201: // process last 2 blocks 3421 cmp x2,#2 3422 b.gt 1f 3423 ld1 {v4.4s,v5.4s},[x0],#32 3424 rbit v8.16b,v8.16b 3425 rbit v9.16b,v9.16b 3426 eor v4.16b, v4.16b, v8.16b 3427 eor v5.16b, v5.16b, v9.16b 3428#ifndef __AARCH64EB__ 3429 rev32 v4.16b,v4.16b 3430#endif 3431#ifndef __AARCH64EB__ 3432 rev32 v5.16b,v5.16b 3433#endif 3434 zip1 v0.4s,v4.4s,v5.4s 3435 zip2 v1.4s,v4.4s,v5.4s 3436 zip1 v2.4s,v6.4s,v7.4s 3437 zip2 v3.4s,v6.4s,v7.4s 3438 zip1 v4.2d,v0.2d,v2.2d 3439 zip2 v5.2d,v0.2d,v2.2d 3440 zip1 v6.2d,v1.2d,v3.2d 3441 zip2 v7.2d,v1.2d,v3.2d 3442 bl _vpsm4_enc_4blks 3443 zip1 v4.4s,v0.4s,v1.4s 3444 zip2 v5.4s,v0.4s,v1.4s 3445 zip1 v6.4s,v2.4s,v3.4s 3446 zip2 v7.4s,v2.4s,v3.4s 3447 zip1 v0.2d,v4.2d,v6.2d 3448 zip2 v1.2d,v4.2d,v6.2d 3449 zip1 v2.2d,v5.2d,v7.2d 3450 zip2 v3.2d,v5.2d,v7.2d 3451 eor v0.16b, v0.16b, v8.16b 3452 eor v1.16b, v1.16b, v9.16b 3453 st1 {v0.4s,v1.4s},[x1],#32 3454 // save the last tweak 3455 st1 {v9.4s},[x5] 3456 b 100f 34571: // process last 3 blocks 3458 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 3459 rbit v8.16b,v8.16b 3460 rbit v9.16b,v9.16b 3461 rbit v10.16b,v10.16b 3462 eor v4.16b, v4.16b, v8.16b 3463 eor v5.16b, v5.16b, v9.16b 3464 eor v6.16b, v6.16b, v10.16b 3465#ifndef __AARCH64EB__ 3466 rev32 v4.16b,v4.16b 3467#endif 3468#ifndef __AARCH64EB__ 3469 rev32 v5.16b,v5.16b 3470#endif 3471#ifndef __AARCH64EB__ 3472 rev32 v6.16b,v6.16b 3473#endif 3474 zip1 v0.4s,v4.4s,v5.4s 3475 zip2 v1.4s,v4.4s,v5.4s 3476 zip1 v2.4s,v6.4s,v7.4s 3477 zip2 v3.4s,v6.4s,v7.4s 3478 zip1 v4.2d,v0.2d,v2.2d 3479 zip2 v5.2d,v0.2d,v2.2d 3480 zip1 v6.2d,v1.2d,v3.2d 3481 zip2 v7.2d,v1.2d,v3.2d 3482 bl _vpsm4_enc_4blks 3483 zip1 v4.4s,v0.4s,v1.4s 3484 zip2 v5.4s,v0.4s,v1.4s 3485 zip1 v6.4s,v2.4s,v3.4s 3486 zip2 v7.4s,v2.4s,v3.4s 3487 zip1 v0.2d,v4.2d,v6.2d 3488 zip2 v1.2d,v4.2d,v6.2d 3489 zip1 v2.2d,v5.2d,v7.2d 3490 zip2 v3.2d,v5.2d,v7.2d 3491 eor v0.16b, v0.16b, v8.16b 3492 eor v1.16b, v1.16b, v9.16b 3493 eor v2.16b, v2.16b, v10.16b 3494 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 3495 // save the last tweak 3496 st1 {v10.4s},[x5] 3497100: 3498 cmp x29,0 3499 b.eq .return_gb 3500 3501// This branch calculates the last two tweaks, 3502// while the encryption/decryption length is larger than 32 3503.last_2blks_tweak_gb: 3504 ld1 {v8.4s},[x5] 3505#ifdef __AARCH64EB__ 3506 rev32 v8.16b,v8.16b 3507#endif 3508 rbit v2.16b,v8.16b 3509 adrp x10,.Lxts_magic 3510 ldr q0, [x10, #:lo12:.Lxts_magic] 3511 shl v9.16b, v2.16b, #1 3512 ext v1.16b, v2.16b, v2.16b,#15 3513 ushr v1.16b, v1.16b, #7 3514 mul v1.16b, v1.16b, v0.16b 3515 eor v9.16b, v9.16b, v1.16b 3516 rbit v9.16b,v9.16b 3517 rbit v2.16b,v9.16b 3518 adrp x10,.Lxts_magic 3519 ldr q0, [x10, #:lo12:.Lxts_magic] 3520 shl v10.16b, v2.16b, #1 3521 ext v1.16b, v2.16b, v2.16b,#15 3522 ushr v1.16b, v1.16b, #7 3523 mul v1.16b, v1.16b, v0.16b 3524 eor v10.16b, v10.16b, v1.16b 3525 rbit v10.16b,v10.16b 3526 b .check_dec_gb 3527 3528 3529// This branch calculates the last two tweaks, 3530// while the encryption/decryption length is equal to 32, who only need two tweaks 3531.only_2blks_tweak_gb: 3532 mov v9.16b,v8.16b 3533#ifdef __AARCH64EB__ 3534 rev32 v9.16b,v9.16b 3535#endif 3536 rbit v2.16b,v9.16b 3537 adrp x10,.Lxts_magic 3538 ldr q0, [x10, #:lo12:.Lxts_magic] 3539 shl v10.16b, v2.16b, #1 3540 ext v1.16b, v2.16b, v2.16b,#15 3541 ushr v1.16b, v1.16b, #7 3542 mul v1.16b, v1.16b, v0.16b 3543 eor v10.16b, v10.16b, v1.16b 3544 rbit v10.16b,v10.16b 3545 b .check_dec_gb 3546 3547 3548// Determine whether encryption or decryption is required. 3549// The last two tweaks need to be swapped for decryption. 3550.check_dec_gb: 3551 // encryption:1 decryption:0 3552 cmp w28,1 3553 b.eq .process_last_2blks_gb 3554 mov v0.16B,v9.16b 3555 mov v9.16B,v10.16b 3556 mov v10.16B,v0.16b 3557 3558.process_last_2blks_gb: 3559#ifdef __AARCH64EB__ 3560 rev32 v9.16b,v9.16b 3561#endif 3562#ifdef __AARCH64EB__ 3563 rev32 v10.16b,v10.16b 3564#endif 3565 ld1 {v4.4s},[x0],#16 3566 eor v4.16b, v4.16b, v9.16b 3567#ifndef __AARCH64EB__ 3568 rev32 v4.16b,v4.16b 3569#endif 3570 mov x10,x3 3571 mov w11,#8 3572 mov w12,v4.s[0] 3573 mov w13,v4.s[1] 3574 mov w14,v4.s[2] 3575 mov w15,v4.s[3] 357610: 3577 ldp w7,w8,[x10],8 3578 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3579 eor w6,w14,w15 3580 eor w9,w7,w13 3581 eor w6,w6,w9 3582 movi v1.16b,#64 3583 movi v2.16b,#128 3584 movi v3.16b,#192 3585 mov v0.s[0],w6 3586 3587 sub v1.16b,v0.16b,v1.16b 3588 sub v2.16b,v0.16b,v2.16b 3589 sub v3.16b,v0.16b,v3.16b 3590 3591 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3592 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3593 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3594 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3595 3596 mov w6,v0.s[0] 3597 mov w7,v1.s[0] 3598 mov w9,v2.s[0] 3599 add w7,w6,w7 3600 mov w6,v3.s[0] 3601 add w7,w7,w9 3602 add w7,w7,w6 3603 3604 eor w6,w7,w7,ror #32-2 3605 eor w6,w6,w7,ror #32-10 3606 eor w6,w6,w7,ror #32-18 3607 eor w6,w6,w7,ror #32-24 3608 eor w12,w12,w6 3609 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3610 eor w6,w14,w15 3611 eor w9,w12,w8 3612 eor w6,w6,w9 3613 movi v1.16b,#64 3614 movi v2.16b,#128 3615 movi v3.16b,#192 3616 mov v0.s[0],w6 3617 3618 sub v1.16b,v0.16b,v1.16b 3619 sub v2.16b,v0.16b,v2.16b 3620 sub v3.16b,v0.16b,v3.16b 3621 3622 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3623 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3624 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3625 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3626 3627 mov w6,v0.s[0] 3628 mov w7,v1.s[0] 3629 mov w9,v2.s[0] 3630 add w7,w6,w7 3631 mov w6,v3.s[0] 3632 add w7,w7,w9 3633 add w7,w7,w6 3634 3635 eor w6,w7,w7,ror #32-2 3636 eor w6,w6,w7,ror #32-10 3637 eor w6,w6,w7,ror #32-18 3638 eor w6,w6,w7,ror #32-24 3639 ldp w7,w8,[x10],8 3640 eor w13,w13,w6 3641 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3642 eor w6,w12,w13 3643 eor w9,w7,w15 3644 eor w6,w6,w9 3645 movi v1.16b,#64 3646 movi v2.16b,#128 3647 movi v3.16b,#192 3648 mov v0.s[0],w6 3649 3650 sub v1.16b,v0.16b,v1.16b 3651 sub v2.16b,v0.16b,v2.16b 3652 sub v3.16b,v0.16b,v3.16b 3653 3654 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3655 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3656 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3657 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3658 3659 mov w6,v0.s[0] 3660 mov w7,v1.s[0] 3661 mov w9,v2.s[0] 3662 add w7,w6,w7 3663 mov w6,v3.s[0] 3664 add w7,w7,w9 3665 add w7,w7,w6 3666 3667 eor w6,w7,w7,ror #32-2 3668 eor w6,w6,w7,ror #32-10 3669 eor w6,w6,w7,ror #32-18 3670 eor w6,w6,w7,ror #32-24 3671 eor w14,w14,w6 3672 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3673 eor w6,w12,w13 3674 eor w9,w14,w8 3675 eor w6,w6,w9 3676 movi v1.16b,#64 3677 movi v2.16b,#128 3678 movi v3.16b,#192 3679 mov v0.s[0],w6 3680 3681 sub v1.16b,v0.16b,v1.16b 3682 sub v2.16b,v0.16b,v2.16b 3683 sub v3.16b,v0.16b,v3.16b 3684 3685 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3686 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3687 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3688 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3689 3690 mov w6,v0.s[0] 3691 mov w7,v1.s[0] 3692 mov w9,v2.s[0] 3693 add w7,w6,w7 3694 mov w6,v3.s[0] 3695 add w7,w7,w9 3696 add w7,w7,w6 3697 3698 eor w6,w7,w7,ror #32-2 3699 eor w6,w6,w7,ror #32-10 3700 eor w6,w6,w7,ror #32-18 3701 eor w6,w6,w7,ror #32-24 3702 eor w15,w15,w6 3703 subs w11,w11,#1 3704 b.ne 10b 3705 mov v4.s[0],w15 3706 mov v4.s[1],w14 3707 mov v4.s[2],w13 3708 mov v4.s[3],w12 3709#ifndef __AARCH64EB__ 3710 rev32 v4.16b,v4.16b 3711#endif 3712 eor v4.16b, v4.16b, v9.16b 3713 st1 {v4.4s},[x1],#16 3714 3715 sub x26,x1,16 3716.loop_gb: 3717 subs x29,x29,1 3718 ldrb w7,[x26,x29] 3719 ldrb w8,[x0,x29] 3720 strb w8,[x26,x29] 3721 strb w7,[x1,x29] 3722 b.gt .loop_gb 3723 ld1 {v4.4s}, [x26] 3724 eor v4.16b, v4.16b, v10.16b 3725#ifndef __AARCH64EB__ 3726 rev32 v4.16b,v4.16b 3727#endif 3728 mov x10,x3 3729 mov w11,#8 3730 mov w12,v4.s[0] 3731 mov w13,v4.s[1] 3732 mov w14,v4.s[2] 3733 mov w15,v4.s[3] 373410: 3735 ldp w7,w8,[x10],8 3736 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3737 eor w6,w14,w15 3738 eor w9,w7,w13 3739 eor w6,w6,w9 3740 movi v1.16b,#64 3741 movi v2.16b,#128 3742 movi v3.16b,#192 3743 mov v0.s[0],w6 3744 3745 sub v1.16b,v0.16b,v1.16b 3746 sub v2.16b,v0.16b,v2.16b 3747 sub v3.16b,v0.16b,v3.16b 3748 3749 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3750 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3751 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3752 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3753 3754 mov w6,v0.s[0] 3755 mov w7,v1.s[0] 3756 mov w9,v2.s[0] 3757 add w7,w6,w7 3758 mov w6,v3.s[0] 3759 add w7,w7,w9 3760 add w7,w7,w6 3761 3762 eor w6,w7,w7,ror #32-2 3763 eor w6,w6,w7,ror #32-10 3764 eor w6,w6,w7,ror #32-18 3765 eor w6,w6,w7,ror #32-24 3766 eor w12,w12,w6 3767 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3768 eor w6,w14,w15 3769 eor w9,w12,w8 3770 eor w6,w6,w9 3771 movi v1.16b,#64 3772 movi v2.16b,#128 3773 movi v3.16b,#192 3774 mov v0.s[0],w6 3775 3776 sub v1.16b,v0.16b,v1.16b 3777 sub v2.16b,v0.16b,v2.16b 3778 sub v3.16b,v0.16b,v3.16b 3779 3780 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3781 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3782 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3783 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3784 3785 mov w6,v0.s[0] 3786 mov w7,v1.s[0] 3787 mov w9,v2.s[0] 3788 add w7,w6,w7 3789 mov w6,v3.s[0] 3790 add w7,w7,w9 3791 add w7,w7,w6 3792 3793 eor w6,w7,w7,ror #32-2 3794 eor w6,w6,w7,ror #32-10 3795 eor w6,w6,w7,ror #32-18 3796 eor w6,w6,w7,ror #32-24 3797 ldp w7,w8,[x10],8 3798 eor w13,w13,w6 3799 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3800 eor w6,w12,w13 3801 eor w9,w7,w15 3802 eor w6,w6,w9 3803 movi v1.16b,#64 3804 movi v2.16b,#128 3805 movi v3.16b,#192 3806 mov v0.s[0],w6 3807 3808 sub v1.16b,v0.16b,v1.16b 3809 sub v2.16b,v0.16b,v2.16b 3810 sub v3.16b,v0.16b,v3.16b 3811 3812 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3813 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3814 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3815 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3816 3817 mov w6,v0.s[0] 3818 mov w7,v1.s[0] 3819 mov w9,v2.s[0] 3820 add w7,w6,w7 3821 mov w6,v3.s[0] 3822 add w7,w7,w9 3823 add w7,w7,w6 3824 3825 eor w6,w7,w7,ror #32-2 3826 eor w6,w6,w7,ror #32-10 3827 eor w6,w6,w7,ror #32-18 3828 eor w6,w6,w7,ror #32-24 3829 eor w14,w14,w6 3830 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 3831 eor w6,w12,w13 3832 eor w9,w14,w8 3833 eor w6,w6,w9 3834 movi v1.16b,#64 3835 movi v2.16b,#128 3836 movi v3.16b,#192 3837 mov v0.s[0],w6 3838 3839 sub v1.16b,v0.16b,v1.16b 3840 sub v2.16b,v0.16b,v2.16b 3841 sub v3.16b,v0.16b,v3.16b 3842 3843 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3844 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3845 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3846 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3847 3848 mov w6,v0.s[0] 3849 mov w7,v1.s[0] 3850 mov w9,v2.s[0] 3851 add w7,w6,w7 3852 mov w6,v3.s[0] 3853 add w7,w7,w9 3854 add w7,w7,w6 3855 3856 eor w6,w7,w7,ror #32-2 3857 eor w6,w6,w7,ror #32-10 3858 eor w6,w6,w7,ror #32-18 3859 eor w6,w6,w7,ror #32-24 3860 eor w15,w15,w6 3861 subs w11,w11,#1 3862 b.ne 10b 3863 mov v4.s[0],w15 3864 mov v4.s[1],w14 3865 mov v4.s[2],w13 3866 mov v4.s[3],w12 3867#ifndef __AARCH64EB__ 3868 rev32 v4.16b,v4.16b 3869#endif 3870 eor v4.16b, v4.16b, v10.16b 3871 st1 {v4.4s}, [x26] 3872.return_gb: 3873 ldp d14, d15, [sp], #0x10 3874 ldp d12, d13, [sp], #0x10 3875 ldp d10, d11, [sp], #0x10 3876 ldp d8, d9, [sp], #0x10 3877 ldp x29, x30, [sp], #0x10 3878 ldp x27, x28, [sp], #0x10 3879 ldp x25, x26, [sp], #0x10 3880 ldp x23, x24, [sp], #0x10 3881 ldp x21, x22, [sp], #0x10 3882 ldp x19, x20, [sp], #0x10 3883 ldp x17, x18, [sp], #0x10 3884 ldp x15, x16, [sp], #0x10 3885 AARCH64_VALIDATE_LINK_REGISTER 3886 ret 3887.size vpsm4_xts_encrypt_gb,.-vpsm4_xts_encrypt_gb 3888.globl vpsm4_xts_encrypt 3889.type vpsm4_xts_encrypt,%function 3890.align 5 3891vpsm4_xts_encrypt: 3892 AARCH64_SIGN_LINK_REGISTER 3893 stp x15, x16, [sp, #-0x10]! 3894 stp x17, x18, [sp, #-0x10]! 3895 stp x19, x20, [sp, #-0x10]! 3896 stp x21, x22, [sp, #-0x10]! 3897 stp x23, x24, [sp, #-0x10]! 3898 stp x25, x26, [sp, #-0x10]! 3899 stp x27, x28, [sp, #-0x10]! 3900 stp x29, x30, [sp, #-0x10]! 3901 stp d8, d9, [sp, #-0x10]! 3902 stp d10, d11, [sp, #-0x10]! 3903 stp d12, d13, [sp, #-0x10]! 3904 stp d14, d15, [sp, #-0x10]! 3905 mov x26,x3 3906 mov x27,x4 3907 mov w28,w6 3908 ld1 {v8.4s}, [x5] 3909 mov x3,x27 3910 adrp x10,.Lsbox 3911 add x10,x10,#:lo12:.Lsbox 3912 ld1 {v16.16b,v17.16b,v18.16b,v19.16b},[x10],#64 3913 ld1 {v20.16b,v21.16b,v22.16b,v23.16b},[x10],#64 3914 ld1 {v24.16b,v25.16b,v26.16b,v27.16b},[x10],#64 3915 ld1 {v28.16b,v29.16b,v30.16b,v31.16b},[x10] 3916#ifndef __AARCH64EB__ 3917 rev32 v8.16b,v8.16b 3918#endif 3919 mov x10,x3 3920 mov w11,#8 3921 mov w12,v8.s[0] 3922 mov w13,v8.s[1] 3923 mov w14,v8.s[2] 3924 mov w15,v8.s[3] 392510: 3926 ldp w7,w8,[x10],8 3927 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 3928 eor w6,w14,w15 3929 eor w9,w7,w13 3930 eor w6,w6,w9 3931 movi v1.16b,#64 3932 movi v2.16b,#128 3933 movi v3.16b,#192 3934 mov v0.s[0],w6 3935 3936 sub v1.16b,v0.16b,v1.16b 3937 sub v2.16b,v0.16b,v2.16b 3938 sub v3.16b,v0.16b,v3.16b 3939 3940 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3941 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3942 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3943 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3944 3945 mov w6,v0.s[0] 3946 mov w7,v1.s[0] 3947 mov w9,v2.s[0] 3948 add w7,w6,w7 3949 mov w6,v3.s[0] 3950 add w7,w7,w9 3951 add w7,w7,w6 3952 3953 eor w6,w7,w7,ror #32-2 3954 eor w6,w6,w7,ror #32-10 3955 eor w6,w6,w7,ror #32-18 3956 eor w6,w6,w7,ror #32-24 3957 eor w12,w12,w6 3958 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 3959 eor w6,w14,w15 3960 eor w9,w12,w8 3961 eor w6,w6,w9 3962 movi v1.16b,#64 3963 movi v2.16b,#128 3964 movi v3.16b,#192 3965 mov v0.s[0],w6 3966 3967 sub v1.16b,v0.16b,v1.16b 3968 sub v2.16b,v0.16b,v2.16b 3969 sub v3.16b,v0.16b,v3.16b 3970 3971 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 3972 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 3973 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 3974 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 3975 3976 mov w6,v0.s[0] 3977 mov w7,v1.s[0] 3978 mov w9,v2.s[0] 3979 add w7,w6,w7 3980 mov w6,v3.s[0] 3981 add w7,w7,w9 3982 add w7,w7,w6 3983 3984 eor w6,w7,w7,ror #32-2 3985 eor w6,w6,w7,ror #32-10 3986 eor w6,w6,w7,ror #32-18 3987 eor w6,w6,w7,ror #32-24 3988 ldp w7,w8,[x10],8 3989 eor w13,w13,w6 3990 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 3991 eor w6,w12,w13 3992 eor w9,w7,w15 3993 eor w6,w6,w9 3994 movi v1.16b,#64 3995 movi v2.16b,#128 3996 movi v3.16b,#192 3997 mov v0.s[0],w6 3998 3999 sub v1.16b,v0.16b,v1.16b 4000 sub v2.16b,v0.16b,v2.16b 4001 sub v3.16b,v0.16b,v3.16b 4002 4003 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4004 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4005 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4006 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4007 4008 mov w6,v0.s[0] 4009 mov w7,v1.s[0] 4010 mov w9,v2.s[0] 4011 add w7,w6,w7 4012 mov w6,v3.s[0] 4013 add w7,w7,w9 4014 add w7,w7,w6 4015 4016 eor w6,w7,w7,ror #32-2 4017 eor w6,w6,w7,ror #32-10 4018 eor w6,w6,w7,ror #32-18 4019 eor w6,w6,w7,ror #32-24 4020 eor w14,w14,w6 4021 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4022 eor w6,w12,w13 4023 eor w9,w14,w8 4024 eor w6,w6,w9 4025 movi v1.16b,#64 4026 movi v2.16b,#128 4027 movi v3.16b,#192 4028 mov v0.s[0],w6 4029 4030 sub v1.16b,v0.16b,v1.16b 4031 sub v2.16b,v0.16b,v2.16b 4032 sub v3.16b,v0.16b,v3.16b 4033 4034 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4035 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4036 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4037 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4038 4039 mov w6,v0.s[0] 4040 mov w7,v1.s[0] 4041 mov w9,v2.s[0] 4042 add w7,w6,w7 4043 mov w6,v3.s[0] 4044 add w7,w7,w9 4045 add w7,w7,w6 4046 4047 eor w6,w7,w7,ror #32-2 4048 eor w6,w6,w7,ror #32-10 4049 eor w6,w6,w7,ror #32-18 4050 eor w6,w6,w7,ror #32-24 4051 eor w15,w15,w6 4052 subs w11,w11,#1 4053 b.ne 10b 4054 mov v8.s[0],w15 4055 mov v8.s[1],w14 4056 mov v8.s[2],w13 4057 mov v8.s[3],w12 4058#ifndef __AARCH64EB__ 4059 rev32 v8.16b,v8.16b 4060#endif 4061 mov x3,x26 4062 and x29,x2,#0x0F 4063 // convert length into blocks 4064 lsr x2,x2,4 4065 cmp x2,#1 4066 b.lt .return 4067 4068 cmp x29,0 4069 // If the encryption/decryption Length is N times of 16, 4070 // the all blocks are encrypted/decrypted in .xts_encrypt_blocks 4071 b.eq .xts_encrypt_blocks 4072 4073 // If the encryption/decryption length is not N times of 16, 4074 // the last two blocks are encrypted/decrypted in .last_2blks_tweak or .only_2blks_tweak 4075 // the other blocks are encrypted/decrypted in .xts_encrypt_blocks 4076 subs x2,x2,#1 4077 b.eq .only_2blks_tweak 4078.xts_encrypt_blocks: 4079#ifdef __AARCH64EB__ 4080 rev32 v8.16b,v8.16b 4081#endif 4082 mov x12,v8.d[0] 4083 mov x13,v8.d[1] 4084 mov w7,0x87 4085 extr x9,x13,x13,#32 4086 extr x15,x13,x12,#63 4087 and w8,w7,w9,asr#31 4088 eor x14,x8,x12,lsl#1 4089 mov w7,0x87 4090 extr x9,x15,x15,#32 4091 extr x17,x15,x14,#63 4092 and w8,w7,w9,asr#31 4093 eor x16,x8,x14,lsl#1 4094 mov w7,0x87 4095 extr x9,x17,x17,#32 4096 extr x19,x17,x16,#63 4097 and w8,w7,w9,asr#31 4098 eor x18,x8,x16,lsl#1 4099 mov w7,0x87 4100 extr x9,x19,x19,#32 4101 extr x21,x19,x18,#63 4102 and w8,w7,w9,asr#31 4103 eor x20,x8,x18,lsl#1 4104 mov w7,0x87 4105 extr x9,x21,x21,#32 4106 extr x23,x21,x20,#63 4107 and w8,w7,w9,asr#31 4108 eor x22,x8,x20,lsl#1 4109 mov w7,0x87 4110 extr x9,x23,x23,#32 4111 extr x25,x23,x22,#63 4112 and w8,w7,w9,asr#31 4113 eor x24,x8,x22,lsl#1 4114 mov w7,0x87 4115 extr x9,x25,x25,#32 4116 extr x27,x25,x24,#63 4117 and w8,w7,w9,asr#31 4118 eor x26,x8,x24,lsl#1 4119.Lxts_8_blocks_process: 4120 cmp x2,#8 4121 b.lt .Lxts_4_blocks_process 4122 mov v0.d[0],x12 4123 mov v0.d[1],x13 4124#ifdef __AARCH64EB__ 4125 rev32 v0.16b,v0.16b 4126#endif 4127 mov v1.d[0],x14 4128 mov v1.d[1],x15 4129#ifdef __AARCH64EB__ 4130 rev32 v1.16b,v1.16b 4131#endif 4132 mov v2.d[0],x16 4133 mov v2.d[1],x17 4134#ifdef __AARCH64EB__ 4135 rev32 v2.16b,v2.16b 4136#endif 4137 mov v3.d[0],x18 4138 mov v3.d[1],x19 4139#ifdef __AARCH64EB__ 4140 rev32 v3.16b,v3.16b 4141#endif 4142 mov v12.d[0],x20 4143 mov v12.d[1],x21 4144#ifdef __AARCH64EB__ 4145 rev32 v12.16b,v12.16b 4146#endif 4147 mov v13.d[0],x22 4148 mov v13.d[1],x23 4149#ifdef __AARCH64EB__ 4150 rev32 v13.16b,v13.16b 4151#endif 4152 mov v14.d[0],x24 4153 mov v14.d[1],x25 4154#ifdef __AARCH64EB__ 4155 rev32 v14.16b,v14.16b 4156#endif 4157 mov v15.d[0],x26 4158 mov v15.d[1],x27 4159#ifdef __AARCH64EB__ 4160 rev32 v15.16b,v15.16b 4161#endif 4162 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 4163 eor v4.16b, v4.16b, v0.16b 4164 eor v5.16b, v5.16b, v1.16b 4165 eor v6.16b, v6.16b, v2.16b 4166 eor v7.16b, v7.16b, v3.16b 4167 ld1 {v8.4s,v9.4s,v10.4s,v11.4s},[x0],#64 4168 eor v8.16b, v8.16b, v12.16b 4169 eor v9.16b, v9.16b, v13.16b 4170 eor v10.16b, v10.16b, v14.16b 4171 eor v11.16b, v11.16b, v15.16b 4172#ifndef __AARCH64EB__ 4173 rev32 v4.16b,v4.16b 4174#endif 4175#ifndef __AARCH64EB__ 4176 rev32 v5.16b,v5.16b 4177#endif 4178#ifndef __AARCH64EB__ 4179 rev32 v6.16b,v6.16b 4180#endif 4181#ifndef __AARCH64EB__ 4182 rev32 v7.16b,v7.16b 4183#endif 4184#ifndef __AARCH64EB__ 4185 rev32 v8.16b,v8.16b 4186#endif 4187#ifndef __AARCH64EB__ 4188 rev32 v9.16b,v9.16b 4189#endif 4190#ifndef __AARCH64EB__ 4191 rev32 v10.16b,v10.16b 4192#endif 4193#ifndef __AARCH64EB__ 4194 rev32 v11.16b,v11.16b 4195#endif 4196 zip1 v0.4s,v4.4s,v5.4s 4197 zip2 v1.4s,v4.4s,v5.4s 4198 zip1 v2.4s,v6.4s,v7.4s 4199 zip2 v3.4s,v6.4s,v7.4s 4200 zip1 v4.2d,v0.2d,v2.2d 4201 zip2 v5.2d,v0.2d,v2.2d 4202 zip1 v6.2d,v1.2d,v3.2d 4203 zip2 v7.2d,v1.2d,v3.2d 4204 zip1 v0.4s,v8.4s,v9.4s 4205 zip2 v1.4s,v8.4s,v9.4s 4206 zip1 v2.4s,v10.4s,v11.4s 4207 zip2 v3.4s,v10.4s,v11.4s 4208 zip1 v8.2d,v0.2d,v2.2d 4209 zip2 v9.2d,v0.2d,v2.2d 4210 zip1 v10.2d,v1.2d,v3.2d 4211 zip2 v11.2d,v1.2d,v3.2d 4212 bl _vpsm4_enc_8blks 4213 zip1 v8.4s,v0.4s,v1.4s 4214 zip2 v9.4s,v0.4s,v1.4s 4215 zip1 v10.4s,v2.4s,v3.4s 4216 zip2 v11.4s,v2.4s,v3.4s 4217 zip1 v0.2d,v8.2d,v10.2d 4218 zip2 v1.2d,v8.2d,v10.2d 4219 zip1 v2.2d,v9.2d,v11.2d 4220 zip2 v3.2d,v9.2d,v11.2d 4221 zip1 v8.4s,v4.4s,v5.4s 4222 zip2 v9.4s,v4.4s,v5.4s 4223 zip1 v10.4s,v6.4s,v7.4s 4224 zip2 v11.4s,v6.4s,v7.4s 4225 zip1 v4.2d,v8.2d,v10.2d 4226 zip2 v5.2d,v8.2d,v10.2d 4227 zip1 v6.2d,v9.2d,v11.2d 4228 zip2 v7.2d,v9.2d,v11.2d 4229 mov v12.d[0],x12 4230 mov v12.d[1],x13 4231#ifdef __AARCH64EB__ 4232 rev32 v12.16b,v12.16b 4233#endif 4234 mov w7,0x87 4235 extr x9,x27,x27,#32 4236 extr x13,x27,x26,#63 4237 and w8,w7,w9,asr#31 4238 eor x12,x8,x26,lsl#1 4239 mov v13.d[0],x14 4240 mov v13.d[1],x15 4241#ifdef __AARCH64EB__ 4242 rev32 v13.16b,v13.16b 4243#endif 4244 mov w7,0x87 4245 extr x9,x13,x13,#32 4246 extr x15,x13,x12,#63 4247 and w8,w7,w9,asr#31 4248 eor x14,x8,x12,lsl#1 4249 mov v14.d[0],x16 4250 mov v14.d[1],x17 4251#ifdef __AARCH64EB__ 4252 rev32 v14.16b,v14.16b 4253#endif 4254 mov w7,0x87 4255 extr x9,x15,x15,#32 4256 extr x17,x15,x14,#63 4257 and w8,w7,w9,asr#31 4258 eor x16,x8,x14,lsl#1 4259 mov v15.d[0],x18 4260 mov v15.d[1],x19 4261#ifdef __AARCH64EB__ 4262 rev32 v15.16b,v15.16b 4263#endif 4264 mov w7,0x87 4265 extr x9,x17,x17,#32 4266 extr x19,x17,x16,#63 4267 and w8,w7,w9,asr#31 4268 eor x18,x8,x16,lsl#1 4269 mov v8.d[0],x20 4270 mov v8.d[1],x21 4271#ifdef __AARCH64EB__ 4272 rev32 v8.16b,v8.16b 4273#endif 4274 mov w7,0x87 4275 extr x9,x19,x19,#32 4276 extr x21,x19,x18,#63 4277 and w8,w7,w9,asr#31 4278 eor x20,x8,x18,lsl#1 4279 mov v9.d[0],x22 4280 mov v9.d[1],x23 4281#ifdef __AARCH64EB__ 4282 rev32 v9.16b,v9.16b 4283#endif 4284 mov w7,0x87 4285 extr x9,x21,x21,#32 4286 extr x23,x21,x20,#63 4287 and w8,w7,w9,asr#31 4288 eor x22,x8,x20,lsl#1 4289 mov v10.d[0],x24 4290 mov v10.d[1],x25 4291#ifdef __AARCH64EB__ 4292 rev32 v10.16b,v10.16b 4293#endif 4294 mov w7,0x87 4295 extr x9,x23,x23,#32 4296 extr x25,x23,x22,#63 4297 and w8,w7,w9,asr#31 4298 eor x24,x8,x22,lsl#1 4299 mov v11.d[0],x26 4300 mov v11.d[1],x27 4301#ifdef __AARCH64EB__ 4302 rev32 v11.16b,v11.16b 4303#endif 4304 mov w7,0x87 4305 extr x9,x25,x25,#32 4306 extr x27,x25,x24,#63 4307 and w8,w7,w9,asr#31 4308 eor x26,x8,x24,lsl#1 4309 eor v0.16b, v0.16b, v12.16b 4310 eor v1.16b, v1.16b, v13.16b 4311 eor v2.16b, v2.16b, v14.16b 4312 eor v3.16b, v3.16b, v15.16b 4313 eor v4.16b, v4.16b, v8.16b 4314 eor v5.16b, v5.16b, v9.16b 4315 eor v6.16b, v6.16b, v10.16b 4316 eor v7.16b, v7.16b, v11.16b 4317 4318 // save the last tweak 4319 st1 {v11.4s},[x5] 4320 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 4321 st1 {v4.4s,v5.4s,v6.4s,v7.4s},[x1],#64 4322 subs x2,x2,#8 4323 b.gt .Lxts_8_blocks_process 4324 b 100f 4325.Lxts_4_blocks_process: 4326 mov v8.d[0],x12 4327 mov v8.d[1],x13 4328#ifdef __AARCH64EB__ 4329 rev32 v8.16b,v8.16b 4330#endif 4331 mov v9.d[0],x14 4332 mov v9.d[1],x15 4333#ifdef __AARCH64EB__ 4334 rev32 v9.16b,v9.16b 4335#endif 4336 mov v10.d[0],x16 4337 mov v10.d[1],x17 4338#ifdef __AARCH64EB__ 4339 rev32 v10.16b,v10.16b 4340#endif 4341 mov v11.d[0],x18 4342 mov v11.d[1],x19 4343#ifdef __AARCH64EB__ 4344 rev32 v11.16b,v11.16b 4345#endif 4346 cmp x2,#4 4347 b.lt 1f 4348 ld1 {v4.4s,v5.4s,v6.4s,v7.4s},[x0],#64 4349 eor v4.16b, v4.16b, v8.16b 4350 eor v5.16b, v5.16b, v9.16b 4351 eor v6.16b, v6.16b, v10.16b 4352 eor v7.16b, v7.16b, v11.16b 4353#ifndef __AARCH64EB__ 4354 rev32 v4.16b,v4.16b 4355#endif 4356#ifndef __AARCH64EB__ 4357 rev32 v5.16b,v5.16b 4358#endif 4359#ifndef __AARCH64EB__ 4360 rev32 v6.16b,v6.16b 4361#endif 4362#ifndef __AARCH64EB__ 4363 rev32 v7.16b,v7.16b 4364#endif 4365 zip1 v0.4s,v4.4s,v5.4s 4366 zip2 v1.4s,v4.4s,v5.4s 4367 zip1 v2.4s,v6.4s,v7.4s 4368 zip2 v3.4s,v6.4s,v7.4s 4369 zip1 v4.2d,v0.2d,v2.2d 4370 zip2 v5.2d,v0.2d,v2.2d 4371 zip1 v6.2d,v1.2d,v3.2d 4372 zip2 v7.2d,v1.2d,v3.2d 4373 bl _vpsm4_enc_4blks 4374 zip1 v4.4s,v0.4s,v1.4s 4375 zip2 v5.4s,v0.4s,v1.4s 4376 zip1 v6.4s,v2.4s,v3.4s 4377 zip2 v7.4s,v2.4s,v3.4s 4378 zip1 v0.2d,v4.2d,v6.2d 4379 zip2 v1.2d,v4.2d,v6.2d 4380 zip1 v2.2d,v5.2d,v7.2d 4381 zip2 v3.2d,v5.2d,v7.2d 4382 eor v0.16b, v0.16b, v8.16b 4383 eor v1.16b, v1.16b, v9.16b 4384 eor v2.16b, v2.16b, v10.16b 4385 eor v3.16b, v3.16b, v11.16b 4386 st1 {v0.4s,v1.4s,v2.4s,v3.4s},[x1],#64 4387 sub x2,x2,#4 4388 mov v8.d[0],x20 4389 mov v8.d[1],x21 4390#ifdef __AARCH64EB__ 4391 rev32 v8.16b,v8.16b 4392#endif 4393 mov v9.d[0],x22 4394 mov v9.d[1],x23 4395#ifdef __AARCH64EB__ 4396 rev32 v9.16b,v9.16b 4397#endif 4398 mov v10.d[0],x24 4399 mov v10.d[1],x25 4400#ifdef __AARCH64EB__ 4401 rev32 v10.16b,v10.16b 4402#endif 4403 // save the last tweak 4404 st1 {v11.4s},[x5] 44051: 4406 // process last block 4407 cmp x2,#1 4408 b.lt 100f 4409 b.gt 1f 4410 ld1 {v4.4s},[x0],#16 4411 eor v4.16b, v4.16b, v8.16b 4412#ifndef __AARCH64EB__ 4413 rev32 v4.16b,v4.16b 4414#endif 4415 mov x10,x3 4416 mov w11,#8 4417 mov w12,v4.s[0] 4418 mov w13,v4.s[1] 4419 mov w14,v4.s[2] 4420 mov w15,v4.s[3] 442110: 4422 ldp w7,w8,[x10],8 4423 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4424 eor w6,w14,w15 4425 eor w9,w7,w13 4426 eor w6,w6,w9 4427 movi v1.16b,#64 4428 movi v2.16b,#128 4429 movi v3.16b,#192 4430 mov v0.s[0],w6 4431 4432 sub v1.16b,v0.16b,v1.16b 4433 sub v2.16b,v0.16b,v2.16b 4434 sub v3.16b,v0.16b,v3.16b 4435 4436 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4437 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4438 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4439 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4440 4441 mov w6,v0.s[0] 4442 mov w7,v1.s[0] 4443 mov w9,v2.s[0] 4444 add w7,w6,w7 4445 mov w6,v3.s[0] 4446 add w7,w7,w9 4447 add w7,w7,w6 4448 4449 eor w6,w7,w7,ror #32-2 4450 eor w6,w6,w7,ror #32-10 4451 eor w6,w6,w7,ror #32-18 4452 eor w6,w6,w7,ror #32-24 4453 eor w12,w12,w6 4454 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4455 eor w6,w14,w15 4456 eor w9,w12,w8 4457 eor w6,w6,w9 4458 movi v1.16b,#64 4459 movi v2.16b,#128 4460 movi v3.16b,#192 4461 mov v0.s[0],w6 4462 4463 sub v1.16b,v0.16b,v1.16b 4464 sub v2.16b,v0.16b,v2.16b 4465 sub v3.16b,v0.16b,v3.16b 4466 4467 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4468 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4469 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4470 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4471 4472 mov w6,v0.s[0] 4473 mov w7,v1.s[0] 4474 mov w9,v2.s[0] 4475 add w7,w6,w7 4476 mov w6,v3.s[0] 4477 add w7,w7,w9 4478 add w7,w7,w6 4479 4480 eor w6,w7,w7,ror #32-2 4481 eor w6,w6,w7,ror #32-10 4482 eor w6,w6,w7,ror #32-18 4483 eor w6,w6,w7,ror #32-24 4484 ldp w7,w8,[x10],8 4485 eor w13,w13,w6 4486 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4487 eor w6,w12,w13 4488 eor w9,w7,w15 4489 eor w6,w6,w9 4490 movi v1.16b,#64 4491 movi v2.16b,#128 4492 movi v3.16b,#192 4493 mov v0.s[0],w6 4494 4495 sub v1.16b,v0.16b,v1.16b 4496 sub v2.16b,v0.16b,v2.16b 4497 sub v3.16b,v0.16b,v3.16b 4498 4499 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4500 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4501 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4502 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4503 4504 mov w6,v0.s[0] 4505 mov w7,v1.s[0] 4506 mov w9,v2.s[0] 4507 add w7,w6,w7 4508 mov w6,v3.s[0] 4509 add w7,w7,w9 4510 add w7,w7,w6 4511 4512 eor w6,w7,w7,ror #32-2 4513 eor w6,w6,w7,ror #32-10 4514 eor w6,w6,w7,ror #32-18 4515 eor w6,w6,w7,ror #32-24 4516 eor w14,w14,w6 4517 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4518 eor w6,w12,w13 4519 eor w9,w14,w8 4520 eor w6,w6,w9 4521 movi v1.16b,#64 4522 movi v2.16b,#128 4523 movi v3.16b,#192 4524 mov v0.s[0],w6 4525 4526 sub v1.16b,v0.16b,v1.16b 4527 sub v2.16b,v0.16b,v2.16b 4528 sub v3.16b,v0.16b,v3.16b 4529 4530 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4531 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4532 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4533 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4534 4535 mov w6,v0.s[0] 4536 mov w7,v1.s[0] 4537 mov w9,v2.s[0] 4538 add w7,w6,w7 4539 mov w6,v3.s[0] 4540 add w7,w7,w9 4541 add w7,w7,w6 4542 4543 eor w6,w7,w7,ror #32-2 4544 eor w6,w6,w7,ror #32-10 4545 eor w6,w6,w7,ror #32-18 4546 eor w6,w6,w7,ror #32-24 4547 eor w15,w15,w6 4548 subs w11,w11,#1 4549 b.ne 10b 4550 mov v4.s[0],w15 4551 mov v4.s[1],w14 4552 mov v4.s[2],w13 4553 mov v4.s[3],w12 4554#ifndef __AARCH64EB__ 4555 rev32 v4.16b,v4.16b 4556#endif 4557 eor v4.16b, v4.16b, v8.16b 4558 st1 {v4.4s},[x1],#16 4559 // save the last tweak 4560 st1 {v8.4s},[x5] 4561 b 100f 45621: // process last 2 blocks 4563 cmp x2,#2 4564 b.gt 1f 4565 ld1 {v4.4s,v5.4s},[x0],#32 4566 eor v4.16b, v4.16b, v8.16b 4567 eor v5.16b, v5.16b, v9.16b 4568#ifndef __AARCH64EB__ 4569 rev32 v4.16b,v4.16b 4570#endif 4571#ifndef __AARCH64EB__ 4572 rev32 v5.16b,v5.16b 4573#endif 4574 zip1 v0.4s,v4.4s,v5.4s 4575 zip2 v1.4s,v4.4s,v5.4s 4576 zip1 v2.4s,v6.4s,v7.4s 4577 zip2 v3.4s,v6.4s,v7.4s 4578 zip1 v4.2d,v0.2d,v2.2d 4579 zip2 v5.2d,v0.2d,v2.2d 4580 zip1 v6.2d,v1.2d,v3.2d 4581 zip2 v7.2d,v1.2d,v3.2d 4582 bl _vpsm4_enc_4blks 4583 zip1 v4.4s,v0.4s,v1.4s 4584 zip2 v5.4s,v0.4s,v1.4s 4585 zip1 v6.4s,v2.4s,v3.4s 4586 zip2 v7.4s,v2.4s,v3.4s 4587 zip1 v0.2d,v4.2d,v6.2d 4588 zip2 v1.2d,v4.2d,v6.2d 4589 zip1 v2.2d,v5.2d,v7.2d 4590 zip2 v3.2d,v5.2d,v7.2d 4591 eor v0.16b, v0.16b, v8.16b 4592 eor v1.16b, v1.16b, v9.16b 4593 st1 {v0.4s,v1.4s},[x1],#32 4594 // save the last tweak 4595 st1 {v9.4s},[x5] 4596 b 100f 45971: // process last 3 blocks 4598 ld1 {v4.4s,v5.4s,v6.4s},[x0],#48 4599 eor v4.16b, v4.16b, v8.16b 4600 eor v5.16b, v5.16b, v9.16b 4601 eor v6.16b, v6.16b, v10.16b 4602#ifndef __AARCH64EB__ 4603 rev32 v4.16b,v4.16b 4604#endif 4605#ifndef __AARCH64EB__ 4606 rev32 v5.16b,v5.16b 4607#endif 4608#ifndef __AARCH64EB__ 4609 rev32 v6.16b,v6.16b 4610#endif 4611 zip1 v0.4s,v4.4s,v5.4s 4612 zip2 v1.4s,v4.4s,v5.4s 4613 zip1 v2.4s,v6.4s,v7.4s 4614 zip2 v3.4s,v6.4s,v7.4s 4615 zip1 v4.2d,v0.2d,v2.2d 4616 zip2 v5.2d,v0.2d,v2.2d 4617 zip1 v6.2d,v1.2d,v3.2d 4618 zip2 v7.2d,v1.2d,v3.2d 4619 bl _vpsm4_enc_4blks 4620 zip1 v4.4s,v0.4s,v1.4s 4621 zip2 v5.4s,v0.4s,v1.4s 4622 zip1 v6.4s,v2.4s,v3.4s 4623 zip2 v7.4s,v2.4s,v3.4s 4624 zip1 v0.2d,v4.2d,v6.2d 4625 zip2 v1.2d,v4.2d,v6.2d 4626 zip1 v2.2d,v5.2d,v7.2d 4627 zip2 v3.2d,v5.2d,v7.2d 4628 eor v0.16b, v0.16b, v8.16b 4629 eor v1.16b, v1.16b, v9.16b 4630 eor v2.16b, v2.16b, v10.16b 4631 st1 {v0.4s,v1.4s,v2.4s},[x1],#48 4632 // save the last tweak 4633 st1 {v10.4s},[x5] 4634100: 4635 cmp x29,0 4636 b.eq .return 4637 4638// This branch calculates the last two tweaks, 4639// while the encryption/decryption length is larger than 32 4640.last_2blks_tweak: 4641 ld1 {v8.4s},[x5] 4642#ifdef __AARCH64EB__ 4643 rev32 v8.16b,v8.16b 4644#endif 4645 mov v2.16b,v8.16b 4646 adrp x10,.Lxts_magic 4647 ldr q0, [x10, #:lo12:.Lxts_magic] 4648 shl v9.16b, v2.16b, #1 4649 ext v1.16b, v2.16b, v2.16b,#15 4650 ushr v1.16b, v1.16b, #7 4651 mul v1.16b, v1.16b, v0.16b 4652 eor v9.16b, v9.16b, v1.16b 4653 mov v2.16b,v9.16b 4654 adrp x10,.Lxts_magic 4655 ldr q0, [x10, #:lo12:.Lxts_magic] 4656 shl v10.16b, v2.16b, #1 4657 ext v1.16b, v2.16b, v2.16b,#15 4658 ushr v1.16b, v1.16b, #7 4659 mul v1.16b, v1.16b, v0.16b 4660 eor v10.16b, v10.16b, v1.16b 4661 b .check_dec 4662 4663 4664// This branch calculates the last two tweaks, 4665// while the encryption/decryption length is equal to 32, who only need two tweaks 4666.only_2blks_tweak: 4667 mov v9.16b,v8.16b 4668#ifdef __AARCH64EB__ 4669 rev32 v9.16b,v9.16b 4670#endif 4671 mov v2.16b,v9.16b 4672 adrp x10,.Lxts_magic 4673 ldr q0, [x10, #:lo12:.Lxts_magic] 4674 shl v10.16b, v2.16b, #1 4675 ext v1.16b, v2.16b, v2.16b,#15 4676 ushr v1.16b, v1.16b, #7 4677 mul v1.16b, v1.16b, v0.16b 4678 eor v10.16b, v10.16b, v1.16b 4679 b .check_dec 4680 4681 4682// Determine whether encryption or decryption is required. 4683// The last two tweaks need to be swapped for decryption. 4684.check_dec: 4685 // encryption:1 decryption:0 4686 cmp w28,1 4687 b.eq .process_last_2blks 4688 mov v0.16B,v9.16b 4689 mov v9.16B,v10.16b 4690 mov v10.16B,v0.16b 4691 4692.process_last_2blks: 4693#ifdef __AARCH64EB__ 4694 rev32 v9.16b,v9.16b 4695#endif 4696#ifdef __AARCH64EB__ 4697 rev32 v10.16b,v10.16b 4698#endif 4699 ld1 {v4.4s},[x0],#16 4700 eor v4.16b, v4.16b, v9.16b 4701#ifndef __AARCH64EB__ 4702 rev32 v4.16b,v4.16b 4703#endif 4704 mov x10,x3 4705 mov w11,#8 4706 mov w12,v4.s[0] 4707 mov w13,v4.s[1] 4708 mov w14,v4.s[2] 4709 mov w15,v4.s[3] 471010: 4711 ldp w7,w8,[x10],8 4712 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4713 eor w6,w14,w15 4714 eor w9,w7,w13 4715 eor w6,w6,w9 4716 movi v1.16b,#64 4717 movi v2.16b,#128 4718 movi v3.16b,#192 4719 mov v0.s[0],w6 4720 4721 sub v1.16b,v0.16b,v1.16b 4722 sub v2.16b,v0.16b,v2.16b 4723 sub v3.16b,v0.16b,v3.16b 4724 4725 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4726 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4727 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4728 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4729 4730 mov w6,v0.s[0] 4731 mov w7,v1.s[0] 4732 mov w9,v2.s[0] 4733 add w7,w6,w7 4734 mov w6,v3.s[0] 4735 add w7,w7,w9 4736 add w7,w7,w6 4737 4738 eor w6,w7,w7,ror #32-2 4739 eor w6,w6,w7,ror #32-10 4740 eor w6,w6,w7,ror #32-18 4741 eor w6,w6,w7,ror #32-24 4742 eor w12,w12,w6 4743 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4744 eor w6,w14,w15 4745 eor w9,w12,w8 4746 eor w6,w6,w9 4747 movi v1.16b,#64 4748 movi v2.16b,#128 4749 movi v3.16b,#192 4750 mov v0.s[0],w6 4751 4752 sub v1.16b,v0.16b,v1.16b 4753 sub v2.16b,v0.16b,v2.16b 4754 sub v3.16b,v0.16b,v3.16b 4755 4756 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4757 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4758 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4759 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4760 4761 mov w6,v0.s[0] 4762 mov w7,v1.s[0] 4763 mov w9,v2.s[0] 4764 add w7,w6,w7 4765 mov w6,v3.s[0] 4766 add w7,w7,w9 4767 add w7,w7,w6 4768 4769 eor w6,w7,w7,ror #32-2 4770 eor w6,w6,w7,ror #32-10 4771 eor w6,w6,w7,ror #32-18 4772 eor w6,w6,w7,ror #32-24 4773 ldp w7,w8,[x10],8 4774 eor w13,w13,w6 4775 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4776 eor w6,w12,w13 4777 eor w9,w7,w15 4778 eor w6,w6,w9 4779 movi v1.16b,#64 4780 movi v2.16b,#128 4781 movi v3.16b,#192 4782 mov v0.s[0],w6 4783 4784 sub v1.16b,v0.16b,v1.16b 4785 sub v2.16b,v0.16b,v2.16b 4786 sub v3.16b,v0.16b,v3.16b 4787 4788 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4789 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4790 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4791 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4792 4793 mov w6,v0.s[0] 4794 mov w7,v1.s[0] 4795 mov w9,v2.s[0] 4796 add w7,w6,w7 4797 mov w6,v3.s[0] 4798 add w7,w7,w9 4799 add w7,w7,w6 4800 4801 eor w6,w7,w7,ror #32-2 4802 eor w6,w6,w7,ror #32-10 4803 eor w6,w6,w7,ror #32-18 4804 eor w6,w6,w7,ror #32-24 4805 eor w14,w14,w6 4806 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4807 eor w6,w12,w13 4808 eor w9,w14,w8 4809 eor w6,w6,w9 4810 movi v1.16b,#64 4811 movi v2.16b,#128 4812 movi v3.16b,#192 4813 mov v0.s[0],w6 4814 4815 sub v1.16b,v0.16b,v1.16b 4816 sub v2.16b,v0.16b,v2.16b 4817 sub v3.16b,v0.16b,v3.16b 4818 4819 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4820 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4821 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4822 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4823 4824 mov w6,v0.s[0] 4825 mov w7,v1.s[0] 4826 mov w9,v2.s[0] 4827 add w7,w6,w7 4828 mov w6,v3.s[0] 4829 add w7,w7,w9 4830 add w7,w7,w6 4831 4832 eor w6,w7,w7,ror #32-2 4833 eor w6,w6,w7,ror #32-10 4834 eor w6,w6,w7,ror #32-18 4835 eor w6,w6,w7,ror #32-24 4836 eor w15,w15,w6 4837 subs w11,w11,#1 4838 b.ne 10b 4839 mov v4.s[0],w15 4840 mov v4.s[1],w14 4841 mov v4.s[2],w13 4842 mov v4.s[3],w12 4843#ifndef __AARCH64EB__ 4844 rev32 v4.16b,v4.16b 4845#endif 4846 eor v4.16b, v4.16b, v9.16b 4847 st1 {v4.4s},[x1],#16 4848 4849 sub x26,x1,16 4850.loop: 4851 subs x29,x29,1 4852 ldrb w7,[x26,x29] 4853 ldrb w8,[x0,x29] 4854 strb w8,[x26,x29] 4855 strb w7,[x1,x29] 4856 b.gt .loop 4857 ld1 {v4.4s}, [x26] 4858 eor v4.16b, v4.16b, v10.16b 4859#ifndef __AARCH64EB__ 4860 rev32 v4.16b,v4.16b 4861#endif 4862 mov x10,x3 4863 mov w11,#8 4864 mov w12,v4.s[0] 4865 mov w13,v4.s[1] 4866 mov w14,v4.s[2] 4867 mov w15,v4.s[3] 486810: 4869 ldp w7,w8,[x10],8 4870 // B0 ^= SBOX(B1 ^ B2 ^ B3 ^ RK0) 4871 eor w6,w14,w15 4872 eor w9,w7,w13 4873 eor w6,w6,w9 4874 movi v1.16b,#64 4875 movi v2.16b,#128 4876 movi v3.16b,#192 4877 mov v0.s[0],w6 4878 4879 sub v1.16b,v0.16b,v1.16b 4880 sub v2.16b,v0.16b,v2.16b 4881 sub v3.16b,v0.16b,v3.16b 4882 4883 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4884 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4885 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4886 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4887 4888 mov w6,v0.s[0] 4889 mov w7,v1.s[0] 4890 mov w9,v2.s[0] 4891 add w7,w6,w7 4892 mov w6,v3.s[0] 4893 add w7,w7,w9 4894 add w7,w7,w6 4895 4896 eor w6,w7,w7,ror #32-2 4897 eor w6,w6,w7,ror #32-10 4898 eor w6,w6,w7,ror #32-18 4899 eor w6,w6,w7,ror #32-24 4900 eor w12,w12,w6 4901 // B1 ^= SBOX(B0 ^ B2 ^ B3 ^ RK1) 4902 eor w6,w14,w15 4903 eor w9,w12,w8 4904 eor w6,w6,w9 4905 movi v1.16b,#64 4906 movi v2.16b,#128 4907 movi v3.16b,#192 4908 mov v0.s[0],w6 4909 4910 sub v1.16b,v0.16b,v1.16b 4911 sub v2.16b,v0.16b,v2.16b 4912 sub v3.16b,v0.16b,v3.16b 4913 4914 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4915 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4916 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4917 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4918 4919 mov w6,v0.s[0] 4920 mov w7,v1.s[0] 4921 mov w9,v2.s[0] 4922 add w7,w6,w7 4923 mov w6,v3.s[0] 4924 add w7,w7,w9 4925 add w7,w7,w6 4926 4927 eor w6,w7,w7,ror #32-2 4928 eor w6,w6,w7,ror #32-10 4929 eor w6,w6,w7,ror #32-18 4930 eor w6,w6,w7,ror #32-24 4931 ldp w7,w8,[x10],8 4932 eor w13,w13,w6 4933 // B2 ^= SBOX(B0 ^ B1 ^ B3 ^ RK2) 4934 eor w6,w12,w13 4935 eor w9,w7,w15 4936 eor w6,w6,w9 4937 movi v1.16b,#64 4938 movi v2.16b,#128 4939 movi v3.16b,#192 4940 mov v0.s[0],w6 4941 4942 sub v1.16b,v0.16b,v1.16b 4943 sub v2.16b,v0.16b,v2.16b 4944 sub v3.16b,v0.16b,v3.16b 4945 4946 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4947 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4948 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4949 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4950 4951 mov w6,v0.s[0] 4952 mov w7,v1.s[0] 4953 mov w9,v2.s[0] 4954 add w7,w6,w7 4955 mov w6,v3.s[0] 4956 add w7,w7,w9 4957 add w7,w7,w6 4958 4959 eor w6,w7,w7,ror #32-2 4960 eor w6,w6,w7,ror #32-10 4961 eor w6,w6,w7,ror #32-18 4962 eor w6,w6,w7,ror #32-24 4963 eor w14,w14,w6 4964 // B3 ^= SBOX(B0 ^ B1 ^ B2 ^ RK3) 4965 eor w6,w12,w13 4966 eor w9,w14,w8 4967 eor w6,w6,w9 4968 movi v1.16b,#64 4969 movi v2.16b,#128 4970 movi v3.16b,#192 4971 mov v0.s[0],w6 4972 4973 sub v1.16b,v0.16b,v1.16b 4974 sub v2.16b,v0.16b,v2.16b 4975 sub v3.16b,v0.16b,v3.16b 4976 4977 tbl v0.16b,{v16.16b,v17.16b,v18.16b,v19.16b},v0.16b 4978 tbl v1.16b,{v20.16b,v21.16b,v22.16b,v23.16b},v1.16b 4979 tbl v2.16b,{v24.16b,v25.16b,v26.16b,v27.16b},v2.16b 4980 tbl v3.16b,{v28.16b,v29.16b,v30.16b,v31.16b},v3.16b 4981 4982 mov w6,v0.s[0] 4983 mov w7,v1.s[0] 4984 mov w9,v2.s[0] 4985 add w7,w6,w7 4986 mov w6,v3.s[0] 4987 add w7,w7,w9 4988 add w7,w7,w6 4989 4990 eor w6,w7,w7,ror #32-2 4991 eor w6,w6,w7,ror #32-10 4992 eor w6,w6,w7,ror #32-18 4993 eor w6,w6,w7,ror #32-24 4994 eor w15,w15,w6 4995 subs w11,w11,#1 4996 b.ne 10b 4997 mov v4.s[0],w15 4998 mov v4.s[1],w14 4999 mov v4.s[2],w13 5000 mov v4.s[3],w12 5001#ifndef __AARCH64EB__ 5002 rev32 v4.16b,v4.16b 5003#endif 5004 eor v4.16b, v4.16b, v10.16b 5005 st1 {v4.4s}, [x26] 5006.return: 5007 ldp d14, d15, [sp], #0x10 5008 ldp d12, d13, [sp], #0x10 5009 ldp d10, d11, [sp], #0x10 5010 ldp d8, d9, [sp], #0x10 5011 ldp x29, x30, [sp], #0x10 5012 ldp x27, x28, [sp], #0x10 5013 ldp x25, x26, [sp], #0x10 5014 ldp x23, x24, [sp], #0x10 5015 ldp x21, x22, [sp], #0x10 5016 ldp x19, x20, [sp], #0x10 5017 ldp x17, x18, [sp], #0x10 5018 ldp x15, x16, [sp], #0x10 5019 AARCH64_VALIDATE_LINK_REGISTER 5020 ret 5021.size vpsm4_xts_encrypt,.-vpsm4_xts_encrypt 5022