1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Bit sliced AES using NEON instructions 4 * 5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* 9 * The algorithm implemented here is described in detail by the paper 10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 12 * 13 * This implementation is based primarily on the OpenSSL implementation 14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 15 */ 16 17#include <linux/linkage.h> 18#include <linux/cfi_types.h> 19#include <asm/assembler.h> 20 21 .text 22 23 rounds .req x11 24 bskey .req x12 25 26 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 27 eor \b2, \b2, \b1 28 eor \b5, \b5, \b6 29 eor \b3, \b3, \b0 30 eor \b6, \b6, \b2 31 eor \b5, \b5, \b0 32 eor \b6, \b6, \b3 33 eor \b3, \b3, \b7 34 eor \b7, \b7, \b5 35 eor \b3, \b3, \b4 36 eor \b4, \b4, \b5 37 eor \b2, \b2, \b7 38 eor \b3, \b3, \b1 39 eor \b1, \b1, \b5 40 .endm 41 42 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 43 eor \b0, \b0, \b6 44 eor \b1, \b1, \b4 45 eor \b4, \b4, \b6 46 eor \b2, \b2, \b0 47 eor \b6, \b6, \b1 48 eor \b1, \b1, \b5 49 eor \b5, \b5, \b3 50 eor \b3, \b3, \b7 51 eor \b7, \b7, \b5 52 eor \b2, \b2, \b5 53 eor \b4, \b4, \b7 54 .endm 55 56 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 57 eor \b1, \b1, \b7 58 eor \b4, \b4, \b7 59 eor \b7, \b7, \b5 60 eor \b1, \b1, \b3 61 eor \b2, \b2, \b5 62 eor \b3, \b3, \b7 63 eor \b6, \b6, \b1 64 eor \b2, \b2, \b0 65 eor \b5, \b5, \b3 66 eor \b4, \b4, \b6 67 eor \b0, \b0, \b6 68 eor \b1, \b1, \b4 69 .endm 70 71 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 72 eor \b1, \b1, \b5 73 eor \b2, \b2, \b7 74 eor \b3, \b3, \b1 75 eor \b4, \b4, \b5 76 eor \b7, \b7, \b5 77 eor \b3, \b3, \b4 78 eor \b5, \b5, \b0 79 eor \b3, \b3, \b7 80 eor \b6, \b6, \b2 81 eor \b2, \b2, \b1 82 eor \b6, \b6, \b3 83 eor \b3, \b3, \b0 84 eor \b5, \b5, \b6 85 .endm 86 87 .macro mul_gf4, x0, x1, y0, y1, t0, t1 88 eor \t0, \y0, \y1 89 and \t0, \t0, \x0 90 eor \x0, \x0, \x1 91 and \t1, \x1, \y0 92 and \x0, \x0, \y1 93 eor \x1, \t1, \t0 94 eor \x0, \x0, \t1 95 .endm 96 97 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 98 eor \t0, \y0, \y1 99 eor \t1, \y2, \y3 100 and \t0, \t0, \x0 101 and \t1, \t1, \x2 102 eor \x0, \x0, \x1 103 eor \x2, \x2, \x3 104 and \x1, \x1, \y0 105 and \x3, \x3, \y2 106 and \x0, \x0, \y1 107 and \x2, \x2, \y3 108 eor \x1, \x1, \x0 109 eor \x2, \x2, \x3 110 eor \x0, \x0, \t0 111 eor \x3, \x3, \t1 112 .endm 113 114 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 115 y0, y1, y2, y3, t0, t1, t2, t3 116 eor \t0, \x0, \x2 117 eor \t1, \x1, \x3 118 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 119 eor \y0, \y0, \y2 120 eor \y1, \y1, \y3 121 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 122 eor \x0, \x0, \t0 123 eor \x2, \x2, \t0 124 eor \x1, \x1, \t1 125 eor \x3, \x3, \t1 126 eor \t0, \x4, \x6 127 eor \t1, \x5, \x7 128 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 129 eor \y0, \y0, \y2 130 eor \y1, \y1, \y3 131 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 132 eor \x4, \x4, \t0 133 eor \x6, \x6, \t0 134 eor \x5, \x5, \t1 135 eor \x7, \x7, \t1 136 .endm 137 138 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 139 t0, t1, t2, t3, s0, s1, s2, s3 140 eor \t3, \x4, \x6 141 eor \t0, \x5, \x7 142 eor \t1, \x1, \x3 143 eor \s1, \x7, \x6 144 eor \s0, \x0, \x2 145 eor \s3, \t3, \t0 146 orr \t2, \t0, \t1 147 and \s2, \t3, \s0 148 orr \t3, \t3, \s0 149 eor \s0, \s0, \t1 150 and \t0, \t0, \t1 151 eor \t1, \x3, \x2 152 and \s3, \s3, \s0 153 and \s1, \s1, \t1 154 eor \t1, \x4, \x5 155 eor \s0, \x1, \x0 156 eor \t3, \t3, \s1 157 eor \t2, \t2, \s1 158 and \s1, \t1, \s0 159 orr \t1, \t1, \s0 160 eor \t3, \t3, \s3 161 eor \t0, \t0, \s1 162 eor \t2, \t2, \s2 163 eor \t1, \t1, \s3 164 eor \t0, \t0, \s2 165 and \s0, \x7, \x3 166 eor \t1, \t1, \s2 167 and \s1, \x6, \x2 168 and \s2, \x5, \x1 169 orr \s3, \x4, \x0 170 eor \t3, \t3, \s0 171 eor \t1, \t1, \s2 172 eor \s0, \t0, \s3 173 eor \t2, \t2, \s1 174 and \s2, \t3, \t1 175 eor \s1, \t2, \s2 176 eor \s3, \s0, \s2 177 bsl \s1, \t1, \s0 178 not \t0, \s0 179 bsl \s0, \s1, \s3 180 bsl \t0, \s1, \s3 181 bsl \s3, \t3, \t2 182 eor \t3, \t3, \t2 183 and \s2, \s0, \s3 184 eor \t1, \t1, \t0 185 eor \s2, \s2, \t3 186 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 187 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 188 .endm 189 190 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 191 t0, t1, t2, t3, s0, s1, s2, s3 192 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 193 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 194 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 195 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 196 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 197 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 198 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 199 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 200 .endm 201 202 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 203 t0, t1, t2, t3, s0, s1, s2, s3 204 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 205 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 206 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 207 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 208 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 209 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 210 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 211 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 212 .endm 213 214 .macro enc_next_rk 215 ldp q16, q17, [bskey], #128 216 ldp q18, q19, [bskey, #-96] 217 ldp q20, q21, [bskey, #-64] 218 ldp q22, q23, [bskey, #-32] 219 .endm 220 221 .macro dec_next_rk 222 ldp q16, q17, [bskey, #-128]! 223 ldp q18, q19, [bskey, #32] 224 ldp q20, q21, [bskey, #64] 225 ldp q22, q23, [bskey, #96] 226 .endm 227 228 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 229 eor \x0\().16b, \x0\().16b, v16.16b 230 eor \x1\().16b, \x1\().16b, v17.16b 231 eor \x2\().16b, \x2\().16b, v18.16b 232 eor \x3\().16b, \x3\().16b, v19.16b 233 eor \x4\().16b, \x4\().16b, v20.16b 234 eor \x5\().16b, \x5\().16b, v21.16b 235 eor \x6\().16b, \x6\().16b, v22.16b 236 eor \x7\().16b, \x7\().16b, v23.16b 237 .endm 238 239 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 240 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 241 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 242 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 243 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 244 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 245 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 246 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 247 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 248 .endm 249 250 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 251 t0, t1, t2, t3, t4, t5, t6, t7, inv 252 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 253 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 254 eor \x0\().16b, \x0\().16b, \t0\().16b 255 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 256 eor \x1\().16b, \x1\().16b, \t1\().16b 257 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 258 eor \x2\().16b, \x2\().16b, \t2\().16b 259 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 260 eor \x3\().16b, \x3\().16b, \t3\().16b 261 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 262 eor \x4\().16b, \x4\().16b, \t4\().16b 263 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 264 eor \x5\().16b, \x5\().16b, \t5\().16b 265 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 266 eor \x6\().16b, \x6\().16b, \t6\().16b 267 eor \t1\().16b, \t1\().16b, \x0\().16b 268 eor \x7\().16b, \x7\().16b, \t7\().16b 269 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 270 eor \t2\().16b, \t2\().16b, \x1\().16b 271 eor \t0\().16b, \t0\().16b, \x7\().16b 272 eor \t1\().16b, \t1\().16b, \x7\().16b 273 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 274 eor \t5\().16b, \t5\().16b, \x4\().16b 275 eor \x0\().16b, \x0\().16b, \t0\().16b 276 eor \t6\().16b, \t6\().16b, \x5\().16b 277 eor \x1\().16b, \x1\().16b, \t1\().16b 278 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 279 eor \t4\().16b, \t4\().16b, \x3\().16b 280 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 281 eor \t7\().16b, \t7\().16b, \x6\().16b 282 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 283 eor \t3\().16b, \t3\().16b, \x2\().16b 284 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 285 eor \t4\().16b, \t4\().16b, \x7\().16b 286 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 287 eor \t3\().16b, \t3\().16b, \x7\().16b 288 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 289 eor \x7\().16b, \t1\().16b, \t5\().16b 290 .ifb \inv 291 eor \x2\().16b, \t0\().16b, \t4\().16b 292 eor \x4\().16b, \x4\().16b, \t3\().16b 293 eor \x5\().16b, \x5\().16b, \t7\().16b 294 eor \x3\().16b, \x3\().16b, \t6\().16b 295 eor \x6\().16b, \x6\().16b, \t2\().16b 296 .else 297 eor \t3\().16b, \t3\().16b, \x4\().16b 298 eor \x5\().16b, \x5\().16b, \t7\().16b 299 eor \x2\().16b, \x3\().16b, \t6\().16b 300 eor \x3\().16b, \t0\().16b, \t4\().16b 301 eor \x4\().16b, \x6\().16b, \t2\().16b 302 mov \x6\().16b, \t3\().16b 303 .endif 304 .endm 305 306 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 307 t0, t1, t2, t3, t4, t5, t6, t7 308 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 309 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 310 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 311 eor \t0\().16b, \t0\().16b, \x0\().16b 312 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 313 eor \t6\().16b, \t6\().16b, \x6\().16b 314 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 315 eor \t7\().16b, \t7\().16b, \x7\().16b 316 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 317 eor \t1\().16b, \t1\().16b, \x1\().16b 318 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 319 eor \t2\().16b, \t2\().16b, \x2\().16b 320 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 321 eor \t3\().16b, \t3\().16b, \x3\().16b 322 eor \t4\().16b, \t4\().16b, \x4\().16b 323 eor \t5\().16b, \t5\().16b, \x5\().16b 324 eor \x0\().16b, \x0\().16b, \t6\().16b 325 eor \x1\().16b, \x1\().16b, \t6\().16b 326 eor \x2\().16b, \x2\().16b, \t0\().16b 327 eor \x4\().16b, \x4\().16b, \t2\().16b 328 eor \x3\().16b, \x3\().16b, \t1\().16b 329 eor \x1\().16b, \x1\().16b, \t7\().16b 330 eor \x2\().16b, \x2\().16b, \t7\().16b 331 eor \x4\().16b, \x4\().16b, \t6\().16b 332 eor \x5\().16b, \x5\().16b, \t3\().16b 333 eor \x3\().16b, \x3\().16b, \t6\().16b 334 eor \x6\().16b, \x6\().16b, \t4\().16b 335 eor \x4\().16b, \x4\().16b, \t7\().16b 336 eor \x5\().16b, \x5\().16b, \t7\().16b 337 eor \x7\().16b, \x7\().16b, \t5\().16b 338 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 339 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 340 .endm 341 342 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 343 ushr \t0\().2d, \b0\().2d, #\n 344 ushr \t1\().2d, \b1\().2d, #\n 345 eor \t0\().16b, \t0\().16b, \a0\().16b 346 eor \t1\().16b, \t1\().16b, \a1\().16b 347 and \t0\().16b, \t0\().16b, \mask\().16b 348 and \t1\().16b, \t1\().16b, \mask\().16b 349 eor \a0\().16b, \a0\().16b, \t0\().16b 350 shl \t0\().2d, \t0\().2d, #\n 351 eor \a1\().16b, \a1\().16b, \t1\().16b 352 shl \t1\().2d, \t1\().2d, #\n 353 eor \b0\().16b, \b0\().16b, \t0\().16b 354 eor \b1\().16b, \b1\().16b, \t1\().16b 355 .endm 356 357 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 358 movi \t0\().16b, #0x55 359 movi \t1\().16b, #0x33 360 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 361 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 362 movi \t0\().16b, #0x0f 363 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 364 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 365 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 366 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 367 .endm 368 369 370 .align 6 371M0: .octa 0x0004080c0105090d02060a0e03070b0f 372 373M0SR: .octa 0x0004080c05090d010a0e02060f03070b 374SR: .octa 0x0f0e0d0c0a09080b0504070600030201 375SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 376 377M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 378ISR: .octa 0x0f0e0d0c080b0a090504070602010003 379ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 380 381 /* 382 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 383 */ 384SYM_FUNC_START(aesbs_convert_key) 385 ld1 {v7.4s}, [x1], #16 // load round 0 key 386 ld1 {v17.4s}, [x1], #16 // load round 1 key 387 388 movi v8.16b, #0x01 // bit masks 389 movi v9.16b, #0x02 390 movi v10.16b, #0x04 391 movi v11.16b, #0x08 392 movi v12.16b, #0x10 393 movi v13.16b, #0x20 394 movi v14.16b, #0x40 395 movi v15.16b, #0x80 396 ldr q16, M0 397 398 sub x2, x2, #1 399 str q7, [x0], #16 // save round 0 key 400 401.Lkey_loop: 402 tbl v7.16b ,{v17.16b}, v16.16b 403 ld1 {v17.4s}, [x1], #16 // load next round key 404 405 cmtst v0.16b, v7.16b, v8.16b 406 cmtst v1.16b, v7.16b, v9.16b 407 cmtst v2.16b, v7.16b, v10.16b 408 cmtst v3.16b, v7.16b, v11.16b 409 cmtst v4.16b, v7.16b, v12.16b 410 cmtst v5.16b, v7.16b, v13.16b 411 cmtst v6.16b, v7.16b, v14.16b 412 cmtst v7.16b, v7.16b, v15.16b 413 not v0.16b, v0.16b 414 not v1.16b, v1.16b 415 not v5.16b, v5.16b 416 not v6.16b, v6.16b 417 418 subs x2, x2, #1 419 stp q0, q1, [x0], #128 420 stp q2, q3, [x0, #-96] 421 stp q4, q5, [x0, #-64] 422 stp q6, q7, [x0, #-32] 423 b.ne .Lkey_loop 424 425 movi v7.16b, #0x63 // compose .L63 426 eor v17.16b, v17.16b, v7.16b 427 str q17, [x0] 428 ret 429SYM_FUNC_END(aesbs_convert_key) 430 431 .align 4 432SYM_FUNC_START_LOCAL(aesbs_encrypt8) 433 ldr q9, [bskey], #16 // round 0 key 434 ldr q8, M0SR 435 ldr q24, SR 436 437 eor v10.16b, v0.16b, v9.16b // xor with round0 key 438 eor v11.16b, v1.16b, v9.16b 439 tbl v0.16b, {v10.16b}, v8.16b 440 eor v12.16b, v2.16b, v9.16b 441 tbl v1.16b, {v11.16b}, v8.16b 442 eor v13.16b, v3.16b, v9.16b 443 tbl v2.16b, {v12.16b}, v8.16b 444 eor v14.16b, v4.16b, v9.16b 445 tbl v3.16b, {v13.16b}, v8.16b 446 eor v15.16b, v5.16b, v9.16b 447 tbl v4.16b, {v14.16b}, v8.16b 448 eor v10.16b, v6.16b, v9.16b 449 tbl v5.16b, {v15.16b}, v8.16b 450 eor v11.16b, v7.16b, v9.16b 451 tbl v6.16b, {v10.16b}, v8.16b 452 tbl v7.16b, {v11.16b}, v8.16b 453 454 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 455 456 sub rounds, rounds, #1 457 b .Lenc_sbox 458 459.Lenc_loop: 460 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 461.Lenc_sbox: 462 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 463 v13, v14, v15 464 subs rounds, rounds, #1 465 b.cc .Lenc_done 466 467 enc_next_rk 468 469 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 470 v13, v14, v15 471 472 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 473 474 b.ne .Lenc_loop 475 ldr q24, SRM0 476 b .Lenc_loop 477 478.Lenc_done: 479 ldr q12, [bskey] // last round key 480 481 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 482 483 eor v0.16b, v0.16b, v12.16b 484 eor v1.16b, v1.16b, v12.16b 485 eor v4.16b, v4.16b, v12.16b 486 eor v6.16b, v6.16b, v12.16b 487 eor v3.16b, v3.16b, v12.16b 488 eor v7.16b, v7.16b, v12.16b 489 eor v2.16b, v2.16b, v12.16b 490 eor v5.16b, v5.16b, v12.16b 491 ret 492SYM_FUNC_END(aesbs_encrypt8) 493 494 .align 4 495SYM_FUNC_START_LOCAL(aesbs_decrypt8) 496 lsl x9, rounds, #7 497 add bskey, bskey, x9 498 499 ldr q9, [bskey, #-112]! // round 0 key 500 ldr q8, M0ISR 501 ldr q24, ISR 502 503 eor v10.16b, v0.16b, v9.16b // xor with round0 key 504 eor v11.16b, v1.16b, v9.16b 505 tbl v0.16b, {v10.16b}, v8.16b 506 eor v12.16b, v2.16b, v9.16b 507 tbl v1.16b, {v11.16b}, v8.16b 508 eor v13.16b, v3.16b, v9.16b 509 tbl v2.16b, {v12.16b}, v8.16b 510 eor v14.16b, v4.16b, v9.16b 511 tbl v3.16b, {v13.16b}, v8.16b 512 eor v15.16b, v5.16b, v9.16b 513 tbl v4.16b, {v14.16b}, v8.16b 514 eor v10.16b, v6.16b, v9.16b 515 tbl v5.16b, {v15.16b}, v8.16b 516 eor v11.16b, v7.16b, v9.16b 517 tbl v6.16b, {v10.16b}, v8.16b 518 tbl v7.16b, {v11.16b}, v8.16b 519 520 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 521 522 sub rounds, rounds, #1 523 b .Ldec_sbox 524 525.Ldec_loop: 526 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 527.Ldec_sbox: 528 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 529 v13, v14, v15 530 subs rounds, rounds, #1 531 b.cc .Ldec_done 532 533 dec_next_rk 534 535 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 536 537 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 538 v13, v14, v15 539 540 b.ne .Ldec_loop 541 ldr q24, ISRM0 542 b .Ldec_loop 543.Ldec_done: 544 ldr q12, [bskey, #-16] // last round key 545 546 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 547 548 eor v0.16b, v0.16b, v12.16b 549 eor v1.16b, v1.16b, v12.16b 550 eor v6.16b, v6.16b, v12.16b 551 eor v4.16b, v4.16b, v12.16b 552 eor v2.16b, v2.16b, v12.16b 553 eor v7.16b, v7.16b, v12.16b 554 eor v3.16b, v3.16b, v12.16b 555 eor v5.16b, v5.16b, v12.16b 556 ret 557SYM_FUNC_END(aesbs_decrypt8) 558 559 /* 560 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 561 * int blocks) 562 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 563 * int blocks) 564 */ 565 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 566 frame_push 5 567 568 mov x19, x0 569 mov x20, x1 570 mov x21, x2 571 mov x22, x3 572 mov x23, x4 573 57499: mov x5, #1 575 lsl x5, x5, x23 576 subs w23, w23, #8 577 csel x23, x23, xzr, pl 578 csel x5, x5, xzr, mi 579 580 ld1 {v0.16b}, [x20], #16 581 tbnz x5, #1, 0f 582 ld1 {v1.16b}, [x20], #16 583 tbnz x5, #2, 0f 584 ld1 {v2.16b}, [x20], #16 585 tbnz x5, #3, 0f 586 ld1 {v3.16b}, [x20], #16 587 tbnz x5, #4, 0f 588 ld1 {v4.16b}, [x20], #16 589 tbnz x5, #5, 0f 590 ld1 {v5.16b}, [x20], #16 591 tbnz x5, #6, 0f 592 ld1 {v6.16b}, [x20], #16 593 tbnz x5, #7, 0f 594 ld1 {v7.16b}, [x20], #16 595 5960: mov bskey, x21 597 mov rounds, x22 598 bl \do8 599 600 st1 {\o0\().16b}, [x19], #16 601 tbnz x5, #1, 1f 602 st1 {\o1\().16b}, [x19], #16 603 tbnz x5, #2, 1f 604 st1 {\o2\().16b}, [x19], #16 605 tbnz x5, #3, 1f 606 st1 {\o3\().16b}, [x19], #16 607 tbnz x5, #4, 1f 608 st1 {\o4\().16b}, [x19], #16 609 tbnz x5, #5, 1f 610 st1 {\o5\().16b}, [x19], #16 611 tbnz x5, #6, 1f 612 st1 {\o6\().16b}, [x19], #16 613 tbnz x5, #7, 1f 614 st1 {\o7\().16b}, [x19], #16 615 616 cbz x23, 1f 617 b 99b 618 6191: frame_pop 620 ret 621 .endm 622 623 .align 4 624SYM_TYPED_FUNC_START(aesbs_ecb_encrypt) 625 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 626SYM_FUNC_END(aesbs_ecb_encrypt) 627 628 .align 4 629SYM_TYPED_FUNC_START(aesbs_ecb_decrypt) 630 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 631SYM_FUNC_END(aesbs_ecb_decrypt) 632 633 /* 634 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 635 * int blocks, u8 iv[]) 636 */ 637 .align 4 638SYM_FUNC_START(aesbs_cbc_decrypt) 639 frame_push 6 640 641 mov x19, x0 642 mov x20, x1 643 mov x21, x2 644 mov x22, x3 645 mov x23, x4 646 mov x24, x5 647 64899: mov x6, #1 649 lsl x6, x6, x23 650 subs w23, w23, #8 651 csel x23, x23, xzr, pl 652 csel x6, x6, xzr, mi 653 654 ld1 {v0.16b}, [x20], #16 655 mov v25.16b, v0.16b 656 tbnz x6, #1, 0f 657 ld1 {v1.16b}, [x20], #16 658 mov v26.16b, v1.16b 659 tbnz x6, #2, 0f 660 ld1 {v2.16b}, [x20], #16 661 mov v27.16b, v2.16b 662 tbnz x6, #3, 0f 663 ld1 {v3.16b}, [x20], #16 664 mov v28.16b, v3.16b 665 tbnz x6, #4, 0f 666 ld1 {v4.16b}, [x20], #16 667 mov v29.16b, v4.16b 668 tbnz x6, #5, 0f 669 ld1 {v5.16b}, [x20], #16 670 mov v30.16b, v5.16b 671 tbnz x6, #6, 0f 672 ld1 {v6.16b}, [x20], #16 673 mov v31.16b, v6.16b 674 tbnz x6, #7, 0f 675 ld1 {v7.16b}, [x20] 676 6770: mov bskey, x21 678 mov rounds, x22 679 bl aesbs_decrypt8 680 681 ld1 {v24.16b}, [x24] // load IV 682 683 eor v1.16b, v1.16b, v25.16b 684 eor v6.16b, v6.16b, v26.16b 685 eor v4.16b, v4.16b, v27.16b 686 eor v2.16b, v2.16b, v28.16b 687 eor v7.16b, v7.16b, v29.16b 688 eor v0.16b, v0.16b, v24.16b 689 eor v3.16b, v3.16b, v30.16b 690 eor v5.16b, v5.16b, v31.16b 691 692 st1 {v0.16b}, [x19], #16 693 mov v24.16b, v25.16b 694 tbnz x6, #1, 1f 695 st1 {v1.16b}, [x19], #16 696 mov v24.16b, v26.16b 697 tbnz x6, #2, 1f 698 st1 {v6.16b}, [x19], #16 699 mov v24.16b, v27.16b 700 tbnz x6, #3, 1f 701 st1 {v4.16b}, [x19], #16 702 mov v24.16b, v28.16b 703 tbnz x6, #4, 1f 704 st1 {v2.16b}, [x19], #16 705 mov v24.16b, v29.16b 706 tbnz x6, #5, 1f 707 st1 {v7.16b}, [x19], #16 708 mov v24.16b, v30.16b 709 tbnz x6, #6, 1f 710 st1 {v3.16b}, [x19], #16 711 mov v24.16b, v31.16b 712 tbnz x6, #7, 1f 713 ld1 {v24.16b}, [x20], #16 714 st1 {v5.16b}, [x19], #16 7151: st1 {v24.16b}, [x24] // store IV 716 717 cbz x23, 2f 718 b 99b 719 7202: frame_pop 721 ret 722SYM_FUNC_END(aesbs_cbc_decrypt) 723 724 .macro next_tweak, out, in, const, tmp 725 sshr \tmp\().2d, \in\().2d, #63 726 and \tmp\().16b, \tmp\().16b, \const\().16b 727 add \out\().2d, \in\().2d, \in\().2d 728 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 729 eor \out\().16b, \out\().16b, \tmp\().16b 730 .endm 731 732 /* 733 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 734 * int blocks, u8 iv[]) 735 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 736 * int blocks, u8 iv[]) 737 */ 738SYM_FUNC_START_LOCAL(__xts_crypt8) 739 movi v18.2s, #0x1 740 movi v19.2s, #0x87 741 uzp1 v18.4s, v18.4s, v19.4s 742 743 ld1 {v0.16b-v3.16b}, [x1], #64 744 ld1 {v4.16b-v7.16b}, [x1], #64 745 746 next_tweak v26, v25, v18, v19 747 next_tweak v27, v26, v18, v19 748 next_tweak v28, v27, v18, v19 749 next_tweak v29, v28, v18, v19 750 next_tweak v30, v29, v18, v19 751 next_tweak v31, v30, v18, v19 752 next_tweak v16, v31, v18, v19 753 next_tweak v17, v16, v18, v19 754 755 eor v0.16b, v0.16b, v25.16b 756 eor v1.16b, v1.16b, v26.16b 757 eor v2.16b, v2.16b, v27.16b 758 eor v3.16b, v3.16b, v28.16b 759 eor v4.16b, v4.16b, v29.16b 760 eor v5.16b, v5.16b, v30.16b 761 eor v6.16b, v6.16b, v31.16b 762 eor v7.16b, v7.16b, v16.16b 763 764 stp q16, q17, [x6] 765 766 mov bskey, x2 767 mov rounds, x3 768 br x16 769SYM_FUNC_END(__xts_crypt8) 770 771 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 772 frame_push 0, 32 773 add x6, sp, #.Lframe_local_offset 774 775 ld1 {v25.16b}, [x5] 776 7770: adr x16, \do8 778 bl __xts_crypt8 779 780 eor v16.16b, \o0\().16b, v25.16b 781 eor v17.16b, \o1\().16b, v26.16b 782 eor v18.16b, \o2\().16b, v27.16b 783 eor v19.16b, \o3\().16b, v28.16b 784 785 ldp q24, q25, [x6] 786 787 eor v20.16b, \o4\().16b, v29.16b 788 eor v21.16b, \o5\().16b, v30.16b 789 eor v22.16b, \o6\().16b, v31.16b 790 eor v23.16b, \o7\().16b, v24.16b 791 792 st1 {v16.16b-v19.16b}, [x0], #64 793 st1 {v20.16b-v23.16b}, [x0], #64 794 795 subs x4, x4, #8 796 b.gt 0b 797 798 st1 {v25.16b}, [x5] 799 frame_pop 800 ret 801 .endm 802 803SYM_TYPED_FUNC_START(aesbs_xts_encrypt) 804 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 805SYM_FUNC_END(aesbs_xts_encrypt) 806 807SYM_TYPED_FUNC_START(aesbs_xts_decrypt) 808 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 809SYM_FUNC_END(aesbs_xts_decrypt) 810 811 .macro next_ctr, v 812 mov \v\().d[1], x8 813 adds x8, x8, #1 814 mov \v\().d[0], x7 815 adc x7, x7, xzr 816 rev64 \v\().16b, \v\().16b 817 .endm 818 819 /* 820 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 821 * int rounds, int blocks, u8 iv[]) 822 */ 823SYM_FUNC_START(aesbs_ctr_encrypt) 824 frame_push 0 825 ldp x7, x8, [x5] 826 ld1 {v0.16b}, [x5] 827CPU_LE( rev x7, x7 ) 828CPU_LE( rev x8, x8 ) 829 adds x8, x8, #1 830 adc x7, x7, xzr 831 8320: next_ctr v1 833 next_ctr v2 834 next_ctr v3 835 next_ctr v4 836 next_ctr v5 837 next_ctr v6 838 next_ctr v7 839 840 mov bskey, x2 841 mov rounds, x3 842 bl aesbs_encrypt8 843 844 ld1 { v8.16b-v11.16b}, [x1], #64 845 ld1 {v12.16b-v15.16b}, [x1], #64 846 847 eor v8.16b, v0.16b, v8.16b 848 eor v9.16b, v1.16b, v9.16b 849 eor v10.16b, v4.16b, v10.16b 850 eor v11.16b, v6.16b, v11.16b 851 eor v12.16b, v3.16b, v12.16b 852 eor v13.16b, v7.16b, v13.16b 853 eor v14.16b, v2.16b, v14.16b 854 eor v15.16b, v5.16b, v15.16b 855 856 st1 { v8.16b-v11.16b}, [x0], #64 857 st1 {v12.16b-v15.16b}, [x0], #64 858 859 next_ctr v0 860 subs x4, x4, #8 861 b.gt 0b 862 863 st1 {v0.16b}, [x5] 864 frame_pop 865 ret 866SYM_FUNC_END(aesbs_ctr_encrypt) 867