1/* SPDX-License-Identifier: GPL-2.0-only */ 2/* 3 * Bit sliced AES using NEON instructions 4 * 5 * Copyright (C) 2016 Linaro Ltd <ard.biesheuvel@linaro.org> 6 */ 7 8/* 9 * The algorithm implemented here is described in detail by the paper 10 * 'Faster and Timing-Attack Resistant AES-GCM' by Emilia Kaesper and 11 * Peter Schwabe (https://eprint.iacr.org/2009/129.pdf) 12 * 13 * This implementation is based primarily on the OpenSSL implementation 14 * for 32-bit ARM written by Andy Polyakov <appro@openssl.org> 15 */ 16 17#include <linux/linkage.h> 18#include <asm/assembler.h> 19 20 .text 21 22 rounds .req x11 23 bskey .req x12 24 25 .macro in_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 26 eor \b2, \b2, \b1 27 eor \b5, \b5, \b6 28 eor \b3, \b3, \b0 29 eor \b6, \b6, \b2 30 eor \b5, \b5, \b0 31 eor \b6, \b6, \b3 32 eor \b3, \b3, \b7 33 eor \b7, \b7, \b5 34 eor \b3, \b3, \b4 35 eor \b4, \b4, \b5 36 eor \b2, \b2, \b7 37 eor \b3, \b3, \b1 38 eor \b1, \b1, \b5 39 .endm 40 41 .macro out_bs_ch, b0, b1, b2, b3, b4, b5, b6, b7 42 eor \b0, \b0, \b6 43 eor \b1, \b1, \b4 44 eor \b4, \b4, \b6 45 eor \b2, \b2, \b0 46 eor \b6, \b6, \b1 47 eor \b1, \b1, \b5 48 eor \b5, \b5, \b3 49 eor \b3, \b3, \b7 50 eor \b7, \b7, \b5 51 eor \b2, \b2, \b5 52 eor \b4, \b4, \b7 53 .endm 54 55 .macro inv_in_bs_ch, b6, b1, b2, b4, b7, b0, b3, b5 56 eor \b1, \b1, \b7 57 eor \b4, \b4, \b7 58 eor \b7, \b7, \b5 59 eor \b1, \b1, \b3 60 eor \b2, \b2, \b5 61 eor \b3, \b3, \b7 62 eor \b6, \b6, \b1 63 eor \b2, \b2, \b0 64 eor \b5, \b5, \b3 65 eor \b4, \b4, \b6 66 eor \b0, \b0, \b6 67 eor \b1, \b1, \b4 68 .endm 69 70 .macro inv_out_bs_ch, b6, b5, b0, b3, b7, b1, b4, b2 71 eor \b1, \b1, \b5 72 eor \b2, \b2, \b7 73 eor \b3, \b3, \b1 74 eor \b4, \b4, \b5 75 eor \b7, \b7, \b5 76 eor \b3, \b3, \b4 77 eor \b5, \b5, \b0 78 eor \b3, \b3, \b7 79 eor \b6, \b6, \b2 80 eor \b2, \b2, \b1 81 eor \b6, \b6, \b3 82 eor \b3, \b3, \b0 83 eor \b5, \b5, \b6 84 .endm 85 86 .macro mul_gf4, x0, x1, y0, y1, t0, t1 87 eor \t0, \y0, \y1 88 and \t0, \t0, \x0 89 eor \x0, \x0, \x1 90 and \t1, \x1, \y0 91 and \x0, \x0, \y1 92 eor \x1, \t1, \t0 93 eor \x0, \x0, \t1 94 .endm 95 96 .macro mul_gf4_n_gf4, x0, x1, y0, y1, t0, x2, x3, y2, y3, t1 97 eor \t0, \y0, \y1 98 eor \t1, \y2, \y3 99 and \t0, \t0, \x0 100 and \t1, \t1, \x2 101 eor \x0, \x0, \x1 102 eor \x2, \x2, \x3 103 and \x1, \x1, \y0 104 and \x3, \x3, \y2 105 and \x0, \x0, \y1 106 and \x2, \x2, \y3 107 eor \x1, \x1, \x0 108 eor \x2, \x2, \x3 109 eor \x0, \x0, \t0 110 eor \x3, \x3, \t1 111 .endm 112 113 .macro mul_gf16_2, x0, x1, x2, x3, x4, x5, x6, x7, \ 114 y0, y1, y2, y3, t0, t1, t2, t3 115 eor \t0, \x0, \x2 116 eor \t1, \x1, \x3 117 mul_gf4 \x0, \x1, \y0, \y1, \t2, \t3 118 eor \y0, \y0, \y2 119 eor \y1, \y1, \y3 120 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x2, \x3, \y2, \y3, \t2 121 eor \x0, \x0, \t0 122 eor \x2, \x2, \t0 123 eor \x1, \x1, \t1 124 eor \x3, \x3, \t1 125 eor \t0, \x4, \x6 126 eor \t1, \x5, \x7 127 mul_gf4_n_gf4 \t0, \t1, \y0, \y1, \t3, \x6, \x7, \y2, \y3, \t2 128 eor \y0, \y0, \y2 129 eor \y1, \y1, \y3 130 mul_gf4 \x4, \x5, \y0, \y1, \t2, \t3 131 eor \x4, \x4, \t0 132 eor \x6, \x6, \t0 133 eor \x5, \x5, \t1 134 eor \x7, \x7, \t1 135 .endm 136 137 .macro inv_gf256, x0, x1, x2, x3, x4, x5, x6, x7, \ 138 t0, t1, t2, t3, s0, s1, s2, s3 139 eor \t3, \x4, \x6 140 eor \t0, \x5, \x7 141 eor \t1, \x1, \x3 142 eor \s1, \x7, \x6 143 eor \s0, \x0, \x2 144 eor \s3, \t3, \t0 145 orr \t2, \t0, \t1 146 and \s2, \t3, \s0 147 orr \t3, \t3, \s0 148 eor \s0, \s0, \t1 149 and \t0, \t0, \t1 150 eor \t1, \x3, \x2 151 and \s3, \s3, \s0 152 and \s1, \s1, \t1 153 eor \t1, \x4, \x5 154 eor \s0, \x1, \x0 155 eor \t3, \t3, \s1 156 eor \t2, \t2, \s1 157 and \s1, \t1, \s0 158 orr \t1, \t1, \s0 159 eor \t3, \t3, \s3 160 eor \t0, \t0, \s1 161 eor \t2, \t2, \s2 162 eor \t1, \t1, \s3 163 eor \t0, \t0, \s2 164 and \s0, \x7, \x3 165 eor \t1, \t1, \s2 166 and \s1, \x6, \x2 167 and \s2, \x5, \x1 168 orr \s3, \x4, \x0 169 eor \t3, \t3, \s0 170 eor \t1, \t1, \s2 171 eor \s0, \t0, \s3 172 eor \t2, \t2, \s1 173 and \s2, \t3, \t1 174 eor \s1, \t2, \s2 175 eor \s3, \s0, \s2 176 bsl \s1, \t1, \s0 177 not \t0, \s0 178 bsl \s0, \s1, \s3 179 bsl \t0, \s1, \s3 180 bsl \s3, \t3, \t2 181 eor \t3, \t3, \t2 182 and \s2, \s0, \s3 183 eor \t1, \t1, \t0 184 eor \s2, \s2, \t3 185 mul_gf16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 186 \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3 187 .endm 188 189 .macro sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 190 t0, t1, t2, t3, s0, s1, s2, s3 191 in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 192 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 193 inv_gf256 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b, \ 194 \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 195 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 196 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 197 out_bs_ch \b7\().16b, \b1\().16b, \b4\().16b, \b2\().16b, \ 198 \b6\().16b, \b5\().16b, \b0\().16b, \b3\().16b 199 .endm 200 201 .macro inv_sbox, b0, b1, b2, b3, b4, b5, b6, b7, \ 202 t0, t1, t2, t3, s0, s1, s2, s3 203 inv_in_bs_ch \b0\().16b, \b1\().16b, \b2\().16b, \b3\().16b, \ 204 \b4\().16b, \b5\().16b, \b6\().16b, \b7\().16b 205 inv_gf256 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b, \ 206 \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 207 \t0\().16b, \t1\().16b, \t2\().16b, \t3\().16b, \ 208 \s0\().16b, \s1\().16b, \s2\().16b, \s3\().16b 209 inv_out_bs_ch \b3\().16b, \b7\().16b, \b0\().16b, \b4\().16b, \ 210 \b5\().16b, \b1\().16b, \b2\().16b, \b6\().16b 211 .endm 212 213 .macro enc_next_rk 214 ldp q16, q17, [bskey], #128 215 ldp q18, q19, [bskey, #-96] 216 ldp q20, q21, [bskey, #-64] 217 ldp q22, q23, [bskey, #-32] 218 .endm 219 220 .macro dec_next_rk 221 ldp q16, q17, [bskey, #-128]! 222 ldp q18, q19, [bskey, #32] 223 ldp q20, q21, [bskey, #64] 224 ldp q22, q23, [bskey, #96] 225 .endm 226 227 .macro add_round_key, x0, x1, x2, x3, x4, x5, x6, x7 228 eor \x0\().16b, \x0\().16b, v16.16b 229 eor \x1\().16b, \x1\().16b, v17.16b 230 eor \x2\().16b, \x2\().16b, v18.16b 231 eor \x3\().16b, \x3\().16b, v19.16b 232 eor \x4\().16b, \x4\().16b, v20.16b 233 eor \x5\().16b, \x5\().16b, v21.16b 234 eor \x6\().16b, \x6\().16b, v22.16b 235 eor \x7\().16b, \x7\().16b, v23.16b 236 .endm 237 238 .macro shift_rows, x0, x1, x2, x3, x4, x5, x6, x7, mask 239 tbl \x0\().16b, {\x0\().16b}, \mask\().16b 240 tbl \x1\().16b, {\x1\().16b}, \mask\().16b 241 tbl \x2\().16b, {\x2\().16b}, \mask\().16b 242 tbl \x3\().16b, {\x3\().16b}, \mask\().16b 243 tbl \x4\().16b, {\x4\().16b}, \mask\().16b 244 tbl \x5\().16b, {\x5\().16b}, \mask\().16b 245 tbl \x6\().16b, {\x6\().16b}, \mask\().16b 246 tbl \x7\().16b, {\x7\().16b}, \mask\().16b 247 .endm 248 249 .macro mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 250 t0, t1, t2, t3, t4, t5, t6, t7, inv 251 ext \t0\().16b, \x0\().16b, \x0\().16b, #12 252 ext \t1\().16b, \x1\().16b, \x1\().16b, #12 253 eor \x0\().16b, \x0\().16b, \t0\().16b 254 ext \t2\().16b, \x2\().16b, \x2\().16b, #12 255 eor \x1\().16b, \x1\().16b, \t1\().16b 256 ext \t3\().16b, \x3\().16b, \x3\().16b, #12 257 eor \x2\().16b, \x2\().16b, \t2\().16b 258 ext \t4\().16b, \x4\().16b, \x4\().16b, #12 259 eor \x3\().16b, \x3\().16b, \t3\().16b 260 ext \t5\().16b, \x5\().16b, \x5\().16b, #12 261 eor \x4\().16b, \x4\().16b, \t4\().16b 262 ext \t6\().16b, \x6\().16b, \x6\().16b, #12 263 eor \x5\().16b, \x5\().16b, \t5\().16b 264 ext \t7\().16b, \x7\().16b, \x7\().16b, #12 265 eor \x6\().16b, \x6\().16b, \t6\().16b 266 eor \t1\().16b, \t1\().16b, \x0\().16b 267 eor \x7\().16b, \x7\().16b, \t7\().16b 268 ext \x0\().16b, \x0\().16b, \x0\().16b, #8 269 eor \t2\().16b, \t2\().16b, \x1\().16b 270 eor \t0\().16b, \t0\().16b, \x7\().16b 271 eor \t1\().16b, \t1\().16b, \x7\().16b 272 ext \x1\().16b, \x1\().16b, \x1\().16b, #8 273 eor \t5\().16b, \t5\().16b, \x4\().16b 274 eor \x0\().16b, \x0\().16b, \t0\().16b 275 eor \t6\().16b, \t6\().16b, \x5\().16b 276 eor \x1\().16b, \x1\().16b, \t1\().16b 277 ext \t0\().16b, \x4\().16b, \x4\().16b, #8 278 eor \t4\().16b, \t4\().16b, \x3\().16b 279 ext \t1\().16b, \x5\().16b, \x5\().16b, #8 280 eor \t7\().16b, \t7\().16b, \x6\().16b 281 ext \x4\().16b, \x3\().16b, \x3\().16b, #8 282 eor \t3\().16b, \t3\().16b, \x2\().16b 283 ext \x5\().16b, \x7\().16b, \x7\().16b, #8 284 eor \t4\().16b, \t4\().16b, \x7\().16b 285 ext \x3\().16b, \x6\().16b, \x6\().16b, #8 286 eor \t3\().16b, \t3\().16b, \x7\().16b 287 ext \x6\().16b, \x2\().16b, \x2\().16b, #8 288 eor \x7\().16b, \t1\().16b, \t5\().16b 289 .ifb \inv 290 eor \x2\().16b, \t0\().16b, \t4\().16b 291 eor \x4\().16b, \x4\().16b, \t3\().16b 292 eor \x5\().16b, \x5\().16b, \t7\().16b 293 eor \x3\().16b, \x3\().16b, \t6\().16b 294 eor \x6\().16b, \x6\().16b, \t2\().16b 295 .else 296 eor \t3\().16b, \t3\().16b, \x4\().16b 297 eor \x5\().16b, \x5\().16b, \t7\().16b 298 eor \x2\().16b, \x3\().16b, \t6\().16b 299 eor \x3\().16b, \t0\().16b, \t4\().16b 300 eor \x4\().16b, \x6\().16b, \t2\().16b 301 mov \x6\().16b, \t3\().16b 302 .endif 303 .endm 304 305 .macro inv_mix_cols, x0, x1, x2, x3, x4, x5, x6, x7, \ 306 t0, t1, t2, t3, t4, t5, t6, t7 307 ext \t0\().16b, \x0\().16b, \x0\().16b, #8 308 ext \t6\().16b, \x6\().16b, \x6\().16b, #8 309 ext \t7\().16b, \x7\().16b, \x7\().16b, #8 310 eor \t0\().16b, \t0\().16b, \x0\().16b 311 ext \t1\().16b, \x1\().16b, \x1\().16b, #8 312 eor \t6\().16b, \t6\().16b, \x6\().16b 313 ext \t2\().16b, \x2\().16b, \x2\().16b, #8 314 eor \t7\().16b, \t7\().16b, \x7\().16b 315 ext \t3\().16b, \x3\().16b, \x3\().16b, #8 316 eor \t1\().16b, \t1\().16b, \x1\().16b 317 ext \t4\().16b, \x4\().16b, \x4\().16b, #8 318 eor \t2\().16b, \t2\().16b, \x2\().16b 319 ext \t5\().16b, \x5\().16b, \x5\().16b, #8 320 eor \t3\().16b, \t3\().16b, \x3\().16b 321 eor \t4\().16b, \t4\().16b, \x4\().16b 322 eor \t5\().16b, \t5\().16b, \x5\().16b 323 eor \x0\().16b, \x0\().16b, \t6\().16b 324 eor \x1\().16b, \x1\().16b, \t6\().16b 325 eor \x2\().16b, \x2\().16b, \t0\().16b 326 eor \x4\().16b, \x4\().16b, \t2\().16b 327 eor \x3\().16b, \x3\().16b, \t1\().16b 328 eor \x1\().16b, \x1\().16b, \t7\().16b 329 eor \x2\().16b, \x2\().16b, \t7\().16b 330 eor \x4\().16b, \x4\().16b, \t6\().16b 331 eor \x5\().16b, \x5\().16b, \t3\().16b 332 eor \x3\().16b, \x3\().16b, \t6\().16b 333 eor \x6\().16b, \x6\().16b, \t4\().16b 334 eor \x4\().16b, \x4\().16b, \t7\().16b 335 eor \x5\().16b, \x5\().16b, \t7\().16b 336 eor \x7\().16b, \x7\().16b, \t5\().16b 337 mix_cols \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \ 338 \t0, \t1, \t2, \t3, \t4, \t5, \t6, \t7, 1 339 .endm 340 341 .macro swapmove_2x, a0, b0, a1, b1, n, mask, t0, t1 342 ushr \t0\().2d, \b0\().2d, #\n 343 ushr \t1\().2d, \b1\().2d, #\n 344 eor \t0\().16b, \t0\().16b, \a0\().16b 345 eor \t1\().16b, \t1\().16b, \a1\().16b 346 and \t0\().16b, \t0\().16b, \mask\().16b 347 and \t1\().16b, \t1\().16b, \mask\().16b 348 eor \a0\().16b, \a0\().16b, \t0\().16b 349 shl \t0\().2d, \t0\().2d, #\n 350 eor \a1\().16b, \a1\().16b, \t1\().16b 351 shl \t1\().2d, \t1\().2d, #\n 352 eor \b0\().16b, \b0\().16b, \t0\().16b 353 eor \b1\().16b, \b1\().16b, \t1\().16b 354 .endm 355 356 .macro bitslice, x7, x6, x5, x4, x3, x2, x1, x0, t0, t1, t2, t3 357 movi \t0\().16b, #0x55 358 movi \t1\().16b, #0x33 359 swapmove_2x \x0, \x1, \x2, \x3, 1, \t0, \t2, \t3 360 swapmove_2x \x4, \x5, \x6, \x7, 1, \t0, \t2, \t3 361 movi \t0\().16b, #0x0f 362 swapmove_2x \x0, \x2, \x1, \x3, 2, \t1, \t2, \t3 363 swapmove_2x \x4, \x6, \x5, \x7, 2, \t1, \t2, \t3 364 swapmove_2x \x0, \x4, \x1, \x5, 4, \t0, \t2, \t3 365 swapmove_2x \x2, \x6, \x3, \x7, 4, \t0, \t2, \t3 366 .endm 367 368 369 .align 6 370M0: .octa 0x0004080c0105090d02060a0e03070b0f 371 372M0SR: .octa 0x0004080c05090d010a0e02060f03070b 373SR: .octa 0x0f0e0d0c0a09080b0504070600030201 374SRM0: .octa 0x01060b0c0207080d0304090e00050a0f 375 376M0ISR: .octa 0x0004080c0d0105090a0e0206070b0f03 377ISR: .octa 0x0f0e0d0c080b0a090504070602010003 378ISRM0: .octa 0x0306090c00070a0d01040b0e0205080f 379 380 /* 381 * void aesbs_convert_key(u8 out[], u32 const rk[], int rounds) 382 */ 383ENTRY(aesbs_convert_key) 384 ld1 {v7.4s}, [x1], #16 // load round 0 key 385 ld1 {v17.4s}, [x1], #16 // load round 1 key 386 387 movi v8.16b, #0x01 // bit masks 388 movi v9.16b, #0x02 389 movi v10.16b, #0x04 390 movi v11.16b, #0x08 391 movi v12.16b, #0x10 392 movi v13.16b, #0x20 393 movi v14.16b, #0x40 394 movi v15.16b, #0x80 395 ldr q16, M0 396 397 sub x2, x2, #1 398 str q7, [x0], #16 // save round 0 key 399 400.Lkey_loop: 401 tbl v7.16b ,{v17.16b}, v16.16b 402 ld1 {v17.4s}, [x1], #16 // load next round key 403 404 cmtst v0.16b, v7.16b, v8.16b 405 cmtst v1.16b, v7.16b, v9.16b 406 cmtst v2.16b, v7.16b, v10.16b 407 cmtst v3.16b, v7.16b, v11.16b 408 cmtst v4.16b, v7.16b, v12.16b 409 cmtst v5.16b, v7.16b, v13.16b 410 cmtst v6.16b, v7.16b, v14.16b 411 cmtst v7.16b, v7.16b, v15.16b 412 not v0.16b, v0.16b 413 not v1.16b, v1.16b 414 not v5.16b, v5.16b 415 not v6.16b, v6.16b 416 417 subs x2, x2, #1 418 stp q0, q1, [x0], #128 419 stp q2, q3, [x0, #-96] 420 stp q4, q5, [x0, #-64] 421 stp q6, q7, [x0, #-32] 422 b.ne .Lkey_loop 423 424 movi v7.16b, #0x63 // compose .L63 425 eor v17.16b, v17.16b, v7.16b 426 str q17, [x0] 427 ret 428ENDPROC(aesbs_convert_key) 429 430 .align 4 431aesbs_encrypt8: 432 ldr q9, [bskey], #16 // round 0 key 433 ldr q8, M0SR 434 ldr q24, SR 435 436 eor v10.16b, v0.16b, v9.16b // xor with round0 key 437 eor v11.16b, v1.16b, v9.16b 438 tbl v0.16b, {v10.16b}, v8.16b 439 eor v12.16b, v2.16b, v9.16b 440 tbl v1.16b, {v11.16b}, v8.16b 441 eor v13.16b, v3.16b, v9.16b 442 tbl v2.16b, {v12.16b}, v8.16b 443 eor v14.16b, v4.16b, v9.16b 444 tbl v3.16b, {v13.16b}, v8.16b 445 eor v15.16b, v5.16b, v9.16b 446 tbl v4.16b, {v14.16b}, v8.16b 447 eor v10.16b, v6.16b, v9.16b 448 tbl v5.16b, {v15.16b}, v8.16b 449 eor v11.16b, v7.16b, v9.16b 450 tbl v6.16b, {v10.16b}, v8.16b 451 tbl v7.16b, {v11.16b}, v8.16b 452 453 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 454 455 sub rounds, rounds, #1 456 b .Lenc_sbox 457 458.Lenc_loop: 459 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 460.Lenc_sbox: 461 sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 462 v13, v14, v15 463 subs rounds, rounds, #1 464 b.cc .Lenc_done 465 466 enc_next_rk 467 468 mix_cols v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11, v12, \ 469 v13, v14, v15 470 471 add_round_key v0, v1, v2, v3, v4, v5, v6, v7 472 473 b.ne .Lenc_loop 474 ldr q24, SRM0 475 b .Lenc_loop 476 477.Lenc_done: 478 ldr q12, [bskey] // last round key 479 480 bitslice v0, v1, v4, v6, v3, v7, v2, v5, v8, v9, v10, v11 481 482 eor v0.16b, v0.16b, v12.16b 483 eor v1.16b, v1.16b, v12.16b 484 eor v4.16b, v4.16b, v12.16b 485 eor v6.16b, v6.16b, v12.16b 486 eor v3.16b, v3.16b, v12.16b 487 eor v7.16b, v7.16b, v12.16b 488 eor v2.16b, v2.16b, v12.16b 489 eor v5.16b, v5.16b, v12.16b 490 ret 491ENDPROC(aesbs_encrypt8) 492 493 .align 4 494aesbs_decrypt8: 495 lsl x9, rounds, #7 496 add bskey, bskey, x9 497 498 ldr q9, [bskey, #-112]! // round 0 key 499 ldr q8, M0ISR 500 ldr q24, ISR 501 502 eor v10.16b, v0.16b, v9.16b // xor with round0 key 503 eor v11.16b, v1.16b, v9.16b 504 tbl v0.16b, {v10.16b}, v8.16b 505 eor v12.16b, v2.16b, v9.16b 506 tbl v1.16b, {v11.16b}, v8.16b 507 eor v13.16b, v3.16b, v9.16b 508 tbl v2.16b, {v12.16b}, v8.16b 509 eor v14.16b, v4.16b, v9.16b 510 tbl v3.16b, {v13.16b}, v8.16b 511 eor v15.16b, v5.16b, v9.16b 512 tbl v4.16b, {v14.16b}, v8.16b 513 eor v10.16b, v6.16b, v9.16b 514 tbl v5.16b, {v15.16b}, v8.16b 515 eor v11.16b, v7.16b, v9.16b 516 tbl v6.16b, {v10.16b}, v8.16b 517 tbl v7.16b, {v11.16b}, v8.16b 518 519 bitslice v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11 520 521 sub rounds, rounds, #1 522 b .Ldec_sbox 523 524.Ldec_loop: 525 shift_rows v0, v1, v2, v3, v4, v5, v6, v7, v24 526.Ldec_sbox: 527 inv_sbox v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, \ 528 v13, v14, v15 529 subs rounds, rounds, #1 530 b.cc .Ldec_done 531 532 dec_next_rk 533 534 add_round_key v0, v1, v6, v4, v2, v7, v3, v5 535 536 inv_mix_cols v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11, v12, \ 537 v13, v14, v15 538 539 b.ne .Ldec_loop 540 ldr q24, ISRM0 541 b .Ldec_loop 542.Ldec_done: 543 ldr q12, [bskey, #-16] // last round key 544 545 bitslice v0, v1, v6, v4, v2, v7, v3, v5, v8, v9, v10, v11 546 547 eor v0.16b, v0.16b, v12.16b 548 eor v1.16b, v1.16b, v12.16b 549 eor v6.16b, v6.16b, v12.16b 550 eor v4.16b, v4.16b, v12.16b 551 eor v2.16b, v2.16b, v12.16b 552 eor v7.16b, v7.16b, v12.16b 553 eor v3.16b, v3.16b, v12.16b 554 eor v5.16b, v5.16b, v12.16b 555 ret 556ENDPROC(aesbs_decrypt8) 557 558 /* 559 * aesbs_ecb_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 560 * int blocks) 561 * aesbs_ecb_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 562 * int blocks) 563 */ 564 .macro __ecb_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 565 frame_push 5 566 567 mov x19, x0 568 mov x20, x1 569 mov x21, x2 570 mov x22, x3 571 mov x23, x4 572 57399: mov x5, #1 574 lsl x5, x5, x23 575 subs w23, w23, #8 576 csel x23, x23, xzr, pl 577 csel x5, x5, xzr, mi 578 579 ld1 {v0.16b}, [x20], #16 580 tbnz x5, #1, 0f 581 ld1 {v1.16b}, [x20], #16 582 tbnz x5, #2, 0f 583 ld1 {v2.16b}, [x20], #16 584 tbnz x5, #3, 0f 585 ld1 {v3.16b}, [x20], #16 586 tbnz x5, #4, 0f 587 ld1 {v4.16b}, [x20], #16 588 tbnz x5, #5, 0f 589 ld1 {v5.16b}, [x20], #16 590 tbnz x5, #6, 0f 591 ld1 {v6.16b}, [x20], #16 592 tbnz x5, #7, 0f 593 ld1 {v7.16b}, [x20], #16 594 5950: mov bskey, x21 596 mov rounds, x22 597 bl \do8 598 599 st1 {\o0\().16b}, [x19], #16 600 tbnz x5, #1, 1f 601 st1 {\o1\().16b}, [x19], #16 602 tbnz x5, #2, 1f 603 st1 {\o2\().16b}, [x19], #16 604 tbnz x5, #3, 1f 605 st1 {\o3\().16b}, [x19], #16 606 tbnz x5, #4, 1f 607 st1 {\o4\().16b}, [x19], #16 608 tbnz x5, #5, 1f 609 st1 {\o5\().16b}, [x19], #16 610 tbnz x5, #6, 1f 611 st1 {\o6\().16b}, [x19], #16 612 tbnz x5, #7, 1f 613 st1 {\o7\().16b}, [x19], #16 614 615 cbz x23, 1f 616 cond_yield_neon 617 b 99b 618 6191: frame_pop 620 ret 621 .endm 622 623 .align 4 624ENTRY(aesbs_ecb_encrypt) 625 __ecb_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 626ENDPROC(aesbs_ecb_encrypt) 627 628 .align 4 629ENTRY(aesbs_ecb_decrypt) 630 __ecb_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 631ENDPROC(aesbs_ecb_decrypt) 632 633 /* 634 * aesbs_cbc_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 635 * int blocks, u8 iv[]) 636 */ 637 .align 4 638ENTRY(aesbs_cbc_decrypt) 639 frame_push 6 640 641 mov x19, x0 642 mov x20, x1 643 mov x21, x2 644 mov x22, x3 645 mov x23, x4 646 mov x24, x5 647 64899: mov x6, #1 649 lsl x6, x6, x23 650 subs w23, w23, #8 651 csel x23, x23, xzr, pl 652 csel x6, x6, xzr, mi 653 654 ld1 {v0.16b}, [x20], #16 655 mov v25.16b, v0.16b 656 tbnz x6, #1, 0f 657 ld1 {v1.16b}, [x20], #16 658 mov v26.16b, v1.16b 659 tbnz x6, #2, 0f 660 ld1 {v2.16b}, [x20], #16 661 mov v27.16b, v2.16b 662 tbnz x6, #3, 0f 663 ld1 {v3.16b}, [x20], #16 664 mov v28.16b, v3.16b 665 tbnz x6, #4, 0f 666 ld1 {v4.16b}, [x20], #16 667 mov v29.16b, v4.16b 668 tbnz x6, #5, 0f 669 ld1 {v5.16b}, [x20], #16 670 mov v30.16b, v5.16b 671 tbnz x6, #6, 0f 672 ld1 {v6.16b}, [x20], #16 673 mov v31.16b, v6.16b 674 tbnz x6, #7, 0f 675 ld1 {v7.16b}, [x20] 676 6770: mov bskey, x21 678 mov rounds, x22 679 bl aesbs_decrypt8 680 681 ld1 {v24.16b}, [x24] // load IV 682 683 eor v1.16b, v1.16b, v25.16b 684 eor v6.16b, v6.16b, v26.16b 685 eor v4.16b, v4.16b, v27.16b 686 eor v2.16b, v2.16b, v28.16b 687 eor v7.16b, v7.16b, v29.16b 688 eor v0.16b, v0.16b, v24.16b 689 eor v3.16b, v3.16b, v30.16b 690 eor v5.16b, v5.16b, v31.16b 691 692 st1 {v0.16b}, [x19], #16 693 mov v24.16b, v25.16b 694 tbnz x6, #1, 1f 695 st1 {v1.16b}, [x19], #16 696 mov v24.16b, v26.16b 697 tbnz x6, #2, 1f 698 st1 {v6.16b}, [x19], #16 699 mov v24.16b, v27.16b 700 tbnz x6, #3, 1f 701 st1 {v4.16b}, [x19], #16 702 mov v24.16b, v28.16b 703 tbnz x6, #4, 1f 704 st1 {v2.16b}, [x19], #16 705 mov v24.16b, v29.16b 706 tbnz x6, #5, 1f 707 st1 {v7.16b}, [x19], #16 708 mov v24.16b, v30.16b 709 tbnz x6, #6, 1f 710 st1 {v3.16b}, [x19], #16 711 mov v24.16b, v31.16b 712 tbnz x6, #7, 1f 713 ld1 {v24.16b}, [x20], #16 714 st1 {v5.16b}, [x19], #16 7151: st1 {v24.16b}, [x24] // store IV 716 717 cbz x23, 2f 718 cond_yield_neon 719 b 99b 720 7212: frame_pop 722 ret 723ENDPROC(aesbs_cbc_decrypt) 724 725 .macro next_tweak, out, in, const, tmp 726 sshr \tmp\().2d, \in\().2d, #63 727 and \tmp\().16b, \tmp\().16b, \const\().16b 728 add \out\().2d, \in\().2d, \in\().2d 729 ext \tmp\().16b, \tmp\().16b, \tmp\().16b, #8 730 eor \out\().16b, \out\().16b, \tmp\().16b 731 .endm 732 733 /* 734 * aesbs_xts_encrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 735 * int blocks, u8 iv[]) 736 * aesbs_xts_decrypt(u8 out[], u8 const in[], u8 const rk[], int rounds, 737 * int blocks, u8 iv[]) 738 */ 739__xts_crypt8: 740 mov x6, #1 741 lsl x6, x6, x23 742 subs w23, w23, #8 743 csel x23, x23, xzr, pl 744 csel x6, x6, xzr, mi 745 746 ld1 {v0.16b}, [x20], #16 747 next_tweak v26, v25, v30, v31 748 eor v0.16b, v0.16b, v25.16b 749 tbnz x6, #1, 0f 750 751 ld1 {v1.16b}, [x20], #16 752 next_tweak v27, v26, v30, v31 753 eor v1.16b, v1.16b, v26.16b 754 tbnz x6, #2, 0f 755 756 ld1 {v2.16b}, [x20], #16 757 next_tweak v28, v27, v30, v31 758 eor v2.16b, v2.16b, v27.16b 759 tbnz x6, #3, 0f 760 761 ld1 {v3.16b}, [x20], #16 762 next_tweak v29, v28, v30, v31 763 eor v3.16b, v3.16b, v28.16b 764 tbnz x6, #4, 0f 765 766 ld1 {v4.16b}, [x20], #16 767 str q29, [sp, #.Lframe_local_offset] 768 eor v4.16b, v4.16b, v29.16b 769 next_tweak v29, v29, v30, v31 770 tbnz x6, #5, 0f 771 772 ld1 {v5.16b}, [x20], #16 773 str q29, [sp, #.Lframe_local_offset + 16] 774 eor v5.16b, v5.16b, v29.16b 775 next_tweak v29, v29, v30, v31 776 tbnz x6, #6, 0f 777 778 ld1 {v6.16b}, [x20], #16 779 str q29, [sp, #.Lframe_local_offset + 32] 780 eor v6.16b, v6.16b, v29.16b 781 next_tweak v29, v29, v30, v31 782 tbnz x6, #7, 0f 783 784 ld1 {v7.16b}, [x20], #16 785 str q29, [sp, #.Lframe_local_offset + 48] 786 eor v7.16b, v7.16b, v29.16b 787 next_tweak v29, v29, v30, v31 788 7890: mov bskey, x21 790 mov rounds, x22 791 br x7 792ENDPROC(__xts_crypt8) 793 794 .macro __xts_crypt, do8, o0, o1, o2, o3, o4, o5, o6, o7 795 frame_push 6, 64 796 797 mov x19, x0 798 mov x20, x1 799 mov x21, x2 800 mov x22, x3 801 mov x23, x4 802 mov x24, x5 803 8040: movi v30.2s, #0x1 805 movi v25.2s, #0x87 806 uzp1 v30.4s, v30.4s, v25.4s 807 ld1 {v25.16b}, [x24] 808 80999: adr x7, \do8 810 bl __xts_crypt8 811 812 ldp q16, q17, [sp, #.Lframe_local_offset] 813 ldp q18, q19, [sp, #.Lframe_local_offset + 32] 814 815 eor \o0\().16b, \o0\().16b, v25.16b 816 eor \o1\().16b, \o1\().16b, v26.16b 817 eor \o2\().16b, \o2\().16b, v27.16b 818 eor \o3\().16b, \o3\().16b, v28.16b 819 820 st1 {\o0\().16b}, [x19], #16 821 mov v25.16b, v26.16b 822 tbnz x6, #1, 1f 823 st1 {\o1\().16b}, [x19], #16 824 mov v25.16b, v27.16b 825 tbnz x6, #2, 1f 826 st1 {\o2\().16b}, [x19], #16 827 mov v25.16b, v28.16b 828 tbnz x6, #3, 1f 829 st1 {\o3\().16b}, [x19], #16 830 mov v25.16b, v29.16b 831 tbnz x6, #4, 1f 832 833 eor \o4\().16b, \o4\().16b, v16.16b 834 eor \o5\().16b, \o5\().16b, v17.16b 835 eor \o6\().16b, \o6\().16b, v18.16b 836 eor \o7\().16b, \o7\().16b, v19.16b 837 838 st1 {\o4\().16b}, [x19], #16 839 tbnz x6, #5, 1f 840 st1 {\o5\().16b}, [x19], #16 841 tbnz x6, #6, 1f 842 st1 {\o6\().16b}, [x19], #16 843 tbnz x6, #7, 1f 844 st1 {\o7\().16b}, [x19], #16 845 846 cbz x23, 1f 847 st1 {v25.16b}, [x24] 848 849 cond_yield_neon 0b 850 b 99b 851 8521: st1 {v25.16b}, [x24] 853 frame_pop 854 ret 855 .endm 856 857ENTRY(aesbs_xts_encrypt) 858 __xts_crypt aesbs_encrypt8, v0, v1, v4, v6, v3, v7, v2, v5 859ENDPROC(aesbs_xts_encrypt) 860 861ENTRY(aesbs_xts_decrypt) 862 __xts_crypt aesbs_decrypt8, v0, v1, v6, v4, v2, v7, v3, v5 863ENDPROC(aesbs_xts_decrypt) 864 865 .macro next_ctr, v 866 mov \v\().d[1], x8 867 adds x8, x8, #1 868 mov \v\().d[0], x7 869 adc x7, x7, xzr 870 rev64 \v\().16b, \v\().16b 871 .endm 872 873 /* 874 * aesbs_ctr_encrypt(u8 out[], u8 const in[], u8 const rk[], 875 * int rounds, int blocks, u8 iv[], u8 final[]) 876 */ 877ENTRY(aesbs_ctr_encrypt) 878 frame_push 8 879 880 mov x19, x0 881 mov x20, x1 882 mov x21, x2 883 mov x22, x3 884 mov x23, x4 885 mov x24, x5 886 mov x25, x6 887 888 cmp x25, #0 889 cset x26, ne 890 add x23, x23, x26 // do one extra block if final 891 89298: ldp x7, x8, [x24] 893 ld1 {v0.16b}, [x24] 894CPU_LE( rev x7, x7 ) 895CPU_LE( rev x8, x8 ) 896 adds x8, x8, #1 897 adc x7, x7, xzr 898 89999: mov x9, #1 900 lsl x9, x9, x23 901 subs w23, w23, #8 902 csel x23, x23, xzr, pl 903 csel x9, x9, xzr, le 904 905 tbnz x9, #1, 0f 906 next_ctr v1 907 tbnz x9, #2, 0f 908 next_ctr v2 909 tbnz x9, #3, 0f 910 next_ctr v3 911 tbnz x9, #4, 0f 912 next_ctr v4 913 tbnz x9, #5, 0f 914 next_ctr v5 915 tbnz x9, #6, 0f 916 next_ctr v6 917 tbnz x9, #7, 0f 918 next_ctr v7 919 9200: mov bskey, x21 921 mov rounds, x22 922 bl aesbs_encrypt8 923 924 lsr x9, x9, x26 // disregard the extra block 925 tbnz x9, #0, 0f 926 927 ld1 {v8.16b}, [x20], #16 928 eor v0.16b, v0.16b, v8.16b 929 st1 {v0.16b}, [x19], #16 930 tbnz x9, #1, 1f 931 932 ld1 {v9.16b}, [x20], #16 933 eor v1.16b, v1.16b, v9.16b 934 st1 {v1.16b}, [x19], #16 935 tbnz x9, #2, 2f 936 937 ld1 {v10.16b}, [x20], #16 938 eor v4.16b, v4.16b, v10.16b 939 st1 {v4.16b}, [x19], #16 940 tbnz x9, #3, 3f 941 942 ld1 {v11.16b}, [x20], #16 943 eor v6.16b, v6.16b, v11.16b 944 st1 {v6.16b}, [x19], #16 945 tbnz x9, #4, 4f 946 947 ld1 {v12.16b}, [x20], #16 948 eor v3.16b, v3.16b, v12.16b 949 st1 {v3.16b}, [x19], #16 950 tbnz x9, #5, 5f 951 952 ld1 {v13.16b}, [x20], #16 953 eor v7.16b, v7.16b, v13.16b 954 st1 {v7.16b}, [x19], #16 955 tbnz x9, #6, 6f 956 957 ld1 {v14.16b}, [x20], #16 958 eor v2.16b, v2.16b, v14.16b 959 st1 {v2.16b}, [x19], #16 960 tbnz x9, #7, 7f 961 962 ld1 {v15.16b}, [x20], #16 963 eor v5.16b, v5.16b, v15.16b 964 st1 {v5.16b}, [x19], #16 965 9668: next_ctr v0 967 st1 {v0.16b}, [x24] 968 cbz x23, .Lctr_done 969 970 cond_yield_neon 98b 971 b 99b 972 973.Lctr_done: 974 frame_pop 975 ret 976 977 /* 978 * If we are handling the tail of the input (x6 != NULL), return the 979 * final keystream block back to the caller. 980 */ 9810: cbz x25, 8b 982 st1 {v0.16b}, [x25] 983 b 8b 9841: cbz x25, 8b 985 st1 {v1.16b}, [x25] 986 b 8b 9872: cbz x25, 8b 988 st1 {v4.16b}, [x25] 989 b 8b 9903: cbz x25, 8b 991 st1 {v6.16b}, [x25] 992 b 8b 9934: cbz x25, 8b 994 st1 {v3.16b}, [x25] 995 b 8b 9965: cbz x25, 8b 997 st1 {v7.16b}, [x25] 998 b 8b 9996: cbz x25, 8b 1000 st1 {v2.16b}, [x25] 1001 b 8b 10027: cbz x25, 8b 1003 st1 {v5.16b}, [x25] 1004 b 8b 1005ENDPROC(aesbs_ctr_encrypt) 1006