1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 16-way parallel algorithm (AVX) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <linux/cfi_types.h> 11#include <asm/asm-offsets.h> 12#include <asm/frame.h> 13 14/* register macros */ 15#define CTX %rdi 16 17 18#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 19 ( (((a0) & 1) << 0) | \ 20 (((a1) & 1) << 1) | \ 21 (((a2) & 1) << 2) | \ 22 (((a3) & 1) << 3) | \ 23 (((a4) & 1) << 4) | \ 24 (((a5) & 1) << 5) | \ 25 (((a6) & 1) << 6) | \ 26 (((a7) & 1) << 7) ) 27 28#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 29 ( ((l7) << (0 * 8)) | \ 30 ((l6) << (1 * 8)) | \ 31 ((l5) << (2 * 8)) | \ 32 ((l4) << (3 * 8)) | \ 33 ((l3) << (4 * 8)) | \ 34 ((l2) << (5 * 8)) | \ 35 ((l1) << (6 * 8)) | \ 36 ((l0) << (7 * 8)) ) 37 38#define inc_le128(x, minus_one, tmp) \ 39 vpcmpeqq minus_one, x, tmp; \ 40 vpsubq minus_one, x, x; \ 41 vpslldq $8, tmp, tmp; \ 42 vpsubq tmp, x, x; 43 44#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 45 vpand x, mask4bit, tmp0; \ 46 vpandn x, mask4bit, x; \ 47 vpsrld $4, x, x; \ 48 \ 49 vpshufb tmp0, lo_t, tmp0; \ 50 vpshufb x, hi_t, x; \ 51 vpxor tmp0, x, x; 52 53#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 54 vpunpckhdq x1, x0, t2; \ 55 vpunpckldq x1, x0, x0; \ 56 \ 57 vpunpckldq x3, x2, t1; \ 58 vpunpckhdq x3, x2, x2; \ 59 \ 60 vpunpckhqdq t1, x0, x1; \ 61 vpunpcklqdq t1, x0, x0; \ 62 \ 63 vpunpckhqdq x2, t2, x3; \ 64 vpunpcklqdq x2, t2, x2; 65 66#define byteslice_16x16b(a0, b0, c0, d0, \ 67 a1, b1, c1, d1, \ 68 a2, b2, c2, d2, \ 69 a3, b3, c3, d3, \ 70 st0, st1) \ 71 vmovdqu d2, st0; \ 72 vmovdqu d3, st1; \ 73 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 74 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 75 vmovdqu st0, d2; \ 76 vmovdqu st1, d3; \ 77 \ 78 vmovdqu a0, st0; \ 79 vmovdqu a1, st1; \ 80 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 81 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 82 \ 83 vmovdqu .Lshufb_16x16b(%rip), a0; \ 84 vmovdqu st1, a1; \ 85 vpshufb a0, a2, a2; \ 86 vpshufb a0, a3, a3; \ 87 vpshufb a0, b0, b0; \ 88 vpshufb a0, b1, b1; \ 89 vpshufb a0, b2, b2; \ 90 vpshufb a0, b3, b3; \ 91 vpshufb a0, a1, a1; \ 92 vpshufb a0, c0, c0; \ 93 vpshufb a0, c1, c1; \ 94 vpshufb a0, c2, c2; \ 95 vpshufb a0, c3, c3; \ 96 vpshufb a0, d0, d0; \ 97 vpshufb a0, d1, d1; \ 98 vpshufb a0, d2, d2; \ 99 vpshufb a0, d3, d3; \ 100 vmovdqu d3, st1; \ 101 vmovdqu st0, d3; \ 102 vpshufb a0, d3, a0; \ 103 vmovdqu d2, st0; \ 104 \ 105 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 106 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 107 vmovdqu st0, d2; \ 108 vmovdqu st1, d3; \ 109 \ 110 vmovdqu b0, st0; \ 111 vmovdqu b1, st1; \ 112 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 113 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 114 vmovdqu st0, b0; \ 115 vmovdqu st1, b1; \ 116 /* does not adjust output bytes inside vectors */ 117 118#define debyteslice_16x16b(a0, b0, c0, d0, \ 119 a1, b1, c1, d1, \ 120 a2, b2, c2, d2, \ 121 a3, b3, c3, d3, \ 122 st0, st1) \ 123 vmovdqu d2, st0; \ 124 vmovdqu d3, st1; \ 125 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 126 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 127 vmovdqu st0, d2; \ 128 vmovdqu st1, d3; \ 129 \ 130 vmovdqu a0, st0; \ 131 vmovdqu a1, st1; \ 132 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 133 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 134 \ 135 vmovdqu .Lshufb_16x16b(%rip), a0; \ 136 vmovdqu st1, a1; \ 137 vpshufb a0, a2, a2; \ 138 vpshufb a0, a3, a3; \ 139 vpshufb a0, b0, b0; \ 140 vpshufb a0, b1, b1; \ 141 vpshufb a0, b2, b2; \ 142 vpshufb a0, b3, b3; \ 143 vpshufb a0, a1, a1; \ 144 vpshufb a0, c0, c0; \ 145 vpshufb a0, c1, c1; \ 146 vpshufb a0, c2, c2; \ 147 vpshufb a0, c3, c3; \ 148 vpshufb a0, d0, d0; \ 149 vpshufb a0, d1, d1; \ 150 vpshufb a0, d2, d2; \ 151 vpshufb a0, d3, d3; \ 152 vmovdqu d3, st1; \ 153 vmovdqu st0, d3; \ 154 vpshufb a0, d3, a0; \ 155 vmovdqu d2, st0; \ 156 \ 157 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 158 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 159 vmovdqu st0, d2; \ 160 vmovdqu st1, d3; \ 161 \ 162 vmovdqu b0, st0; \ 163 vmovdqu b1, st1; \ 164 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 165 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 166 vmovdqu st0, b0; \ 167 vmovdqu st1, b1; \ 168 /* does not adjust output bytes inside vectors */ 169 170/* load blocks to registers and apply pre-whitening */ 171#define inpack16_pre(x0, x1, x2, x3, \ 172 x4, x5, x6, x7, \ 173 y0, y1, y2, y3, \ 174 y4, y5, y6, y7, \ 175 rio) \ 176 vmovdqu (0 * 16)(rio), x0; \ 177 vmovdqu (1 * 16)(rio), x1; \ 178 vmovdqu (2 * 16)(rio), x2; \ 179 vmovdqu (3 * 16)(rio), x3; \ 180 vmovdqu (4 * 16)(rio), x4; \ 181 vmovdqu (5 * 16)(rio), x5; \ 182 vmovdqu (6 * 16)(rio), x6; \ 183 vmovdqu (7 * 16)(rio), x7; \ 184 vmovdqu (8 * 16)(rio), y0; \ 185 vmovdqu (9 * 16)(rio), y1; \ 186 vmovdqu (10 * 16)(rio), y2; \ 187 vmovdqu (11 * 16)(rio), y3; \ 188 vmovdqu (12 * 16)(rio), y4; \ 189 vmovdqu (13 * 16)(rio), y5; \ 190 vmovdqu (14 * 16)(rio), y6; \ 191 vmovdqu (15 * 16)(rio), y7; 192 193/* byteslice pre-whitened blocks and store to temporary memory */ 194#define inpack16_post(x0, x1, x2, x3, \ 195 x4, x5, x6, x7, \ 196 y0, y1, y2, y3, \ 197 y4, y5, y6, y7, \ 198 mem_ab, mem_cd) \ 199 byteslice_16x16b(x0, x1, x2, x3, \ 200 x4, x5, x6, x7, \ 201 y0, y1, y2, y3, \ 202 y4, y5, y6, y7, \ 203 (mem_ab), (mem_cd)); \ 204 \ 205 vmovdqu x0, 0 * 16(mem_ab); \ 206 vmovdqu x1, 1 * 16(mem_ab); \ 207 vmovdqu x2, 2 * 16(mem_ab); \ 208 vmovdqu x3, 3 * 16(mem_ab); \ 209 vmovdqu x4, 4 * 16(mem_ab); \ 210 vmovdqu x5, 5 * 16(mem_ab); \ 211 vmovdqu x6, 6 * 16(mem_ab); \ 212 vmovdqu x7, 7 * 16(mem_ab); \ 213 vmovdqu y0, 0 * 16(mem_cd); \ 214 vmovdqu y1, 1 * 16(mem_cd); \ 215 vmovdqu y2, 2 * 16(mem_cd); \ 216 vmovdqu y3, 3 * 16(mem_cd); \ 217 vmovdqu y4, 4 * 16(mem_cd); \ 218 vmovdqu y5, 5 * 16(mem_cd); \ 219 vmovdqu y6, 6 * 16(mem_cd); \ 220 vmovdqu y7, 7 * 16(mem_cd); 221 222#define write_output(x0, x1, x2, x3, \ 223 x4, x5, x6, x7, \ 224 y0, y1, y2, y3, \ 225 y4, y5, y6, y7, \ 226 mem) \ 227 vmovdqu x0, 0 * 16(mem); \ 228 vmovdqu x1, 1 * 16(mem); \ 229 vmovdqu x2, 2 * 16(mem); \ 230 vmovdqu x3, 3 * 16(mem); \ 231 vmovdqu x4, 4 * 16(mem); \ 232 vmovdqu x5, 5 * 16(mem); \ 233 vmovdqu x6, 6 * 16(mem); \ 234 vmovdqu x7, 7 * 16(mem); \ 235 vmovdqu y0, 8 * 16(mem); \ 236 vmovdqu y1, 9 * 16(mem); \ 237 vmovdqu y2, 10 * 16(mem); \ 238 vmovdqu y3, 11 * 16(mem); \ 239 vmovdqu y4, 12 * 16(mem); \ 240 vmovdqu y5, 13 * 16(mem); \ 241 vmovdqu y6, 14 * 16(mem); \ 242 vmovdqu y7, 15 * 16(mem); \ 243 244#define aria_store_state_8way(x0, x1, x2, x3, \ 245 x4, x5, x6, x7, \ 246 mem_tmp, idx) \ 247 vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 248 vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 249 vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 250 vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 251 vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 252 vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 253 vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 254 vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 255 256#define aria_load_state_8way(x0, x1, x2, x3, \ 257 x4, x5, x6, x7, \ 258 mem_tmp, idx) \ 259 vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 260 vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 261 vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 262 vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 263 vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 264 vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 265 vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 266 vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 267 268#define aria_ark_8way(x0, x1, x2, x3, \ 269 x4, x5, x6, x7, \ 270 t0, t1, t2, rk, \ 271 idx, round) \ 272 /* AddRoundKey */ \ 273 vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ 274 vpsrld $24, t0, t2; \ 275 vpshufb t1, t2, t2; \ 276 vpxor t2, x0, x0; \ 277 vpsrld $16, t0, t2; \ 278 vpshufb t1, t2, t2; \ 279 vpxor t2, x1, x1; \ 280 vpsrld $8, t0, t2; \ 281 vpshufb t1, t2, t2; \ 282 vpxor t2, x2, x2; \ 283 vpshufb t1, t0, t2; \ 284 vpxor t2, x3, x3; \ 285 vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ 286 vpsrld $24, t0, t2; \ 287 vpshufb t1, t2, t2; \ 288 vpxor t2, x4, x4; \ 289 vpsrld $16, t0, t2; \ 290 vpshufb t1, t2, t2; \ 291 vpxor t2, x5, x5; \ 292 vpsrld $8, t0, t2; \ 293 vpshufb t1, t2, t2; \ 294 vpxor t2, x6, x6; \ 295 vpshufb t1, t0, t2; \ 296 vpxor t2, x7, x7; 297 298#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 299 x4, x5, x6, x7, \ 300 t0, t1, t2, t3, \ 301 t4, t5, t6, t7) \ 302 vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \ 303 vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \ 304 vmovdqa .Ltf_id_bitmatrix(%rip), t2; \ 305 vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \ 306 vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \ 307 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 308 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 309 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 310 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 311 vgf2p8affineinvqb $0, t2, x2, x2; \ 312 vgf2p8affineinvqb $0, t2, x6, x6; \ 313 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 314 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 315 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 316 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 317 vgf2p8affineinvqb $0, t2, x3, x3; \ 318 vgf2p8affineinvqb $0, t2, x7, x7 319 320#define aria_sbox_8way(x0, x1, x2, x3, \ 321 x4, x5, x6, x7, \ 322 t0, t1, t2, t3, \ 323 t4, t5, t6, t7) \ 324 vmovdqa .Linv_shift_row(%rip), t0; \ 325 vmovdqa .Lshift_row(%rip), t1; \ 326 vbroadcastss .L0f0f0f0f(%rip), t6; \ 327 vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \ 328 vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \ 329 vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ 330 vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ 331 \ 332 vaesenclast t7, x0, x0; \ 333 vaesenclast t7, x4, x4; \ 334 vaesenclast t7, x1, x1; \ 335 vaesenclast t7, x5, x5; \ 336 vaesdeclast t7, x2, x2; \ 337 vaesdeclast t7, x6, x6; \ 338 \ 339 /* AES inverse shift rows */ \ 340 vpshufb t0, x0, x0; \ 341 vpshufb t0, x4, x4; \ 342 vpshufb t0, x1, x1; \ 343 vpshufb t0, x5, x5; \ 344 vpshufb t1, x3, x3; \ 345 vpshufb t1, x7, x7; \ 346 vpshufb t1, x2, x2; \ 347 vpshufb t1, x6, x6; \ 348 \ 349 /* affine transformation for S2 */ \ 350 filter_8bit(x1, t2, t3, t6, t0); \ 351 /* affine transformation for S2 */ \ 352 filter_8bit(x5, t2, t3, t6, t0); \ 353 \ 354 /* affine transformation for X2 */ \ 355 filter_8bit(x3, t4, t5, t6, t0); \ 356 /* affine transformation for X2 */ \ 357 filter_8bit(x7, t4, t5, t6, t0); \ 358 vaesdeclast t7, x3, x3; \ 359 vaesdeclast t7, x7, x7; 360 361#define aria_diff_m(x0, x1, x2, x3, \ 362 t0, t1, t2, t3) \ 363 /* T = rotr32(X, 8); */ \ 364 /* X ^= T */ \ 365 vpxor x0, x3, t0; \ 366 vpxor x1, x0, t1; \ 367 vpxor x2, x1, t2; \ 368 vpxor x3, x2, t3; \ 369 /* X = T ^ rotr(X, 16); */ \ 370 vpxor t2, x0, x0; \ 371 vpxor x1, t3, t3; \ 372 vpxor t0, x2, x2; \ 373 vpxor t1, x3, x1; \ 374 vmovdqu t3, x3; 375 376#define aria_diff_word(x0, x1, x2, x3, \ 377 x4, x5, x6, x7, \ 378 y0, y1, y2, y3, \ 379 y4, y5, y6, y7) \ 380 /* t1 ^= t2; */ \ 381 vpxor y0, x4, x4; \ 382 vpxor y1, x5, x5; \ 383 vpxor y2, x6, x6; \ 384 vpxor y3, x7, x7; \ 385 \ 386 /* t2 ^= t3; */ \ 387 vpxor y4, y0, y0; \ 388 vpxor y5, y1, y1; \ 389 vpxor y6, y2, y2; \ 390 vpxor y7, y3, y3; \ 391 \ 392 /* t0 ^= t1; */ \ 393 vpxor x4, x0, x0; \ 394 vpxor x5, x1, x1; \ 395 vpxor x6, x2, x2; \ 396 vpxor x7, x3, x3; \ 397 \ 398 /* t3 ^= t1; */ \ 399 vpxor x4, y4, y4; \ 400 vpxor x5, y5, y5; \ 401 vpxor x6, y6, y6; \ 402 vpxor x7, y7, y7; \ 403 \ 404 /* t2 ^= t0; */ \ 405 vpxor x0, y0, y0; \ 406 vpxor x1, y1, y1; \ 407 vpxor x2, y2, y2; \ 408 vpxor x3, y3, y3; \ 409 \ 410 /* t1 ^= t2; */ \ 411 vpxor y0, x4, x4; \ 412 vpxor y1, x5, x5; \ 413 vpxor y2, x6, x6; \ 414 vpxor y3, x7, x7; 415 416#define aria_fe(x0, x1, x2, x3, \ 417 x4, x5, x6, x7, \ 418 y0, y1, y2, y3, \ 419 y4, y5, y6, y7, \ 420 mem_tmp, rk, round) \ 421 vpxor y7, y7, y7; \ 422 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 423 y0, y7, y2, rk, 8, round); \ 424 \ 425 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 426 y0, y1, y2, y3, y4, y5, y6, y7); \ 427 \ 428 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 429 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 430 aria_store_state_8way(x0, x1, x2, x3, \ 431 x4, x5, x6, x7, \ 432 mem_tmp, 8); \ 433 \ 434 aria_load_state_8way(x0, x1, x2, x3, \ 435 x4, x5, x6, x7, \ 436 mem_tmp, 0); \ 437 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 438 y0, y7, y2, rk, 0, round); \ 439 \ 440 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 441 y0, y1, y2, y3, y4, y5, y6, y7); \ 442 \ 443 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 444 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 445 aria_store_state_8way(x0, x1, x2, x3, \ 446 x4, x5, x6, x7, \ 447 mem_tmp, 0); \ 448 aria_load_state_8way(y0, y1, y2, y3, \ 449 y4, y5, y6, y7, \ 450 mem_tmp, 8); \ 451 aria_diff_word(x0, x1, x2, x3, \ 452 x4, x5, x6, x7, \ 453 y0, y1, y2, y3, \ 454 y4, y5, y6, y7); \ 455 /* aria_diff_byte() \ 456 * T3 = ABCD -> BADC \ 457 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 458 * T0 = ABCD -> CDAB \ 459 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 460 * T1 = ABCD -> DCBA \ 461 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 462 */ \ 463 aria_diff_word(x2, x3, x0, x1, \ 464 x7, x6, x5, x4, \ 465 y0, y1, y2, y3, \ 466 y5, y4, y7, y6); \ 467 aria_store_state_8way(x3, x2, x1, x0, \ 468 x6, x7, x4, x5, \ 469 mem_tmp, 0); 470 471#define aria_fo(x0, x1, x2, x3, \ 472 x4, x5, x6, x7, \ 473 y0, y1, y2, y3, \ 474 y4, y5, y6, y7, \ 475 mem_tmp, rk, round) \ 476 vpxor y7, y7, y7; \ 477 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 478 y0, y7, y2, rk, 8, round); \ 479 \ 480 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 481 y0, y1, y2, y3, y4, y5, y6, y7); \ 482 \ 483 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 484 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 485 aria_store_state_8way(x0, x1, x2, x3, \ 486 x4, x5, x6, x7, \ 487 mem_tmp, 8); \ 488 \ 489 aria_load_state_8way(x0, x1, x2, x3, \ 490 x4, x5, x6, x7, \ 491 mem_tmp, 0); \ 492 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 493 y0, y7, y2, rk, 0, round); \ 494 \ 495 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 496 y0, y1, y2, y3, y4, y5, y6, y7); \ 497 \ 498 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 499 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 500 aria_store_state_8way(x0, x1, x2, x3, \ 501 x4, x5, x6, x7, \ 502 mem_tmp, 0); \ 503 aria_load_state_8way(y0, y1, y2, y3, \ 504 y4, y5, y6, y7, \ 505 mem_tmp, 8); \ 506 aria_diff_word(x0, x1, x2, x3, \ 507 x4, x5, x6, x7, \ 508 y0, y1, y2, y3, \ 509 y4, y5, y6, y7); \ 510 /* aria_diff_byte() \ 511 * T1 = ABCD -> BADC \ 512 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 513 * T2 = ABCD -> CDAB \ 514 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 515 * T3 = ABCD -> DCBA \ 516 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 517 */ \ 518 aria_diff_word(x0, x1, x2, x3, \ 519 x5, x4, x7, x6, \ 520 y2, y3, y0, y1, \ 521 y7, y6, y5, y4); \ 522 aria_store_state_8way(x3, x2, x1, x0, \ 523 x6, x7, x4, x5, \ 524 mem_tmp, 0); 525 526#define aria_ff(x0, x1, x2, x3, \ 527 x4, x5, x6, x7, \ 528 y0, y1, y2, y3, \ 529 y4, y5, y6, y7, \ 530 mem_tmp, rk, round, last_round) \ 531 vpxor y7, y7, y7; \ 532 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 533 y0, y7, y2, rk, 8, round); \ 534 \ 535 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 536 y0, y1, y2, y3, y4, y5, y6, y7); \ 537 \ 538 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 539 y0, y7, y2, rk, 8, last_round); \ 540 \ 541 aria_store_state_8way(x0, x1, x2, x3, \ 542 x4, x5, x6, x7, \ 543 mem_tmp, 8); \ 544 \ 545 aria_load_state_8way(x0, x1, x2, x3, \ 546 x4, x5, x6, x7, \ 547 mem_tmp, 0); \ 548 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 549 y0, y7, y2, rk, 0, round); \ 550 \ 551 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 552 y0, y1, y2, y3, y4, y5, y6, y7); \ 553 \ 554 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 555 y0, y7, y2, rk, 0, last_round); \ 556 \ 557 aria_load_state_8way(y0, y1, y2, y3, \ 558 y4, y5, y6, y7, \ 559 mem_tmp, 8); 560 561#define aria_fe_gfni(x0, x1, x2, x3, \ 562 x4, x5, x6, x7, \ 563 y0, y1, y2, y3, \ 564 y4, y5, y6, y7, \ 565 mem_tmp, rk, round) \ 566 vpxor y7, y7, y7; \ 567 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 568 y0, y7, y2, rk, 8, round); \ 569 \ 570 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 571 x6, x7, x4, x5, \ 572 y0, y1, y2, y3, \ 573 y4, y5, y6, y7); \ 574 \ 575 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 576 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 577 aria_store_state_8way(x0, x1, x2, x3, \ 578 x4, x5, x6, x7, \ 579 mem_tmp, 8); \ 580 \ 581 aria_load_state_8way(x0, x1, x2, x3, \ 582 x4, x5, x6, x7, \ 583 mem_tmp, 0); \ 584 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 585 y0, y7, y2, rk, 0, round); \ 586 \ 587 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 588 x6, x7, x4, x5, \ 589 y0, y1, y2, y3, \ 590 y4, y5, y6, y7); \ 591 \ 592 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 593 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 594 aria_store_state_8way(x0, x1, x2, x3, \ 595 x4, x5, x6, x7, \ 596 mem_tmp, 0); \ 597 aria_load_state_8way(y0, y1, y2, y3, \ 598 y4, y5, y6, y7, \ 599 mem_tmp, 8); \ 600 aria_diff_word(x0, x1, x2, x3, \ 601 x4, x5, x6, x7, \ 602 y0, y1, y2, y3, \ 603 y4, y5, y6, y7); \ 604 /* aria_diff_byte() \ 605 * T3 = ABCD -> BADC \ 606 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 607 * T0 = ABCD -> CDAB \ 608 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 609 * T1 = ABCD -> DCBA \ 610 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 611 */ \ 612 aria_diff_word(x2, x3, x0, x1, \ 613 x7, x6, x5, x4, \ 614 y0, y1, y2, y3, \ 615 y5, y4, y7, y6); \ 616 aria_store_state_8way(x3, x2, x1, x0, \ 617 x6, x7, x4, x5, \ 618 mem_tmp, 0); 619 620#define aria_fo_gfni(x0, x1, x2, x3, \ 621 x4, x5, x6, x7, \ 622 y0, y1, y2, y3, \ 623 y4, y5, y6, y7, \ 624 mem_tmp, rk, round) \ 625 vpxor y7, y7, y7; \ 626 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 627 y0, y7, y2, rk, 8, round); \ 628 \ 629 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 630 x4, x5, x6, x7, \ 631 y0, y1, y2, y3, \ 632 y4, y5, y6, y7); \ 633 \ 634 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 635 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 636 aria_store_state_8way(x0, x1, x2, x3, \ 637 x4, x5, x6, x7, \ 638 mem_tmp, 8); \ 639 \ 640 aria_load_state_8way(x0, x1, x2, x3, \ 641 x4, x5, x6, x7, \ 642 mem_tmp, 0); \ 643 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 644 y0, y7, y2, rk, 0, round); \ 645 \ 646 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 647 x4, x5, x6, x7, \ 648 y0, y1, y2, y3, \ 649 y4, y5, y6, y7); \ 650 \ 651 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 652 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 653 aria_store_state_8way(x0, x1, x2, x3, \ 654 x4, x5, x6, x7, \ 655 mem_tmp, 0); \ 656 aria_load_state_8way(y0, y1, y2, y3, \ 657 y4, y5, y6, y7, \ 658 mem_tmp, 8); \ 659 aria_diff_word(x0, x1, x2, x3, \ 660 x4, x5, x6, x7, \ 661 y0, y1, y2, y3, \ 662 y4, y5, y6, y7); \ 663 /* aria_diff_byte() \ 664 * T1 = ABCD -> BADC \ 665 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 666 * T2 = ABCD -> CDAB \ 667 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 668 * T3 = ABCD -> DCBA \ 669 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 670 */ \ 671 aria_diff_word(x0, x1, x2, x3, \ 672 x5, x4, x7, x6, \ 673 y2, y3, y0, y1, \ 674 y7, y6, y5, y4); \ 675 aria_store_state_8way(x3, x2, x1, x0, \ 676 x6, x7, x4, x5, \ 677 mem_tmp, 0); 678 679#define aria_ff_gfni(x0, x1, x2, x3, \ 680 x4, x5, x6, x7, \ 681 y0, y1, y2, y3, \ 682 y4, y5, y6, y7, \ 683 mem_tmp, rk, round, last_round) \ 684 vpxor y7, y7, y7; \ 685 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 686 y0, y7, y2, rk, 8, round); \ 687 \ 688 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 689 x6, x7, x4, x5, \ 690 y0, y1, y2, y3, \ 691 y4, y5, y6, y7); \ 692 \ 693 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 694 y0, y7, y2, rk, 8, last_round); \ 695 \ 696 aria_store_state_8way(x0, x1, x2, x3, \ 697 x4, x5, x6, x7, \ 698 mem_tmp, 8); \ 699 \ 700 aria_load_state_8way(x0, x1, x2, x3, \ 701 x4, x5, x6, x7, \ 702 mem_tmp, 0); \ 703 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 704 y0, y7, y2, rk, 0, round); \ 705 \ 706 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 707 x6, x7, x4, x5, \ 708 y0, y1, y2, y3, \ 709 y4, y5, y6, y7); \ 710 \ 711 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 712 y0, y7, y2, rk, 0, last_round); \ 713 \ 714 aria_load_state_8way(y0, y1, y2, y3, \ 715 y4, y5, y6, y7, \ 716 mem_tmp, 8); 717 718/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 719.section .rodata.cst16, "aM", @progbits, 16 720.align 16 721 722#define SHUFB_BYTES(idx) \ 723 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 724 725.Lshufb_16x16b: 726 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 727/* For isolating SubBytes from AESENCLAST, inverse shift row */ 728.Linv_shift_row: 729 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 730 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 731.Lshift_row: 732 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 733 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 734/* For CTR-mode IV byteswap */ 735.Lbswap128_mask: 736 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 737 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 738 739/* AES inverse affine and S2 combined: 740 * 1 1 0 0 0 0 0 1 x0 0 741 * 0 1 0 0 1 0 0 0 x1 0 742 * 1 1 0 0 1 1 1 1 x2 0 743 * 0 1 1 0 1 0 0 1 x3 1 744 * 0 1 0 0 1 1 0 0 * x4 + 0 745 * 0 1 0 1 1 0 0 0 x5 0 746 * 0 0 0 0 0 1 0 1 x6 0 747 * 1 1 1 0 0 1 1 1 x7 1 748 */ 749.Ltf_lo__inv_aff__and__s2: 750 .octa 0x92172DA81A9FA520B2370D883ABF8500 751.Ltf_hi__inv_aff__and__s2: 752 .octa 0x2B15FFC1AF917B45E6D8320C625CB688 753 754/* X2 and AES forward affine combined: 755 * 1 0 1 1 0 0 0 1 x0 0 756 * 0 1 1 1 1 0 1 1 x1 0 757 * 0 0 0 1 1 0 1 0 x2 1 758 * 0 1 0 0 0 1 0 0 x3 0 759 * 0 0 1 1 1 0 1 1 * x4 + 0 760 * 0 1 0 0 1 0 0 0 x5 0 761 * 1 1 0 1 0 0 1 1 x6 0 762 * 0 1 0 0 1 0 1 0 x7 0 763 */ 764.Ltf_lo__x2__and__fwd_aff: 765 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 766.Ltf_hi__x2__and__fwd_aff: 767 .octa 0x3F893781E95FE1576CDA64D2BA0CB204 768 769/* AES affine: */ 770#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 771.Ltf_aff_bitmatrix: 772 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 773 BV8(1, 1, 0, 0, 0, 1, 1, 1), 774 BV8(1, 1, 1, 0, 0, 0, 1, 1), 775 BV8(1, 1, 1, 1, 0, 0, 0, 1), 776 BV8(1, 1, 1, 1, 1, 0, 0, 0), 777 BV8(0, 1, 1, 1, 1, 1, 0, 0), 778 BV8(0, 0, 1, 1, 1, 1, 1, 0), 779 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 780 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 781 BV8(1, 1, 0, 0, 0, 1, 1, 1), 782 BV8(1, 1, 1, 0, 0, 0, 1, 1), 783 BV8(1, 1, 1, 1, 0, 0, 0, 1), 784 BV8(1, 1, 1, 1, 1, 0, 0, 0), 785 BV8(0, 1, 1, 1, 1, 1, 0, 0), 786 BV8(0, 0, 1, 1, 1, 1, 1, 0), 787 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 788 789/* AES inverse affine: */ 790#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 791.Ltf_inv_bitmatrix: 792 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 793 BV8(1, 0, 0, 1, 0, 0, 1, 0), 794 BV8(0, 1, 0, 0, 1, 0, 0, 1), 795 BV8(1, 0, 1, 0, 0, 1, 0, 0), 796 BV8(0, 1, 0, 1, 0, 0, 1, 0), 797 BV8(0, 0, 1, 0, 1, 0, 0, 1), 798 BV8(1, 0, 0, 1, 0, 1, 0, 0), 799 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 800 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 801 BV8(1, 0, 0, 1, 0, 0, 1, 0), 802 BV8(0, 1, 0, 0, 1, 0, 0, 1), 803 BV8(1, 0, 1, 0, 0, 1, 0, 0), 804 BV8(0, 1, 0, 1, 0, 0, 1, 0), 805 BV8(0, 0, 1, 0, 1, 0, 0, 1), 806 BV8(1, 0, 0, 1, 0, 1, 0, 0), 807 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 808 809/* S2: */ 810#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 811.Ltf_s2_bitmatrix: 812 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 813 BV8(0, 0, 1, 1, 1, 1, 1, 1), 814 BV8(1, 1, 1, 0, 1, 1, 0, 1), 815 BV8(1, 1, 0, 0, 0, 0, 1, 1), 816 BV8(0, 1, 0, 0, 0, 0, 1, 1), 817 BV8(1, 1, 0, 0, 1, 1, 1, 0), 818 BV8(0, 1, 1, 0, 0, 0, 1, 1), 819 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 820 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 821 BV8(0, 0, 1, 1, 1, 1, 1, 1), 822 BV8(1, 1, 1, 0, 1, 1, 0, 1), 823 BV8(1, 1, 0, 0, 0, 0, 1, 1), 824 BV8(0, 1, 0, 0, 0, 0, 1, 1), 825 BV8(1, 1, 0, 0, 1, 1, 1, 0), 826 BV8(0, 1, 1, 0, 0, 0, 1, 1), 827 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 828 829/* X2: */ 830#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 831.Ltf_x2_bitmatrix: 832 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 833 BV8(0, 0, 1, 0, 0, 1, 1, 0), 834 BV8(0, 0, 0, 0, 1, 0, 1, 0), 835 BV8(1, 1, 1, 0, 0, 0, 1, 1), 836 BV8(1, 1, 1, 0, 1, 1, 0, 0), 837 BV8(0, 1, 1, 0, 1, 0, 1, 1), 838 BV8(1, 0, 1, 1, 1, 1, 0, 1), 839 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 840 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 841 BV8(0, 0, 1, 0, 0, 1, 1, 0), 842 BV8(0, 0, 0, 0, 1, 0, 1, 0), 843 BV8(1, 1, 1, 0, 0, 0, 1, 1), 844 BV8(1, 1, 1, 0, 1, 1, 0, 0), 845 BV8(0, 1, 1, 0, 1, 0, 1, 1), 846 BV8(1, 0, 1, 1, 1, 1, 0, 1), 847 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 848 849/* Identity matrix: */ 850.Ltf_id_bitmatrix: 851 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 852 BV8(0, 1, 0, 0, 0, 0, 0, 0), 853 BV8(0, 0, 1, 0, 0, 0, 0, 0), 854 BV8(0, 0, 0, 1, 0, 0, 0, 0), 855 BV8(0, 0, 0, 0, 1, 0, 0, 0), 856 BV8(0, 0, 0, 0, 0, 1, 0, 0), 857 BV8(0, 0, 0, 0, 0, 0, 1, 0), 858 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 859 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 860 BV8(0, 1, 0, 0, 0, 0, 0, 0), 861 BV8(0, 0, 1, 0, 0, 0, 0, 0), 862 BV8(0, 0, 0, 1, 0, 0, 0, 0), 863 BV8(0, 0, 0, 0, 1, 0, 0, 0), 864 BV8(0, 0, 0, 0, 0, 1, 0, 0), 865 BV8(0, 0, 0, 0, 0, 0, 1, 0), 866 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 867 868/* 4-bit mask */ 869.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 870.align 4 871.L0f0f0f0f: 872 .long 0x0f0f0f0f 873 874.text 875 876SYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 877 /* input: 878 * %r9: rk 879 * %rsi: dst 880 * %rdx: src 881 * %xmm0..%xmm15: 16 byte-sliced blocks 882 */ 883 884 FRAME_BEGIN 885 886 movq %rsi, %rax; 887 leaq 8 * 16(%rax), %r8; 888 889 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 890 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 891 %xmm15, %rax, %r8); 892 aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 893 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 894 %rax, %r9, 0); 895 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 896 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 897 %xmm15, %rax, %r9, 1); 898 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 899 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 900 %rax, %r9, 2); 901 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 902 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 903 %xmm15, %rax, %r9, 3); 904 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 905 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 906 %rax, %r9, 4); 907 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 908 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 909 %xmm15, %rax, %r9, 5); 910 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 911 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 912 %rax, %r9, 6); 913 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 914 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 915 %xmm15, %rax, %r9, 7); 916 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 917 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 918 %rax, %r9, 8); 919 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 920 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 921 %xmm15, %rax, %r9, 9); 922 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 923 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 924 %rax, %r9, 10); 925 cmpl $12, ARIA_CTX_rounds(CTX); 926 jne .Laria_192; 927 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 928 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 929 %xmm15, %rax, %r9, 11, 12); 930 jmp .Laria_end; 931.Laria_192: 932 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 933 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 934 %xmm15, %rax, %r9, 11); 935 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 936 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 937 %rax, %r9, 12); 938 cmpl $14, ARIA_CTX_rounds(CTX); 939 jne .Laria_256; 940 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 941 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 942 %xmm15, %rax, %r9, 13, 14); 943 jmp .Laria_end; 944.Laria_256: 945 aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 946 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 947 %xmm15, %rax, %r9, 13); 948 aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 949 %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 950 %rax, %r9, 14); 951 aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 952 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 953 %xmm15, %rax, %r9, 15, 16); 954.Laria_end: 955 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 956 %xmm9, %xmm13, %xmm0, %xmm5, 957 %xmm10, %xmm14, %xmm3, %xmm6, 958 %xmm11, %xmm15, %xmm2, %xmm7, 959 (%rax), (%r8)); 960 961 FRAME_END 962 RET; 963SYM_FUNC_END(__aria_aesni_avx_crypt_16way) 964 965SYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) 966 /* input: 967 * %rdi: ctx, CTX 968 * %rsi: dst 969 * %rdx: src 970 */ 971 972 FRAME_BEGIN 973 974 leaq ARIA_CTX_enc_key(CTX), %r9; 975 976 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 977 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 978 %xmm15, %rdx); 979 980 call __aria_aesni_avx_crypt_16way; 981 982 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 983 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 984 %xmm15, %rax); 985 986 FRAME_END 987 RET; 988SYM_FUNC_END(aria_aesni_avx_encrypt_16way) 989 990SYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) 991 /* input: 992 * %rdi: ctx, CTX 993 * %rsi: dst 994 * %rdx: src 995 */ 996 997 FRAME_BEGIN 998 999 leaq ARIA_CTX_dec_key(CTX), %r9; 1000 1001 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1002 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1003 %xmm15, %rdx); 1004 1005 call __aria_aesni_avx_crypt_16way; 1006 1007 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1008 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1009 %xmm15, %rax); 1010 1011 FRAME_END 1012 RET; 1013SYM_FUNC_END(aria_aesni_avx_decrypt_16way) 1014 1015SYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 1016 /* input: 1017 * %rdi: ctx 1018 * %rsi: dst 1019 * %rdx: src 1020 * %rcx: keystream 1021 * %r8: iv (big endian, 128bit) 1022 */ 1023 1024 FRAME_BEGIN 1025 /* load IV and byteswap */ 1026 vmovdqu (%r8), %xmm8; 1027 1028 vmovdqa .Lbswap128_mask (%rip), %xmm1; 1029 vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 1030 1031 vpcmpeqd %xmm0, %xmm0, %xmm0; 1032 vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 1033 1034 /* construct IVs */ 1035 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1036 vpshufb %xmm1, %xmm3, %xmm9; 1037 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1038 vpshufb %xmm1, %xmm3, %xmm10; 1039 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1040 vpshufb %xmm1, %xmm3, %xmm11; 1041 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1042 vpshufb %xmm1, %xmm3, %xmm12; 1043 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1044 vpshufb %xmm1, %xmm3, %xmm13; 1045 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1046 vpshufb %xmm1, %xmm3, %xmm14; 1047 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1048 vpshufb %xmm1, %xmm3, %xmm15; 1049 vmovdqu %xmm8, (0 * 16)(%rcx); 1050 vmovdqu %xmm9, (1 * 16)(%rcx); 1051 vmovdqu %xmm10, (2 * 16)(%rcx); 1052 vmovdqu %xmm11, (3 * 16)(%rcx); 1053 vmovdqu %xmm12, (4 * 16)(%rcx); 1054 vmovdqu %xmm13, (5 * 16)(%rcx); 1055 vmovdqu %xmm14, (6 * 16)(%rcx); 1056 vmovdqu %xmm15, (7 * 16)(%rcx); 1057 1058 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1059 vpshufb %xmm1, %xmm3, %xmm8; 1060 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1061 vpshufb %xmm1, %xmm3, %xmm9; 1062 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1063 vpshufb %xmm1, %xmm3, %xmm10; 1064 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1065 vpshufb %xmm1, %xmm3, %xmm11; 1066 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1067 vpshufb %xmm1, %xmm3, %xmm12; 1068 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1069 vpshufb %xmm1, %xmm3, %xmm13; 1070 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1071 vpshufb %xmm1, %xmm3, %xmm14; 1072 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1073 vpshufb %xmm1, %xmm3, %xmm15; 1074 inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1075 vpshufb %xmm1, %xmm3, %xmm4; 1076 vmovdqu %xmm4, (%r8); 1077 1078 vmovdqu (0 * 16)(%rcx), %xmm0; 1079 vmovdqu (1 * 16)(%rcx), %xmm1; 1080 vmovdqu (2 * 16)(%rcx), %xmm2; 1081 vmovdqu (3 * 16)(%rcx), %xmm3; 1082 vmovdqu (4 * 16)(%rcx), %xmm4; 1083 vmovdqu (5 * 16)(%rcx), %xmm5; 1084 vmovdqu (6 * 16)(%rcx), %xmm6; 1085 vmovdqu (7 * 16)(%rcx), %xmm7; 1086 1087 FRAME_END 1088 RET; 1089SYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1090 1091SYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1092 /* input: 1093 * %rdi: ctx 1094 * %rsi: dst 1095 * %rdx: src 1096 * %rcx: keystream 1097 * %r8: iv (big endian, 128bit) 1098 */ 1099 FRAME_BEGIN 1100 1101 call __aria_aesni_avx_ctr_gen_keystream_16way; 1102 1103 leaq (%rsi), %r10; 1104 leaq (%rdx), %r11; 1105 leaq (%rcx), %rsi; 1106 leaq (%rcx), %rdx; 1107 leaq ARIA_CTX_enc_key(CTX), %r9; 1108 1109 call __aria_aesni_avx_crypt_16way; 1110 1111 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1112 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1113 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1114 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1115 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1116 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1117 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1118 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1119 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1120 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1121 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1122 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1123 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1124 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1125 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1126 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1127 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1128 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1129 %xmm15, %r10); 1130 1131 FRAME_END 1132 RET; 1133SYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1134 1135SYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1136 /* input: 1137 * %r9: rk 1138 * %rsi: dst 1139 * %rdx: src 1140 * %xmm0..%xmm15: 16 byte-sliced blocks 1141 */ 1142 1143 FRAME_BEGIN 1144 1145 movq %rsi, %rax; 1146 leaq 8 * 16(%rax), %r8; 1147 1148 inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1149 %xmm4, %xmm5, %xmm6, %xmm7, 1150 %xmm8, %xmm9, %xmm10, %xmm11, 1151 %xmm12, %xmm13, %xmm14, 1152 %xmm15, %rax, %r8); 1153 aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1154 %xmm12, %xmm13, %xmm14, %xmm15, 1155 %xmm0, %xmm1, %xmm2, %xmm3, 1156 %xmm4, %xmm5, %xmm6, %xmm7, 1157 %rax, %r9, 0); 1158 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1159 %xmm4, %xmm5, %xmm6, %xmm7, 1160 %xmm8, %xmm9, %xmm10, %xmm11, 1161 %xmm12, %xmm13, %xmm14, 1162 %xmm15, %rax, %r9, 1); 1163 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1164 %xmm12, %xmm13, %xmm14, %xmm15, 1165 %xmm0, %xmm1, %xmm2, %xmm3, 1166 %xmm4, %xmm5, %xmm6, %xmm7, 1167 %rax, %r9, 2); 1168 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1169 %xmm4, %xmm5, %xmm6, %xmm7, 1170 %xmm8, %xmm9, %xmm10, %xmm11, 1171 %xmm12, %xmm13, %xmm14, 1172 %xmm15, %rax, %r9, 3); 1173 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1174 %xmm12, %xmm13, %xmm14, %xmm15, 1175 %xmm0, %xmm1, %xmm2, %xmm3, 1176 %xmm4, %xmm5, %xmm6, %xmm7, 1177 %rax, %r9, 4); 1178 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1179 %xmm4, %xmm5, %xmm6, %xmm7, 1180 %xmm8, %xmm9, %xmm10, %xmm11, 1181 %xmm12, %xmm13, %xmm14, 1182 %xmm15, %rax, %r9, 5); 1183 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1184 %xmm12, %xmm13, %xmm14, %xmm15, 1185 %xmm0, %xmm1, %xmm2, %xmm3, 1186 %xmm4, %xmm5, %xmm6, %xmm7, 1187 %rax, %r9, 6); 1188 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1189 %xmm4, %xmm5, %xmm6, %xmm7, 1190 %xmm8, %xmm9, %xmm10, %xmm11, 1191 %xmm12, %xmm13, %xmm14, 1192 %xmm15, %rax, %r9, 7); 1193 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1194 %xmm12, %xmm13, %xmm14, %xmm15, 1195 %xmm0, %xmm1, %xmm2, %xmm3, 1196 %xmm4, %xmm5, %xmm6, %xmm7, 1197 %rax, %r9, 8); 1198 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1199 %xmm4, %xmm5, %xmm6, %xmm7, 1200 %xmm8, %xmm9, %xmm10, %xmm11, 1201 %xmm12, %xmm13, %xmm14, 1202 %xmm15, %rax, %r9, 9); 1203 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1204 %xmm12, %xmm13, %xmm14, %xmm15, 1205 %xmm0, %xmm1, %xmm2, %xmm3, 1206 %xmm4, %xmm5, %xmm6, %xmm7, 1207 %rax, %r9, 10); 1208 cmpl $12, ARIA_CTX_rounds(CTX); 1209 jne .Laria_gfni_192; 1210 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1211 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1212 %xmm15, %rax, %r9, 11, 12); 1213 jmp .Laria_gfni_end; 1214.Laria_gfni_192: 1215 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1216 %xmm4, %xmm5, %xmm6, %xmm7, 1217 %xmm8, %xmm9, %xmm10, %xmm11, 1218 %xmm12, %xmm13, %xmm14, 1219 %xmm15, %rax, %r9, 11); 1220 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1221 %xmm12, %xmm13, %xmm14, %xmm15, 1222 %xmm0, %xmm1, %xmm2, %xmm3, 1223 %xmm4, %xmm5, %xmm6, %xmm7, 1224 %rax, %r9, 12); 1225 cmpl $14, ARIA_CTX_rounds(CTX); 1226 jne .Laria_gfni_256; 1227 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1228 %xmm4, %xmm5, %xmm6, %xmm7, 1229 %xmm8, %xmm9, %xmm10, %xmm11, 1230 %xmm12, %xmm13, %xmm14, 1231 %xmm15, %rax, %r9, 13, 14); 1232 jmp .Laria_gfni_end; 1233.Laria_gfni_256: 1234 aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1235 %xmm4, %xmm5, %xmm6, %xmm7, 1236 %xmm8, %xmm9, %xmm10, %xmm11, 1237 %xmm12, %xmm13, %xmm14, 1238 %xmm15, %rax, %r9, 13); 1239 aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1240 %xmm12, %xmm13, %xmm14, %xmm15, 1241 %xmm0, %xmm1, %xmm2, %xmm3, 1242 %xmm4, %xmm5, %xmm6, %xmm7, 1243 %rax, %r9, 14); 1244 aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1245 %xmm4, %xmm5, %xmm6, %xmm7, 1246 %xmm8, %xmm9, %xmm10, %xmm11, 1247 %xmm12, %xmm13, %xmm14, 1248 %xmm15, %rax, %r9, 15, 16); 1249.Laria_gfni_end: 1250 debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1251 %xmm9, %xmm13, %xmm0, %xmm5, 1252 %xmm10, %xmm14, %xmm3, %xmm6, 1253 %xmm11, %xmm15, %xmm2, %xmm7, 1254 (%rax), (%r8)); 1255 1256 FRAME_END 1257 RET; 1258SYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1259 1260SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1261 /* input: 1262 * %rdi: ctx, CTX 1263 * %rsi: dst 1264 * %rdx: src 1265 */ 1266 1267 FRAME_BEGIN 1268 1269 leaq ARIA_CTX_enc_key(CTX), %r9; 1270 1271 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1272 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1273 %xmm15, %rdx); 1274 1275 call __aria_aesni_avx_gfni_crypt_16way; 1276 1277 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1278 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1279 %xmm15, %rax); 1280 1281 FRAME_END 1282 RET; 1283SYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1284 1285SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1286 /* input: 1287 * %rdi: ctx, CTX 1288 * %rsi: dst 1289 * %rdx: src 1290 */ 1291 1292 FRAME_BEGIN 1293 1294 leaq ARIA_CTX_dec_key(CTX), %r9; 1295 1296 inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1297 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1298 %xmm15, %rdx); 1299 1300 call __aria_aesni_avx_gfni_crypt_16way; 1301 1302 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1303 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1304 %xmm15, %rax); 1305 1306 FRAME_END 1307 RET; 1308SYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1309 1310SYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1311 /* input: 1312 * %rdi: ctx 1313 * %rsi: dst 1314 * %rdx: src 1315 * %rcx: keystream 1316 * %r8: iv (big endian, 128bit) 1317 */ 1318 FRAME_BEGIN 1319 1320 call __aria_aesni_avx_ctr_gen_keystream_16way 1321 1322 leaq (%rsi), %r10; 1323 leaq (%rdx), %r11; 1324 leaq (%rcx), %rsi; 1325 leaq (%rcx), %rdx; 1326 leaq ARIA_CTX_enc_key(CTX), %r9; 1327 1328 call __aria_aesni_avx_gfni_crypt_16way; 1329 1330 vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1331 vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1332 vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1333 vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1334 vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1335 vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1336 vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1337 vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1338 vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1339 vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1340 vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1341 vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1342 vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1343 vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1344 vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1345 vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1346 write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1347 %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1348 %xmm15, %r10); 1349 1350 FRAME_END 1351 RET; 1352SYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1353