1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * ARIA Cipher 32-way parallel algorithm (AVX2) 4 * 5 * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6 * 7 */ 8 9#include <linux/linkage.h> 10#include <asm/frame.h> 11#include <asm/asm-offsets.h> 12#include <linux/cfi_types.h> 13 14/* register macros */ 15#define CTX %rdi 16 17#define ymm0_x xmm0 18#define ymm1_x xmm1 19#define ymm2_x xmm2 20#define ymm3_x xmm3 21#define ymm4_x xmm4 22#define ymm5_x xmm5 23#define ymm6_x xmm6 24#define ymm7_x xmm7 25#define ymm8_x xmm8 26#define ymm9_x xmm9 27#define ymm10_x xmm10 28#define ymm11_x xmm11 29#define ymm12_x xmm12 30#define ymm13_x xmm13 31#define ymm14_x xmm14 32#define ymm15_x xmm15 33 34#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 35 ( (((a0) & 1) << 0) | \ 36 (((a1) & 1) << 1) | \ 37 (((a2) & 1) << 2) | \ 38 (((a3) & 1) << 3) | \ 39 (((a4) & 1) << 4) | \ 40 (((a5) & 1) << 5) | \ 41 (((a6) & 1) << 6) | \ 42 (((a7) & 1) << 7) ) 43 44#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 45 ( ((l7) << (0 * 8)) | \ 46 ((l6) << (1 * 8)) | \ 47 ((l5) << (2 * 8)) | \ 48 ((l4) << (3 * 8)) | \ 49 ((l3) << (4 * 8)) | \ 50 ((l2) << (5 * 8)) | \ 51 ((l1) << (6 * 8)) | \ 52 ((l0) << (7 * 8)) ) 53 54#define inc_le128(x, minus_one, tmp) \ 55 vpcmpeqq minus_one, x, tmp; \ 56 vpsubq minus_one, x, x; \ 57 vpslldq $8, tmp, tmp; \ 58 vpsubq tmp, x, x; 59 60#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 61 vpand x, mask4bit, tmp0; \ 62 vpandn x, mask4bit, x; \ 63 vpsrld $4, x, x; \ 64 \ 65 vpshufb tmp0, lo_t, tmp0; \ 66 vpshufb x, hi_t, x; \ 67 vpxor tmp0, x, x; 68 69#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 70 vpunpckhdq x1, x0, t2; \ 71 vpunpckldq x1, x0, x0; \ 72 \ 73 vpunpckldq x3, x2, t1; \ 74 vpunpckhdq x3, x2, x2; \ 75 \ 76 vpunpckhqdq t1, x0, x1; \ 77 vpunpcklqdq t1, x0, x0; \ 78 \ 79 vpunpckhqdq x2, t2, x3; \ 80 vpunpcklqdq x2, t2, x2; 81 82#define byteslice_16x16b(a0, b0, c0, d0, \ 83 a1, b1, c1, d1, \ 84 a2, b2, c2, d2, \ 85 a3, b3, c3, d3, \ 86 st0, st1) \ 87 vmovdqu d2, st0; \ 88 vmovdqu d3, st1; \ 89 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 90 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 91 vmovdqu st0, d2; \ 92 vmovdqu st1, d3; \ 93 \ 94 vmovdqu a0, st0; \ 95 vmovdqu a1, st1; \ 96 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 97 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 98 \ 99 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ 100 vmovdqu st1, a1; \ 101 vpshufb a0, a2, a2; \ 102 vpshufb a0, a3, a3; \ 103 vpshufb a0, b0, b0; \ 104 vpshufb a0, b1, b1; \ 105 vpshufb a0, b2, b2; \ 106 vpshufb a0, b3, b3; \ 107 vpshufb a0, a1, a1; \ 108 vpshufb a0, c0, c0; \ 109 vpshufb a0, c1, c1; \ 110 vpshufb a0, c2, c2; \ 111 vpshufb a0, c3, c3; \ 112 vpshufb a0, d0, d0; \ 113 vpshufb a0, d1, d1; \ 114 vpshufb a0, d2, d2; \ 115 vpshufb a0, d3, d3; \ 116 vmovdqu d3, st1; \ 117 vmovdqu st0, d3; \ 118 vpshufb a0, d3, a0; \ 119 vmovdqu d2, st0; \ 120 \ 121 transpose_4x4(a0, b0, c0, d0, d2, d3); \ 122 transpose_4x4(a1, b1, c1, d1, d2, d3); \ 123 vmovdqu st0, d2; \ 124 vmovdqu st1, d3; \ 125 \ 126 vmovdqu b0, st0; \ 127 vmovdqu b1, st1; \ 128 transpose_4x4(a2, b2, c2, d2, b0, b1); \ 129 transpose_4x4(a3, b3, c3, d3, b0, b1); \ 130 vmovdqu st0, b0; \ 131 vmovdqu st1, b1; \ 132 /* does not adjust output bytes inside vectors */ 133 134#define debyteslice_16x16b(a0, b0, c0, d0, \ 135 a1, b1, c1, d1, \ 136 a2, b2, c2, d2, \ 137 a3, b3, c3, d3, \ 138 st0, st1) \ 139 vmovdqu d2, st0; \ 140 vmovdqu d3, st1; \ 141 transpose_4x4(a0, a1, a2, a3, d2, d3); \ 142 transpose_4x4(b0, b1, b2, b3, d2, d3); \ 143 vmovdqu st0, d2; \ 144 vmovdqu st1, d3; \ 145 \ 146 vmovdqu a0, st0; \ 147 vmovdqu a1, st1; \ 148 transpose_4x4(c0, c1, c2, c3, a0, a1); \ 149 transpose_4x4(d0, d1, d2, d3, a0, a1); \ 150 \ 151 vbroadcasti128 .Lshufb_16x16b(%rip), a0; \ 152 vmovdqu st1, a1; \ 153 vpshufb a0, a2, a2; \ 154 vpshufb a0, a3, a3; \ 155 vpshufb a0, b0, b0; \ 156 vpshufb a0, b1, b1; \ 157 vpshufb a0, b2, b2; \ 158 vpshufb a0, b3, b3; \ 159 vpshufb a0, a1, a1; \ 160 vpshufb a0, c0, c0; \ 161 vpshufb a0, c1, c1; \ 162 vpshufb a0, c2, c2; \ 163 vpshufb a0, c3, c3; \ 164 vpshufb a0, d0, d0; \ 165 vpshufb a0, d1, d1; \ 166 vpshufb a0, d2, d2; \ 167 vpshufb a0, d3, d3; \ 168 vmovdqu d3, st1; \ 169 vmovdqu st0, d3; \ 170 vpshufb a0, d3, a0; \ 171 vmovdqu d2, st0; \ 172 \ 173 transpose_4x4(c0, d0, a0, b0, d2, d3); \ 174 transpose_4x4(c1, d1, a1, b1, d2, d3); \ 175 vmovdqu st0, d2; \ 176 vmovdqu st1, d3; \ 177 \ 178 vmovdqu b0, st0; \ 179 vmovdqu b1, st1; \ 180 transpose_4x4(c2, d2, a2, b2, b0, b1); \ 181 transpose_4x4(c3, d3, a3, b3, b0, b1); \ 182 vmovdqu st0, b0; \ 183 vmovdqu st1, b1; \ 184 /* does not adjust output bytes inside vectors */ 185 186/* load blocks to registers and apply pre-whitening */ 187#define inpack16_pre(x0, x1, x2, x3, \ 188 x4, x5, x6, x7, \ 189 y0, y1, y2, y3, \ 190 y4, y5, y6, y7, \ 191 rio) \ 192 vmovdqu (0 * 32)(rio), x0; \ 193 vmovdqu (1 * 32)(rio), x1; \ 194 vmovdqu (2 * 32)(rio), x2; \ 195 vmovdqu (3 * 32)(rio), x3; \ 196 vmovdqu (4 * 32)(rio), x4; \ 197 vmovdqu (5 * 32)(rio), x5; \ 198 vmovdqu (6 * 32)(rio), x6; \ 199 vmovdqu (7 * 32)(rio), x7; \ 200 vmovdqu (8 * 32)(rio), y0; \ 201 vmovdqu (9 * 32)(rio), y1; \ 202 vmovdqu (10 * 32)(rio), y2; \ 203 vmovdqu (11 * 32)(rio), y3; \ 204 vmovdqu (12 * 32)(rio), y4; \ 205 vmovdqu (13 * 32)(rio), y5; \ 206 vmovdqu (14 * 32)(rio), y6; \ 207 vmovdqu (15 * 32)(rio), y7; 208 209/* byteslice pre-whitened blocks and store to temporary memory */ 210#define inpack16_post(x0, x1, x2, x3, \ 211 x4, x5, x6, x7, \ 212 y0, y1, y2, y3, \ 213 y4, y5, y6, y7, \ 214 mem_ab, mem_cd) \ 215 byteslice_16x16b(x0, x1, x2, x3, \ 216 x4, x5, x6, x7, \ 217 y0, y1, y2, y3, \ 218 y4, y5, y6, y7, \ 219 (mem_ab), (mem_cd)); \ 220 \ 221 vmovdqu x0, 0 * 32(mem_ab); \ 222 vmovdqu x1, 1 * 32(mem_ab); \ 223 vmovdqu x2, 2 * 32(mem_ab); \ 224 vmovdqu x3, 3 * 32(mem_ab); \ 225 vmovdqu x4, 4 * 32(mem_ab); \ 226 vmovdqu x5, 5 * 32(mem_ab); \ 227 vmovdqu x6, 6 * 32(mem_ab); \ 228 vmovdqu x7, 7 * 32(mem_ab); \ 229 vmovdqu y0, 0 * 32(mem_cd); \ 230 vmovdqu y1, 1 * 32(mem_cd); \ 231 vmovdqu y2, 2 * 32(mem_cd); \ 232 vmovdqu y3, 3 * 32(mem_cd); \ 233 vmovdqu y4, 4 * 32(mem_cd); \ 234 vmovdqu y5, 5 * 32(mem_cd); \ 235 vmovdqu y6, 6 * 32(mem_cd); \ 236 vmovdqu y7, 7 * 32(mem_cd); 237 238#define write_output(x0, x1, x2, x3, \ 239 x4, x5, x6, x7, \ 240 y0, y1, y2, y3, \ 241 y4, y5, y6, y7, \ 242 mem) \ 243 vmovdqu x0, 0 * 32(mem); \ 244 vmovdqu x1, 1 * 32(mem); \ 245 vmovdqu x2, 2 * 32(mem); \ 246 vmovdqu x3, 3 * 32(mem); \ 247 vmovdqu x4, 4 * 32(mem); \ 248 vmovdqu x5, 5 * 32(mem); \ 249 vmovdqu x6, 6 * 32(mem); \ 250 vmovdqu x7, 7 * 32(mem); \ 251 vmovdqu y0, 8 * 32(mem); \ 252 vmovdqu y1, 9 * 32(mem); \ 253 vmovdqu y2, 10 * 32(mem); \ 254 vmovdqu y3, 11 * 32(mem); \ 255 vmovdqu y4, 12 * 32(mem); \ 256 vmovdqu y5, 13 * 32(mem); \ 257 vmovdqu y6, 14 * 32(mem); \ 258 vmovdqu y7, 15 * 32(mem); \ 259 260#define aria_store_state_8way(x0, x1, x2, x3, \ 261 x4, x5, x6, x7, \ 262 mem_tmp, idx) \ 263 vmovdqu x0, ((idx + 0) * 32)(mem_tmp); \ 264 vmovdqu x1, ((idx + 1) * 32)(mem_tmp); \ 265 vmovdqu x2, ((idx + 2) * 32)(mem_tmp); \ 266 vmovdqu x3, ((idx + 3) * 32)(mem_tmp); \ 267 vmovdqu x4, ((idx + 4) * 32)(mem_tmp); \ 268 vmovdqu x5, ((idx + 5) * 32)(mem_tmp); \ 269 vmovdqu x6, ((idx + 6) * 32)(mem_tmp); \ 270 vmovdqu x7, ((idx + 7) * 32)(mem_tmp); 271 272#define aria_load_state_8way(x0, x1, x2, x3, \ 273 x4, x5, x6, x7, \ 274 mem_tmp, idx) \ 275 vmovdqu ((idx + 0) * 32)(mem_tmp), x0; \ 276 vmovdqu ((idx + 1) * 32)(mem_tmp), x1; \ 277 vmovdqu ((idx + 2) * 32)(mem_tmp), x2; \ 278 vmovdqu ((idx + 3) * 32)(mem_tmp), x3; \ 279 vmovdqu ((idx + 4) * 32)(mem_tmp), x4; \ 280 vmovdqu ((idx + 5) * 32)(mem_tmp), x5; \ 281 vmovdqu ((idx + 6) * 32)(mem_tmp), x6; \ 282 vmovdqu ((idx + 7) * 32)(mem_tmp), x7; 283 284#define aria_ark_8way(x0, x1, x2, x3, \ 285 x4, x5, x6, x7, \ 286 t0, rk, idx, round) \ 287 /* AddRoundKey */ \ 288 vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ 289 vpxor t0, x0, x0; \ 290 vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ 291 vpxor t0, x1, x1; \ 292 vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ 293 vpxor t0, x2, x2; \ 294 vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ 295 vpxor t0, x3, x3; \ 296 vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ 297 vpxor t0, x4, x4; \ 298 vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ 299 vpxor t0, x5, x5; \ 300 vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ 301 vpxor t0, x6, x6; \ 302 vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ 303 vpxor t0, x7, x7; 304 305#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 306 x4, x5, x6, x7, \ 307 t0, t1, t2, t3, \ 308 t4, t5, t6, t7) \ 309 vpbroadcastq .Ltf_s2_bitmatrix(%rip), t0; \ 310 vpbroadcastq .Ltf_inv_bitmatrix(%rip), t1; \ 311 vpbroadcastq .Ltf_id_bitmatrix(%rip), t2; \ 312 vpbroadcastq .Ltf_aff_bitmatrix(%rip), t3; \ 313 vpbroadcastq .Ltf_x2_bitmatrix(%rip), t4; \ 314 vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 315 vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 316 vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 317 vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 318 vgf2p8affineinvqb $0, t2, x2, x2; \ 319 vgf2p8affineinvqb $0, t2, x6, x6; \ 320 vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 321 vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 322 vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 323 vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 324 vgf2p8affineinvqb $0, t2, x3, x3; \ 325 vgf2p8affineinvqb $0, t2, x7, x7 326 327#define aria_sbox_8way(x0, x1, x2, x3, \ 328 x4, x5, x6, x7, \ 329 t0, t1, t2, t3, \ 330 t4, t5, t6, t7) \ 331 vpxor t7, t7, t7; \ 332 vpxor t6, t6, t6; \ 333 vbroadcasti128 .Linv_shift_row(%rip), t0; \ 334 vbroadcasti128 .Lshift_row(%rip), t1; \ 335 vbroadcasti128 .Ltf_lo__inv_aff__and__s2(%rip), t2; \ 336 vbroadcasti128 .Ltf_hi__inv_aff__and__s2(%rip), t3; \ 337 vbroadcasti128 .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ 338 vbroadcasti128 .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ 339 \ 340 vextracti128 $1, x0, t6##_x; \ 341 vaesenclast t7##_x, x0##_x, x0##_x; \ 342 vaesenclast t7##_x, t6##_x, t6##_x; \ 343 vinserti128 $1, t6##_x, x0, x0; \ 344 \ 345 vextracti128 $1, x4, t6##_x; \ 346 vaesenclast t7##_x, x4##_x, x4##_x; \ 347 vaesenclast t7##_x, t6##_x, t6##_x; \ 348 vinserti128 $1, t6##_x, x4, x4; \ 349 \ 350 vextracti128 $1, x1, t6##_x; \ 351 vaesenclast t7##_x, x1##_x, x1##_x; \ 352 vaesenclast t7##_x, t6##_x, t6##_x; \ 353 vinserti128 $1, t6##_x, x1, x1; \ 354 \ 355 vextracti128 $1, x5, t6##_x; \ 356 vaesenclast t7##_x, x5##_x, x5##_x; \ 357 vaesenclast t7##_x, t6##_x, t6##_x; \ 358 vinserti128 $1, t6##_x, x5, x5; \ 359 \ 360 vextracti128 $1, x2, t6##_x; \ 361 vaesdeclast t7##_x, x2##_x, x2##_x; \ 362 vaesdeclast t7##_x, t6##_x, t6##_x; \ 363 vinserti128 $1, t6##_x, x2, x2; \ 364 \ 365 vextracti128 $1, x6, t6##_x; \ 366 vaesdeclast t7##_x, x6##_x, x6##_x; \ 367 vaesdeclast t7##_x, t6##_x, t6##_x; \ 368 vinserti128 $1, t6##_x, x6, x6; \ 369 \ 370 vpbroadcastd .L0f0f0f0f(%rip), t6; \ 371 \ 372 /* AES inverse shift rows */ \ 373 vpshufb t0, x0, x0; \ 374 vpshufb t0, x4, x4; \ 375 vpshufb t0, x1, x1; \ 376 vpshufb t0, x5, x5; \ 377 vpshufb t1, x3, x3; \ 378 vpshufb t1, x7, x7; \ 379 vpshufb t1, x2, x2; \ 380 vpshufb t1, x6, x6; \ 381 \ 382 /* affine transformation for S2 */ \ 383 filter_8bit(x1, t2, t3, t6, t0); \ 384 /* affine transformation for S2 */ \ 385 filter_8bit(x5, t2, t3, t6, t0); \ 386 \ 387 /* affine transformation for X2 */ \ 388 filter_8bit(x3, t4, t5, t6, t0); \ 389 /* affine transformation for X2 */ \ 390 filter_8bit(x7, t4, t5, t6, t0); \ 391 \ 392 vpxor t6, t6, t6; \ 393 vextracti128 $1, x3, t6##_x; \ 394 vaesdeclast t7##_x, x3##_x, x3##_x; \ 395 vaesdeclast t7##_x, t6##_x, t6##_x; \ 396 vinserti128 $1, t6##_x, x3, x3; \ 397 \ 398 vextracti128 $1, x7, t6##_x; \ 399 vaesdeclast t7##_x, x7##_x, x7##_x; \ 400 vaesdeclast t7##_x, t6##_x, t6##_x; \ 401 vinserti128 $1, t6##_x, x7, x7; \ 402 403#define aria_diff_m(x0, x1, x2, x3, \ 404 t0, t1, t2, t3) \ 405 /* T = rotr32(X, 8); */ \ 406 /* X ^= T */ \ 407 vpxor x0, x3, t0; \ 408 vpxor x1, x0, t1; \ 409 vpxor x2, x1, t2; \ 410 vpxor x3, x2, t3; \ 411 /* X = T ^ rotr(X, 16); */ \ 412 vpxor t2, x0, x0; \ 413 vpxor x1, t3, t3; \ 414 vpxor t0, x2, x2; \ 415 vpxor t1, x3, x1; \ 416 vmovdqu t3, x3; 417 418#define aria_diff_word(x0, x1, x2, x3, \ 419 x4, x5, x6, x7, \ 420 y0, y1, y2, y3, \ 421 y4, y5, y6, y7) \ 422 /* t1 ^= t2; */ \ 423 vpxor y0, x4, x4; \ 424 vpxor y1, x5, x5; \ 425 vpxor y2, x6, x6; \ 426 vpxor y3, x7, x7; \ 427 \ 428 /* t2 ^= t3; */ \ 429 vpxor y4, y0, y0; \ 430 vpxor y5, y1, y1; \ 431 vpxor y6, y2, y2; \ 432 vpxor y7, y3, y3; \ 433 \ 434 /* t0 ^= t1; */ \ 435 vpxor x4, x0, x0; \ 436 vpxor x5, x1, x1; \ 437 vpxor x6, x2, x2; \ 438 vpxor x7, x3, x3; \ 439 \ 440 /* t3 ^= t1; */ \ 441 vpxor x4, y4, y4; \ 442 vpxor x5, y5, y5; \ 443 vpxor x6, y6, y6; \ 444 vpxor x7, y7, y7; \ 445 \ 446 /* t2 ^= t0; */ \ 447 vpxor x0, y0, y0; \ 448 vpxor x1, y1, y1; \ 449 vpxor x2, y2, y2; \ 450 vpxor x3, y3, y3; \ 451 \ 452 /* t1 ^= t2; */ \ 453 vpxor y0, x4, x4; \ 454 vpxor y1, x5, x5; \ 455 vpxor y2, x6, x6; \ 456 vpxor y3, x7, x7; 457 458#define aria_fe(x0, x1, x2, x3, \ 459 x4, x5, x6, x7, \ 460 y0, y1, y2, y3, \ 461 y4, y5, y6, y7, \ 462 mem_tmp, rk, round) \ 463 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 464 y0, rk, 8, round); \ 465 \ 466 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 467 y0, y1, y2, y3, y4, y5, y6, y7); \ 468 \ 469 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 470 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 471 aria_store_state_8way(x0, x1, x2, x3, \ 472 x4, x5, x6, x7, \ 473 mem_tmp, 8); \ 474 \ 475 aria_load_state_8way(x0, x1, x2, x3, \ 476 x4, x5, x6, x7, \ 477 mem_tmp, 0); \ 478 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 479 y0, rk, 0, round); \ 480 \ 481 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 482 y0, y1, y2, y3, y4, y5, y6, y7); \ 483 \ 484 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 485 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 486 aria_store_state_8way(x0, x1, x2, x3, \ 487 x4, x5, x6, x7, \ 488 mem_tmp, 0); \ 489 aria_load_state_8way(y0, y1, y2, y3, \ 490 y4, y5, y6, y7, \ 491 mem_tmp, 8); \ 492 aria_diff_word(x0, x1, x2, x3, \ 493 x4, x5, x6, x7, \ 494 y0, y1, y2, y3, \ 495 y4, y5, y6, y7); \ 496 /* aria_diff_byte() \ 497 * T3 = ABCD -> BADC \ 498 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 499 * T0 = ABCD -> CDAB \ 500 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 501 * T1 = ABCD -> DCBA \ 502 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 503 */ \ 504 aria_diff_word(x2, x3, x0, x1, \ 505 x7, x6, x5, x4, \ 506 y0, y1, y2, y3, \ 507 y5, y4, y7, y6); \ 508 aria_store_state_8way(x3, x2, x1, x0, \ 509 x6, x7, x4, x5, \ 510 mem_tmp, 0); 511 512#define aria_fo(x0, x1, x2, x3, \ 513 x4, x5, x6, x7, \ 514 y0, y1, y2, y3, \ 515 y4, y5, y6, y7, \ 516 mem_tmp, rk, round) \ 517 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 518 y0, rk, 8, round); \ 519 \ 520 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 521 y0, y1, y2, y3, y4, y5, y6, y7); \ 522 \ 523 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 524 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 525 aria_store_state_8way(x0, x1, x2, x3, \ 526 x4, x5, x6, x7, \ 527 mem_tmp, 8); \ 528 \ 529 aria_load_state_8way(x0, x1, x2, x3, \ 530 x4, x5, x6, x7, \ 531 mem_tmp, 0); \ 532 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 533 y0, rk, 0, round); \ 534 \ 535 aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 536 y0, y1, y2, y3, y4, y5, y6, y7); \ 537 \ 538 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 539 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 540 aria_store_state_8way(x0, x1, x2, x3, \ 541 x4, x5, x6, x7, \ 542 mem_tmp, 0); \ 543 aria_load_state_8way(y0, y1, y2, y3, \ 544 y4, y5, y6, y7, \ 545 mem_tmp, 8); \ 546 aria_diff_word(x0, x1, x2, x3, \ 547 x4, x5, x6, x7, \ 548 y0, y1, y2, y3, \ 549 y4, y5, y6, y7); \ 550 /* aria_diff_byte() \ 551 * T1 = ABCD -> BADC \ 552 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 553 * T2 = ABCD -> CDAB \ 554 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 555 * T3 = ABCD -> DCBA \ 556 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 557 */ \ 558 aria_diff_word(x0, x1, x2, x3, \ 559 x5, x4, x7, x6, \ 560 y2, y3, y0, y1, \ 561 y7, y6, y5, y4); \ 562 aria_store_state_8way(x3, x2, x1, x0, \ 563 x6, x7, x4, x5, \ 564 mem_tmp, 0); 565 566#define aria_ff(x0, x1, x2, x3, \ 567 x4, x5, x6, x7, \ 568 y0, y1, y2, y3, \ 569 y4, y5, y6, y7, \ 570 mem_tmp, rk, round, last_round) \ 571 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 572 y0, rk, 8, round); \ 573 \ 574 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 575 y0, y1, y2, y3, y4, y5, y6, y7); \ 576 \ 577 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 578 y0, rk, 8, last_round); \ 579 \ 580 aria_store_state_8way(x0, x1, x2, x3, \ 581 x4, x5, x6, x7, \ 582 mem_tmp, 8); \ 583 \ 584 aria_load_state_8way(x0, x1, x2, x3, \ 585 x4, x5, x6, x7, \ 586 mem_tmp, 0); \ 587 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 588 y0, rk, 0, round); \ 589 \ 590 aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 591 y0, y1, y2, y3, y4, y5, y6, y7); \ 592 \ 593 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 594 y0, rk, 0, last_round); \ 595 \ 596 aria_load_state_8way(y0, y1, y2, y3, \ 597 y4, y5, y6, y7, \ 598 mem_tmp, 8); 599 600#define aria_fe_gfni(x0, x1, x2, x3, \ 601 x4, x5, x6, x7, \ 602 y0, y1, y2, y3, \ 603 y4, y5, y6, y7, \ 604 mem_tmp, rk, round) \ 605 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 606 y0, rk, 8, round); \ 607 \ 608 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 609 x6, x7, x4, x5, \ 610 y0, y1, y2, y3, \ 611 y4, y5, y6, y7); \ 612 \ 613 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 614 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 615 aria_store_state_8way(x0, x1, x2, x3, \ 616 x4, x5, x6, x7, \ 617 mem_tmp, 8); \ 618 \ 619 aria_load_state_8way(x0, x1, x2, x3, \ 620 x4, x5, x6, x7, \ 621 mem_tmp, 0); \ 622 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 623 y0, rk, 0, round); \ 624 \ 625 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 626 x6, x7, x4, x5, \ 627 y0, y1, y2, y3, \ 628 y4, y5, y6, y7); \ 629 \ 630 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 631 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 632 aria_store_state_8way(x0, x1, x2, x3, \ 633 x4, x5, x6, x7, \ 634 mem_tmp, 0); \ 635 aria_load_state_8way(y0, y1, y2, y3, \ 636 y4, y5, y6, y7, \ 637 mem_tmp, 8); \ 638 aria_diff_word(x0, x1, x2, x3, \ 639 x4, x5, x6, x7, \ 640 y0, y1, y2, y3, \ 641 y4, y5, y6, y7); \ 642 /* aria_diff_byte() \ 643 * T3 = ABCD -> BADC \ 644 * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 645 * T0 = ABCD -> CDAB \ 646 * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 647 * T1 = ABCD -> DCBA \ 648 * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 649 */ \ 650 aria_diff_word(x2, x3, x0, x1, \ 651 x7, x6, x5, x4, \ 652 y0, y1, y2, y3, \ 653 y5, y4, y7, y6); \ 654 aria_store_state_8way(x3, x2, x1, x0, \ 655 x6, x7, x4, x5, \ 656 mem_tmp, 0); 657 658#define aria_fo_gfni(x0, x1, x2, x3, \ 659 x4, x5, x6, x7, \ 660 y0, y1, y2, y3, \ 661 y4, y5, y6, y7, \ 662 mem_tmp, rk, round) \ 663 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 664 y0, rk, 8, round); \ 665 \ 666 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 667 x4, x5, x6, x7, \ 668 y0, y1, y2, y3, \ 669 y4, y5, y6, y7); \ 670 \ 671 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 672 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 673 aria_store_state_8way(x0, x1, x2, x3, \ 674 x4, x5, x6, x7, \ 675 mem_tmp, 8); \ 676 \ 677 aria_load_state_8way(x0, x1, x2, x3, \ 678 x4, x5, x6, x7, \ 679 mem_tmp, 0); \ 680 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 681 y0, rk, 0, round); \ 682 \ 683 aria_sbox_8way_gfni(x0, x1, x2, x3, \ 684 x4, x5, x6, x7, \ 685 y0, y1, y2, y3, \ 686 y4, y5, y6, y7); \ 687 \ 688 aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 689 aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 690 aria_store_state_8way(x0, x1, x2, x3, \ 691 x4, x5, x6, x7, \ 692 mem_tmp, 0); \ 693 aria_load_state_8way(y0, y1, y2, y3, \ 694 y4, y5, y6, y7, \ 695 mem_tmp, 8); \ 696 aria_diff_word(x0, x1, x2, x3, \ 697 x4, x5, x6, x7, \ 698 y0, y1, y2, y3, \ 699 y4, y5, y6, y7); \ 700 /* aria_diff_byte() \ 701 * T1 = ABCD -> BADC \ 702 * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 703 * T2 = ABCD -> CDAB \ 704 * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 705 * T3 = ABCD -> DCBA \ 706 * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 707 */ \ 708 aria_diff_word(x0, x1, x2, x3, \ 709 x5, x4, x7, x6, \ 710 y2, y3, y0, y1, \ 711 y7, y6, y5, y4); \ 712 aria_store_state_8way(x3, x2, x1, x0, \ 713 x6, x7, x4, x5, \ 714 mem_tmp, 0); 715 716#define aria_ff_gfni(x0, x1, x2, x3, \ 717 x4, x5, x6, x7, \ 718 y0, y1, y2, y3, \ 719 y4, y5, y6, y7, \ 720 mem_tmp, rk, round, last_round) \ 721 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 722 y0, rk, 8, round); \ 723 \ 724 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 725 x6, x7, x4, x5, \ 726 y0, y1, y2, y3, \ 727 y4, y5, y6, y7); \ 728 \ 729 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 730 y0, rk, 8, last_round); \ 731 \ 732 aria_store_state_8way(x0, x1, x2, x3, \ 733 x4, x5, x6, x7, \ 734 mem_tmp, 8); \ 735 \ 736 aria_load_state_8way(x0, x1, x2, x3, \ 737 x4, x5, x6, x7, \ 738 mem_tmp, 0); \ 739 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 740 y0, rk, 0, round); \ 741 \ 742 aria_sbox_8way_gfni(x2, x3, x0, x1, \ 743 x6, x7, x4, x5, \ 744 y0, y1, y2, y3, \ 745 y4, y5, y6, y7); \ 746 \ 747 aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 748 y0, rk, 0, last_round); \ 749 \ 750 aria_load_state_8way(y0, y1, y2, y3, \ 751 y4, y5, y6, y7, \ 752 mem_tmp, 8); 753 754.section .rodata.cst32.shufb_16x16b, "aM", @progbits, 32 755.align 32 756#define SHUFB_BYTES(idx) \ 757 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 758.Lshufb_16x16b: 759 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 760 .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3) 761 762.section .rodata.cst16, "aM", @progbits, 16 763.align 16 764/* For isolating SubBytes from AESENCLAST, inverse shift row */ 765.Linv_shift_row: 766 .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 767 .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 768.Lshift_row: 769 .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 770 .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 771/* For CTR-mode IV byteswap */ 772.Lbswap128_mask: 773 .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 774 .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 775 776/* AES inverse affine and S2 combined: 777 * 1 1 0 0 0 0 0 1 x0 0 778 * 0 1 0 0 1 0 0 0 x1 0 779 * 1 1 0 0 1 1 1 1 x2 0 780 * 0 1 1 0 1 0 0 1 x3 1 781 * 0 1 0 0 1 1 0 0 * x4 + 0 782 * 0 1 0 1 1 0 0 0 x5 0 783 * 0 0 0 0 0 1 0 1 x6 0 784 * 1 1 1 0 0 1 1 1 x7 1 785 */ 786.Ltf_lo__inv_aff__and__s2: 787 .octa 0x92172DA81A9FA520B2370D883ABF8500 788.Ltf_hi__inv_aff__and__s2: 789 .octa 0x2B15FFC1AF917B45E6D8320C625CB688 790 791/* X2 and AES forward affine combined: 792 * 1 0 1 1 0 0 0 1 x0 0 793 * 0 1 1 1 1 0 1 1 x1 0 794 * 0 0 0 1 1 0 1 0 x2 1 795 * 0 1 0 0 0 1 0 0 x3 0 796 * 0 0 1 1 1 0 1 1 * x4 + 0 797 * 0 1 0 0 1 0 0 0 x5 0 798 * 1 1 0 1 0 0 1 1 x6 0 799 * 0 1 0 0 1 0 1 0 x7 0 800 */ 801.Ltf_lo__x2__and__fwd_aff: 802 .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 803.Ltf_hi__x2__and__fwd_aff: 804 .octa 0x3F893781E95FE1576CDA64D2BA0CB204 805 806.section .rodata.cst8, "aM", @progbits, 8 807.align 8 808/* AES affine: */ 809#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 810.Ltf_aff_bitmatrix: 811 .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 812 BV8(1, 1, 0, 0, 0, 1, 1, 1), 813 BV8(1, 1, 1, 0, 0, 0, 1, 1), 814 BV8(1, 1, 1, 1, 0, 0, 0, 1), 815 BV8(1, 1, 1, 1, 1, 0, 0, 0), 816 BV8(0, 1, 1, 1, 1, 1, 0, 0), 817 BV8(0, 0, 1, 1, 1, 1, 1, 0), 818 BV8(0, 0, 0, 1, 1, 1, 1, 1)) 819 820/* AES inverse affine: */ 821#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 822.Ltf_inv_bitmatrix: 823 .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 824 BV8(1, 0, 0, 1, 0, 0, 1, 0), 825 BV8(0, 1, 0, 0, 1, 0, 0, 1), 826 BV8(1, 0, 1, 0, 0, 1, 0, 0), 827 BV8(0, 1, 0, 1, 0, 0, 1, 0), 828 BV8(0, 0, 1, 0, 1, 0, 0, 1), 829 BV8(1, 0, 0, 1, 0, 1, 0, 0), 830 BV8(0, 1, 0, 0, 1, 0, 1, 0)) 831 832/* S2: */ 833#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 834.Ltf_s2_bitmatrix: 835 .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 836 BV8(0, 0, 1, 1, 1, 1, 1, 1), 837 BV8(1, 1, 1, 0, 1, 1, 0, 1), 838 BV8(1, 1, 0, 0, 0, 0, 1, 1), 839 BV8(0, 1, 0, 0, 0, 0, 1, 1), 840 BV8(1, 1, 0, 0, 1, 1, 1, 0), 841 BV8(0, 1, 1, 0, 0, 0, 1, 1), 842 BV8(1, 1, 1, 1, 0, 1, 1, 0)) 843 844/* X2: */ 845#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 846.Ltf_x2_bitmatrix: 847 .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 848 BV8(0, 0, 1, 0, 0, 1, 1, 0), 849 BV8(0, 0, 0, 0, 1, 0, 1, 0), 850 BV8(1, 1, 1, 0, 0, 0, 1, 1), 851 BV8(1, 1, 1, 0, 1, 1, 0, 0), 852 BV8(0, 1, 1, 0, 1, 0, 1, 1), 853 BV8(1, 0, 1, 1, 1, 1, 0, 1), 854 BV8(1, 0, 0, 1, 0, 0, 1, 1)) 855 856/* Identity matrix: */ 857.Ltf_id_bitmatrix: 858 .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 859 BV8(0, 1, 0, 0, 0, 0, 0, 0), 860 BV8(0, 0, 1, 0, 0, 0, 0, 0), 861 BV8(0, 0, 0, 1, 0, 0, 0, 0), 862 BV8(0, 0, 0, 0, 1, 0, 0, 0), 863 BV8(0, 0, 0, 0, 0, 1, 0, 0), 864 BV8(0, 0, 0, 0, 0, 0, 1, 0), 865 BV8(0, 0, 0, 0, 0, 0, 0, 1)) 866 867/* 4-bit mask */ 868.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 869.align 4 870.L0f0f0f0f: 871 .long 0x0f0f0f0f 872 873.text 874 875SYM_FUNC_START_LOCAL(__aria_aesni_avx2_crypt_32way) 876 /* input: 877 * %r9: rk 878 * %rsi: dst 879 * %rdx: src 880 * %ymm0..%ymm15: byte-sliced blocks 881 */ 882 883 FRAME_BEGIN 884 885 movq %rsi, %rax; 886 leaq 8 * 32(%rax), %r8; 887 888 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 889 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 890 %ymm15, %rax, %r8); 891 aria_fo(%ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, %ymm15, 892 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 893 %rax, %r9, 0); 894 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 895 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 896 %ymm15, %rax, %r9, 1); 897 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 898 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 899 %rax, %r9, 2); 900 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 901 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 902 %ymm15, %rax, %r9, 3); 903 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 904 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 905 %rax, %r9, 4); 906 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 907 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 908 %ymm15, %rax, %r9, 5); 909 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 910 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 911 %rax, %r9, 6); 912 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 913 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 914 %ymm15, %rax, %r9, 7); 915 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 916 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 917 %rax, %r9, 8); 918 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 919 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 920 %ymm15, %rax, %r9, 9); 921 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 922 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 923 %rax, %r9, 10); 924 cmpl $12, ARIA_CTX_rounds(CTX); 925 jne .Laria_192; 926 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 927 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 928 %ymm15, %rax, %r9, 11, 12); 929 jmp .Laria_end; 930.Laria_192: 931 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 932 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 933 %ymm15, %rax, %r9, 11); 934 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 935 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 936 %rax, %r9, 12); 937 cmpl $14, ARIA_CTX_rounds(CTX); 938 jne .Laria_256; 939 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 940 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 941 %ymm15, %rax, %r9, 13, 14); 942 jmp .Laria_end; 943.Laria_256: 944 aria_fe(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 945 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 946 %ymm15, %rax, %r9, 13); 947 aria_fo(%ymm9, %ymm8, %ymm11, %ymm10, %ymm12, %ymm13, %ymm14, %ymm15, 948 %ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 949 %rax, %r9, 14); 950 aria_ff(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 951 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 952 %ymm15, %rax, %r9, 15, 16); 953.Laria_end: 954 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, 955 %ymm9, %ymm13, %ymm0, %ymm5, 956 %ymm10, %ymm14, %ymm3, %ymm6, 957 %ymm11, %ymm15, %ymm2, %ymm7, 958 (%rax), (%r8)); 959 960 FRAME_END 961 RET; 962SYM_FUNC_END(__aria_aesni_avx2_crypt_32way) 963 964SYM_TYPED_FUNC_START(aria_aesni_avx2_encrypt_32way) 965 /* input: 966 * %rdi: ctx, CTX 967 * %rsi: dst 968 * %rdx: src 969 */ 970 971 FRAME_BEGIN 972 973 leaq ARIA_CTX_enc_key(CTX), %r9; 974 975 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 976 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 977 %ymm15, %rdx); 978 979 call __aria_aesni_avx2_crypt_32way; 980 981 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 982 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 983 %ymm15, %rax); 984 985 FRAME_END 986 RET; 987SYM_FUNC_END(aria_aesni_avx2_encrypt_32way) 988 989SYM_TYPED_FUNC_START(aria_aesni_avx2_decrypt_32way) 990 /* input: 991 * %rdi: ctx, CTX 992 * %rsi: dst 993 * %rdx: src 994 */ 995 996 FRAME_BEGIN 997 998 leaq ARIA_CTX_dec_key(CTX), %r9; 999 1000 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1001 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1002 %ymm15, %rdx); 1003 1004 call __aria_aesni_avx2_crypt_32way; 1005 1006 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1007 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1008 %ymm15, %rax); 1009 1010 FRAME_END 1011 RET; 1012SYM_FUNC_END(aria_aesni_avx2_decrypt_32way) 1013 1014SYM_FUNC_START_LOCAL(__aria_aesni_avx2_ctr_gen_keystream_32way) 1015 /* input: 1016 * %rdi: ctx 1017 * %rsi: dst 1018 * %rdx: src 1019 * %rcx: keystream 1020 * %r8: iv (big endian, 128bit) 1021 */ 1022 1023 FRAME_BEGIN 1024 movq 8(%r8), %r11; 1025 bswapq %r11; 1026 1027 vbroadcasti128 .Lbswap128_mask (%rip), %ymm6; 1028 vpcmpeqd %ymm0, %ymm0, %ymm0; 1029 vpsrldq $8, %ymm0, %ymm0; /* ab: -1:0 ; cd: -1:0 */ 1030 vpaddq %ymm0, %ymm0, %ymm5; /* ab: -2:0 ; cd: -2:0 */ 1031 1032 /* load IV and byteswap */ 1033 vmovdqu (%r8), %xmm7; 1034 vpshufb %xmm6, %xmm7, %xmm7; 1035 vmovdqa %xmm7, %xmm3; 1036 inc_le128(%xmm7, %xmm0, %xmm4); 1037 vinserti128 $1, %xmm7, %ymm3, %ymm3; 1038 vpshufb %ymm6, %ymm3, %ymm8; /* +1 ; +0 */ 1039 1040 /* check need for handling 64-bit overflow and carry */ 1041 cmpq $(0xffffffffffffffff - 32), %r11; 1042 ja .Lhandle_ctr_carry; 1043 1044 /* construct IVs */ 1045 vpsubq %ymm5, %ymm3, %ymm3; /* +3 ; +2 */ 1046 vpshufb %ymm6, %ymm3, %ymm9; 1047 vpsubq %ymm5, %ymm3, %ymm3; /* +5 ; +4 */ 1048 vpshufb %ymm6, %ymm3, %ymm10; 1049 vpsubq %ymm5, %ymm3, %ymm3; /* +7 ; +6 */ 1050 vpshufb %ymm6, %ymm3, %ymm11; 1051 vpsubq %ymm5, %ymm3, %ymm3; /* +9 ; +8 */ 1052 vpshufb %ymm6, %ymm3, %ymm12; 1053 vpsubq %ymm5, %ymm3, %ymm3; /* +11 ; +10 */ 1054 vpshufb %ymm6, %ymm3, %ymm13; 1055 vpsubq %ymm5, %ymm3, %ymm3; /* +13 ; +12 */ 1056 vpshufb %ymm6, %ymm3, %ymm14; 1057 vpsubq %ymm5, %ymm3, %ymm3; /* +15 ; +14 */ 1058 vpshufb %ymm6, %ymm3, %ymm15; 1059 vmovdqu %ymm8, (0 * 32)(%rcx); 1060 vmovdqu %ymm9, (1 * 32)(%rcx); 1061 vmovdqu %ymm10, (2 * 32)(%rcx); 1062 vmovdqu %ymm11, (3 * 32)(%rcx); 1063 vmovdqu %ymm12, (4 * 32)(%rcx); 1064 vmovdqu %ymm13, (5 * 32)(%rcx); 1065 vmovdqu %ymm14, (6 * 32)(%rcx); 1066 vmovdqu %ymm15, (7 * 32)(%rcx); 1067 1068 vpsubq %ymm5, %ymm3, %ymm3; /* +17 ; +16 */ 1069 vpshufb %ymm6, %ymm3, %ymm8; 1070 vpsubq %ymm5, %ymm3, %ymm3; /* +19 ; +18 */ 1071 vpshufb %ymm6, %ymm3, %ymm9; 1072 vpsubq %ymm5, %ymm3, %ymm3; /* +21 ; +20 */ 1073 vpshufb %ymm6, %ymm3, %ymm10; 1074 vpsubq %ymm5, %ymm3, %ymm3; /* +23 ; +22 */ 1075 vpshufb %ymm6, %ymm3, %ymm11; 1076 vpsubq %ymm5, %ymm3, %ymm3; /* +25 ; +24 */ 1077 vpshufb %ymm6, %ymm3, %ymm12; 1078 vpsubq %ymm5, %ymm3, %ymm3; /* +27 ; +26 */ 1079 vpshufb %ymm6, %ymm3, %ymm13; 1080 vpsubq %ymm5, %ymm3, %ymm3; /* +29 ; +28 */ 1081 vpshufb %ymm6, %ymm3, %ymm14; 1082 vpsubq %ymm5, %ymm3, %ymm3; /* +31 ; +30 */ 1083 vpshufb %ymm6, %ymm3, %ymm15; 1084 vpsubq %ymm5, %ymm3, %ymm3; /* +32 */ 1085 vpshufb %xmm6, %xmm3, %xmm3; 1086 vmovdqu %xmm3, (%r8); 1087 vmovdqu (0 * 32)(%rcx), %ymm0; 1088 vmovdqu (1 * 32)(%rcx), %ymm1; 1089 vmovdqu (2 * 32)(%rcx), %ymm2; 1090 vmovdqu (3 * 32)(%rcx), %ymm3; 1091 vmovdqu (4 * 32)(%rcx), %ymm4; 1092 vmovdqu (5 * 32)(%rcx), %ymm5; 1093 vmovdqu (6 * 32)(%rcx), %ymm6; 1094 vmovdqu (7 * 32)(%rcx), %ymm7; 1095 jmp .Lctr_carry_done; 1096 1097 .Lhandle_ctr_carry: 1098 /* construct IVs */ 1099 inc_le128(%ymm3, %ymm0, %ymm4); 1100 inc_le128(%ymm3, %ymm0, %ymm4); 1101 vpshufb %ymm6, %ymm3, %ymm9; /* +3 ; +2 */ 1102 inc_le128(%ymm3, %ymm0, %ymm4); 1103 inc_le128(%ymm3, %ymm0, %ymm4); 1104 vpshufb %ymm6, %ymm3, %ymm10; /* +5 ; +4 */ 1105 inc_le128(%ymm3, %ymm0, %ymm4); 1106 inc_le128(%ymm3, %ymm0, %ymm4); 1107 vpshufb %ymm6, %ymm3, %ymm11; /* +7 ; +6 */ 1108 inc_le128(%ymm3, %ymm0, %ymm4); 1109 inc_le128(%ymm3, %ymm0, %ymm4); 1110 vpshufb %ymm6, %ymm3, %ymm12; /* +9 ; +8 */ 1111 inc_le128(%ymm3, %ymm0, %ymm4); 1112 inc_le128(%ymm3, %ymm0, %ymm4); 1113 vpshufb %ymm6, %ymm3, %ymm13; /* +11 ; +10 */ 1114 inc_le128(%ymm3, %ymm0, %ymm4); 1115 inc_le128(%ymm3, %ymm0, %ymm4); 1116 vpshufb %ymm6, %ymm3, %ymm14; /* +13 ; +12 */ 1117 inc_le128(%ymm3, %ymm0, %ymm4); 1118 inc_le128(%ymm3, %ymm0, %ymm4); 1119 vpshufb %ymm6, %ymm3, %ymm15; /* +15 ; +14 */ 1120 vmovdqu %ymm8, (0 * 32)(%rcx); 1121 vmovdqu %ymm9, (1 * 32)(%rcx); 1122 vmovdqu %ymm10, (2 * 32)(%rcx); 1123 vmovdqu %ymm11, (3 * 32)(%rcx); 1124 vmovdqu %ymm12, (4 * 32)(%rcx); 1125 vmovdqu %ymm13, (5 * 32)(%rcx); 1126 vmovdqu %ymm14, (6 * 32)(%rcx); 1127 vmovdqu %ymm15, (7 * 32)(%rcx); 1128 1129 inc_le128(%ymm3, %ymm0, %ymm4); 1130 inc_le128(%ymm3, %ymm0, %ymm4); 1131 vpshufb %ymm6, %ymm3, %ymm8; /* +17 ; +16 */ 1132 inc_le128(%ymm3, %ymm0, %ymm4); 1133 inc_le128(%ymm3, %ymm0, %ymm4); 1134 vpshufb %ymm6, %ymm3, %ymm9; /* +19 ; +18 */ 1135 inc_le128(%ymm3, %ymm0, %ymm4); 1136 inc_le128(%ymm3, %ymm0, %ymm4); 1137 vpshufb %ymm6, %ymm3, %ymm10; /* +21 ; +20 */ 1138 inc_le128(%ymm3, %ymm0, %ymm4); 1139 inc_le128(%ymm3, %ymm0, %ymm4); 1140 vpshufb %ymm6, %ymm3, %ymm11; /* +23 ; +22 */ 1141 inc_le128(%ymm3, %ymm0, %ymm4); 1142 inc_le128(%ymm3, %ymm0, %ymm4); 1143 vpshufb %ymm6, %ymm3, %ymm12; /* +25 ; +24 */ 1144 inc_le128(%ymm3, %ymm0, %ymm4); 1145 inc_le128(%ymm3, %ymm0, %ymm4); 1146 vpshufb %ymm6, %ymm3, %ymm13; /* +27 ; +26 */ 1147 inc_le128(%ymm3, %ymm0, %ymm4); 1148 inc_le128(%ymm3, %ymm0, %ymm4); 1149 vpshufb %ymm6, %ymm3, %ymm14; /* +29 ; +28 */ 1150 inc_le128(%ymm3, %ymm0, %ymm4); 1151 inc_le128(%ymm3, %ymm0, %ymm4); 1152 vpshufb %ymm6, %ymm3, %ymm15; /* +31 ; +30 */ 1153 inc_le128(%ymm3, %ymm0, %ymm4); 1154 vextracti128 $1, %ymm3, %xmm3; 1155 vpshufb %xmm6, %xmm3, %xmm3; /* +32 */ 1156 vmovdqu %xmm3, (%r8); 1157 vmovdqu (0 * 32)(%rcx), %ymm0; 1158 vmovdqu (1 * 32)(%rcx), %ymm1; 1159 vmovdqu (2 * 32)(%rcx), %ymm2; 1160 vmovdqu (3 * 32)(%rcx), %ymm3; 1161 vmovdqu (4 * 32)(%rcx), %ymm4; 1162 vmovdqu (5 * 32)(%rcx), %ymm5; 1163 vmovdqu (6 * 32)(%rcx), %ymm6; 1164 vmovdqu (7 * 32)(%rcx), %ymm7; 1165 1166 .Lctr_carry_done: 1167 1168 FRAME_END 1169 RET; 1170SYM_FUNC_END(__aria_aesni_avx2_ctr_gen_keystream_32way) 1171 1172SYM_TYPED_FUNC_START(aria_aesni_avx2_ctr_crypt_32way) 1173 /* input: 1174 * %rdi: ctx 1175 * %rsi: dst 1176 * %rdx: src 1177 * %rcx: keystream 1178 * %r8: iv (big endian, 128bit) 1179 */ 1180 FRAME_BEGIN 1181 1182 call __aria_aesni_avx2_ctr_gen_keystream_32way; 1183 1184 leaq (%rsi), %r10; 1185 leaq (%rdx), %r11; 1186 leaq (%rcx), %rsi; 1187 leaq (%rcx), %rdx; 1188 leaq ARIA_CTX_enc_key(CTX), %r9; 1189 1190 call __aria_aesni_avx2_crypt_32way; 1191 1192 vpxor (0 * 32)(%r11), %ymm1, %ymm1; 1193 vpxor (1 * 32)(%r11), %ymm0, %ymm0; 1194 vpxor (2 * 32)(%r11), %ymm3, %ymm3; 1195 vpxor (3 * 32)(%r11), %ymm2, %ymm2; 1196 vpxor (4 * 32)(%r11), %ymm4, %ymm4; 1197 vpxor (5 * 32)(%r11), %ymm5, %ymm5; 1198 vpxor (6 * 32)(%r11), %ymm6, %ymm6; 1199 vpxor (7 * 32)(%r11), %ymm7, %ymm7; 1200 vpxor (8 * 32)(%r11), %ymm8, %ymm8; 1201 vpxor (9 * 32)(%r11), %ymm9, %ymm9; 1202 vpxor (10 * 32)(%r11), %ymm10, %ymm10; 1203 vpxor (11 * 32)(%r11), %ymm11, %ymm11; 1204 vpxor (12 * 32)(%r11), %ymm12, %ymm12; 1205 vpxor (13 * 32)(%r11), %ymm13, %ymm13; 1206 vpxor (14 * 32)(%r11), %ymm14, %ymm14; 1207 vpxor (15 * 32)(%r11), %ymm15, %ymm15; 1208 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1209 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1210 %ymm15, %r10); 1211 1212 FRAME_END 1213 RET; 1214SYM_FUNC_END(aria_aesni_avx2_ctr_crypt_32way) 1215 1216SYM_FUNC_START_LOCAL(__aria_aesni_avx2_gfni_crypt_32way) 1217 /* input: 1218 * %r9: rk 1219 * %rsi: dst 1220 * %rdx: src 1221 * %ymm0..%ymm15: 16 byte-sliced blocks 1222 */ 1223 1224 FRAME_BEGIN 1225 1226 movq %rsi, %rax; 1227 leaq 8 * 32(%rax), %r8; 1228 1229 inpack16_post(%ymm0, %ymm1, %ymm2, %ymm3, 1230 %ymm4, %ymm5, %ymm6, %ymm7, 1231 %ymm8, %ymm9, %ymm10, %ymm11, 1232 %ymm12, %ymm13, %ymm14, 1233 %ymm15, %rax, %r8); 1234 aria_fo_gfni(%ymm8, %ymm9, %ymm10, %ymm11, 1235 %ymm12, %ymm13, %ymm14, %ymm15, 1236 %ymm0, %ymm1, %ymm2, %ymm3, 1237 %ymm4, %ymm5, %ymm6, %ymm7, 1238 %rax, %r9, 0); 1239 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1240 %ymm4, %ymm5, %ymm6, %ymm7, 1241 %ymm8, %ymm9, %ymm10, %ymm11, 1242 %ymm12, %ymm13, %ymm14, 1243 %ymm15, %rax, %r9, 1); 1244 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1245 %ymm12, %ymm13, %ymm14, %ymm15, 1246 %ymm0, %ymm1, %ymm2, %ymm3, 1247 %ymm4, %ymm5, %ymm6, %ymm7, 1248 %rax, %r9, 2); 1249 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1250 %ymm4, %ymm5, %ymm6, %ymm7, 1251 %ymm8, %ymm9, %ymm10, %ymm11, 1252 %ymm12, %ymm13, %ymm14, 1253 %ymm15, %rax, %r9, 3); 1254 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1255 %ymm12, %ymm13, %ymm14, %ymm15, 1256 %ymm0, %ymm1, %ymm2, %ymm3, 1257 %ymm4, %ymm5, %ymm6, %ymm7, 1258 %rax, %r9, 4); 1259 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1260 %ymm4, %ymm5, %ymm6, %ymm7, 1261 %ymm8, %ymm9, %ymm10, %ymm11, 1262 %ymm12, %ymm13, %ymm14, 1263 %ymm15, %rax, %r9, 5); 1264 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1265 %ymm12, %ymm13, %ymm14, %ymm15, 1266 %ymm0, %ymm1, %ymm2, %ymm3, 1267 %ymm4, %ymm5, %ymm6, %ymm7, 1268 %rax, %r9, 6); 1269 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1270 %ymm4, %ymm5, %ymm6, %ymm7, 1271 %ymm8, %ymm9, %ymm10, %ymm11, 1272 %ymm12, %ymm13, %ymm14, 1273 %ymm15, %rax, %r9, 7); 1274 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1275 %ymm12, %ymm13, %ymm14, %ymm15, 1276 %ymm0, %ymm1, %ymm2, %ymm3, 1277 %ymm4, %ymm5, %ymm6, %ymm7, 1278 %rax, %r9, 8); 1279 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1280 %ymm4, %ymm5, %ymm6, %ymm7, 1281 %ymm8, %ymm9, %ymm10, %ymm11, 1282 %ymm12, %ymm13, %ymm14, 1283 %ymm15, %rax, %r9, 9); 1284 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1285 %ymm12, %ymm13, %ymm14, %ymm15, 1286 %ymm0, %ymm1, %ymm2, %ymm3, 1287 %ymm4, %ymm5, %ymm6, %ymm7, 1288 %rax, %r9, 10); 1289 cmpl $12, ARIA_CTX_rounds(CTX); 1290 jne .Laria_gfni_192; 1291 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1292 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1293 %ymm15, %rax, %r9, 11, 12); 1294 jmp .Laria_gfni_end; 1295.Laria_gfni_192: 1296 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1297 %ymm4, %ymm5, %ymm6, %ymm7, 1298 %ymm8, %ymm9, %ymm10, %ymm11, 1299 %ymm12, %ymm13, %ymm14, 1300 %ymm15, %rax, %r9, 11); 1301 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1302 %ymm12, %ymm13, %ymm14, %ymm15, 1303 %ymm0, %ymm1, %ymm2, %ymm3, 1304 %ymm4, %ymm5, %ymm6, %ymm7, 1305 %rax, %r9, 12); 1306 cmpl $14, ARIA_CTX_rounds(CTX); 1307 jne .Laria_gfni_256; 1308 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1309 %ymm4, %ymm5, %ymm6, %ymm7, 1310 %ymm8, %ymm9, %ymm10, %ymm11, 1311 %ymm12, %ymm13, %ymm14, 1312 %ymm15, %rax, %r9, 13, 14); 1313 jmp .Laria_gfni_end; 1314.Laria_gfni_256: 1315 aria_fe_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1316 %ymm4, %ymm5, %ymm6, %ymm7, 1317 %ymm8, %ymm9, %ymm10, %ymm11, 1318 %ymm12, %ymm13, %ymm14, 1319 %ymm15, %rax, %r9, 13); 1320 aria_fo_gfni(%ymm9, %ymm8, %ymm11, %ymm10, 1321 %ymm12, %ymm13, %ymm14, %ymm15, 1322 %ymm0, %ymm1, %ymm2, %ymm3, 1323 %ymm4, %ymm5, %ymm6, %ymm7, 1324 %rax, %r9, 14); 1325 aria_ff_gfni(%ymm1, %ymm0, %ymm3, %ymm2, 1326 %ymm4, %ymm5, %ymm6, %ymm7, 1327 %ymm8, %ymm9, %ymm10, %ymm11, 1328 %ymm12, %ymm13, %ymm14, 1329 %ymm15, %rax, %r9, 15, 16); 1330.Laria_gfni_end: 1331 debyteslice_16x16b(%ymm8, %ymm12, %ymm1, %ymm4, 1332 %ymm9, %ymm13, %ymm0, %ymm5, 1333 %ymm10, %ymm14, %ymm3, %ymm6, 1334 %ymm11, %ymm15, %ymm2, %ymm7, 1335 (%rax), (%r8)); 1336 1337 FRAME_END 1338 RET; 1339SYM_FUNC_END(__aria_aesni_avx2_gfni_crypt_32way) 1340 1341SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_encrypt_32way) 1342 /* input: 1343 * %rdi: ctx, CTX 1344 * %rsi: dst 1345 * %rdx: src 1346 */ 1347 1348 FRAME_BEGIN 1349 1350 leaq ARIA_CTX_enc_key(CTX), %r9; 1351 1352 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1353 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1354 %ymm15, %rdx); 1355 1356 call __aria_aesni_avx2_gfni_crypt_32way; 1357 1358 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1359 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1360 %ymm15, %rax); 1361 1362 FRAME_END 1363 RET; 1364SYM_FUNC_END(aria_aesni_avx2_gfni_encrypt_32way) 1365 1366SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_decrypt_32way) 1367 /* input: 1368 * %rdi: ctx, CTX 1369 * %rsi: dst 1370 * %rdx: src 1371 */ 1372 1373 FRAME_BEGIN 1374 1375 leaq ARIA_CTX_dec_key(CTX), %r9; 1376 1377 inpack16_pre(%ymm0, %ymm1, %ymm2, %ymm3, %ymm4, %ymm5, %ymm6, %ymm7, 1378 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1379 %ymm15, %rdx); 1380 1381 call __aria_aesni_avx2_gfni_crypt_32way; 1382 1383 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1384 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1385 %ymm15, %rax); 1386 1387 FRAME_END 1388 RET; 1389SYM_FUNC_END(aria_aesni_avx2_gfni_decrypt_32way) 1390 1391SYM_TYPED_FUNC_START(aria_aesni_avx2_gfni_ctr_crypt_32way) 1392 /* input: 1393 * %rdi: ctx 1394 * %rsi: dst 1395 * %rdx: src 1396 * %rcx: keystream 1397 * %r8: iv (big endian, 128bit) 1398 */ 1399 FRAME_BEGIN 1400 1401 call __aria_aesni_avx2_ctr_gen_keystream_32way 1402 1403 leaq (%rsi), %r10; 1404 leaq (%rdx), %r11; 1405 leaq (%rcx), %rsi; 1406 leaq (%rcx), %rdx; 1407 leaq ARIA_CTX_enc_key(CTX), %r9; 1408 1409 call __aria_aesni_avx2_gfni_crypt_32way; 1410 1411 vpxor (0 * 32)(%r11), %ymm1, %ymm1; 1412 vpxor (1 * 32)(%r11), %ymm0, %ymm0; 1413 vpxor (2 * 32)(%r11), %ymm3, %ymm3; 1414 vpxor (3 * 32)(%r11), %ymm2, %ymm2; 1415 vpxor (4 * 32)(%r11), %ymm4, %ymm4; 1416 vpxor (5 * 32)(%r11), %ymm5, %ymm5; 1417 vpxor (6 * 32)(%r11), %ymm6, %ymm6; 1418 vpxor (7 * 32)(%r11), %ymm7, %ymm7; 1419 vpxor (8 * 32)(%r11), %ymm8, %ymm8; 1420 vpxor (9 * 32)(%r11), %ymm9, %ymm9; 1421 vpxor (10 * 32)(%r11), %ymm10, %ymm10; 1422 vpxor (11 * 32)(%r11), %ymm11, %ymm11; 1423 vpxor (12 * 32)(%r11), %ymm12, %ymm12; 1424 vpxor (13 * 32)(%r11), %ymm13, %ymm13; 1425 vpxor (14 * 32)(%r11), %ymm14, %ymm14; 1426 vpxor (15 * 32)(%r11), %ymm15, %ymm15; 1427 write_output(%ymm1, %ymm0, %ymm3, %ymm2, %ymm4, %ymm5, %ymm6, %ymm7, 1428 %ymm8, %ymm9, %ymm10, %ymm11, %ymm12, %ymm13, %ymm14, 1429 %ymm15, %r10); 1430 1431 FRAME_END 1432 RET; 1433SYM_FUNC_END(aria_aesni_avx2_gfni_ctr_crypt_32way) 1434