1ba3579e6STaehee Yoo/* SPDX-License-Identifier: GPL-2.0-or-later */ 2ba3579e6STaehee Yoo/* 3ba3579e6STaehee Yoo * ARIA Cipher 16-way parallel algorithm (AVX) 4ba3579e6STaehee Yoo * 5ba3579e6STaehee Yoo * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6ba3579e6STaehee Yoo * 7ba3579e6STaehee Yoo */ 8ba3579e6STaehee Yoo 9ba3579e6STaehee Yoo#include <linux/linkage.h> 10*c67b553aSEric Biggers#include <linux/cfi_types.h> 11ba3579e6STaehee Yoo#include <asm/frame.h> 12ba3579e6STaehee Yoo 13ba3579e6STaehee Yoo/* struct aria_ctx: */ 14ba3579e6STaehee Yoo#define enc_key 0 15ba3579e6STaehee Yoo#define dec_key 272 16ba3579e6STaehee Yoo#define rounds 544 17ba3579e6STaehee Yoo 18ba3579e6STaehee Yoo/* register macros */ 19ba3579e6STaehee Yoo#define CTX %rdi 20ba3579e6STaehee Yoo 21ba3579e6STaehee Yoo 22ba3579e6STaehee Yoo#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 23ba3579e6STaehee Yoo ( (((a0) & 1) << 0) | \ 24ba3579e6STaehee Yoo (((a1) & 1) << 1) | \ 25ba3579e6STaehee Yoo (((a2) & 1) << 2) | \ 26ba3579e6STaehee Yoo (((a3) & 1) << 3) | \ 27ba3579e6STaehee Yoo (((a4) & 1) << 4) | \ 28ba3579e6STaehee Yoo (((a5) & 1) << 5) | \ 29ba3579e6STaehee Yoo (((a6) & 1) << 6) | \ 30ba3579e6STaehee Yoo (((a7) & 1) << 7) ) 31ba3579e6STaehee Yoo 32ba3579e6STaehee Yoo#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 33ba3579e6STaehee Yoo ( ((l7) << (0 * 8)) | \ 34ba3579e6STaehee Yoo ((l6) << (1 * 8)) | \ 35ba3579e6STaehee Yoo ((l5) << (2 * 8)) | \ 36ba3579e6STaehee Yoo ((l4) << (3 * 8)) | \ 37ba3579e6STaehee Yoo ((l3) << (4 * 8)) | \ 38ba3579e6STaehee Yoo ((l2) << (5 * 8)) | \ 39ba3579e6STaehee Yoo ((l1) << (6 * 8)) | \ 40ba3579e6STaehee Yoo ((l0) << (7 * 8)) ) 41ba3579e6STaehee Yoo 42ba3579e6STaehee Yoo#define inc_le128(x, minus_one, tmp) \ 43ba3579e6STaehee Yoo vpcmpeqq minus_one, x, tmp; \ 44ba3579e6STaehee Yoo vpsubq minus_one, x, x; \ 45ba3579e6STaehee Yoo vpslldq $8, tmp, tmp; \ 46ba3579e6STaehee Yoo vpsubq tmp, x, x; 47ba3579e6STaehee Yoo 48ba3579e6STaehee Yoo#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 49ba3579e6STaehee Yoo vpand x, mask4bit, tmp0; \ 50ba3579e6STaehee Yoo vpandn x, mask4bit, x; \ 51ba3579e6STaehee Yoo vpsrld $4, x, x; \ 52ba3579e6STaehee Yoo \ 53ba3579e6STaehee Yoo vpshufb tmp0, lo_t, tmp0; \ 54ba3579e6STaehee Yoo vpshufb x, hi_t, x; \ 55ba3579e6STaehee Yoo vpxor tmp0, x, x; 56ba3579e6STaehee Yoo 57ba3579e6STaehee Yoo#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 58ba3579e6STaehee Yoo vpunpckhdq x1, x0, t2; \ 59ba3579e6STaehee Yoo vpunpckldq x1, x0, x0; \ 60ba3579e6STaehee Yoo \ 61ba3579e6STaehee Yoo vpunpckldq x3, x2, t1; \ 62ba3579e6STaehee Yoo vpunpckhdq x3, x2, x2; \ 63ba3579e6STaehee Yoo \ 64ba3579e6STaehee Yoo vpunpckhqdq t1, x0, x1; \ 65ba3579e6STaehee Yoo vpunpcklqdq t1, x0, x0; \ 66ba3579e6STaehee Yoo \ 67ba3579e6STaehee Yoo vpunpckhqdq x2, t2, x3; \ 68ba3579e6STaehee Yoo vpunpcklqdq x2, t2, x2; 69ba3579e6STaehee Yoo 70ba3579e6STaehee Yoo#define byteslice_16x16b(a0, b0, c0, d0, \ 71ba3579e6STaehee Yoo a1, b1, c1, d1, \ 72ba3579e6STaehee Yoo a2, b2, c2, d2, \ 73ba3579e6STaehee Yoo a3, b3, c3, d3, \ 74ba3579e6STaehee Yoo st0, st1) \ 75ba3579e6STaehee Yoo vmovdqu d2, st0; \ 76ba3579e6STaehee Yoo vmovdqu d3, st1; \ 77ba3579e6STaehee Yoo transpose_4x4(a0, a1, a2, a3, d2, d3); \ 78ba3579e6STaehee Yoo transpose_4x4(b0, b1, b2, b3, d2, d3); \ 79ba3579e6STaehee Yoo vmovdqu st0, d2; \ 80ba3579e6STaehee Yoo vmovdqu st1, d3; \ 81ba3579e6STaehee Yoo \ 82ba3579e6STaehee Yoo vmovdqu a0, st0; \ 83ba3579e6STaehee Yoo vmovdqu a1, st1; \ 84ba3579e6STaehee Yoo transpose_4x4(c0, c1, c2, c3, a0, a1); \ 85ba3579e6STaehee Yoo transpose_4x4(d0, d1, d2, d3, a0, a1); \ 86ba3579e6STaehee Yoo \ 87ba3579e6STaehee Yoo vmovdqu .Lshufb_16x16b, a0; \ 88ba3579e6STaehee Yoo vmovdqu st1, a1; \ 89ba3579e6STaehee Yoo vpshufb a0, a2, a2; \ 90ba3579e6STaehee Yoo vpshufb a0, a3, a3; \ 91ba3579e6STaehee Yoo vpshufb a0, b0, b0; \ 92ba3579e6STaehee Yoo vpshufb a0, b1, b1; \ 93ba3579e6STaehee Yoo vpshufb a0, b2, b2; \ 94ba3579e6STaehee Yoo vpshufb a0, b3, b3; \ 95ba3579e6STaehee Yoo vpshufb a0, a1, a1; \ 96ba3579e6STaehee Yoo vpshufb a0, c0, c0; \ 97ba3579e6STaehee Yoo vpshufb a0, c1, c1; \ 98ba3579e6STaehee Yoo vpshufb a0, c2, c2; \ 99ba3579e6STaehee Yoo vpshufb a0, c3, c3; \ 100ba3579e6STaehee Yoo vpshufb a0, d0, d0; \ 101ba3579e6STaehee Yoo vpshufb a0, d1, d1; \ 102ba3579e6STaehee Yoo vpshufb a0, d2, d2; \ 103ba3579e6STaehee Yoo vpshufb a0, d3, d3; \ 104ba3579e6STaehee Yoo vmovdqu d3, st1; \ 105ba3579e6STaehee Yoo vmovdqu st0, d3; \ 106ba3579e6STaehee Yoo vpshufb a0, d3, a0; \ 107ba3579e6STaehee Yoo vmovdqu d2, st0; \ 108ba3579e6STaehee Yoo \ 109ba3579e6STaehee Yoo transpose_4x4(a0, b0, c0, d0, d2, d3); \ 110ba3579e6STaehee Yoo transpose_4x4(a1, b1, c1, d1, d2, d3); \ 111ba3579e6STaehee Yoo vmovdqu st0, d2; \ 112ba3579e6STaehee Yoo vmovdqu st1, d3; \ 113ba3579e6STaehee Yoo \ 114ba3579e6STaehee Yoo vmovdqu b0, st0; \ 115ba3579e6STaehee Yoo vmovdqu b1, st1; \ 116ba3579e6STaehee Yoo transpose_4x4(a2, b2, c2, d2, b0, b1); \ 117ba3579e6STaehee Yoo transpose_4x4(a3, b3, c3, d3, b0, b1); \ 118ba3579e6STaehee Yoo vmovdqu st0, b0; \ 119ba3579e6STaehee Yoo vmovdqu st1, b1; \ 120ba3579e6STaehee Yoo /* does not adjust output bytes inside vectors */ 121ba3579e6STaehee Yoo 122ba3579e6STaehee Yoo#define debyteslice_16x16b(a0, b0, c0, d0, \ 123ba3579e6STaehee Yoo a1, b1, c1, d1, \ 124ba3579e6STaehee Yoo a2, b2, c2, d2, \ 125ba3579e6STaehee Yoo a3, b3, c3, d3, \ 126ba3579e6STaehee Yoo st0, st1) \ 127ba3579e6STaehee Yoo vmovdqu d2, st0; \ 128ba3579e6STaehee Yoo vmovdqu d3, st1; \ 129ba3579e6STaehee Yoo transpose_4x4(a0, a1, a2, a3, d2, d3); \ 130ba3579e6STaehee Yoo transpose_4x4(b0, b1, b2, b3, d2, d3); \ 131ba3579e6STaehee Yoo vmovdqu st0, d2; \ 132ba3579e6STaehee Yoo vmovdqu st1, d3; \ 133ba3579e6STaehee Yoo \ 134ba3579e6STaehee Yoo vmovdqu a0, st0; \ 135ba3579e6STaehee Yoo vmovdqu a1, st1; \ 136ba3579e6STaehee Yoo transpose_4x4(c0, c1, c2, c3, a0, a1); \ 137ba3579e6STaehee Yoo transpose_4x4(d0, d1, d2, d3, a0, a1); \ 138ba3579e6STaehee Yoo \ 139ba3579e6STaehee Yoo vmovdqu .Lshufb_16x16b, a0; \ 140ba3579e6STaehee Yoo vmovdqu st1, a1; \ 141ba3579e6STaehee Yoo vpshufb a0, a2, a2; \ 142ba3579e6STaehee Yoo vpshufb a0, a3, a3; \ 143ba3579e6STaehee Yoo vpshufb a0, b0, b0; \ 144ba3579e6STaehee Yoo vpshufb a0, b1, b1; \ 145ba3579e6STaehee Yoo vpshufb a0, b2, b2; \ 146ba3579e6STaehee Yoo vpshufb a0, b3, b3; \ 147ba3579e6STaehee Yoo vpshufb a0, a1, a1; \ 148ba3579e6STaehee Yoo vpshufb a0, c0, c0; \ 149ba3579e6STaehee Yoo vpshufb a0, c1, c1; \ 150ba3579e6STaehee Yoo vpshufb a0, c2, c2; \ 151ba3579e6STaehee Yoo vpshufb a0, c3, c3; \ 152ba3579e6STaehee Yoo vpshufb a0, d0, d0; \ 153ba3579e6STaehee Yoo vpshufb a0, d1, d1; \ 154ba3579e6STaehee Yoo vpshufb a0, d2, d2; \ 155ba3579e6STaehee Yoo vpshufb a0, d3, d3; \ 156ba3579e6STaehee Yoo vmovdqu d3, st1; \ 157ba3579e6STaehee Yoo vmovdqu st0, d3; \ 158ba3579e6STaehee Yoo vpshufb a0, d3, a0; \ 159ba3579e6STaehee Yoo vmovdqu d2, st0; \ 160ba3579e6STaehee Yoo \ 161ba3579e6STaehee Yoo transpose_4x4(c0, d0, a0, b0, d2, d3); \ 162ba3579e6STaehee Yoo transpose_4x4(c1, d1, a1, b1, d2, d3); \ 163ba3579e6STaehee Yoo vmovdqu st0, d2; \ 164ba3579e6STaehee Yoo vmovdqu st1, d3; \ 165ba3579e6STaehee Yoo \ 166ba3579e6STaehee Yoo vmovdqu b0, st0; \ 167ba3579e6STaehee Yoo vmovdqu b1, st1; \ 168ba3579e6STaehee Yoo transpose_4x4(c2, d2, a2, b2, b0, b1); \ 169ba3579e6STaehee Yoo transpose_4x4(c3, d3, a3, b3, b0, b1); \ 170ba3579e6STaehee Yoo vmovdqu st0, b0; \ 171ba3579e6STaehee Yoo vmovdqu st1, b1; \ 172ba3579e6STaehee Yoo /* does not adjust output bytes inside vectors */ 173ba3579e6STaehee Yoo 174ba3579e6STaehee Yoo/* load blocks to registers and apply pre-whitening */ 175ba3579e6STaehee Yoo#define inpack16_pre(x0, x1, x2, x3, \ 176ba3579e6STaehee Yoo x4, x5, x6, x7, \ 177ba3579e6STaehee Yoo y0, y1, y2, y3, \ 178ba3579e6STaehee Yoo y4, y5, y6, y7, \ 179ba3579e6STaehee Yoo rio) \ 180ba3579e6STaehee Yoo vmovdqu (0 * 16)(rio), x0; \ 181ba3579e6STaehee Yoo vmovdqu (1 * 16)(rio), x1; \ 182ba3579e6STaehee Yoo vmovdqu (2 * 16)(rio), x2; \ 183ba3579e6STaehee Yoo vmovdqu (3 * 16)(rio), x3; \ 184ba3579e6STaehee Yoo vmovdqu (4 * 16)(rio), x4; \ 185ba3579e6STaehee Yoo vmovdqu (5 * 16)(rio), x5; \ 186ba3579e6STaehee Yoo vmovdqu (6 * 16)(rio), x6; \ 187ba3579e6STaehee Yoo vmovdqu (7 * 16)(rio), x7; \ 188ba3579e6STaehee Yoo vmovdqu (8 * 16)(rio), y0; \ 189ba3579e6STaehee Yoo vmovdqu (9 * 16)(rio), y1; \ 190ba3579e6STaehee Yoo vmovdqu (10 * 16)(rio), y2; \ 191ba3579e6STaehee Yoo vmovdqu (11 * 16)(rio), y3; \ 192ba3579e6STaehee Yoo vmovdqu (12 * 16)(rio), y4; \ 193ba3579e6STaehee Yoo vmovdqu (13 * 16)(rio), y5; \ 194ba3579e6STaehee Yoo vmovdqu (14 * 16)(rio), y6; \ 195ba3579e6STaehee Yoo vmovdqu (15 * 16)(rio), y7; 196ba3579e6STaehee Yoo 197ba3579e6STaehee Yoo/* byteslice pre-whitened blocks and store to temporary memory */ 198ba3579e6STaehee Yoo#define inpack16_post(x0, x1, x2, x3, \ 199ba3579e6STaehee Yoo x4, x5, x6, x7, \ 200ba3579e6STaehee Yoo y0, y1, y2, y3, \ 201ba3579e6STaehee Yoo y4, y5, y6, y7, \ 202ba3579e6STaehee Yoo mem_ab, mem_cd) \ 203ba3579e6STaehee Yoo byteslice_16x16b(x0, x1, x2, x3, \ 204ba3579e6STaehee Yoo x4, x5, x6, x7, \ 205ba3579e6STaehee Yoo y0, y1, y2, y3, \ 206ba3579e6STaehee Yoo y4, y5, y6, y7, \ 207ba3579e6STaehee Yoo (mem_ab), (mem_cd)); \ 208ba3579e6STaehee Yoo \ 209ba3579e6STaehee Yoo vmovdqu x0, 0 * 16(mem_ab); \ 210ba3579e6STaehee Yoo vmovdqu x1, 1 * 16(mem_ab); \ 211ba3579e6STaehee Yoo vmovdqu x2, 2 * 16(mem_ab); \ 212ba3579e6STaehee Yoo vmovdqu x3, 3 * 16(mem_ab); \ 213ba3579e6STaehee Yoo vmovdqu x4, 4 * 16(mem_ab); \ 214ba3579e6STaehee Yoo vmovdqu x5, 5 * 16(mem_ab); \ 215ba3579e6STaehee Yoo vmovdqu x6, 6 * 16(mem_ab); \ 216ba3579e6STaehee Yoo vmovdqu x7, 7 * 16(mem_ab); \ 217ba3579e6STaehee Yoo vmovdqu y0, 0 * 16(mem_cd); \ 218ba3579e6STaehee Yoo vmovdqu y1, 1 * 16(mem_cd); \ 219ba3579e6STaehee Yoo vmovdqu y2, 2 * 16(mem_cd); \ 220ba3579e6STaehee Yoo vmovdqu y3, 3 * 16(mem_cd); \ 221ba3579e6STaehee Yoo vmovdqu y4, 4 * 16(mem_cd); \ 222ba3579e6STaehee Yoo vmovdqu y5, 5 * 16(mem_cd); \ 223ba3579e6STaehee Yoo vmovdqu y6, 6 * 16(mem_cd); \ 224ba3579e6STaehee Yoo vmovdqu y7, 7 * 16(mem_cd); 225ba3579e6STaehee Yoo 226ba3579e6STaehee Yoo#define write_output(x0, x1, x2, x3, \ 227ba3579e6STaehee Yoo x4, x5, x6, x7, \ 228ba3579e6STaehee Yoo y0, y1, y2, y3, \ 229ba3579e6STaehee Yoo y4, y5, y6, y7, \ 230ba3579e6STaehee Yoo mem) \ 231ba3579e6STaehee Yoo vmovdqu x0, 0 * 16(mem); \ 232ba3579e6STaehee Yoo vmovdqu x1, 1 * 16(mem); \ 233ba3579e6STaehee Yoo vmovdqu x2, 2 * 16(mem); \ 234ba3579e6STaehee Yoo vmovdqu x3, 3 * 16(mem); \ 235ba3579e6STaehee Yoo vmovdqu x4, 4 * 16(mem); \ 236ba3579e6STaehee Yoo vmovdqu x5, 5 * 16(mem); \ 237ba3579e6STaehee Yoo vmovdqu x6, 6 * 16(mem); \ 238ba3579e6STaehee Yoo vmovdqu x7, 7 * 16(mem); \ 239ba3579e6STaehee Yoo vmovdqu y0, 8 * 16(mem); \ 240ba3579e6STaehee Yoo vmovdqu y1, 9 * 16(mem); \ 241ba3579e6STaehee Yoo vmovdqu y2, 10 * 16(mem); \ 242ba3579e6STaehee Yoo vmovdqu y3, 11 * 16(mem); \ 243ba3579e6STaehee Yoo vmovdqu y4, 12 * 16(mem); \ 244ba3579e6STaehee Yoo vmovdqu y5, 13 * 16(mem); \ 245ba3579e6STaehee Yoo vmovdqu y6, 14 * 16(mem); \ 246ba3579e6STaehee Yoo vmovdqu y7, 15 * 16(mem); \ 247ba3579e6STaehee Yoo 248ba3579e6STaehee Yoo#define aria_store_state_8way(x0, x1, x2, x3, \ 249ba3579e6STaehee Yoo x4, x5, x6, x7, \ 250ba3579e6STaehee Yoo mem_tmp, idx) \ 251ba3579e6STaehee Yoo vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 252ba3579e6STaehee Yoo vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 253ba3579e6STaehee Yoo vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 254ba3579e6STaehee Yoo vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 255ba3579e6STaehee Yoo vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 256ba3579e6STaehee Yoo vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 257ba3579e6STaehee Yoo vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 258ba3579e6STaehee Yoo vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 259ba3579e6STaehee Yoo 260ba3579e6STaehee Yoo#define aria_load_state_8way(x0, x1, x2, x3, \ 261ba3579e6STaehee Yoo x4, x5, x6, x7, \ 262ba3579e6STaehee Yoo mem_tmp, idx) \ 263ba3579e6STaehee Yoo vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 264ba3579e6STaehee Yoo vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 265ba3579e6STaehee Yoo vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 266ba3579e6STaehee Yoo vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 267ba3579e6STaehee Yoo vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 268ba3579e6STaehee Yoo vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 269ba3579e6STaehee Yoo vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 270ba3579e6STaehee Yoo vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 271ba3579e6STaehee Yoo 272ba3579e6STaehee Yoo#define aria_ark_8way(x0, x1, x2, x3, \ 273ba3579e6STaehee Yoo x4, x5, x6, x7, \ 274ba3579e6STaehee Yoo t0, rk, idx, round) \ 275ba3579e6STaehee Yoo /* AddRoundKey */ \ 276ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 3)(rk), t0; \ 277ba3579e6STaehee Yoo vpxor t0, x0, x0; \ 278ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 2)(rk), t0; \ 279ba3579e6STaehee Yoo vpxor t0, x1, x1; \ 280ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 1)(rk), t0; \ 281ba3579e6STaehee Yoo vpxor t0, x2, x2; \ 282ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 0)(rk), t0; \ 283ba3579e6STaehee Yoo vpxor t0, x3, x3; \ 284ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 7)(rk), t0; \ 285ba3579e6STaehee Yoo vpxor t0, x4, x4; \ 286ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 6)(rk), t0; \ 287ba3579e6STaehee Yoo vpxor t0, x5, x5; \ 288ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 5)(rk), t0; \ 289ba3579e6STaehee Yoo vpxor t0, x6, x6; \ 290ba3579e6STaehee Yoo vpbroadcastb ((round * 16) + idx + 4)(rk), t0; \ 291ba3579e6STaehee Yoo vpxor t0, x7, x7; 292ba3579e6STaehee Yoo 293ba3579e6STaehee Yoo#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 294ba3579e6STaehee Yoo x4, x5, x6, x7, \ 295ba3579e6STaehee Yoo t0, t1, t2, t3, \ 296ba3579e6STaehee Yoo t4, t5, t6, t7) \ 297ba3579e6STaehee Yoo vpbroadcastq .Ltf_s2_bitmatrix, t0; \ 298ba3579e6STaehee Yoo vpbroadcastq .Ltf_inv_bitmatrix, t1; \ 299ba3579e6STaehee Yoo vpbroadcastq .Ltf_id_bitmatrix, t2; \ 300ba3579e6STaehee Yoo vpbroadcastq .Ltf_aff_bitmatrix, t3; \ 301ba3579e6STaehee Yoo vpbroadcastq .Ltf_x2_bitmatrix, t4; \ 302ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 303ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 304ba3579e6STaehee Yoo vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 305ba3579e6STaehee Yoo vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 306ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x2, x2; \ 307ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x6, x6; \ 308ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 309ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 310ba3579e6STaehee Yoo vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 311ba3579e6STaehee Yoo vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 312ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x3, x3; \ 313ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x7, x7 314ba3579e6STaehee Yoo 315ba3579e6STaehee Yoo#define aria_sbox_8way(x0, x1, x2, x3, \ 316ba3579e6STaehee Yoo x4, x5, x6, x7, \ 317ba3579e6STaehee Yoo t0, t1, t2, t3, \ 318ba3579e6STaehee Yoo t4, t5, t6, t7) \ 319ba3579e6STaehee Yoo vpxor t7, t7, t7; \ 320ba3579e6STaehee Yoo vmovdqa .Linv_shift_row, t0; \ 321ba3579e6STaehee Yoo vmovdqa .Lshift_row, t1; \ 322ba3579e6STaehee Yoo vpbroadcastd .L0f0f0f0f, t6; \ 323ba3579e6STaehee Yoo vmovdqa .Ltf_lo__inv_aff__and__s2, t2; \ 324ba3579e6STaehee Yoo vmovdqa .Ltf_hi__inv_aff__and__s2, t3; \ 325ba3579e6STaehee Yoo vmovdqa .Ltf_lo__x2__and__fwd_aff, t4; \ 326ba3579e6STaehee Yoo vmovdqa .Ltf_hi__x2__and__fwd_aff, t5; \ 327ba3579e6STaehee Yoo \ 328ba3579e6STaehee Yoo vaesenclast t7, x0, x0; \ 329ba3579e6STaehee Yoo vaesenclast t7, x4, x4; \ 330ba3579e6STaehee Yoo vaesenclast t7, x1, x1; \ 331ba3579e6STaehee Yoo vaesenclast t7, x5, x5; \ 332ba3579e6STaehee Yoo vaesdeclast t7, x2, x2; \ 333ba3579e6STaehee Yoo vaesdeclast t7, x6, x6; \ 334ba3579e6STaehee Yoo \ 335ba3579e6STaehee Yoo /* AES inverse shift rows */ \ 336ba3579e6STaehee Yoo vpshufb t0, x0, x0; \ 337ba3579e6STaehee Yoo vpshufb t0, x4, x4; \ 338ba3579e6STaehee Yoo vpshufb t0, x1, x1; \ 339ba3579e6STaehee Yoo vpshufb t0, x5, x5; \ 340ba3579e6STaehee Yoo vpshufb t1, x3, x3; \ 341ba3579e6STaehee Yoo vpshufb t1, x7, x7; \ 342ba3579e6STaehee Yoo vpshufb t1, x2, x2; \ 343ba3579e6STaehee Yoo vpshufb t1, x6, x6; \ 344ba3579e6STaehee Yoo \ 345ba3579e6STaehee Yoo /* affine transformation for S2 */ \ 346ba3579e6STaehee Yoo filter_8bit(x1, t2, t3, t6, t0); \ 347ba3579e6STaehee Yoo /* affine transformation for S2 */ \ 348ba3579e6STaehee Yoo filter_8bit(x5, t2, t3, t6, t0); \ 349ba3579e6STaehee Yoo \ 350ba3579e6STaehee Yoo /* affine transformation for X2 */ \ 351ba3579e6STaehee Yoo filter_8bit(x3, t4, t5, t6, t0); \ 352ba3579e6STaehee Yoo /* affine transformation for X2 */ \ 353ba3579e6STaehee Yoo filter_8bit(x7, t4, t5, t6, t0); \ 354ba3579e6STaehee Yoo vaesdeclast t7, x3, x3; \ 355ba3579e6STaehee Yoo vaesdeclast t7, x7, x7; 356ba3579e6STaehee Yoo 357ba3579e6STaehee Yoo#define aria_diff_m(x0, x1, x2, x3, \ 358ba3579e6STaehee Yoo t0, t1, t2, t3) \ 359ba3579e6STaehee Yoo /* T = rotr32(X, 8); */ \ 360ba3579e6STaehee Yoo /* X ^= T */ \ 361ba3579e6STaehee Yoo vpxor x0, x3, t0; \ 362ba3579e6STaehee Yoo vpxor x1, x0, t1; \ 363ba3579e6STaehee Yoo vpxor x2, x1, t2; \ 364ba3579e6STaehee Yoo vpxor x3, x2, t3; \ 365ba3579e6STaehee Yoo /* X = T ^ rotr(X, 16); */ \ 366ba3579e6STaehee Yoo vpxor t2, x0, x0; \ 367ba3579e6STaehee Yoo vpxor x1, t3, t3; \ 368ba3579e6STaehee Yoo vpxor t0, x2, x2; \ 369ba3579e6STaehee Yoo vpxor t1, x3, x1; \ 370ba3579e6STaehee Yoo vmovdqu t3, x3; 371ba3579e6STaehee Yoo 372ba3579e6STaehee Yoo#define aria_diff_word(x0, x1, x2, x3, \ 373ba3579e6STaehee Yoo x4, x5, x6, x7, \ 374ba3579e6STaehee Yoo y0, y1, y2, y3, \ 375ba3579e6STaehee Yoo y4, y5, y6, y7) \ 376ba3579e6STaehee Yoo /* t1 ^= t2; */ \ 377ba3579e6STaehee Yoo vpxor y0, x4, x4; \ 378ba3579e6STaehee Yoo vpxor y1, x5, x5; \ 379ba3579e6STaehee Yoo vpxor y2, x6, x6; \ 380ba3579e6STaehee Yoo vpxor y3, x7, x7; \ 381ba3579e6STaehee Yoo \ 382ba3579e6STaehee Yoo /* t2 ^= t3; */ \ 383ba3579e6STaehee Yoo vpxor y4, y0, y0; \ 384ba3579e6STaehee Yoo vpxor y5, y1, y1; \ 385ba3579e6STaehee Yoo vpxor y6, y2, y2; \ 386ba3579e6STaehee Yoo vpxor y7, y3, y3; \ 387ba3579e6STaehee Yoo \ 388ba3579e6STaehee Yoo /* t0 ^= t1; */ \ 389ba3579e6STaehee Yoo vpxor x4, x0, x0; \ 390ba3579e6STaehee Yoo vpxor x5, x1, x1; \ 391ba3579e6STaehee Yoo vpxor x6, x2, x2; \ 392ba3579e6STaehee Yoo vpxor x7, x3, x3; \ 393ba3579e6STaehee Yoo \ 394ba3579e6STaehee Yoo /* t3 ^= t1; */ \ 395ba3579e6STaehee Yoo vpxor x4, y4, y4; \ 396ba3579e6STaehee Yoo vpxor x5, y5, y5; \ 397ba3579e6STaehee Yoo vpxor x6, y6, y6; \ 398ba3579e6STaehee Yoo vpxor x7, y7, y7; \ 399ba3579e6STaehee Yoo \ 400ba3579e6STaehee Yoo /* t2 ^= t0; */ \ 401ba3579e6STaehee Yoo vpxor x0, y0, y0; \ 402ba3579e6STaehee Yoo vpxor x1, y1, y1; \ 403ba3579e6STaehee Yoo vpxor x2, y2, y2; \ 404ba3579e6STaehee Yoo vpxor x3, y3, y3; \ 405ba3579e6STaehee Yoo \ 406ba3579e6STaehee Yoo /* t1 ^= t2; */ \ 407ba3579e6STaehee Yoo vpxor y0, x4, x4; \ 408ba3579e6STaehee Yoo vpxor y1, x5, x5; \ 409ba3579e6STaehee Yoo vpxor y2, x6, x6; \ 410ba3579e6STaehee Yoo vpxor y3, x7, x7; 411ba3579e6STaehee Yoo 412ba3579e6STaehee Yoo#define aria_fe(x0, x1, x2, x3, \ 413ba3579e6STaehee Yoo x4, x5, x6, x7, \ 414ba3579e6STaehee Yoo y0, y1, y2, y3, \ 415ba3579e6STaehee Yoo y4, y5, y6, y7, \ 416ba3579e6STaehee Yoo mem_tmp, rk, round) \ 417ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 418ba3579e6STaehee Yoo y0, rk, 8, round); \ 419ba3579e6STaehee Yoo \ 420ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 421ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 422ba3579e6STaehee Yoo \ 423ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 424ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 425ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 426ba3579e6STaehee Yoo x4, x5, x6, x7, \ 427ba3579e6STaehee Yoo mem_tmp, 8); \ 428ba3579e6STaehee Yoo \ 429ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 430ba3579e6STaehee Yoo x4, x5, x6, x7, \ 431ba3579e6STaehee Yoo mem_tmp, 0); \ 432ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 433ba3579e6STaehee Yoo y0, rk, 0, round); \ 434ba3579e6STaehee Yoo \ 435ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 436ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 437ba3579e6STaehee Yoo \ 438ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 439ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 440ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 441ba3579e6STaehee Yoo x4, x5, x6, x7, \ 442ba3579e6STaehee Yoo mem_tmp, 0); \ 443ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 444ba3579e6STaehee Yoo y4, y5, y6, y7, \ 445ba3579e6STaehee Yoo mem_tmp, 8); \ 446ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 447ba3579e6STaehee Yoo x4, x5, x6, x7, \ 448ba3579e6STaehee Yoo y0, y1, y2, y3, \ 449ba3579e6STaehee Yoo y4, y5, y6, y7); \ 450ba3579e6STaehee Yoo /* aria_diff_byte() \ 451ba3579e6STaehee Yoo * T3 = ABCD -> BADC \ 452ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 453ba3579e6STaehee Yoo * T0 = ABCD -> CDAB \ 454ba3579e6STaehee Yoo * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 455ba3579e6STaehee Yoo * T1 = ABCD -> DCBA \ 456ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 457ba3579e6STaehee Yoo */ \ 458ba3579e6STaehee Yoo aria_diff_word(x2, x3, x0, x1, \ 459ba3579e6STaehee Yoo x7, x6, x5, x4, \ 460ba3579e6STaehee Yoo y0, y1, y2, y3, \ 461ba3579e6STaehee Yoo y5, y4, y7, y6); \ 462ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 463ba3579e6STaehee Yoo x6, x7, x4, x5, \ 464ba3579e6STaehee Yoo mem_tmp, 0); 465ba3579e6STaehee Yoo 466ba3579e6STaehee Yoo#define aria_fo(x0, x1, x2, x3, \ 467ba3579e6STaehee Yoo x4, x5, x6, x7, \ 468ba3579e6STaehee Yoo y0, y1, y2, y3, \ 469ba3579e6STaehee Yoo y4, y5, y6, y7, \ 470ba3579e6STaehee Yoo mem_tmp, rk, round) \ 471ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 472ba3579e6STaehee Yoo y0, rk, 8, round); \ 473ba3579e6STaehee Yoo \ 474ba3579e6STaehee Yoo aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 475ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 476ba3579e6STaehee Yoo \ 477ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 478ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 479ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 480ba3579e6STaehee Yoo x4, x5, x6, x7, \ 481ba3579e6STaehee Yoo mem_tmp, 8); \ 482ba3579e6STaehee Yoo \ 483ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 484ba3579e6STaehee Yoo x4, x5, x6, x7, \ 485ba3579e6STaehee Yoo mem_tmp, 0); \ 486ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 487ba3579e6STaehee Yoo y0, rk, 0, round); \ 488ba3579e6STaehee Yoo \ 489ba3579e6STaehee Yoo aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 490ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 491ba3579e6STaehee Yoo \ 492ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 493ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 494ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 495ba3579e6STaehee Yoo x4, x5, x6, x7, \ 496ba3579e6STaehee Yoo mem_tmp, 0); \ 497ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 498ba3579e6STaehee Yoo y4, y5, y6, y7, \ 499ba3579e6STaehee Yoo mem_tmp, 8); \ 500ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 501ba3579e6STaehee Yoo x4, x5, x6, x7, \ 502ba3579e6STaehee Yoo y0, y1, y2, y3, \ 503ba3579e6STaehee Yoo y4, y5, y6, y7); \ 504ba3579e6STaehee Yoo /* aria_diff_byte() \ 505ba3579e6STaehee Yoo * T1 = ABCD -> BADC \ 506ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 507ba3579e6STaehee Yoo * T2 = ABCD -> CDAB \ 508ba3579e6STaehee Yoo * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 509ba3579e6STaehee Yoo * T3 = ABCD -> DCBA \ 510ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 511ba3579e6STaehee Yoo */ \ 512ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 513ba3579e6STaehee Yoo x5, x4, x7, x6, \ 514ba3579e6STaehee Yoo y2, y3, y0, y1, \ 515ba3579e6STaehee Yoo y7, y6, y5, y4); \ 516ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 517ba3579e6STaehee Yoo x6, x7, x4, x5, \ 518ba3579e6STaehee Yoo mem_tmp, 0); 519ba3579e6STaehee Yoo 520ba3579e6STaehee Yoo#define aria_ff(x0, x1, x2, x3, \ 521ba3579e6STaehee Yoo x4, x5, x6, x7, \ 522ba3579e6STaehee Yoo y0, y1, y2, y3, \ 523ba3579e6STaehee Yoo y4, y5, y6, y7, \ 524ba3579e6STaehee Yoo mem_tmp, rk, round, last_round) \ 525ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 526ba3579e6STaehee Yoo y0, rk, 8, round); \ 527ba3579e6STaehee Yoo \ 528ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 529ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 530ba3579e6STaehee Yoo \ 531ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 532ba3579e6STaehee Yoo y0, rk, 8, last_round); \ 533ba3579e6STaehee Yoo \ 534ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 535ba3579e6STaehee Yoo x4, x5, x6, x7, \ 536ba3579e6STaehee Yoo mem_tmp, 8); \ 537ba3579e6STaehee Yoo \ 538ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 539ba3579e6STaehee Yoo x4, x5, x6, x7, \ 540ba3579e6STaehee Yoo mem_tmp, 0); \ 541ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 542ba3579e6STaehee Yoo y0, rk, 0, round); \ 543ba3579e6STaehee Yoo \ 544ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 545ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 546ba3579e6STaehee Yoo \ 547ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 548ba3579e6STaehee Yoo y0, rk, 0, last_round); \ 549ba3579e6STaehee Yoo \ 550ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 551ba3579e6STaehee Yoo y4, y5, y6, y7, \ 552ba3579e6STaehee Yoo mem_tmp, 8); 553ba3579e6STaehee Yoo 554ba3579e6STaehee Yoo#define aria_fe_gfni(x0, x1, x2, x3, \ 555ba3579e6STaehee Yoo x4, x5, x6, x7, \ 556ba3579e6STaehee Yoo y0, y1, y2, y3, \ 557ba3579e6STaehee Yoo y4, y5, y6, y7, \ 558ba3579e6STaehee Yoo mem_tmp, rk, round) \ 559ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 560ba3579e6STaehee Yoo y0, rk, 8, round); \ 561ba3579e6STaehee Yoo \ 562ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 563ba3579e6STaehee Yoo x6, x7, x4, x5, \ 564ba3579e6STaehee Yoo y0, y1, y2, y3, \ 565ba3579e6STaehee Yoo y4, y5, y6, y7); \ 566ba3579e6STaehee Yoo \ 567ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 568ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 569ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 570ba3579e6STaehee Yoo x4, x5, x6, x7, \ 571ba3579e6STaehee Yoo mem_tmp, 8); \ 572ba3579e6STaehee Yoo \ 573ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 574ba3579e6STaehee Yoo x4, x5, x6, x7, \ 575ba3579e6STaehee Yoo mem_tmp, 0); \ 576ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 577ba3579e6STaehee Yoo y0, rk, 0, round); \ 578ba3579e6STaehee Yoo \ 579ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 580ba3579e6STaehee Yoo x6, x7, x4, x5, \ 581ba3579e6STaehee Yoo y0, y1, y2, y3, \ 582ba3579e6STaehee Yoo y4, y5, y6, y7); \ 583ba3579e6STaehee Yoo \ 584ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 585ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 586ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 587ba3579e6STaehee Yoo x4, x5, x6, x7, \ 588ba3579e6STaehee Yoo mem_tmp, 0); \ 589ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 590ba3579e6STaehee Yoo y4, y5, y6, y7, \ 591ba3579e6STaehee Yoo mem_tmp, 8); \ 592ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 593ba3579e6STaehee Yoo x4, x5, x6, x7, \ 594ba3579e6STaehee Yoo y0, y1, y2, y3, \ 595ba3579e6STaehee Yoo y4, y5, y6, y7); \ 596ba3579e6STaehee Yoo /* aria_diff_byte() \ 597ba3579e6STaehee Yoo * T3 = ABCD -> BADC \ 598ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 599ba3579e6STaehee Yoo * T0 = ABCD -> CDAB \ 600ba3579e6STaehee Yoo * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 601ba3579e6STaehee Yoo * T1 = ABCD -> DCBA \ 602ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 603ba3579e6STaehee Yoo */ \ 604ba3579e6STaehee Yoo aria_diff_word(x2, x3, x0, x1, \ 605ba3579e6STaehee Yoo x7, x6, x5, x4, \ 606ba3579e6STaehee Yoo y0, y1, y2, y3, \ 607ba3579e6STaehee Yoo y5, y4, y7, y6); \ 608ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 609ba3579e6STaehee Yoo x6, x7, x4, x5, \ 610ba3579e6STaehee Yoo mem_tmp, 0); 611ba3579e6STaehee Yoo 612ba3579e6STaehee Yoo#define aria_fo_gfni(x0, x1, x2, x3, \ 613ba3579e6STaehee Yoo x4, x5, x6, x7, \ 614ba3579e6STaehee Yoo y0, y1, y2, y3, \ 615ba3579e6STaehee Yoo y4, y5, y6, y7, \ 616ba3579e6STaehee Yoo mem_tmp, rk, round) \ 617ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 618ba3579e6STaehee Yoo y0, rk, 8, round); \ 619ba3579e6STaehee Yoo \ 620ba3579e6STaehee Yoo aria_sbox_8way_gfni(x0, x1, x2, x3, \ 621ba3579e6STaehee Yoo x4, x5, x6, x7, \ 622ba3579e6STaehee Yoo y0, y1, y2, y3, \ 623ba3579e6STaehee Yoo y4, y5, y6, y7); \ 624ba3579e6STaehee Yoo \ 625ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 626ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 627ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 628ba3579e6STaehee Yoo x4, x5, x6, x7, \ 629ba3579e6STaehee Yoo mem_tmp, 8); \ 630ba3579e6STaehee Yoo \ 631ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 632ba3579e6STaehee Yoo x4, x5, x6, x7, \ 633ba3579e6STaehee Yoo mem_tmp, 0); \ 634ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 635ba3579e6STaehee Yoo y0, rk, 0, round); \ 636ba3579e6STaehee Yoo \ 637ba3579e6STaehee Yoo aria_sbox_8way_gfni(x0, x1, x2, x3, \ 638ba3579e6STaehee Yoo x4, x5, x6, x7, \ 639ba3579e6STaehee Yoo y0, y1, y2, y3, \ 640ba3579e6STaehee Yoo y4, y5, y6, y7); \ 641ba3579e6STaehee Yoo \ 642ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 643ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 644ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 645ba3579e6STaehee Yoo x4, x5, x6, x7, \ 646ba3579e6STaehee Yoo mem_tmp, 0); \ 647ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 648ba3579e6STaehee Yoo y4, y5, y6, y7, \ 649ba3579e6STaehee Yoo mem_tmp, 8); \ 650ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 651ba3579e6STaehee Yoo x4, x5, x6, x7, \ 652ba3579e6STaehee Yoo y0, y1, y2, y3, \ 653ba3579e6STaehee Yoo y4, y5, y6, y7); \ 654ba3579e6STaehee Yoo /* aria_diff_byte() \ 655ba3579e6STaehee Yoo * T1 = ABCD -> BADC \ 656ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 657ba3579e6STaehee Yoo * T2 = ABCD -> CDAB \ 658ba3579e6STaehee Yoo * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 659ba3579e6STaehee Yoo * T3 = ABCD -> DCBA \ 660ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 661ba3579e6STaehee Yoo */ \ 662ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 663ba3579e6STaehee Yoo x5, x4, x7, x6, \ 664ba3579e6STaehee Yoo y2, y3, y0, y1, \ 665ba3579e6STaehee Yoo y7, y6, y5, y4); \ 666ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 667ba3579e6STaehee Yoo x6, x7, x4, x5, \ 668ba3579e6STaehee Yoo mem_tmp, 0); 669ba3579e6STaehee Yoo 670ba3579e6STaehee Yoo#define aria_ff_gfni(x0, x1, x2, x3, \ 671ba3579e6STaehee Yoo x4, x5, x6, x7, \ 672ba3579e6STaehee Yoo y0, y1, y2, y3, \ 673ba3579e6STaehee Yoo y4, y5, y6, y7, \ 674ba3579e6STaehee Yoo mem_tmp, rk, round, last_round) \ 675ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 676ba3579e6STaehee Yoo y0, rk, 8, round); \ 677ba3579e6STaehee Yoo \ 678ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 679ba3579e6STaehee Yoo x6, x7, x4, x5, \ 680ba3579e6STaehee Yoo y0, y1, y2, y3, \ 681ba3579e6STaehee Yoo y4, y5, y6, y7); \ 682ba3579e6STaehee Yoo \ 683ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 684ba3579e6STaehee Yoo y0, rk, 8, last_round); \ 685ba3579e6STaehee Yoo \ 686ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 687ba3579e6STaehee Yoo x4, x5, x6, x7, \ 688ba3579e6STaehee Yoo mem_tmp, 8); \ 689ba3579e6STaehee Yoo \ 690ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 691ba3579e6STaehee Yoo x4, x5, x6, x7, \ 692ba3579e6STaehee Yoo mem_tmp, 0); \ 693ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 694ba3579e6STaehee Yoo y0, rk, 0, round); \ 695ba3579e6STaehee Yoo \ 696ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 697ba3579e6STaehee Yoo x6, x7, x4, x5, \ 698ba3579e6STaehee Yoo y0, y1, y2, y3, \ 699ba3579e6STaehee Yoo y4, y5, y6, y7); \ 700ba3579e6STaehee Yoo \ 701ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 702ba3579e6STaehee Yoo y0, rk, 0, last_round); \ 703ba3579e6STaehee Yoo \ 704ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 705ba3579e6STaehee Yoo y4, y5, y6, y7, \ 706ba3579e6STaehee Yoo mem_tmp, 8); 707ba3579e6STaehee Yoo 708ba3579e6STaehee Yoo/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 709ba3579e6STaehee Yoo.section .rodata.cst16, "aM", @progbits, 16 710ba3579e6STaehee Yoo.align 16 711ba3579e6STaehee Yoo 712ba3579e6STaehee Yoo#define SHUFB_BYTES(idx) \ 713ba3579e6STaehee Yoo 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 714ba3579e6STaehee Yoo 715ba3579e6STaehee Yoo.Lshufb_16x16b: 716ba3579e6STaehee Yoo .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 717ba3579e6STaehee Yoo/* For isolating SubBytes from AESENCLAST, inverse shift row */ 718ba3579e6STaehee Yoo.Linv_shift_row: 719ba3579e6STaehee Yoo .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 720ba3579e6STaehee Yoo .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 721ba3579e6STaehee Yoo.Lshift_row: 722ba3579e6STaehee Yoo .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 723ba3579e6STaehee Yoo .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 724ba3579e6STaehee Yoo/* For CTR-mode IV byteswap */ 725ba3579e6STaehee Yoo.Lbswap128_mask: 726ba3579e6STaehee Yoo .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 727ba3579e6STaehee Yoo .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 728ba3579e6STaehee Yoo 729ba3579e6STaehee Yoo/* AES inverse affine and S2 combined: 730ba3579e6STaehee Yoo * 1 1 0 0 0 0 0 1 x0 0 731ba3579e6STaehee Yoo * 0 1 0 0 1 0 0 0 x1 0 732ba3579e6STaehee Yoo * 1 1 0 0 1 1 1 1 x2 0 733ba3579e6STaehee Yoo * 0 1 1 0 1 0 0 1 x3 1 734ba3579e6STaehee Yoo * 0 1 0 0 1 1 0 0 * x4 + 0 735ba3579e6STaehee Yoo * 0 1 0 1 1 0 0 0 x5 0 736ba3579e6STaehee Yoo * 0 0 0 0 0 1 0 1 x6 0 737ba3579e6STaehee Yoo * 1 1 1 0 0 1 1 1 x7 1 738ba3579e6STaehee Yoo */ 739ba3579e6STaehee Yoo.Ltf_lo__inv_aff__and__s2: 740ba3579e6STaehee Yoo .octa 0x92172DA81A9FA520B2370D883ABF8500 741ba3579e6STaehee Yoo.Ltf_hi__inv_aff__and__s2: 742ba3579e6STaehee Yoo .octa 0x2B15FFC1AF917B45E6D8320C625CB688 743ba3579e6STaehee Yoo 744ba3579e6STaehee Yoo/* X2 and AES forward affine combined: 745ba3579e6STaehee Yoo * 1 0 1 1 0 0 0 1 x0 0 746ba3579e6STaehee Yoo * 0 1 1 1 1 0 1 1 x1 0 747ba3579e6STaehee Yoo * 0 0 0 1 1 0 1 0 x2 1 748ba3579e6STaehee Yoo * 0 1 0 0 0 1 0 0 x3 0 749ba3579e6STaehee Yoo * 0 0 1 1 1 0 1 1 * x4 + 0 750ba3579e6STaehee Yoo * 0 1 0 0 1 0 0 0 x5 0 751ba3579e6STaehee Yoo * 1 1 0 1 0 0 1 1 x6 0 752ba3579e6STaehee Yoo * 0 1 0 0 1 0 1 0 x7 0 753ba3579e6STaehee Yoo */ 754ba3579e6STaehee Yoo.Ltf_lo__x2__and__fwd_aff: 755ba3579e6STaehee Yoo .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 756ba3579e6STaehee Yoo.Ltf_hi__x2__and__fwd_aff: 757ba3579e6STaehee Yoo .octa 0x3F893781E95FE1576CDA64D2BA0CB204 758ba3579e6STaehee Yoo 759ba3579e6STaehee Yoo.section .rodata.cst8, "aM", @progbits, 8 760ba3579e6STaehee Yoo.align 8 761ba3579e6STaehee Yoo/* AES affine: */ 762ba3579e6STaehee Yoo#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 763ba3579e6STaehee Yoo.Ltf_aff_bitmatrix: 764ba3579e6STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 765ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 0, 1, 1, 1), 766ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 767ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 0, 0, 0, 1), 768ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 1, 0, 0, 0), 769ba3579e6STaehee Yoo BV8(0, 1, 1, 1, 1, 1, 0, 0), 770ba3579e6STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 0), 771ba3579e6STaehee Yoo BV8(0, 0, 0, 1, 1, 1, 1, 1)) 772ba3579e6STaehee Yoo 773ba3579e6STaehee Yoo/* AES inverse affine: */ 774ba3579e6STaehee Yoo#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 775ba3579e6STaehee Yoo.Ltf_inv_bitmatrix: 776ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 777ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 0), 778ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 0, 1), 779ba3579e6STaehee Yoo BV8(1, 0, 1, 0, 0, 1, 0, 0), 780ba3579e6STaehee Yoo BV8(0, 1, 0, 1, 0, 0, 1, 0), 781ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 1, 0, 0, 1), 782ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 1, 0, 0), 783ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 1, 0)) 784ba3579e6STaehee Yoo 785ba3579e6STaehee Yoo/* S2: */ 786ba3579e6STaehee Yoo#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 787ba3579e6STaehee Yoo.Ltf_s2_bitmatrix: 788ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 789ba3579e6STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 1), 790ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 1), 791ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 0, 0, 1, 1), 792ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 1, 1), 793ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 1, 1, 1, 0), 794ba3579e6STaehee Yoo BV8(0, 1, 1, 0, 0, 0, 1, 1), 795ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 0, 1, 1, 0)) 796ba3579e6STaehee Yoo 797ba3579e6STaehee Yoo/* X2: */ 798ba3579e6STaehee Yoo#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 799ba3579e6STaehee Yoo.Ltf_x2_bitmatrix: 800ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 801ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 0, 1, 1, 0), 802ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 1, 0), 803ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 804ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 0), 805ba3579e6STaehee Yoo BV8(0, 1, 1, 0, 1, 0, 1, 1), 806ba3579e6STaehee Yoo BV8(1, 0, 1, 1, 1, 1, 0, 1), 807ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 1)) 808ba3579e6STaehee Yoo 809ba3579e6STaehee Yoo/* Identity matrix: */ 810ba3579e6STaehee Yoo.Ltf_id_bitmatrix: 811ba3579e6STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 812ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 0, 0), 813ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 0, 0, 0, 0), 814ba3579e6STaehee Yoo BV8(0, 0, 0, 1, 0, 0, 0, 0), 815ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 0, 0), 816ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 1, 0, 0), 817ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 1, 0), 818ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 0, 1)) 819ba3579e6STaehee Yoo 820ba3579e6STaehee Yoo/* 4-bit mask */ 821ba3579e6STaehee Yoo.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 822ba3579e6STaehee Yoo.align 4 823ba3579e6STaehee Yoo.L0f0f0f0f: 824ba3579e6STaehee Yoo .long 0x0f0f0f0f 825ba3579e6STaehee Yoo 826ba3579e6STaehee Yoo.text 827ba3579e6STaehee Yoo 828ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 829ba3579e6STaehee Yoo /* input: 830ba3579e6STaehee Yoo * %r9: rk 831ba3579e6STaehee Yoo * %rsi: dst 832ba3579e6STaehee Yoo * %rdx: src 833ba3579e6STaehee Yoo * %xmm0..%xmm15: 16 byte-sliced blocks 834ba3579e6STaehee Yoo */ 835ba3579e6STaehee Yoo 836ba3579e6STaehee Yoo FRAME_BEGIN 837ba3579e6STaehee Yoo 838ba3579e6STaehee Yoo movq %rsi, %rax; 839ba3579e6STaehee Yoo leaq 8 * 16(%rax), %r8; 840ba3579e6STaehee Yoo 841ba3579e6STaehee Yoo inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 842ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 843ba3579e6STaehee Yoo %xmm15, %rax, %r8); 844ba3579e6STaehee Yoo aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 845ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 846ba3579e6STaehee Yoo %rax, %r9, 0); 847ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 848ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 849ba3579e6STaehee Yoo %xmm15, %rax, %r9, 1); 850ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 851ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 852ba3579e6STaehee Yoo %rax, %r9, 2); 853ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 854ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 855ba3579e6STaehee Yoo %xmm15, %rax, %r9, 3); 856ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 857ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 858ba3579e6STaehee Yoo %rax, %r9, 4); 859ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 860ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 861ba3579e6STaehee Yoo %xmm15, %rax, %r9, 5); 862ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 863ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 864ba3579e6STaehee Yoo %rax, %r9, 6); 865ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 866ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 867ba3579e6STaehee Yoo %xmm15, %rax, %r9, 7); 868ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 869ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 870ba3579e6STaehee Yoo %rax, %r9, 8); 871ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 872ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 873ba3579e6STaehee Yoo %xmm15, %rax, %r9, 9); 874ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 875ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 876ba3579e6STaehee Yoo %rax, %r9, 10); 877ba3579e6STaehee Yoo cmpl $12, rounds(CTX); 878ba3579e6STaehee Yoo jne .Laria_192; 879ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 880ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 881ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11, 12); 882ba3579e6STaehee Yoo jmp .Laria_end; 883ba3579e6STaehee Yoo.Laria_192: 884ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 885ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 886ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11); 887ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 888ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 889ba3579e6STaehee Yoo %rax, %r9, 12); 890ba3579e6STaehee Yoo cmpl $14, rounds(CTX); 891ba3579e6STaehee Yoo jne .Laria_256; 892ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 893ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 894ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13, 14); 895ba3579e6STaehee Yoo jmp .Laria_end; 896ba3579e6STaehee Yoo.Laria_256: 897ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 898ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 899ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13); 900ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 901ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 902ba3579e6STaehee Yoo %rax, %r9, 14); 903ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 904ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 905ba3579e6STaehee Yoo %xmm15, %rax, %r9, 15, 16); 906ba3579e6STaehee Yoo.Laria_end: 907ba3579e6STaehee Yoo debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 908ba3579e6STaehee Yoo %xmm9, %xmm13, %xmm0, %xmm5, 909ba3579e6STaehee Yoo %xmm10, %xmm14, %xmm3, %xmm6, 910ba3579e6STaehee Yoo %xmm11, %xmm15, %xmm2, %xmm7, 911ba3579e6STaehee Yoo (%rax), (%r8)); 912ba3579e6STaehee Yoo 913ba3579e6STaehee Yoo FRAME_END 914ba3579e6STaehee Yoo RET; 915ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_crypt_16way) 916ba3579e6STaehee Yoo 917*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) 918ba3579e6STaehee Yoo /* input: 919ba3579e6STaehee Yoo * %rdi: ctx, CTX 920ba3579e6STaehee Yoo * %rsi: dst 921ba3579e6STaehee Yoo * %rdx: src 922ba3579e6STaehee Yoo */ 923ba3579e6STaehee Yoo 924ba3579e6STaehee Yoo FRAME_BEGIN 925ba3579e6STaehee Yoo 926ba3579e6STaehee Yoo leaq enc_key(CTX), %r9; 927ba3579e6STaehee Yoo 928ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 929ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 930ba3579e6STaehee Yoo %xmm15, %rdx); 931ba3579e6STaehee Yoo 932ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 933ba3579e6STaehee Yoo 934ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 935ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 936ba3579e6STaehee Yoo %xmm15, %rax); 937ba3579e6STaehee Yoo 938ba3579e6STaehee Yoo FRAME_END 939ba3579e6STaehee Yoo RET; 940ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_encrypt_16way) 941ba3579e6STaehee Yoo 942*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) 943ba3579e6STaehee Yoo /* input: 944ba3579e6STaehee Yoo * %rdi: ctx, CTX 945ba3579e6STaehee Yoo * %rsi: dst 946ba3579e6STaehee Yoo * %rdx: src 947ba3579e6STaehee Yoo */ 948ba3579e6STaehee Yoo 949ba3579e6STaehee Yoo FRAME_BEGIN 950ba3579e6STaehee Yoo 951ba3579e6STaehee Yoo leaq dec_key(CTX), %r9; 952ba3579e6STaehee Yoo 953ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 954ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 955ba3579e6STaehee Yoo %xmm15, %rdx); 956ba3579e6STaehee Yoo 957ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 958ba3579e6STaehee Yoo 959ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 960ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 961ba3579e6STaehee Yoo %xmm15, %rax); 962ba3579e6STaehee Yoo 963ba3579e6STaehee Yoo FRAME_END 964ba3579e6STaehee Yoo RET; 965ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_decrypt_16way) 966ba3579e6STaehee Yoo 967ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 968ba3579e6STaehee Yoo /* input: 969ba3579e6STaehee Yoo * %rdi: ctx 970ba3579e6STaehee Yoo * %rsi: dst 971ba3579e6STaehee Yoo * %rdx: src 972ba3579e6STaehee Yoo * %rcx: keystream 973ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 974ba3579e6STaehee Yoo */ 975ba3579e6STaehee Yoo 976ba3579e6STaehee Yoo FRAME_BEGIN 977ba3579e6STaehee Yoo /* load IV and byteswap */ 978ba3579e6STaehee Yoo vmovdqu (%r8), %xmm8; 979ba3579e6STaehee Yoo 980ba3579e6STaehee Yoo vmovdqa .Lbswap128_mask (%rip), %xmm1; 981ba3579e6STaehee Yoo vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 982ba3579e6STaehee Yoo 983ba3579e6STaehee Yoo vpcmpeqd %xmm0, %xmm0, %xmm0; 984ba3579e6STaehee Yoo vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 985ba3579e6STaehee Yoo 986ba3579e6STaehee Yoo /* construct IVs */ 987ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 988ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm9; 989ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 990ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm10; 991ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 992ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm11; 993ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 994ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm12; 995ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 996ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm13; 997ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 998ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm14; 999ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1000ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm15; 1001ba3579e6STaehee Yoo vmovdqu %xmm8, (0 * 16)(%rcx); 1002ba3579e6STaehee Yoo vmovdqu %xmm9, (1 * 16)(%rcx); 1003ba3579e6STaehee Yoo vmovdqu %xmm10, (2 * 16)(%rcx); 1004ba3579e6STaehee Yoo vmovdqu %xmm11, (3 * 16)(%rcx); 1005ba3579e6STaehee Yoo vmovdqu %xmm12, (4 * 16)(%rcx); 1006ba3579e6STaehee Yoo vmovdqu %xmm13, (5 * 16)(%rcx); 1007ba3579e6STaehee Yoo vmovdqu %xmm14, (6 * 16)(%rcx); 1008ba3579e6STaehee Yoo vmovdqu %xmm15, (7 * 16)(%rcx); 1009ba3579e6STaehee Yoo 1010ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1011ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm8; 1012ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1013ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm9; 1014ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1015ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm10; 1016ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1017ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm11; 1018ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1019ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm12; 1020ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1021ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm13; 1022ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1023ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm14; 1024ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1025ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm15; 1026ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1027ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm4; 1028ba3579e6STaehee Yoo vmovdqu %xmm4, (%r8); 1029ba3579e6STaehee Yoo 1030ba3579e6STaehee Yoo vmovdqu (0 * 16)(%rcx), %xmm0; 1031ba3579e6STaehee Yoo vmovdqu (1 * 16)(%rcx), %xmm1; 1032ba3579e6STaehee Yoo vmovdqu (2 * 16)(%rcx), %xmm2; 1033ba3579e6STaehee Yoo vmovdqu (3 * 16)(%rcx), %xmm3; 1034ba3579e6STaehee Yoo vmovdqu (4 * 16)(%rcx), %xmm4; 1035ba3579e6STaehee Yoo vmovdqu (5 * 16)(%rcx), %xmm5; 1036ba3579e6STaehee Yoo vmovdqu (6 * 16)(%rcx), %xmm6; 1037ba3579e6STaehee Yoo vmovdqu (7 * 16)(%rcx), %xmm7; 1038ba3579e6STaehee Yoo 1039ba3579e6STaehee Yoo FRAME_END 1040ba3579e6STaehee Yoo RET; 1041ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1042ba3579e6STaehee Yoo 1043*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1044ba3579e6STaehee Yoo /* input: 1045ba3579e6STaehee Yoo * %rdi: ctx 1046ba3579e6STaehee Yoo * %rsi: dst 1047ba3579e6STaehee Yoo * %rdx: src 1048ba3579e6STaehee Yoo * %rcx: keystream 1049ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 1050ba3579e6STaehee Yoo */ 1051ba3579e6STaehee Yoo FRAME_BEGIN 1052ba3579e6STaehee Yoo 1053ba3579e6STaehee Yoo call __aria_aesni_avx_ctr_gen_keystream_16way; 1054ba3579e6STaehee Yoo 1055ba3579e6STaehee Yoo leaq (%rsi), %r10; 1056ba3579e6STaehee Yoo leaq (%rdx), %r11; 1057ba3579e6STaehee Yoo leaq (%rcx), %rsi; 1058ba3579e6STaehee Yoo leaq (%rcx), %rdx; 1059ba3579e6STaehee Yoo leaq enc_key(CTX), %r9; 1060ba3579e6STaehee Yoo 1061ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 1062ba3579e6STaehee Yoo 1063ba3579e6STaehee Yoo vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1064ba3579e6STaehee Yoo vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1065ba3579e6STaehee Yoo vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1066ba3579e6STaehee Yoo vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1067ba3579e6STaehee Yoo vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1068ba3579e6STaehee Yoo vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1069ba3579e6STaehee Yoo vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1070ba3579e6STaehee Yoo vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1071ba3579e6STaehee Yoo vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1072ba3579e6STaehee Yoo vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1073ba3579e6STaehee Yoo vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1074ba3579e6STaehee Yoo vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1075ba3579e6STaehee Yoo vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1076ba3579e6STaehee Yoo vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1077ba3579e6STaehee Yoo vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1078ba3579e6STaehee Yoo vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1079ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1080ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1081ba3579e6STaehee Yoo %xmm15, %r10); 1082ba3579e6STaehee Yoo 1083ba3579e6STaehee Yoo FRAME_END 1084ba3579e6STaehee Yoo RET; 1085ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1086ba3579e6STaehee Yoo 1087ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1088ba3579e6STaehee Yoo /* input: 1089ba3579e6STaehee Yoo * %r9: rk 1090ba3579e6STaehee Yoo * %rsi: dst 1091ba3579e6STaehee Yoo * %rdx: src 1092ba3579e6STaehee Yoo * %xmm0..%xmm15: 16 byte-sliced blocks 1093ba3579e6STaehee Yoo */ 1094ba3579e6STaehee Yoo 1095ba3579e6STaehee Yoo FRAME_BEGIN 1096ba3579e6STaehee Yoo 1097ba3579e6STaehee Yoo movq %rsi, %rax; 1098ba3579e6STaehee Yoo leaq 8 * 16(%rax), %r8; 1099ba3579e6STaehee Yoo 1100ba3579e6STaehee Yoo inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1101ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1102ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1103ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1104ba3579e6STaehee Yoo %xmm15, %rax, %r8); 1105ba3579e6STaehee Yoo aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1106ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1107ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1108ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1109ba3579e6STaehee Yoo %rax, %r9, 0); 1110ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1111ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1112ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1113ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1114ba3579e6STaehee Yoo %xmm15, %rax, %r9, 1); 1115ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1116ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1117ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1118ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1119ba3579e6STaehee Yoo %rax, %r9, 2); 1120ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1121ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1122ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1123ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1124ba3579e6STaehee Yoo %xmm15, %rax, %r9, 3); 1125ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1126ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1127ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1128ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1129ba3579e6STaehee Yoo %rax, %r9, 4); 1130ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1131ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1132ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1133ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1134ba3579e6STaehee Yoo %xmm15, %rax, %r9, 5); 1135ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1136ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1137ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1138ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1139ba3579e6STaehee Yoo %rax, %r9, 6); 1140ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1141ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1142ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1143ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1144ba3579e6STaehee Yoo %xmm15, %rax, %r9, 7); 1145ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1146ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1147ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1148ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1149ba3579e6STaehee Yoo %rax, %r9, 8); 1150ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1151ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1152ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1153ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1154ba3579e6STaehee Yoo %xmm15, %rax, %r9, 9); 1155ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1156ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1157ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1158ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1159ba3579e6STaehee Yoo %rax, %r9, 10); 1160ba3579e6STaehee Yoo cmpl $12, rounds(CTX); 1161ba3579e6STaehee Yoo jne .Laria_gfni_192; 1162ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1163ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1164ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11, 12); 1165ba3579e6STaehee Yoo jmp .Laria_gfni_end; 1166ba3579e6STaehee Yoo.Laria_gfni_192: 1167ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1168ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1169ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1170ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1171ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11); 1172ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1173ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1174ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1175ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1176ba3579e6STaehee Yoo %rax, %r9, 12); 1177ba3579e6STaehee Yoo cmpl $14, rounds(CTX); 1178ba3579e6STaehee Yoo jne .Laria_gfni_256; 1179ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1180ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1181ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1182ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1183ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13, 14); 1184ba3579e6STaehee Yoo jmp .Laria_gfni_end; 1185ba3579e6STaehee Yoo.Laria_gfni_256: 1186ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1187ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1188ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1189ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1190ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13); 1191ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1192ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1193ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1194ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1195ba3579e6STaehee Yoo %rax, %r9, 14); 1196ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1197ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1198ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1199ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1200ba3579e6STaehee Yoo %xmm15, %rax, %r9, 15, 16); 1201ba3579e6STaehee Yoo.Laria_gfni_end: 1202ba3579e6STaehee Yoo debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1203ba3579e6STaehee Yoo %xmm9, %xmm13, %xmm0, %xmm5, 1204ba3579e6STaehee Yoo %xmm10, %xmm14, %xmm3, %xmm6, 1205ba3579e6STaehee Yoo %xmm11, %xmm15, %xmm2, %xmm7, 1206ba3579e6STaehee Yoo (%rax), (%r8)); 1207ba3579e6STaehee Yoo 1208ba3579e6STaehee Yoo FRAME_END 1209ba3579e6STaehee Yoo RET; 1210ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1211ba3579e6STaehee Yoo 1212*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1213ba3579e6STaehee Yoo /* input: 1214ba3579e6STaehee Yoo * %rdi: ctx, CTX 1215ba3579e6STaehee Yoo * %rsi: dst 1216ba3579e6STaehee Yoo * %rdx: src 1217ba3579e6STaehee Yoo */ 1218ba3579e6STaehee Yoo 1219ba3579e6STaehee Yoo FRAME_BEGIN 1220ba3579e6STaehee Yoo 1221ba3579e6STaehee Yoo leaq enc_key(CTX), %r9; 1222ba3579e6STaehee Yoo 1223ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1224ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1225ba3579e6STaehee Yoo %xmm15, %rdx); 1226ba3579e6STaehee Yoo 1227ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1228ba3579e6STaehee Yoo 1229ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1230ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1231ba3579e6STaehee Yoo %xmm15, %rax); 1232ba3579e6STaehee Yoo 1233ba3579e6STaehee Yoo FRAME_END 1234ba3579e6STaehee Yoo RET; 1235ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1236ba3579e6STaehee Yoo 1237*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1238ba3579e6STaehee Yoo /* input: 1239ba3579e6STaehee Yoo * %rdi: ctx, CTX 1240ba3579e6STaehee Yoo * %rsi: dst 1241ba3579e6STaehee Yoo * %rdx: src 1242ba3579e6STaehee Yoo */ 1243ba3579e6STaehee Yoo 1244ba3579e6STaehee Yoo FRAME_BEGIN 1245ba3579e6STaehee Yoo 1246ba3579e6STaehee Yoo leaq dec_key(CTX), %r9; 1247ba3579e6STaehee Yoo 1248ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1249ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1250ba3579e6STaehee Yoo %xmm15, %rdx); 1251ba3579e6STaehee Yoo 1252ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1253ba3579e6STaehee Yoo 1254ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1255ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1256ba3579e6STaehee Yoo %xmm15, %rax); 1257ba3579e6STaehee Yoo 1258ba3579e6STaehee Yoo FRAME_END 1259ba3579e6STaehee Yoo RET; 1260ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1261ba3579e6STaehee Yoo 1262*c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1263ba3579e6STaehee Yoo /* input: 1264ba3579e6STaehee Yoo * %rdi: ctx 1265ba3579e6STaehee Yoo * %rsi: dst 1266ba3579e6STaehee Yoo * %rdx: src 1267ba3579e6STaehee Yoo * %rcx: keystream 1268ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 1269ba3579e6STaehee Yoo */ 1270ba3579e6STaehee Yoo FRAME_BEGIN 1271ba3579e6STaehee Yoo 1272ba3579e6STaehee Yoo call __aria_aesni_avx_ctr_gen_keystream_16way 1273ba3579e6STaehee Yoo 1274ba3579e6STaehee Yoo leaq (%rsi), %r10; 1275ba3579e6STaehee Yoo leaq (%rdx), %r11; 1276ba3579e6STaehee Yoo leaq (%rcx), %rsi; 1277ba3579e6STaehee Yoo leaq (%rcx), %rdx; 1278ba3579e6STaehee Yoo leaq enc_key(CTX), %r9; 1279ba3579e6STaehee Yoo 1280ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1281ba3579e6STaehee Yoo 1282ba3579e6STaehee Yoo vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1283ba3579e6STaehee Yoo vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1284ba3579e6STaehee Yoo vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1285ba3579e6STaehee Yoo vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1286ba3579e6STaehee Yoo vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1287ba3579e6STaehee Yoo vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1288ba3579e6STaehee Yoo vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1289ba3579e6STaehee Yoo vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1290ba3579e6STaehee Yoo vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1291ba3579e6STaehee Yoo vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1292ba3579e6STaehee Yoo vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1293ba3579e6STaehee Yoo vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1294ba3579e6STaehee Yoo vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1295ba3579e6STaehee Yoo vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1296ba3579e6STaehee Yoo vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1297ba3579e6STaehee Yoo vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1298ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1299ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1300ba3579e6STaehee Yoo %xmm15, %r10); 1301ba3579e6STaehee Yoo 1302ba3579e6STaehee Yoo FRAME_END 1303ba3579e6STaehee Yoo RET; 1304ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1305