1ba3579e6STaehee Yoo/* SPDX-License-Identifier: GPL-2.0-or-later */ 2ba3579e6STaehee Yoo/* 3ba3579e6STaehee Yoo * ARIA Cipher 16-way parallel algorithm (AVX) 4ba3579e6STaehee Yoo * 5ba3579e6STaehee Yoo * Copyright (c) 2022 Taehee Yoo <ap420073@gmail.com> 6ba3579e6STaehee Yoo * 7ba3579e6STaehee Yoo */ 8ba3579e6STaehee Yoo 9ba3579e6STaehee Yoo#include <linux/linkage.h> 10c67b553aSEric Biggers#include <linux/cfi_types.h> 1135344cf3STaehee Yoo#include <asm/asm-offsets.h> 12ba3579e6STaehee Yoo#include <asm/frame.h> 13ba3579e6STaehee Yoo 14ba3579e6STaehee Yoo/* register macros */ 15ba3579e6STaehee Yoo#define CTX %rdi 16ba3579e6STaehee Yoo 17ba3579e6STaehee Yoo 18ba3579e6STaehee Yoo#define BV8(a0, a1, a2, a3, a4, a5, a6, a7) \ 19ba3579e6STaehee Yoo ( (((a0) & 1) << 0) | \ 20ba3579e6STaehee Yoo (((a1) & 1) << 1) | \ 21ba3579e6STaehee Yoo (((a2) & 1) << 2) | \ 22ba3579e6STaehee Yoo (((a3) & 1) << 3) | \ 23ba3579e6STaehee Yoo (((a4) & 1) << 4) | \ 24ba3579e6STaehee Yoo (((a5) & 1) << 5) | \ 25ba3579e6STaehee Yoo (((a6) & 1) << 6) | \ 26ba3579e6STaehee Yoo (((a7) & 1) << 7) ) 27ba3579e6STaehee Yoo 28ba3579e6STaehee Yoo#define BM8X8(l0, l1, l2, l3, l4, l5, l6, l7) \ 29ba3579e6STaehee Yoo ( ((l7) << (0 * 8)) | \ 30ba3579e6STaehee Yoo ((l6) << (1 * 8)) | \ 31ba3579e6STaehee Yoo ((l5) << (2 * 8)) | \ 32ba3579e6STaehee Yoo ((l4) << (3 * 8)) | \ 33ba3579e6STaehee Yoo ((l3) << (4 * 8)) | \ 34ba3579e6STaehee Yoo ((l2) << (5 * 8)) | \ 35ba3579e6STaehee Yoo ((l1) << (6 * 8)) | \ 36ba3579e6STaehee Yoo ((l0) << (7 * 8)) ) 37ba3579e6STaehee Yoo 38ba3579e6STaehee Yoo#define inc_le128(x, minus_one, tmp) \ 39ba3579e6STaehee Yoo vpcmpeqq minus_one, x, tmp; \ 40ba3579e6STaehee Yoo vpsubq minus_one, x, x; \ 41ba3579e6STaehee Yoo vpslldq $8, tmp, tmp; \ 42ba3579e6STaehee Yoo vpsubq tmp, x, x; 43ba3579e6STaehee Yoo 44ba3579e6STaehee Yoo#define filter_8bit(x, lo_t, hi_t, mask4bit, tmp0) \ 45ba3579e6STaehee Yoo vpand x, mask4bit, tmp0; \ 46ba3579e6STaehee Yoo vpandn x, mask4bit, x; \ 47ba3579e6STaehee Yoo vpsrld $4, x, x; \ 48ba3579e6STaehee Yoo \ 49ba3579e6STaehee Yoo vpshufb tmp0, lo_t, tmp0; \ 50ba3579e6STaehee Yoo vpshufb x, hi_t, x; \ 51ba3579e6STaehee Yoo vpxor tmp0, x, x; 52ba3579e6STaehee Yoo 53ba3579e6STaehee Yoo#define transpose_4x4(x0, x1, x2, x3, t1, t2) \ 54ba3579e6STaehee Yoo vpunpckhdq x1, x0, t2; \ 55ba3579e6STaehee Yoo vpunpckldq x1, x0, x0; \ 56ba3579e6STaehee Yoo \ 57ba3579e6STaehee Yoo vpunpckldq x3, x2, t1; \ 58ba3579e6STaehee Yoo vpunpckhdq x3, x2, x2; \ 59ba3579e6STaehee Yoo \ 60ba3579e6STaehee Yoo vpunpckhqdq t1, x0, x1; \ 61ba3579e6STaehee Yoo vpunpcklqdq t1, x0, x0; \ 62ba3579e6STaehee Yoo \ 63ba3579e6STaehee Yoo vpunpckhqdq x2, t2, x3; \ 64ba3579e6STaehee Yoo vpunpcklqdq x2, t2, x2; 65ba3579e6STaehee Yoo 66ba3579e6STaehee Yoo#define byteslice_16x16b(a0, b0, c0, d0, \ 67ba3579e6STaehee Yoo a1, b1, c1, d1, \ 68ba3579e6STaehee Yoo a2, b2, c2, d2, \ 69ba3579e6STaehee Yoo a3, b3, c3, d3, \ 70ba3579e6STaehee Yoo st0, st1) \ 71ba3579e6STaehee Yoo vmovdqu d2, st0; \ 72ba3579e6STaehee Yoo vmovdqu d3, st1; \ 73ba3579e6STaehee Yoo transpose_4x4(a0, a1, a2, a3, d2, d3); \ 74ba3579e6STaehee Yoo transpose_4x4(b0, b1, b2, b3, d2, d3); \ 75ba3579e6STaehee Yoo vmovdqu st0, d2; \ 76ba3579e6STaehee Yoo vmovdqu st1, d3; \ 77ba3579e6STaehee Yoo \ 78ba3579e6STaehee Yoo vmovdqu a0, st0; \ 79ba3579e6STaehee Yoo vmovdqu a1, st1; \ 80ba3579e6STaehee Yoo transpose_4x4(c0, c1, c2, c3, a0, a1); \ 81ba3579e6STaehee Yoo transpose_4x4(d0, d1, d2, d3, a0, a1); \ 82ba3579e6STaehee Yoo \ 83*52fc482aSArd Biesheuvel vmovdqu .Lshufb_16x16b(%rip), a0; \ 84ba3579e6STaehee Yoo vmovdqu st1, a1; \ 85ba3579e6STaehee Yoo vpshufb a0, a2, a2; \ 86ba3579e6STaehee Yoo vpshufb a0, a3, a3; \ 87ba3579e6STaehee Yoo vpshufb a0, b0, b0; \ 88ba3579e6STaehee Yoo vpshufb a0, b1, b1; \ 89ba3579e6STaehee Yoo vpshufb a0, b2, b2; \ 90ba3579e6STaehee Yoo vpshufb a0, b3, b3; \ 91ba3579e6STaehee Yoo vpshufb a0, a1, a1; \ 92ba3579e6STaehee Yoo vpshufb a0, c0, c0; \ 93ba3579e6STaehee Yoo vpshufb a0, c1, c1; \ 94ba3579e6STaehee Yoo vpshufb a0, c2, c2; \ 95ba3579e6STaehee Yoo vpshufb a0, c3, c3; \ 96ba3579e6STaehee Yoo vpshufb a0, d0, d0; \ 97ba3579e6STaehee Yoo vpshufb a0, d1, d1; \ 98ba3579e6STaehee Yoo vpshufb a0, d2, d2; \ 99ba3579e6STaehee Yoo vpshufb a0, d3, d3; \ 100ba3579e6STaehee Yoo vmovdqu d3, st1; \ 101ba3579e6STaehee Yoo vmovdqu st0, d3; \ 102ba3579e6STaehee Yoo vpshufb a0, d3, a0; \ 103ba3579e6STaehee Yoo vmovdqu d2, st0; \ 104ba3579e6STaehee Yoo \ 105ba3579e6STaehee Yoo transpose_4x4(a0, b0, c0, d0, d2, d3); \ 106ba3579e6STaehee Yoo transpose_4x4(a1, b1, c1, d1, d2, d3); \ 107ba3579e6STaehee Yoo vmovdqu st0, d2; \ 108ba3579e6STaehee Yoo vmovdqu st1, d3; \ 109ba3579e6STaehee Yoo \ 110ba3579e6STaehee Yoo vmovdqu b0, st0; \ 111ba3579e6STaehee Yoo vmovdqu b1, st1; \ 112ba3579e6STaehee Yoo transpose_4x4(a2, b2, c2, d2, b0, b1); \ 113ba3579e6STaehee Yoo transpose_4x4(a3, b3, c3, d3, b0, b1); \ 114ba3579e6STaehee Yoo vmovdqu st0, b0; \ 115ba3579e6STaehee Yoo vmovdqu st1, b1; \ 116ba3579e6STaehee Yoo /* does not adjust output bytes inside vectors */ 117ba3579e6STaehee Yoo 118ba3579e6STaehee Yoo#define debyteslice_16x16b(a0, b0, c0, d0, \ 119ba3579e6STaehee Yoo a1, b1, c1, d1, \ 120ba3579e6STaehee Yoo a2, b2, c2, d2, \ 121ba3579e6STaehee Yoo a3, b3, c3, d3, \ 122ba3579e6STaehee Yoo st0, st1) \ 123ba3579e6STaehee Yoo vmovdqu d2, st0; \ 124ba3579e6STaehee Yoo vmovdqu d3, st1; \ 125ba3579e6STaehee Yoo transpose_4x4(a0, a1, a2, a3, d2, d3); \ 126ba3579e6STaehee Yoo transpose_4x4(b0, b1, b2, b3, d2, d3); \ 127ba3579e6STaehee Yoo vmovdqu st0, d2; \ 128ba3579e6STaehee Yoo vmovdqu st1, d3; \ 129ba3579e6STaehee Yoo \ 130ba3579e6STaehee Yoo vmovdqu a0, st0; \ 131ba3579e6STaehee Yoo vmovdqu a1, st1; \ 132ba3579e6STaehee Yoo transpose_4x4(c0, c1, c2, c3, a0, a1); \ 133ba3579e6STaehee Yoo transpose_4x4(d0, d1, d2, d3, a0, a1); \ 134ba3579e6STaehee Yoo \ 135*52fc482aSArd Biesheuvel vmovdqu .Lshufb_16x16b(%rip), a0; \ 136ba3579e6STaehee Yoo vmovdqu st1, a1; \ 137ba3579e6STaehee Yoo vpshufb a0, a2, a2; \ 138ba3579e6STaehee Yoo vpshufb a0, a3, a3; \ 139ba3579e6STaehee Yoo vpshufb a0, b0, b0; \ 140ba3579e6STaehee Yoo vpshufb a0, b1, b1; \ 141ba3579e6STaehee Yoo vpshufb a0, b2, b2; \ 142ba3579e6STaehee Yoo vpshufb a0, b3, b3; \ 143ba3579e6STaehee Yoo vpshufb a0, a1, a1; \ 144ba3579e6STaehee Yoo vpshufb a0, c0, c0; \ 145ba3579e6STaehee Yoo vpshufb a0, c1, c1; \ 146ba3579e6STaehee Yoo vpshufb a0, c2, c2; \ 147ba3579e6STaehee Yoo vpshufb a0, c3, c3; \ 148ba3579e6STaehee Yoo vpshufb a0, d0, d0; \ 149ba3579e6STaehee Yoo vpshufb a0, d1, d1; \ 150ba3579e6STaehee Yoo vpshufb a0, d2, d2; \ 151ba3579e6STaehee Yoo vpshufb a0, d3, d3; \ 152ba3579e6STaehee Yoo vmovdqu d3, st1; \ 153ba3579e6STaehee Yoo vmovdqu st0, d3; \ 154ba3579e6STaehee Yoo vpshufb a0, d3, a0; \ 155ba3579e6STaehee Yoo vmovdqu d2, st0; \ 156ba3579e6STaehee Yoo \ 157ba3579e6STaehee Yoo transpose_4x4(c0, d0, a0, b0, d2, d3); \ 158ba3579e6STaehee Yoo transpose_4x4(c1, d1, a1, b1, d2, d3); \ 159ba3579e6STaehee Yoo vmovdqu st0, d2; \ 160ba3579e6STaehee Yoo vmovdqu st1, d3; \ 161ba3579e6STaehee Yoo \ 162ba3579e6STaehee Yoo vmovdqu b0, st0; \ 163ba3579e6STaehee Yoo vmovdqu b1, st1; \ 164ba3579e6STaehee Yoo transpose_4x4(c2, d2, a2, b2, b0, b1); \ 165ba3579e6STaehee Yoo transpose_4x4(c3, d3, a3, b3, b0, b1); \ 166ba3579e6STaehee Yoo vmovdqu st0, b0; \ 167ba3579e6STaehee Yoo vmovdqu st1, b1; \ 168ba3579e6STaehee Yoo /* does not adjust output bytes inside vectors */ 169ba3579e6STaehee Yoo 170ba3579e6STaehee Yoo/* load blocks to registers and apply pre-whitening */ 171ba3579e6STaehee Yoo#define inpack16_pre(x0, x1, x2, x3, \ 172ba3579e6STaehee Yoo x4, x5, x6, x7, \ 173ba3579e6STaehee Yoo y0, y1, y2, y3, \ 174ba3579e6STaehee Yoo y4, y5, y6, y7, \ 175ba3579e6STaehee Yoo rio) \ 176ba3579e6STaehee Yoo vmovdqu (0 * 16)(rio), x0; \ 177ba3579e6STaehee Yoo vmovdqu (1 * 16)(rio), x1; \ 178ba3579e6STaehee Yoo vmovdqu (2 * 16)(rio), x2; \ 179ba3579e6STaehee Yoo vmovdqu (3 * 16)(rio), x3; \ 180ba3579e6STaehee Yoo vmovdqu (4 * 16)(rio), x4; \ 181ba3579e6STaehee Yoo vmovdqu (5 * 16)(rio), x5; \ 182ba3579e6STaehee Yoo vmovdqu (6 * 16)(rio), x6; \ 183ba3579e6STaehee Yoo vmovdqu (7 * 16)(rio), x7; \ 184ba3579e6STaehee Yoo vmovdqu (8 * 16)(rio), y0; \ 185ba3579e6STaehee Yoo vmovdqu (9 * 16)(rio), y1; \ 186ba3579e6STaehee Yoo vmovdqu (10 * 16)(rio), y2; \ 187ba3579e6STaehee Yoo vmovdqu (11 * 16)(rio), y3; \ 188ba3579e6STaehee Yoo vmovdqu (12 * 16)(rio), y4; \ 189ba3579e6STaehee Yoo vmovdqu (13 * 16)(rio), y5; \ 190ba3579e6STaehee Yoo vmovdqu (14 * 16)(rio), y6; \ 191ba3579e6STaehee Yoo vmovdqu (15 * 16)(rio), y7; 192ba3579e6STaehee Yoo 193ba3579e6STaehee Yoo/* byteslice pre-whitened blocks and store to temporary memory */ 194ba3579e6STaehee Yoo#define inpack16_post(x0, x1, x2, x3, \ 195ba3579e6STaehee Yoo x4, x5, x6, x7, \ 196ba3579e6STaehee Yoo y0, y1, y2, y3, \ 197ba3579e6STaehee Yoo y4, y5, y6, y7, \ 198ba3579e6STaehee Yoo mem_ab, mem_cd) \ 199ba3579e6STaehee Yoo byteslice_16x16b(x0, x1, x2, x3, \ 200ba3579e6STaehee Yoo x4, x5, x6, x7, \ 201ba3579e6STaehee Yoo y0, y1, y2, y3, \ 202ba3579e6STaehee Yoo y4, y5, y6, y7, \ 203ba3579e6STaehee Yoo (mem_ab), (mem_cd)); \ 204ba3579e6STaehee Yoo \ 205ba3579e6STaehee Yoo vmovdqu x0, 0 * 16(mem_ab); \ 206ba3579e6STaehee Yoo vmovdqu x1, 1 * 16(mem_ab); \ 207ba3579e6STaehee Yoo vmovdqu x2, 2 * 16(mem_ab); \ 208ba3579e6STaehee Yoo vmovdqu x3, 3 * 16(mem_ab); \ 209ba3579e6STaehee Yoo vmovdqu x4, 4 * 16(mem_ab); \ 210ba3579e6STaehee Yoo vmovdqu x5, 5 * 16(mem_ab); \ 211ba3579e6STaehee Yoo vmovdqu x6, 6 * 16(mem_ab); \ 212ba3579e6STaehee Yoo vmovdqu x7, 7 * 16(mem_ab); \ 213ba3579e6STaehee Yoo vmovdqu y0, 0 * 16(mem_cd); \ 214ba3579e6STaehee Yoo vmovdqu y1, 1 * 16(mem_cd); \ 215ba3579e6STaehee Yoo vmovdqu y2, 2 * 16(mem_cd); \ 216ba3579e6STaehee Yoo vmovdqu y3, 3 * 16(mem_cd); \ 217ba3579e6STaehee Yoo vmovdqu y4, 4 * 16(mem_cd); \ 218ba3579e6STaehee Yoo vmovdqu y5, 5 * 16(mem_cd); \ 219ba3579e6STaehee Yoo vmovdqu y6, 6 * 16(mem_cd); \ 220ba3579e6STaehee Yoo vmovdqu y7, 7 * 16(mem_cd); 221ba3579e6STaehee Yoo 222ba3579e6STaehee Yoo#define write_output(x0, x1, x2, x3, \ 223ba3579e6STaehee Yoo x4, x5, x6, x7, \ 224ba3579e6STaehee Yoo y0, y1, y2, y3, \ 225ba3579e6STaehee Yoo y4, y5, y6, y7, \ 226ba3579e6STaehee Yoo mem) \ 227ba3579e6STaehee Yoo vmovdqu x0, 0 * 16(mem); \ 228ba3579e6STaehee Yoo vmovdqu x1, 1 * 16(mem); \ 229ba3579e6STaehee Yoo vmovdqu x2, 2 * 16(mem); \ 230ba3579e6STaehee Yoo vmovdqu x3, 3 * 16(mem); \ 231ba3579e6STaehee Yoo vmovdqu x4, 4 * 16(mem); \ 232ba3579e6STaehee Yoo vmovdqu x5, 5 * 16(mem); \ 233ba3579e6STaehee Yoo vmovdqu x6, 6 * 16(mem); \ 234ba3579e6STaehee Yoo vmovdqu x7, 7 * 16(mem); \ 235ba3579e6STaehee Yoo vmovdqu y0, 8 * 16(mem); \ 236ba3579e6STaehee Yoo vmovdqu y1, 9 * 16(mem); \ 237ba3579e6STaehee Yoo vmovdqu y2, 10 * 16(mem); \ 238ba3579e6STaehee Yoo vmovdqu y3, 11 * 16(mem); \ 239ba3579e6STaehee Yoo vmovdqu y4, 12 * 16(mem); \ 240ba3579e6STaehee Yoo vmovdqu y5, 13 * 16(mem); \ 241ba3579e6STaehee Yoo vmovdqu y6, 14 * 16(mem); \ 242ba3579e6STaehee Yoo vmovdqu y7, 15 * 16(mem); \ 243ba3579e6STaehee Yoo 244ba3579e6STaehee Yoo#define aria_store_state_8way(x0, x1, x2, x3, \ 245ba3579e6STaehee Yoo x4, x5, x6, x7, \ 246ba3579e6STaehee Yoo mem_tmp, idx) \ 247ba3579e6STaehee Yoo vmovdqu x0, ((idx + 0) * 16)(mem_tmp); \ 248ba3579e6STaehee Yoo vmovdqu x1, ((idx + 1) * 16)(mem_tmp); \ 249ba3579e6STaehee Yoo vmovdqu x2, ((idx + 2) * 16)(mem_tmp); \ 250ba3579e6STaehee Yoo vmovdqu x3, ((idx + 3) * 16)(mem_tmp); \ 251ba3579e6STaehee Yoo vmovdqu x4, ((idx + 4) * 16)(mem_tmp); \ 252ba3579e6STaehee Yoo vmovdqu x5, ((idx + 5) * 16)(mem_tmp); \ 253ba3579e6STaehee Yoo vmovdqu x6, ((idx + 6) * 16)(mem_tmp); \ 254ba3579e6STaehee Yoo vmovdqu x7, ((idx + 7) * 16)(mem_tmp); 255ba3579e6STaehee Yoo 256ba3579e6STaehee Yoo#define aria_load_state_8way(x0, x1, x2, x3, \ 257ba3579e6STaehee Yoo x4, x5, x6, x7, \ 258ba3579e6STaehee Yoo mem_tmp, idx) \ 259ba3579e6STaehee Yoo vmovdqu ((idx + 0) * 16)(mem_tmp), x0; \ 260ba3579e6STaehee Yoo vmovdqu ((idx + 1) * 16)(mem_tmp), x1; \ 261ba3579e6STaehee Yoo vmovdqu ((idx + 2) * 16)(mem_tmp), x2; \ 262ba3579e6STaehee Yoo vmovdqu ((idx + 3) * 16)(mem_tmp), x3; \ 263ba3579e6STaehee Yoo vmovdqu ((idx + 4) * 16)(mem_tmp), x4; \ 264ba3579e6STaehee Yoo vmovdqu ((idx + 5) * 16)(mem_tmp), x5; \ 265ba3579e6STaehee Yoo vmovdqu ((idx + 6) * 16)(mem_tmp), x6; \ 266ba3579e6STaehee Yoo vmovdqu ((idx + 7) * 16)(mem_tmp), x7; 267ba3579e6STaehee Yoo 268ba3579e6STaehee Yoo#define aria_ark_8way(x0, x1, x2, x3, \ 269ba3579e6STaehee Yoo x4, x5, x6, x7, \ 2708b844753STaehee Yoo t0, t1, t2, rk, \ 2718b844753STaehee Yoo idx, round) \ 272ba3579e6STaehee Yoo /* AddRoundKey */ \ 2738b844753STaehee Yoo vbroadcastss ((round * 16) + idx + 0)(rk), t0; \ 2748b844753STaehee Yoo vpsrld $24, t0, t2; \ 2758b844753STaehee Yoo vpshufb t1, t2, t2; \ 2768b844753STaehee Yoo vpxor t2, x0, x0; \ 2778b844753STaehee Yoo vpsrld $16, t0, t2; \ 2788b844753STaehee Yoo vpshufb t1, t2, t2; \ 2798b844753STaehee Yoo vpxor t2, x1, x1; \ 2808b844753STaehee Yoo vpsrld $8, t0, t2; \ 2818b844753STaehee Yoo vpshufb t1, t2, t2; \ 2828b844753STaehee Yoo vpxor t2, x2, x2; \ 2838b844753STaehee Yoo vpshufb t1, t0, t2; \ 2848b844753STaehee Yoo vpxor t2, x3, x3; \ 2858b844753STaehee Yoo vbroadcastss ((round * 16) + idx + 4)(rk), t0; \ 2868b844753STaehee Yoo vpsrld $24, t0, t2; \ 2878b844753STaehee Yoo vpshufb t1, t2, t2; \ 2888b844753STaehee Yoo vpxor t2, x4, x4; \ 2898b844753STaehee Yoo vpsrld $16, t0, t2; \ 2908b844753STaehee Yoo vpshufb t1, t2, t2; \ 2918b844753STaehee Yoo vpxor t2, x5, x5; \ 2928b844753STaehee Yoo vpsrld $8, t0, t2; \ 2938b844753STaehee Yoo vpshufb t1, t2, t2; \ 2948b844753STaehee Yoo vpxor t2, x6, x6; \ 2958b844753STaehee Yoo vpshufb t1, t0, t2; \ 2968b844753STaehee Yoo vpxor t2, x7, x7; 297ba3579e6STaehee Yoo 298e3cf2f87STaehee Yoo#ifdef CONFIG_AS_GFNI 299ba3579e6STaehee Yoo#define aria_sbox_8way_gfni(x0, x1, x2, x3, \ 300ba3579e6STaehee Yoo x4, x5, x6, x7, \ 301ba3579e6STaehee Yoo t0, t1, t2, t3, \ 302ba3579e6STaehee Yoo t4, t5, t6, t7) \ 303*52fc482aSArd Biesheuvel vmovdqa .Ltf_s2_bitmatrix(%rip), t0; \ 304*52fc482aSArd Biesheuvel vmovdqa .Ltf_inv_bitmatrix(%rip), t1; \ 305*52fc482aSArd Biesheuvel vmovdqa .Ltf_id_bitmatrix(%rip), t2; \ 306*52fc482aSArd Biesheuvel vmovdqa .Ltf_aff_bitmatrix(%rip), t3; \ 307*52fc482aSArd Biesheuvel vmovdqa .Ltf_x2_bitmatrix(%rip), t4; \ 308ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_s2_const), t0, x1, x1; \ 309ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_s2_const), t0, x5, x5; \ 310ba3579e6STaehee Yoo vgf2p8affineqb $(tf_inv_const), t1, x2, x2; \ 311ba3579e6STaehee Yoo vgf2p8affineqb $(tf_inv_const), t1, x6, x6; \ 312ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x2, x2; \ 313ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x6, x6; \ 314ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_aff_const), t3, x0, x0; \ 315ba3579e6STaehee Yoo vgf2p8affineinvqb $(tf_aff_const), t3, x4, x4; \ 316ba3579e6STaehee Yoo vgf2p8affineqb $(tf_x2_const), t4, x3, x3; \ 317ba3579e6STaehee Yoo vgf2p8affineqb $(tf_x2_const), t4, x7, x7; \ 318ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x3, x3; \ 319ba3579e6STaehee Yoo vgf2p8affineinvqb $0, t2, x7, x7 320ba3579e6STaehee Yoo 321e3cf2f87STaehee Yoo#endif /* CONFIG_AS_GFNI */ 322e3cf2f87STaehee Yoo 323ba3579e6STaehee Yoo#define aria_sbox_8way(x0, x1, x2, x3, \ 324ba3579e6STaehee Yoo x4, x5, x6, x7, \ 325ba3579e6STaehee Yoo t0, t1, t2, t3, \ 326ba3579e6STaehee Yoo t4, t5, t6, t7) \ 327*52fc482aSArd Biesheuvel vmovdqa .Linv_shift_row(%rip), t0; \ 328*52fc482aSArd Biesheuvel vmovdqa .Lshift_row(%rip), t1; \ 329*52fc482aSArd Biesheuvel vbroadcastss .L0f0f0f0f(%rip), t6; \ 330*52fc482aSArd Biesheuvel vmovdqa .Ltf_lo__inv_aff__and__s2(%rip), t2; \ 331*52fc482aSArd Biesheuvel vmovdqa .Ltf_hi__inv_aff__and__s2(%rip), t3; \ 332*52fc482aSArd Biesheuvel vmovdqa .Ltf_lo__x2__and__fwd_aff(%rip), t4; \ 333*52fc482aSArd Biesheuvel vmovdqa .Ltf_hi__x2__and__fwd_aff(%rip), t5; \ 334ba3579e6STaehee Yoo \ 335ba3579e6STaehee Yoo vaesenclast t7, x0, x0; \ 336ba3579e6STaehee Yoo vaesenclast t7, x4, x4; \ 337ba3579e6STaehee Yoo vaesenclast t7, x1, x1; \ 338ba3579e6STaehee Yoo vaesenclast t7, x5, x5; \ 339ba3579e6STaehee Yoo vaesdeclast t7, x2, x2; \ 340ba3579e6STaehee Yoo vaesdeclast t7, x6, x6; \ 341ba3579e6STaehee Yoo \ 342ba3579e6STaehee Yoo /* AES inverse shift rows */ \ 343ba3579e6STaehee Yoo vpshufb t0, x0, x0; \ 344ba3579e6STaehee Yoo vpshufb t0, x4, x4; \ 345ba3579e6STaehee Yoo vpshufb t0, x1, x1; \ 346ba3579e6STaehee Yoo vpshufb t0, x5, x5; \ 347ba3579e6STaehee Yoo vpshufb t1, x3, x3; \ 348ba3579e6STaehee Yoo vpshufb t1, x7, x7; \ 349ba3579e6STaehee Yoo vpshufb t1, x2, x2; \ 350ba3579e6STaehee Yoo vpshufb t1, x6, x6; \ 351ba3579e6STaehee Yoo \ 352ba3579e6STaehee Yoo /* affine transformation for S2 */ \ 353ba3579e6STaehee Yoo filter_8bit(x1, t2, t3, t6, t0); \ 354ba3579e6STaehee Yoo /* affine transformation for S2 */ \ 355ba3579e6STaehee Yoo filter_8bit(x5, t2, t3, t6, t0); \ 356ba3579e6STaehee Yoo \ 357ba3579e6STaehee Yoo /* affine transformation for X2 */ \ 358ba3579e6STaehee Yoo filter_8bit(x3, t4, t5, t6, t0); \ 359ba3579e6STaehee Yoo /* affine transformation for X2 */ \ 360ba3579e6STaehee Yoo filter_8bit(x7, t4, t5, t6, t0); \ 361ba3579e6STaehee Yoo vaesdeclast t7, x3, x3; \ 362ba3579e6STaehee Yoo vaesdeclast t7, x7, x7; 363ba3579e6STaehee Yoo 364ba3579e6STaehee Yoo#define aria_diff_m(x0, x1, x2, x3, \ 365ba3579e6STaehee Yoo t0, t1, t2, t3) \ 366ba3579e6STaehee Yoo /* T = rotr32(X, 8); */ \ 367ba3579e6STaehee Yoo /* X ^= T */ \ 368ba3579e6STaehee Yoo vpxor x0, x3, t0; \ 369ba3579e6STaehee Yoo vpxor x1, x0, t1; \ 370ba3579e6STaehee Yoo vpxor x2, x1, t2; \ 371ba3579e6STaehee Yoo vpxor x3, x2, t3; \ 372ba3579e6STaehee Yoo /* X = T ^ rotr(X, 16); */ \ 373ba3579e6STaehee Yoo vpxor t2, x0, x0; \ 374ba3579e6STaehee Yoo vpxor x1, t3, t3; \ 375ba3579e6STaehee Yoo vpxor t0, x2, x2; \ 376ba3579e6STaehee Yoo vpxor t1, x3, x1; \ 377ba3579e6STaehee Yoo vmovdqu t3, x3; 378ba3579e6STaehee Yoo 379ba3579e6STaehee Yoo#define aria_diff_word(x0, x1, x2, x3, \ 380ba3579e6STaehee Yoo x4, x5, x6, x7, \ 381ba3579e6STaehee Yoo y0, y1, y2, y3, \ 382ba3579e6STaehee Yoo y4, y5, y6, y7) \ 383ba3579e6STaehee Yoo /* t1 ^= t2; */ \ 384ba3579e6STaehee Yoo vpxor y0, x4, x4; \ 385ba3579e6STaehee Yoo vpxor y1, x5, x5; \ 386ba3579e6STaehee Yoo vpxor y2, x6, x6; \ 387ba3579e6STaehee Yoo vpxor y3, x7, x7; \ 388ba3579e6STaehee Yoo \ 389ba3579e6STaehee Yoo /* t2 ^= t3; */ \ 390ba3579e6STaehee Yoo vpxor y4, y0, y0; \ 391ba3579e6STaehee Yoo vpxor y5, y1, y1; \ 392ba3579e6STaehee Yoo vpxor y6, y2, y2; \ 393ba3579e6STaehee Yoo vpxor y7, y3, y3; \ 394ba3579e6STaehee Yoo \ 395ba3579e6STaehee Yoo /* t0 ^= t1; */ \ 396ba3579e6STaehee Yoo vpxor x4, x0, x0; \ 397ba3579e6STaehee Yoo vpxor x5, x1, x1; \ 398ba3579e6STaehee Yoo vpxor x6, x2, x2; \ 399ba3579e6STaehee Yoo vpxor x7, x3, x3; \ 400ba3579e6STaehee Yoo \ 401ba3579e6STaehee Yoo /* t3 ^= t1; */ \ 402ba3579e6STaehee Yoo vpxor x4, y4, y4; \ 403ba3579e6STaehee Yoo vpxor x5, y5, y5; \ 404ba3579e6STaehee Yoo vpxor x6, y6, y6; \ 405ba3579e6STaehee Yoo vpxor x7, y7, y7; \ 406ba3579e6STaehee Yoo \ 407ba3579e6STaehee Yoo /* t2 ^= t0; */ \ 408ba3579e6STaehee Yoo vpxor x0, y0, y0; \ 409ba3579e6STaehee Yoo vpxor x1, y1, y1; \ 410ba3579e6STaehee Yoo vpxor x2, y2, y2; \ 411ba3579e6STaehee Yoo vpxor x3, y3, y3; \ 412ba3579e6STaehee Yoo \ 413ba3579e6STaehee Yoo /* t1 ^= t2; */ \ 414ba3579e6STaehee Yoo vpxor y0, x4, x4; \ 415ba3579e6STaehee Yoo vpxor y1, x5, x5; \ 416ba3579e6STaehee Yoo vpxor y2, x6, x6; \ 417ba3579e6STaehee Yoo vpxor y3, x7, x7; 418ba3579e6STaehee Yoo 419ba3579e6STaehee Yoo#define aria_fe(x0, x1, x2, x3, \ 420ba3579e6STaehee Yoo x4, x5, x6, x7, \ 421ba3579e6STaehee Yoo y0, y1, y2, y3, \ 422ba3579e6STaehee Yoo y4, y5, y6, y7, \ 423ba3579e6STaehee Yoo mem_tmp, rk, round) \ 4248b844753STaehee Yoo vpxor y7, y7, y7; \ 425ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 4268b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 427ba3579e6STaehee Yoo \ 428ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 429ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 430ba3579e6STaehee Yoo \ 431ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 432ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 433ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 434ba3579e6STaehee Yoo x4, x5, x6, x7, \ 435ba3579e6STaehee Yoo mem_tmp, 8); \ 436ba3579e6STaehee Yoo \ 437ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 438ba3579e6STaehee Yoo x4, x5, x6, x7, \ 439ba3579e6STaehee Yoo mem_tmp, 0); \ 440ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 4418b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 442ba3579e6STaehee Yoo \ 443ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 444ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 445ba3579e6STaehee Yoo \ 446ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 447ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 448ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 449ba3579e6STaehee Yoo x4, x5, x6, x7, \ 450ba3579e6STaehee Yoo mem_tmp, 0); \ 451ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 452ba3579e6STaehee Yoo y4, y5, y6, y7, \ 453ba3579e6STaehee Yoo mem_tmp, 8); \ 454ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 455ba3579e6STaehee Yoo x4, x5, x6, x7, \ 456ba3579e6STaehee Yoo y0, y1, y2, y3, \ 457ba3579e6STaehee Yoo y4, y5, y6, y7); \ 458ba3579e6STaehee Yoo /* aria_diff_byte() \ 459ba3579e6STaehee Yoo * T3 = ABCD -> BADC \ 460ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 461ba3579e6STaehee Yoo * T0 = ABCD -> CDAB \ 462ba3579e6STaehee Yoo * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 463ba3579e6STaehee Yoo * T1 = ABCD -> DCBA \ 464ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 465ba3579e6STaehee Yoo */ \ 466ba3579e6STaehee Yoo aria_diff_word(x2, x3, x0, x1, \ 467ba3579e6STaehee Yoo x7, x6, x5, x4, \ 468ba3579e6STaehee Yoo y0, y1, y2, y3, \ 469ba3579e6STaehee Yoo y5, y4, y7, y6); \ 470ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 471ba3579e6STaehee Yoo x6, x7, x4, x5, \ 472ba3579e6STaehee Yoo mem_tmp, 0); 473ba3579e6STaehee Yoo 474ba3579e6STaehee Yoo#define aria_fo(x0, x1, x2, x3, \ 475ba3579e6STaehee Yoo x4, x5, x6, x7, \ 476ba3579e6STaehee Yoo y0, y1, y2, y3, \ 477ba3579e6STaehee Yoo y4, y5, y6, y7, \ 478ba3579e6STaehee Yoo mem_tmp, rk, round) \ 4798b844753STaehee Yoo vpxor y7, y7, y7; \ 480ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 4818b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 482ba3579e6STaehee Yoo \ 483ba3579e6STaehee Yoo aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 484ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 485ba3579e6STaehee Yoo \ 486ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 487ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 488ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 489ba3579e6STaehee Yoo x4, x5, x6, x7, \ 490ba3579e6STaehee Yoo mem_tmp, 8); \ 491ba3579e6STaehee Yoo \ 492ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 493ba3579e6STaehee Yoo x4, x5, x6, x7, \ 494ba3579e6STaehee Yoo mem_tmp, 0); \ 495ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 4968b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 497ba3579e6STaehee Yoo \ 498ba3579e6STaehee Yoo aria_sbox_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 499ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 500ba3579e6STaehee Yoo \ 501ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 502ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 503ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 504ba3579e6STaehee Yoo x4, x5, x6, x7, \ 505ba3579e6STaehee Yoo mem_tmp, 0); \ 506ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 507ba3579e6STaehee Yoo y4, y5, y6, y7, \ 508ba3579e6STaehee Yoo mem_tmp, 8); \ 509ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 510ba3579e6STaehee Yoo x4, x5, x6, x7, \ 511ba3579e6STaehee Yoo y0, y1, y2, y3, \ 512ba3579e6STaehee Yoo y4, y5, y6, y7); \ 513ba3579e6STaehee Yoo /* aria_diff_byte() \ 514ba3579e6STaehee Yoo * T1 = ABCD -> BADC \ 515ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 516ba3579e6STaehee Yoo * T2 = ABCD -> CDAB \ 517ba3579e6STaehee Yoo * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 518ba3579e6STaehee Yoo * T3 = ABCD -> DCBA \ 519ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 520ba3579e6STaehee Yoo */ \ 521ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 522ba3579e6STaehee Yoo x5, x4, x7, x6, \ 523ba3579e6STaehee Yoo y2, y3, y0, y1, \ 524ba3579e6STaehee Yoo y7, y6, y5, y4); \ 525ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 526ba3579e6STaehee Yoo x6, x7, x4, x5, \ 527ba3579e6STaehee Yoo mem_tmp, 0); 528ba3579e6STaehee Yoo 529ba3579e6STaehee Yoo#define aria_ff(x0, x1, x2, x3, \ 530ba3579e6STaehee Yoo x4, x5, x6, x7, \ 531ba3579e6STaehee Yoo y0, y1, y2, y3, \ 532ba3579e6STaehee Yoo y4, y5, y6, y7, \ 533ba3579e6STaehee Yoo mem_tmp, rk, round, last_round) \ 5348b844753STaehee Yoo vpxor y7, y7, y7; \ 535ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5368b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 537ba3579e6STaehee Yoo \ 538ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 539ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 540ba3579e6STaehee Yoo \ 541ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5428b844753STaehee Yoo y0, y7, y2, rk, 8, last_round); \ 543ba3579e6STaehee Yoo \ 544ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 545ba3579e6STaehee Yoo x4, x5, x6, x7, \ 546ba3579e6STaehee Yoo mem_tmp, 8); \ 547ba3579e6STaehee Yoo \ 548ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 549ba3579e6STaehee Yoo x4, x5, x6, x7, \ 550ba3579e6STaehee Yoo mem_tmp, 0); \ 551ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5528b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 553ba3579e6STaehee Yoo \ 554ba3579e6STaehee Yoo aria_sbox_8way(x2, x3, x0, x1, x6, x7, x4, x5, \ 555ba3579e6STaehee Yoo y0, y1, y2, y3, y4, y5, y6, y7); \ 556ba3579e6STaehee Yoo \ 557ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5588b844753STaehee Yoo y0, y7, y2, rk, 0, last_round); \ 559ba3579e6STaehee Yoo \ 560ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 561ba3579e6STaehee Yoo y4, y5, y6, y7, \ 562ba3579e6STaehee Yoo mem_tmp, 8); 563ba3579e6STaehee Yoo 564e3cf2f87STaehee Yoo#ifdef CONFIG_AS_GFNI 565ba3579e6STaehee Yoo#define aria_fe_gfni(x0, x1, x2, x3, \ 566ba3579e6STaehee Yoo x4, x5, x6, x7, \ 567ba3579e6STaehee Yoo y0, y1, y2, y3, \ 568ba3579e6STaehee Yoo y4, y5, y6, y7, \ 569ba3579e6STaehee Yoo mem_tmp, rk, round) \ 5708b844753STaehee Yoo vpxor y7, y7, y7; \ 571ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5728b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 573ba3579e6STaehee Yoo \ 574ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 575ba3579e6STaehee Yoo x6, x7, x4, x5, \ 576ba3579e6STaehee Yoo y0, y1, y2, y3, \ 577ba3579e6STaehee Yoo y4, y5, y6, y7); \ 578ba3579e6STaehee Yoo \ 579ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 580ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 581ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 582ba3579e6STaehee Yoo x4, x5, x6, x7, \ 583ba3579e6STaehee Yoo mem_tmp, 8); \ 584ba3579e6STaehee Yoo \ 585ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 586ba3579e6STaehee Yoo x4, x5, x6, x7, \ 587ba3579e6STaehee Yoo mem_tmp, 0); \ 588ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 5898b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 590ba3579e6STaehee Yoo \ 591ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 592ba3579e6STaehee Yoo x6, x7, x4, x5, \ 593ba3579e6STaehee Yoo y0, y1, y2, y3, \ 594ba3579e6STaehee Yoo y4, y5, y6, y7); \ 595ba3579e6STaehee Yoo \ 596ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 597ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 598ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 599ba3579e6STaehee Yoo x4, x5, x6, x7, \ 600ba3579e6STaehee Yoo mem_tmp, 0); \ 601ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 602ba3579e6STaehee Yoo y4, y5, y6, y7, \ 603ba3579e6STaehee Yoo mem_tmp, 8); \ 604ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 605ba3579e6STaehee Yoo x4, x5, x6, x7, \ 606ba3579e6STaehee Yoo y0, y1, y2, y3, \ 607ba3579e6STaehee Yoo y4, y5, y6, y7); \ 608ba3579e6STaehee Yoo /* aria_diff_byte() \ 609ba3579e6STaehee Yoo * T3 = ABCD -> BADC \ 610ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y5, y4, y7, y6 \ 611ba3579e6STaehee Yoo * T0 = ABCD -> CDAB \ 612ba3579e6STaehee Yoo * T0 = x0, x1, x2, x3 -> x2, x3, x0, x1 \ 613ba3579e6STaehee Yoo * T1 = ABCD -> DCBA \ 614ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x7, x6, x5, x4 \ 615ba3579e6STaehee Yoo */ \ 616ba3579e6STaehee Yoo aria_diff_word(x2, x3, x0, x1, \ 617ba3579e6STaehee Yoo x7, x6, x5, x4, \ 618ba3579e6STaehee Yoo y0, y1, y2, y3, \ 619ba3579e6STaehee Yoo y5, y4, y7, y6); \ 620ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 621ba3579e6STaehee Yoo x6, x7, x4, x5, \ 622ba3579e6STaehee Yoo mem_tmp, 0); 623ba3579e6STaehee Yoo 624ba3579e6STaehee Yoo#define aria_fo_gfni(x0, x1, x2, x3, \ 625ba3579e6STaehee Yoo x4, x5, x6, x7, \ 626ba3579e6STaehee Yoo y0, y1, y2, y3, \ 627ba3579e6STaehee Yoo y4, y5, y6, y7, \ 628ba3579e6STaehee Yoo mem_tmp, rk, round) \ 6298b844753STaehee Yoo vpxor y7, y7, y7; \ 630ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 6318b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 632ba3579e6STaehee Yoo \ 633ba3579e6STaehee Yoo aria_sbox_8way_gfni(x0, x1, x2, x3, \ 634ba3579e6STaehee Yoo x4, x5, x6, x7, \ 635ba3579e6STaehee Yoo y0, y1, y2, y3, \ 636ba3579e6STaehee Yoo y4, y5, y6, y7); \ 637ba3579e6STaehee Yoo \ 638ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 639ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 640ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 641ba3579e6STaehee Yoo x4, x5, x6, x7, \ 642ba3579e6STaehee Yoo mem_tmp, 8); \ 643ba3579e6STaehee Yoo \ 644ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 645ba3579e6STaehee Yoo x4, x5, x6, x7, \ 646ba3579e6STaehee Yoo mem_tmp, 0); \ 647ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 6488b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 649ba3579e6STaehee Yoo \ 650ba3579e6STaehee Yoo aria_sbox_8way_gfni(x0, x1, x2, x3, \ 651ba3579e6STaehee Yoo x4, x5, x6, x7, \ 652ba3579e6STaehee Yoo y0, y1, y2, y3, \ 653ba3579e6STaehee Yoo y4, y5, y6, y7); \ 654ba3579e6STaehee Yoo \ 655ba3579e6STaehee Yoo aria_diff_m(x0, x1, x2, x3, y0, y1, y2, y3); \ 656ba3579e6STaehee Yoo aria_diff_m(x4, x5, x6, x7, y0, y1, y2, y3); \ 657ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 658ba3579e6STaehee Yoo x4, x5, x6, x7, \ 659ba3579e6STaehee Yoo mem_tmp, 0); \ 660ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 661ba3579e6STaehee Yoo y4, y5, y6, y7, \ 662ba3579e6STaehee Yoo mem_tmp, 8); \ 663ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 664ba3579e6STaehee Yoo x4, x5, x6, x7, \ 665ba3579e6STaehee Yoo y0, y1, y2, y3, \ 666ba3579e6STaehee Yoo y4, y5, y6, y7); \ 667ba3579e6STaehee Yoo /* aria_diff_byte() \ 668ba3579e6STaehee Yoo * T1 = ABCD -> BADC \ 669ba3579e6STaehee Yoo * T1 = x4, x5, x6, x7 -> x5, x4, x7, x6 \ 670ba3579e6STaehee Yoo * T2 = ABCD -> CDAB \ 671ba3579e6STaehee Yoo * T2 = y0, y1, y2, y3, -> y2, y3, y0, y1 \ 672ba3579e6STaehee Yoo * T3 = ABCD -> DCBA \ 673ba3579e6STaehee Yoo * T3 = y4, y5, y6, y7 -> y7, y6, y5, y4 \ 674ba3579e6STaehee Yoo */ \ 675ba3579e6STaehee Yoo aria_diff_word(x0, x1, x2, x3, \ 676ba3579e6STaehee Yoo x5, x4, x7, x6, \ 677ba3579e6STaehee Yoo y2, y3, y0, y1, \ 678ba3579e6STaehee Yoo y7, y6, y5, y4); \ 679ba3579e6STaehee Yoo aria_store_state_8way(x3, x2, x1, x0, \ 680ba3579e6STaehee Yoo x6, x7, x4, x5, \ 681ba3579e6STaehee Yoo mem_tmp, 0); 682ba3579e6STaehee Yoo 683ba3579e6STaehee Yoo#define aria_ff_gfni(x0, x1, x2, x3, \ 684ba3579e6STaehee Yoo x4, x5, x6, x7, \ 685ba3579e6STaehee Yoo y0, y1, y2, y3, \ 686ba3579e6STaehee Yoo y4, y5, y6, y7, \ 687ba3579e6STaehee Yoo mem_tmp, rk, round, last_round) \ 6888b844753STaehee Yoo vpxor y7, y7, y7; \ 689ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 6908b844753STaehee Yoo y0, y7, y2, rk, 8, round); \ 691ba3579e6STaehee Yoo \ 692ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 693ba3579e6STaehee Yoo x6, x7, x4, x5, \ 694ba3579e6STaehee Yoo y0, y1, y2, y3, \ 695ba3579e6STaehee Yoo y4, y5, y6, y7); \ 696ba3579e6STaehee Yoo \ 697ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 6988b844753STaehee Yoo y0, y7, y2, rk, 8, last_round); \ 699ba3579e6STaehee Yoo \ 700ba3579e6STaehee Yoo aria_store_state_8way(x0, x1, x2, x3, \ 701ba3579e6STaehee Yoo x4, x5, x6, x7, \ 702ba3579e6STaehee Yoo mem_tmp, 8); \ 703ba3579e6STaehee Yoo \ 704ba3579e6STaehee Yoo aria_load_state_8way(x0, x1, x2, x3, \ 705ba3579e6STaehee Yoo x4, x5, x6, x7, \ 706ba3579e6STaehee Yoo mem_tmp, 0); \ 707ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 7088b844753STaehee Yoo y0, y7, y2, rk, 0, round); \ 709ba3579e6STaehee Yoo \ 710ba3579e6STaehee Yoo aria_sbox_8way_gfni(x2, x3, x0, x1, \ 711ba3579e6STaehee Yoo x6, x7, x4, x5, \ 712ba3579e6STaehee Yoo y0, y1, y2, y3, \ 713ba3579e6STaehee Yoo y4, y5, y6, y7); \ 714ba3579e6STaehee Yoo \ 715ba3579e6STaehee Yoo aria_ark_8way(x0, x1, x2, x3, x4, x5, x6, x7, \ 7168b844753STaehee Yoo y0, y7, y2, rk, 0, last_round); \ 717ba3579e6STaehee Yoo \ 718ba3579e6STaehee Yoo aria_load_state_8way(y0, y1, y2, y3, \ 719ba3579e6STaehee Yoo y4, y5, y6, y7, \ 720ba3579e6STaehee Yoo mem_tmp, 8); 721ba3579e6STaehee Yoo 722e3cf2f87STaehee Yoo#endif /* CONFIG_AS_GFNI */ 723e3cf2f87STaehee Yoo 724ba3579e6STaehee Yoo/* NB: section is mergeable, all elements must be aligned 16-byte blocks */ 725ba3579e6STaehee Yoo.section .rodata.cst16, "aM", @progbits, 16 726ba3579e6STaehee Yoo.align 16 727ba3579e6STaehee Yoo 728ba3579e6STaehee Yoo#define SHUFB_BYTES(idx) \ 729ba3579e6STaehee Yoo 0 + (idx), 4 + (idx), 8 + (idx), 12 + (idx) 730ba3579e6STaehee Yoo 731ba3579e6STaehee Yoo.Lshufb_16x16b: 732ba3579e6STaehee Yoo .byte SHUFB_BYTES(0), SHUFB_BYTES(1), SHUFB_BYTES(2), SHUFB_BYTES(3); 733ba3579e6STaehee Yoo/* For isolating SubBytes from AESENCLAST, inverse shift row */ 734ba3579e6STaehee Yoo.Linv_shift_row: 735ba3579e6STaehee Yoo .byte 0x00, 0x0d, 0x0a, 0x07, 0x04, 0x01, 0x0e, 0x0b 736ba3579e6STaehee Yoo .byte 0x08, 0x05, 0x02, 0x0f, 0x0c, 0x09, 0x06, 0x03 737ba3579e6STaehee Yoo.Lshift_row: 738ba3579e6STaehee Yoo .byte 0x00, 0x05, 0x0a, 0x0f, 0x04, 0x09, 0x0e, 0x03 739ba3579e6STaehee Yoo .byte 0x08, 0x0d, 0x02, 0x07, 0x0c, 0x01, 0x06, 0x0b 740ba3579e6STaehee Yoo/* For CTR-mode IV byteswap */ 741ba3579e6STaehee Yoo.Lbswap128_mask: 742ba3579e6STaehee Yoo .byte 0x0f, 0x0e, 0x0d, 0x0c, 0x0b, 0x0a, 0x09, 0x08 743ba3579e6STaehee Yoo .byte 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 744ba3579e6STaehee Yoo 745ba3579e6STaehee Yoo/* AES inverse affine and S2 combined: 746ba3579e6STaehee Yoo * 1 1 0 0 0 0 0 1 x0 0 747ba3579e6STaehee Yoo * 0 1 0 0 1 0 0 0 x1 0 748ba3579e6STaehee Yoo * 1 1 0 0 1 1 1 1 x2 0 749ba3579e6STaehee Yoo * 0 1 1 0 1 0 0 1 x3 1 750ba3579e6STaehee Yoo * 0 1 0 0 1 1 0 0 * x4 + 0 751ba3579e6STaehee Yoo * 0 1 0 1 1 0 0 0 x5 0 752ba3579e6STaehee Yoo * 0 0 0 0 0 1 0 1 x6 0 753ba3579e6STaehee Yoo * 1 1 1 0 0 1 1 1 x7 1 754ba3579e6STaehee Yoo */ 755ba3579e6STaehee Yoo.Ltf_lo__inv_aff__and__s2: 756ba3579e6STaehee Yoo .octa 0x92172DA81A9FA520B2370D883ABF8500 757ba3579e6STaehee Yoo.Ltf_hi__inv_aff__and__s2: 758ba3579e6STaehee Yoo .octa 0x2B15FFC1AF917B45E6D8320C625CB688 759ba3579e6STaehee Yoo 760ba3579e6STaehee Yoo/* X2 and AES forward affine combined: 761ba3579e6STaehee Yoo * 1 0 1 1 0 0 0 1 x0 0 762ba3579e6STaehee Yoo * 0 1 1 1 1 0 1 1 x1 0 763ba3579e6STaehee Yoo * 0 0 0 1 1 0 1 0 x2 1 764ba3579e6STaehee Yoo * 0 1 0 0 0 1 0 0 x3 0 765ba3579e6STaehee Yoo * 0 0 1 1 1 0 1 1 * x4 + 0 766ba3579e6STaehee Yoo * 0 1 0 0 1 0 0 0 x5 0 767ba3579e6STaehee Yoo * 1 1 0 1 0 0 1 1 x6 0 768ba3579e6STaehee Yoo * 0 1 0 0 1 0 1 0 x7 0 769ba3579e6STaehee Yoo */ 770ba3579e6STaehee Yoo.Ltf_lo__x2__and__fwd_aff: 771ba3579e6STaehee Yoo .octa 0xEFAE0544FCBD1657B8F95213ABEA4100 772ba3579e6STaehee Yoo.Ltf_hi__x2__and__fwd_aff: 773ba3579e6STaehee Yoo .octa 0x3F893781E95FE1576CDA64D2BA0CB204 774ba3579e6STaehee Yoo 775e3cf2f87STaehee Yoo#ifdef CONFIG_AS_GFNI 776ba3579e6STaehee Yoo/* AES affine: */ 777ba3579e6STaehee Yoo#define tf_aff_const BV8(1, 1, 0, 0, 0, 1, 1, 0) 778ba3579e6STaehee Yoo.Ltf_aff_bitmatrix: 779ba3579e6STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 780ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 0, 1, 1, 1), 781ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 782ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 0, 0, 0, 1), 783ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 1, 0, 0, 0), 784ba3579e6STaehee Yoo BV8(0, 1, 1, 1, 1, 1, 0, 0), 785ba3579e6STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 0), 786ba3579e6STaehee Yoo BV8(0, 0, 0, 1, 1, 1, 1, 1)) 7878b844753STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 1, 1, 1, 1), 7888b844753STaehee Yoo BV8(1, 1, 0, 0, 0, 1, 1, 1), 7898b844753STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 7908b844753STaehee Yoo BV8(1, 1, 1, 1, 0, 0, 0, 1), 7918b844753STaehee Yoo BV8(1, 1, 1, 1, 1, 0, 0, 0), 7928b844753STaehee Yoo BV8(0, 1, 1, 1, 1, 1, 0, 0), 7938b844753STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 0), 7948b844753STaehee Yoo BV8(0, 0, 0, 1, 1, 1, 1, 1)) 795ba3579e6STaehee Yoo 796ba3579e6STaehee Yoo/* AES inverse affine: */ 797ba3579e6STaehee Yoo#define tf_inv_const BV8(1, 0, 1, 0, 0, 0, 0, 0) 798ba3579e6STaehee Yoo.Ltf_inv_bitmatrix: 799ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 800ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 0), 801ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 0, 1), 802ba3579e6STaehee Yoo BV8(1, 0, 1, 0, 0, 1, 0, 0), 803ba3579e6STaehee Yoo BV8(0, 1, 0, 1, 0, 0, 1, 0), 804ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 1, 0, 0, 1), 805ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 1, 0, 0), 806ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 1, 0)) 8078b844753STaehee Yoo .quad BM8X8(BV8(0, 0, 1, 0, 0, 1, 0, 1), 8088b844753STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 0), 8098b844753STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 0, 1), 8108b844753STaehee Yoo BV8(1, 0, 1, 0, 0, 1, 0, 0), 8118b844753STaehee Yoo BV8(0, 1, 0, 1, 0, 0, 1, 0), 8128b844753STaehee Yoo BV8(0, 0, 1, 0, 1, 0, 0, 1), 8138b844753STaehee Yoo BV8(1, 0, 0, 1, 0, 1, 0, 0), 8148b844753STaehee Yoo BV8(0, 1, 0, 0, 1, 0, 1, 0)) 815ba3579e6STaehee Yoo 816ba3579e6STaehee Yoo/* S2: */ 817ba3579e6STaehee Yoo#define tf_s2_const BV8(0, 1, 0, 0, 0, 1, 1, 1) 818ba3579e6STaehee Yoo.Ltf_s2_bitmatrix: 819ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 820ba3579e6STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 1), 821ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 1), 822ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 0, 0, 1, 1), 823ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 1, 1), 824ba3579e6STaehee Yoo BV8(1, 1, 0, 0, 1, 1, 1, 0), 825ba3579e6STaehee Yoo BV8(0, 1, 1, 0, 0, 0, 1, 1), 826ba3579e6STaehee Yoo BV8(1, 1, 1, 1, 0, 1, 1, 0)) 8278b844753STaehee Yoo .quad BM8X8(BV8(0, 1, 0, 1, 0, 1, 1, 1), 8288b844753STaehee Yoo BV8(0, 0, 1, 1, 1, 1, 1, 1), 8298b844753STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 1), 8308b844753STaehee Yoo BV8(1, 1, 0, 0, 0, 0, 1, 1), 8318b844753STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 1, 1), 8328b844753STaehee Yoo BV8(1, 1, 0, 0, 1, 1, 1, 0), 8338b844753STaehee Yoo BV8(0, 1, 1, 0, 0, 0, 1, 1), 8348b844753STaehee Yoo BV8(1, 1, 1, 1, 0, 1, 1, 0)) 835ba3579e6STaehee Yoo 836ba3579e6STaehee Yoo/* X2: */ 837ba3579e6STaehee Yoo#define tf_x2_const BV8(0, 0, 1, 1, 0, 1, 0, 0) 838ba3579e6STaehee Yoo.Ltf_x2_bitmatrix: 839ba3579e6STaehee Yoo .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 840ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 0, 1, 1, 0), 841ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 1, 0), 842ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 843ba3579e6STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 0), 844ba3579e6STaehee Yoo BV8(0, 1, 1, 0, 1, 0, 1, 1), 845ba3579e6STaehee Yoo BV8(1, 0, 1, 1, 1, 1, 0, 1), 846ba3579e6STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 1)) 8478b844753STaehee Yoo .quad BM8X8(BV8(0, 0, 0, 1, 1, 0, 0, 0), 8488b844753STaehee Yoo BV8(0, 0, 1, 0, 0, 1, 1, 0), 8498b844753STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 1, 0), 8508b844753STaehee Yoo BV8(1, 1, 1, 0, 0, 0, 1, 1), 8518b844753STaehee Yoo BV8(1, 1, 1, 0, 1, 1, 0, 0), 8528b844753STaehee Yoo BV8(0, 1, 1, 0, 1, 0, 1, 1), 8538b844753STaehee Yoo BV8(1, 0, 1, 1, 1, 1, 0, 1), 8548b844753STaehee Yoo BV8(1, 0, 0, 1, 0, 0, 1, 1)) 855ba3579e6STaehee Yoo 856ba3579e6STaehee Yoo/* Identity matrix: */ 857ba3579e6STaehee Yoo.Ltf_id_bitmatrix: 858ba3579e6STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 859ba3579e6STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 0, 0), 860ba3579e6STaehee Yoo BV8(0, 0, 1, 0, 0, 0, 0, 0), 861ba3579e6STaehee Yoo BV8(0, 0, 0, 1, 0, 0, 0, 0), 862ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 0, 0), 863ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 1, 0, 0), 864ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 1, 0), 865ba3579e6STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 0, 1)) 8668b844753STaehee Yoo .quad BM8X8(BV8(1, 0, 0, 0, 0, 0, 0, 0), 8678b844753STaehee Yoo BV8(0, 1, 0, 0, 0, 0, 0, 0), 8688b844753STaehee Yoo BV8(0, 0, 1, 0, 0, 0, 0, 0), 8698b844753STaehee Yoo BV8(0, 0, 0, 1, 0, 0, 0, 0), 8708b844753STaehee Yoo BV8(0, 0, 0, 0, 1, 0, 0, 0), 8718b844753STaehee Yoo BV8(0, 0, 0, 0, 0, 1, 0, 0), 8728b844753STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 1, 0), 8738b844753STaehee Yoo BV8(0, 0, 0, 0, 0, 0, 0, 1)) 874e3cf2f87STaehee Yoo#endif /* CONFIG_AS_GFNI */ 875ba3579e6STaehee Yoo 876ba3579e6STaehee Yoo/* 4-bit mask */ 877ba3579e6STaehee Yoo.section .rodata.cst4.L0f0f0f0f, "aM", @progbits, 4 878ba3579e6STaehee Yoo.align 4 879ba3579e6STaehee Yoo.L0f0f0f0f: 880ba3579e6STaehee Yoo .long 0x0f0f0f0f 881ba3579e6STaehee Yoo 882ba3579e6STaehee Yoo.text 883ba3579e6STaehee Yoo 884ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_crypt_16way) 885ba3579e6STaehee Yoo /* input: 886ba3579e6STaehee Yoo * %r9: rk 887ba3579e6STaehee Yoo * %rsi: dst 888ba3579e6STaehee Yoo * %rdx: src 889ba3579e6STaehee Yoo * %xmm0..%xmm15: 16 byte-sliced blocks 890ba3579e6STaehee Yoo */ 891ba3579e6STaehee Yoo 892ba3579e6STaehee Yoo FRAME_BEGIN 893ba3579e6STaehee Yoo 894ba3579e6STaehee Yoo movq %rsi, %rax; 895ba3579e6STaehee Yoo leaq 8 * 16(%rax), %r8; 896ba3579e6STaehee Yoo 897ba3579e6STaehee Yoo inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 898ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 899ba3579e6STaehee Yoo %xmm15, %rax, %r8); 900ba3579e6STaehee Yoo aria_fo(%xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, %xmm15, 901ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 902ba3579e6STaehee Yoo %rax, %r9, 0); 903ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 904ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 905ba3579e6STaehee Yoo %xmm15, %rax, %r9, 1); 906ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 907ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 908ba3579e6STaehee Yoo %rax, %r9, 2); 909ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 910ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 911ba3579e6STaehee Yoo %xmm15, %rax, %r9, 3); 912ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 913ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 914ba3579e6STaehee Yoo %rax, %r9, 4); 915ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 916ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 917ba3579e6STaehee Yoo %xmm15, %rax, %r9, 5); 918ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 919ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 920ba3579e6STaehee Yoo %rax, %r9, 6); 921ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 922ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 923ba3579e6STaehee Yoo %xmm15, %rax, %r9, 7); 924ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 925ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 926ba3579e6STaehee Yoo %rax, %r9, 8); 927ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 928ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 929ba3579e6STaehee Yoo %xmm15, %rax, %r9, 9); 930ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 931ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 932ba3579e6STaehee Yoo %rax, %r9, 10); 93335344cf3STaehee Yoo cmpl $12, ARIA_CTX_rounds(CTX); 934ba3579e6STaehee Yoo jne .Laria_192; 935ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 936ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 937ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11, 12); 938ba3579e6STaehee Yoo jmp .Laria_end; 939ba3579e6STaehee Yoo.Laria_192: 940ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 941ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 942ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11); 943ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 944ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 945ba3579e6STaehee Yoo %rax, %r9, 12); 94635344cf3STaehee Yoo cmpl $14, ARIA_CTX_rounds(CTX); 947ba3579e6STaehee Yoo jne .Laria_256; 948ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 949ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 950ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13, 14); 951ba3579e6STaehee Yoo jmp .Laria_end; 952ba3579e6STaehee Yoo.Laria_256: 953ba3579e6STaehee Yoo aria_fe(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 954ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 955ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13); 956ba3579e6STaehee Yoo aria_fo(%xmm9, %xmm8, %xmm11, %xmm10, %xmm12, %xmm13, %xmm14, %xmm15, 957ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 958ba3579e6STaehee Yoo %rax, %r9, 14); 959ba3579e6STaehee Yoo aria_ff(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 960ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 961ba3579e6STaehee Yoo %xmm15, %rax, %r9, 15, 16); 962ba3579e6STaehee Yoo.Laria_end: 963ba3579e6STaehee Yoo debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 964ba3579e6STaehee Yoo %xmm9, %xmm13, %xmm0, %xmm5, 965ba3579e6STaehee Yoo %xmm10, %xmm14, %xmm3, %xmm6, 966ba3579e6STaehee Yoo %xmm11, %xmm15, %xmm2, %xmm7, 967ba3579e6STaehee Yoo (%rax), (%r8)); 968ba3579e6STaehee Yoo 969ba3579e6STaehee Yoo FRAME_END 970ba3579e6STaehee Yoo RET; 971ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_crypt_16way) 972ba3579e6STaehee Yoo 973c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_encrypt_16way) 974ba3579e6STaehee Yoo /* input: 975ba3579e6STaehee Yoo * %rdi: ctx, CTX 976ba3579e6STaehee Yoo * %rsi: dst 977ba3579e6STaehee Yoo * %rdx: src 978ba3579e6STaehee Yoo */ 979ba3579e6STaehee Yoo 980ba3579e6STaehee Yoo FRAME_BEGIN 981ba3579e6STaehee Yoo 98235344cf3STaehee Yoo leaq ARIA_CTX_enc_key(CTX), %r9; 983ba3579e6STaehee Yoo 984ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 985ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 986ba3579e6STaehee Yoo %xmm15, %rdx); 987ba3579e6STaehee Yoo 988ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 989ba3579e6STaehee Yoo 990ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 991ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 992ba3579e6STaehee Yoo %xmm15, %rax); 993ba3579e6STaehee Yoo 994ba3579e6STaehee Yoo FRAME_END 995ba3579e6STaehee Yoo RET; 996ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_encrypt_16way) 997ba3579e6STaehee Yoo 998c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_decrypt_16way) 999ba3579e6STaehee Yoo /* input: 1000ba3579e6STaehee Yoo * %rdi: ctx, CTX 1001ba3579e6STaehee Yoo * %rsi: dst 1002ba3579e6STaehee Yoo * %rdx: src 1003ba3579e6STaehee Yoo */ 1004ba3579e6STaehee Yoo 1005ba3579e6STaehee Yoo FRAME_BEGIN 1006ba3579e6STaehee Yoo 100735344cf3STaehee Yoo leaq ARIA_CTX_dec_key(CTX), %r9; 1008ba3579e6STaehee Yoo 1009ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1010ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1011ba3579e6STaehee Yoo %xmm15, %rdx); 1012ba3579e6STaehee Yoo 1013ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 1014ba3579e6STaehee Yoo 1015ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1016ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1017ba3579e6STaehee Yoo %xmm15, %rax); 1018ba3579e6STaehee Yoo 1019ba3579e6STaehee Yoo FRAME_END 1020ba3579e6STaehee Yoo RET; 1021ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_decrypt_16way) 1022ba3579e6STaehee Yoo 1023ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_ctr_gen_keystream_16way) 1024ba3579e6STaehee Yoo /* input: 1025ba3579e6STaehee Yoo * %rdi: ctx 1026ba3579e6STaehee Yoo * %rsi: dst 1027ba3579e6STaehee Yoo * %rdx: src 1028ba3579e6STaehee Yoo * %rcx: keystream 1029ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 1030ba3579e6STaehee Yoo */ 1031ba3579e6STaehee Yoo 1032ba3579e6STaehee Yoo FRAME_BEGIN 1033ba3579e6STaehee Yoo /* load IV and byteswap */ 1034ba3579e6STaehee Yoo vmovdqu (%r8), %xmm8; 1035ba3579e6STaehee Yoo 1036ba3579e6STaehee Yoo vmovdqa .Lbswap128_mask (%rip), %xmm1; 1037ba3579e6STaehee Yoo vpshufb %xmm1, %xmm8, %xmm3; /* be => le */ 1038ba3579e6STaehee Yoo 1039ba3579e6STaehee Yoo vpcmpeqd %xmm0, %xmm0, %xmm0; 1040ba3579e6STaehee Yoo vpsrldq $8, %xmm0, %xmm0; /* low: -1, high: 0 */ 1041ba3579e6STaehee Yoo 1042ba3579e6STaehee Yoo /* construct IVs */ 1043ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1044ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm9; 1045ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1046ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm10; 1047ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1048ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm11; 1049ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1050ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm12; 1051ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1052ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm13; 1053ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1054ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm14; 1055ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1056ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm15; 1057ba3579e6STaehee Yoo vmovdqu %xmm8, (0 * 16)(%rcx); 1058ba3579e6STaehee Yoo vmovdqu %xmm9, (1 * 16)(%rcx); 1059ba3579e6STaehee Yoo vmovdqu %xmm10, (2 * 16)(%rcx); 1060ba3579e6STaehee Yoo vmovdqu %xmm11, (3 * 16)(%rcx); 1061ba3579e6STaehee Yoo vmovdqu %xmm12, (4 * 16)(%rcx); 1062ba3579e6STaehee Yoo vmovdqu %xmm13, (5 * 16)(%rcx); 1063ba3579e6STaehee Yoo vmovdqu %xmm14, (6 * 16)(%rcx); 1064ba3579e6STaehee Yoo vmovdqu %xmm15, (7 * 16)(%rcx); 1065ba3579e6STaehee Yoo 1066ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1067ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm8; 1068ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1069ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm9; 1070ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1071ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm10; 1072ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1073ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm11; 1074ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1075ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm12; 1076ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1077ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm13; 1078ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1079ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm14; 1080ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1081ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm15; 1082ba3579e6STaehee Yoo inc_le128(%xmm3, %xmm0, %xmm5); /* +1 */ 1083ba3579e6STaehee Yoo vpshufb %xmm1, %xmm3, %xmm4; 1084ba3579e6STaehee Yoo vmovdqu %xmm4, (%r8); 1085ba3579e6STaehee Yoo 1086ba3579e6STaehee Yoo vmovdqu (0 * 16)(%rcx), %xmm0; 1087ba3579e6STaehee Yoo vmovdqu (1 * 16)(%rcx), %xmm1; 1088ba3579e6STaehee Yoo vmovdqu (2 * 16)(%rcx), %xmm2; 1089ba3579e6STaehee Yoo vmovdqu (3 * 16)(%rcx), %xmm3; 1090ba3579e6STaehee Yoo vmovdqu (4 * 16)(%rcx), %xmm4; 1091ba3579e6STaehee Yoo vmovdqu (5 * 16)(%rcx), %xmm5; 1092ba3579e6STaehee Yoo vmovdqu (6 * 16)(%rcx), %xmm6; 1093ba3579e6STaehee Yoo vmovdqu (7 * 16)(%rcx), %xmm7; 1094ba3579e6STaehee Yoo 1095ba3579e6STaehee Yoo FRAME_END 1096ba3579e6STaehee Yoo RET; 1097ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_ctr_gen_keystream_16way) 1098ba3579e6STaehee Yoo 1099c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_ctr_crypt_16way) 1100ba3579e6STaehee Yoo /* input: 1101ba3579e6STaehee Yoo * %rdi: ctx 1102ba3579e6STaehee Yoo * %rsi: dst 1103ba3579e6STaehee Yoo * %rdx: src 1104ba3579e6STaehee Yoo * %rcx: keystream 1105ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 1106ba3579e6STaehee Yoo */ 1107ba3579e6STaehee Yoo FRAME_BEGIN 1108ba3579e6STaehee Yoo 1109ba3579e6STaehee Yoo call __aria_aesni_avx_ctr_gen_keystream_16way; 1110ba3579e6STaehee Yoo 1111ba3579e6STaehee Yoo leaq (%rsi), %r10; 1112ba3579e6STaehee Yoo leaq (%rdx), %r11; 1113ba3579e6STaehee Yoo leaq (%rcx), %rsi; 1114ba3579e6STaehee Yoo leaq (%rcx), %rdx; 111535344cf3STaehee Yoo leaq ARIA_CTX_enc_key(CTX), %r9; 1116ba3579e6STaehee Yoo 1117ba3579e6STaehee Yoo call __aria_aesni_avx_crypt_16way; 1118ba3579e6STaehee Yoo 1119ba3579e6STaehee Yoo vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1120ba3579e6STaehee Yoo vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1121ba3579e6STaehee Yoo vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1122ba3579e6STaehee Yoo vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1123ba3579e6STaehee Yoo vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1124ba3579e6STaehee Yoo vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1125ba3579e6STaehee Yoo vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1126ba3579e6STaehee Yoo vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1127ba3579e6STaehee Yoo vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1128ba3579e6STaehee Yoo vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1129ba3579e6STaehee Yoo vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1130ba3579e6STaehee Yoo vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1131ba3579e6STaehee Yoo vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1132ba3579e6STaehee Yoo vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1133ba3579e6STaehee Yoo vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1134ba3579e6STaehee Yoo vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1135ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1136ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1137ba3579e6STaehee Yoo %xmm15, %r10); 1138ba3579e6STaehee Yoo 1139ba3579e6STaehee Yoo FRAME_END 1140ba3579e6STaehee Yoo RET; 1141ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_ctr_crypt_16way) 1142ba3579e6STaehee Yoo 1143e3cf2f87STaehee Yoo#ifdef CONFIG_AS_GFNI 1144ba3579e6STaehee YooSYM_FUNC_START_LOCAL(__aria_aesni_avx_gfni_crypt_16way) 1145ba3579e6STaehee Yoo /* input: 1146ba3579e6STaehee Yoo * %r9: rk 1147ba3579e6STaehee Yoo * %rsi: dst 1148ba3579e6STaehee Yoo * %rdx: src 1149ba3579e6STaehee Yoo * %xmm0..%xmm15: 16 byte-sliced blocks 1150ba3579e6STaehee Yoo */ 1151ba3579e6STaehee Yoo 1152ba3579e6STaehee Yoo FRAME_BEGIN 1153ba3579e6STaehee Yoo 1154ba3579e6STaehee Yoo movq %rsi, %rax; 1155ba3579e6STaehee Yoo leaq 8 * 16(%rax), %r8; 1156ba3579e6STaehee Yoo 1157ba3579e6STaehee Yoo inpack16_post(%xmm0, %xmm1, %xmm2, %xmm3, 1158ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1159ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1160ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1161ba3579e6STaehee Yoo %xmm15, %rax, %r8); 1162ba3579e6STaehee Yoo aria_fo_gfni(%xmm8, %xmm9, %xmm10, %xmm11, 1163ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1164ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1165ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1166ba3579e6STaehee Yoo %rax, %r9, 0); 1167ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1168ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1169ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1170ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1171ba3579e6STaehee Yoo %xmm15, %rax, %r9, 1); 1172ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1173ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1174ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1175ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1176ba3579e6STaehee Yoo %rax, %r9, 2); 1177ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1178ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1179ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1180ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1181ba3579e6STaehee Yoo %xmm15, %rax, %r9, 3); 1182ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1183ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1184ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1185ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1186ba3579e6STaehee Yoo %rax, %r9, 4); 1187ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1188ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1189ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1190ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1191ba3579e6STaehee Yoo %xmm15, %rax, %r9, 5); 1192ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1193ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1194ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1195ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1196ba3579e6STaehee Yoo %rax, %r9, 6); 1197ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1198ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1199ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1200ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1201ba3579e6STaehee Yoo %xmm15, %rax, %r9, 7); 1202ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1203ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1204ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1205ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1206ba3579e6STaehee Yoo %rax, %r9, 8); 1207ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1208ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1209ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1210ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1211ba3579e6STaehee Yoo %xmm15, %rax, %r9, 9); 1212ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1213ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1214ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1215ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1216ba3579e6STaehee Yoo %rax, %r9, 10); 121735344cf3STaehee Yoo cmpl $12, ARIA_CTX_rounds(CTX); 1218ba3579e6STaehee Yoo jne .Laria_gfni_192; 1219ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1220ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1221ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11, 12); 1222ba3579e6STaehee Yoo jmp .Laria_gfni_end; 1223ba3579e6STaehee Yoo.Laria_gfni_192: 1224ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1225ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1226ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1227ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1228ba3579e6STaehee Yoo %xmm15, %rax, %r9, 11); 1229ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1230ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1231ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1232ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1233ba3579e6STaehee Yoo %rax, %r9, 12); 123435344cf3STaehee Yoo cmpl $14, ARIA_CTX_rounds(CTX); 1235ba3579e6STaehee Yoo jne .Laria_gfni_256; 1236ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1237ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1238ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1239ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1240ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13, 14); 1241ba3579e6STaehee Yoo jmp .Laria_gfni_end; 1242ba3579e6STaehee Yoo.Laria_gfni_256: 1243ba3579e6STaehee Yoo aria_fe_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1244ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1245ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1246ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1247ba3579e6STaehee Yoo %xmm15, %rax, %r9, 13); 1248ba3579e6STaehee Yoo aria_fo_gfni(%xmm9, %xmm8, %xmm11, %xmm10, 1249ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, %xmm15, 1250ba3579e6STaehee Yoo %xmm0, %xmm1, %xmm2, %xmm3, 1251ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1252ba3579e6STaehee Yoo %rax, %r9, 14); 1253ba3579e6STaehee Yoo aria_ff_gfni(%xmm1, %xmm0, %xmm3, %xmm2, 1254ba3579e6STaehee Yoo %xmm4, %xmm5, %xmm6, %xmm7, 1255ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, 1256ba3579e6STaehee Yoo %xmm12, %xmm13, %xmm14, 1257ba3579e6STaehee Yoo %xmm15, %rax, %r9, 15, 16); 1258ba3579e6STaehee Yoo.Laria_gfni_end: 1259ba3579e6STaehee Yoo debyteslice_16x16b(%xmm8, %xmm12, %xmm1, %xmm4, 1260ba3579e6STaehee Yoo %xmm9, %xmm13, %xmm0, %xmm5, 1261ba3579e6STaehee Yoo %xmm10, %xmm14, %xmm3, %xmm6, 1262ba3579e6STaehee Yoo %xmm11, %xmm15, %xmm2, %xmm7, 1263ba3579e6STaehee Yoo (%rax), (%r8)); 1264ba3579e6STaehee Yoo 1265ba3579e6STaehee Yoo FRAME_END 1266ba3579e6STaehee Yoo RET; 1267ba3579e6STaehee YooSYM_FUNC_END(__aria_aesni_avx_gfni_crypt_16way) 1268ba3579e6STaehee Yoo 1269c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_encrypt_16way) 1270ba3579e6STaehee Yoo /* input: 1271ba3579e6STaehee Yoo * %rdi: ctx, CTX 1272ba3579e6STaehee Yoo * %rsi: dst 1273ba3579e6STaehee Yoo * %rdx: src 1274ba3579e6STaehee Yoo */ 1275ba3579e6STaehee Yoo 1276ba3579e6STaehee Yoo FRAME_BEGIN 1277ba3579e6STaehee Yoo 127835344cf3STaehee Yoo leaq ARIA_CTX_enc_key(CTX), %r9; 1279ba3579e6STaehee Yoo 1280ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1281ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1282ba3579e6STaehee Yoo %xmm15, %rdx); 1283ba3579e6STaehee Yoo 1284ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1285ba3579e6STaehee Yoo 1286ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1287ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1288ba3579e6STaehee Yoo %xmm15, %rax); 1289ba3579e6STaehee Yoo 1290ba3579e6STaehee Yoo FRAME_END 1291ba3579e6STaehee Yoo RET; 1292ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_encrypt_16way) 1293ba3579e6STaehee Yoo 1294c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_decrypt_16way) 1295ba3579e6STaehee Yoo /* input: 1296ba3579e6STaehee Yoo * %rdi: ctx, CTX 1297ba3579e6STaehee Yoo * %rsi: dst 1298ba3579e6STaehee Yoo * %rdx: src 1299ba3579e6STaehee Yoo */ 1300ba3579e6STaehee Yoo 1301ba3579e6STaehee Yoo FRAME_BEGIN 1302ba3579e6STaehee Yoo 130335344cf3STaehee Yoo leaq ARIA_CTX_dec_key(CTX), %r9; 1304ba3579e6STaehee Yoo 1305ba3579e6STaehee Yoo inpack16_pre(%xmm0, %xmm1, %xmm2, %xmm3, %xmm4, %xmm5, %xmm6, %xmm7, 1306ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1307ba3579e6STaehee Yoo %xmm15, %rdx); 1308ba3579e6STaehee Yoo 1309ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1310ba3579e6STaehee Yoo 1311ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1312ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1313ba3579e6STaehee Yoo %xmm15, %rax); 1314ba3579e6STaehee Yoo 1315ba3579e6STaehee Yoo FRAME_END 1316ba3579e6STaehee Yoo RET; 1317ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_decrypt_16way) 1318ba3579e6STaehee Yoo 1319c67b553aSEric BiggersSYM_TYPED_FUNC_START(aria_aesni_avx_gfni_ctr_crypt_16way) 1320ba3579e6STaehee Yoo /* input: 1321ba3579e6STaehee Yoo * %rdi: ctx 1322ba3579e6STaehee Yoo * %rsi: dst 1323ba3579e6STaehee Yoo * %rdx: src 1324ba3579e6STaehee Yoo * %rcx: keystream 1325ba3579e6STaehee Yoo * %r8: iv (big endian, 128bit) 1326ba3579e6STaehee Yoo */ 1327ba3579e6STaehee Yoo FRAME_BEGIN 1328ba3579e6STaehee Yoo 1329ba3579e6STaehee Yoo call __aria_aesni_avx_ctr_gen_keystream_16way 1330ba3579e6STaehee Yoo 1331ba3579e6STaehee Yoo leaq (%rsi), %r10; 1332ba3579e6STaehee Yoo leaq (%rdx), %r11; 1333ba3579e6STaehee Yoo leaq (%rcx), %rsi; 1334ba3579e6STaehee Yoo leaq (%rcx), %rdx; 133535344cf3STaehee Yoo leaq ARIA_CTX_enc_key(CTX), %r9; 1336ba3579e6STaehee Yoo 1337ba3579e6STaehee Yoo call __aria_aesni_avx_gfni_crypt_16way; 1338ba3579e6STaehee Yoo 1339ba3579e6STaehee Yoo vpxor (0 * 16)(%r11), %xmm1, %xmm1; 1340ba3579e6STaehee Yoo vpxor (1 * 16)(%r11), %xmm0, %xmm0; 1341ba3579e6STaehee Yoo vpxor (2 * 16)(%r11), %xmm3, %xmm3; 1342ba3579e6STaehee Yoo vpxor (3 * 16)(%r11), %xmm2, %xmm2; 1343ba3579e6STaehee Yoo vpxor (4 * 16)(%r11), %xmm4, %xmm4; 1344ba3579e6STaehee Yoo vpxor (5 * 16)(%r11), %xmm5, %xmm5; 1345ba3579e6STaehee Yoo vpxor (6 * 16)(%r11), %xmm6, %xmm6; 1346ba3579e6STaehee Yoo vpxor (7 * 16)(%r11), %xmm7, %xmm7; 1347ba3579e6STaehee Yoo vpxor (8 * 16)(%r11), %xmm8, %xmm8; 1348ba3579e6STaehee Yoo vpxor (9 * 16)(%r11), %xmm9, %xmm9; 1349ba3579e6STaehee Yoo vpxor (10 * 16)(%r11), %xmm10, %xmm10; 1350ba3579e6STaehee Yoo vpxor (11 * 16)(%r11), %xmm11, %xmm11; 1351ba3579e6STaehee Yoo vpxor (12 * 16)(%r11), %xmm12, %xmm12; 1352ba3579e6STaehee Yoo vpxor (13 * 16)(%r11), %xmm13, %xmm13; 1353ba3579e6STaehee Yoo vpxor (14 * 16)(%r11), %xmm14, %xmm14; 1354ba3579e6STaehee Yoo vpxor (15 * 16)(%r11), %xmm15, %xmm15; 1355ba3579e6STaehee Yoo write_output(%xmm1, %xmm0, %xmm3, %xmm2, %xmm4, %xmm5, %xmm6, %xmm7, 1356ba3579e6STaehee Yoo %xmm8, %xmm9, %xmm10, %xmm11, %xmm12, %xmm13, %xmm14, 1357ba3579e6STaehee Yoo %xmm15, %r10); 1358ba3579e6STaehee Yoo 1359ba3579e6STaehee Yoo FRAME_END 1360ba3579e6STaehee Yoo RET; 1361ba3579e6STaehee YooSYM_FUNC_END(aria_aesni_avx_gfni_ctr_crypt_16way) 1362e3cf2f87STaehee Yoo#endif /* CONFIG_AS_GFNI */ 1363