1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Cast6 Cipher 8-way parallel algorithm (AVX/x86_64) 4 * 5 * Copyright (C) 2012 Johannes Goetzfried 6 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 7 * 8 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@iki.fi> 9 */ 10 11#include <linux/linkage.h> 12#include <asm/frame.h> 13#include "glue_helper-asm-avx.S" 14 15.file "cast6-avx-x86_64-asm_64.S" 16 17.extern cast_s1 18.extern cast_s2 19.extern cast_s3 20.extern cast_s4 21 22/* structure of crypto context */ 23#define km 0 24#define kr (12*4*4) 25 26/* s-boxes */ 27#define s1 cast_s1 28#define s2 cast_s2 29#define s3 cast_s3 30#define s4 cast_s4 31 32/********************************************************************** 33 8-way AVX cast6 34 **********************************************************************/ 35#define CTX %r15 36 37#define RA1 %xmm0 38#define RB1 %xmm1 39#define RC1 %xmm2 40#define RD1 %xmm3 41 42#define RA2 %xmm4 43#define RB2 %xmm5 44#define RC2 %xmm6 45#define RD2 %xmm7 46 47#define RX %xmm8 48 49#define RKM %xmm9 50#define RKR %xmm10 51#define RKRF %xmm11 52#define RKRR %xmm12 53#define R32 %xmm13 54#define R1ST %xmm14 55 56#define RTMP %xmm15 57 58#define RID1 %rdi 59#define RID1d %edi 60#define RID2 %rsi 61#define RID2d %esi 62 63#define RGI1 %rdx 64#define RGI1bl %dl 65#define RGI1bh %dh 66#define RGI2 %rcx 67#define RGI2bl %cl 68#define RGI2bh %ch 69 70#define RGI3 %rax 71#define RGI3bl %al 72#define RGI3bh %ah 73#define RGI4 %rbx 74#define RGI4bl %bl 75#define RGI4bh %bh 76 77#define RFS1 %r8 78#define RFS1d %r8d 79#define RFS2 %r9 80#define RFS2d %r9d 81#define RFS3 %r10 82#define RFS3d %r10d 83 84 85#define lookup_32bit(src, dst, op1, op2, op3, interleave_op, il_reg) \ 86 movzbl src ## bh, RID1d; \ 87 leaq s1(%rip), RID2; \ 88 movl (RID2,RID1,4), dst ## d; \ 89 movzbl src ## bl, RID2d; \ 90 leaq s2(%rip), RID1; \ 91 op1 (RID1,RID2,4), dst ## d; \ 92 shrq $16, src; \ 93 movzbl src ## bh, RID1d; \ 94 leaq s3(%rip), RID2; \ 95 op2 (RID2,RID1,4), dst ## d; \ 96 movzbl src ## bl, RID2d; \ 97 interleave_op(il_reg); \ 98 leaq s4(%rip), RID1; \ 99 op3 (RID1,RID2,4), dst ## d; 100 101#define dummy(d) /* do nothing */ 102 103#define shr_next(reg) \ 104 shrq $16, reg; 105 106#define F_head(a, x, gi1, gi2, op0) \ 107 op0 a, RKM, x; \ 108 vpslld RKRF, x, RTMP; \ 109 vpsrld RKRR, x, x; \ 110 vpor RTMP, x, x; \ 111 \ 112 vmovq x, gi1; \ 113 vpextrq $1, x, gi2; 114 115#define F_tail(a, x, gi1, gi2, op1, op2, op3) \ 116 lookup_32bit(##gi1, RFS1, op1, op2, op3, shr_next, ##gi1); \ 117 lookup_32bit(##gi2, RFS3, op1, op2, op3, shr_next, ##gi2); \ 118 \ 119 lookup_32bit(##gi1, RFS2, op1, op2, op3, dummy, none); \ 120 shlq $32, RFS2; \ 121 orq RFS1, RFS2; \ 122 lookup_32bit(##gi2, RFS1, op1, op2, op3, dummy, none); \ 123 shlq $32, RFS1; \ 124 orq RFS1, RFS3; \ 125 \ 126 vmovq RFS2, x; \ 127 vpinsrq $1, RFS3, x, x; 128 129#define F_2(a1, b1, a2, b2, op0, op1, op2, op3) \ 130 F_head(b1, RX, RGI1, RGI2, op0); \ 131 F_head(b2, RX, RGI3, RGI4, op0); \ 132 \ 133 F_tail(b1, RX, RGI1, RGI2, op1, op2, op3); \ 134 F_tail(b2, RTMP, RGI3, RGI4, op1, op2, op3); \ 135 \ 136 vpxor a1, RX, a1; \ 137 vpxor a2, RTMP, a2; 138 139#define F1_2(a1, b1, a2, b2) \ 140 F_2(a1, b1, a2, b2, vpaddd, xorl, subl, addl) 141#define F2_2(a1, b1, a2, b2) \ 142 F_2(a1, b1, a2, b2, vpxor, subl, addl, xorl) 143#define F3_2(a1, b1, a2, b2) \ 144 F_2(a1, b1, a2, b2, vpsubd, addl, xorl, subl) 145 146#define qop(in, out, f) \ 147 F ## f ## _2(out ## 1, in ## 1, out ## 2, in ## 2); 148 149#define get_round_keys(nn) \ 150 vbroadcastss (km+(4*(nn)))(CTX), RKM; \ 151 vpand R1ST, RKR, RKRF; \ 152 vpsubq RKRF, R32, RKRR; \ 153 vpsrldq $1, RKR, RKR; 154 155#define Q(n) \ 156 get_round_keys(4*n+0); \ 157 qop(RD, RC, 1); \ 158 \ 159 get_round_keys(4*n+1); \ 160 qop(RC, RB, 2); \ 161 \ 162 get_round_keys(4*n+2); \ 163 qop(RB, RA, 3); \ 164 \ 165 get_round_keys(4*n+3); \ 166 qop(RA, RD, 1); 167 168#define QBAR(n) \ 169 get_round_keys(4*n+3); \ 170 qop(RA, RD, 1); \ 171 \ 172 get_round_keys(4*n+2); \ 173 qop(RB, RA, 3); \ 174 \ 175 get_round_keys(4*n+1); \ 176 qop(RC, RB, 2); \ 177 \ 178 get_round_keys(4*n+0); \ 179 qop(RD, RC, 1); 180 181#define shuffle(mask) \ 182 vpshufb mask(%rip), RKR, RKR; 183 184#define preload_rkr(n, do_mask, mask) \ 185 vbroadcastss .L16_mask(%rip), RKR; \ 186 /* add 16-bit rotation to key rotations (mod 32) */ \ 187 vpxor (kr+n*16)(CTX), RKR, RKR; \ 188 do_mask(mask); 189 190#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 191 vpunpckldq x1, x0, t0; \ 192 vpunpckhdq x1, x0, t2; \ 193 vpunpckldq x3, x2, t1; \ 194 vpunpckhdq x3, x2, x3; \ 195 \ 196 vpunpcklqdq t1, t0, x0; \ 197 vpunpckhqdq t1, t0, x1; \ 198 vpunpcklqdq x3, t2, x2; \ 199 vpunpckhqdq x3, t2, x3; 200 201#define inpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 202 vpshufb rmask, x0, x0; \ 203 vpshufb rmask, x1, x1; \ 204 vpshufb rmask, x2, x2; \ 205 vpshufb rmask, x3, x3; \ 206 \ 207 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 208 209#define outunpack_blocks(x0, x1, x2, x3, t0, t1, t2, rmask) \ 210 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 211 \ 212 vpshufb rmask, x0, x0; \ 213 vpshufb rmask, x1, x1; \ 214 vpshufb rmask, x2, x2; \ 215 vpshufb rmask, x3, x3; 216 217.section .rodata.cst16, "aM", @progbits, 16 218.align 16 219.Lbswap_mask: 220 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 221.Lbswap128_mask: 222 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 223.Lrkr_enc_Q_Q_QBAR_QBAR: 224 .byte 0, 1, 2, 3, 4, 5, 6, 7, 11, 10, 9, 8, 15, 14, 13, 12 225.Lrkr_enc_QBAR_QBAR_QBAR_QBAR: 226 .byte 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 227.Lrkr_dec_Q_Q_Q_Q: 228 .byte 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 229.Lrkr_dec_Q_Q_QBAR_QBAR: 230 .byte 12, 13, 14, 15, 8, 9, 10, 11, 7, 6, 5, 4, 3, 2, 1, 0 231.Lrkr_dec_QBAR_QBAR_QBAR_QBAR: 232 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 233 234.section .rodata.cst4.L16_mask, "aM", @progbits, 4 235.align 4 236.L16_mask: 237 .byte 16, 16, 16, 16 238 239.section .rodata.cst4.L32_mask, "aM", @progbits, 4 240.align 4 241.L32_mask: 242 .byte 32, 0, 0, 0 243 244.section .rodata.cst4.first_mask, "aM", @progbits, 4 245.align 4 246.Lfirst_mask: 247 .byte 0x1f, 0, 0, 0 248 249.text 250 251.align 8 252SYM_FUNC_START_LOCAL(__cast6_enc_blk8) 253 /* input: 254 * %rdi: ctx 255 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 256 * output: 257 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 258 */ 259 260 pushq %r15; 261 pushq %rbx; 262 263 movq %rdi, CTX; 264 265 vmovdqa .Lbswap_mask(%rip), RKM; 266 vmovd .Lfirst_mask(%rip), R1ST; 267 vmovd .L32_mask(%rip), R32; 268 269 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 270 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 271 272 preload_rkr(0, dummy, none); 273 Q(0); 274 Q(1); 275 Q(2); 276 Q(3); 277 preload_rkr(1, shuffle, .Lrkr_enc_Q_Q_QBAR_QBAR); 278 Q(4); 279 Q(5); 280 QBAR(6); 281 QBAR(7); 282 preload_rkr(2, shuffle, .Lrkr_enc_QBAR_QBAR_QBAR_QBAR); 283 QBAR(8); 284 QBAR(9); 285 QBAR(10); 286 QBAR(11); 287 288 popq %rbx; 289 popq %r15; 290 291 vmovdqa .Lbswap_mask(%rip), RKM; 292 293 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 294 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 295 296 RET; 297SYM_FUNC_END(__cast6_enc_blk8) 298 299.align 8 300SYM_FUNC_START_LOCAL(__cast6_dec_blk8) 301 /* input: 302 * %rdi: ctx 303 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 304 * output: 305 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: decrypted blocks 306 */ 307 308 pushq %r15; 309 pushq %rbx; 310 311 movq %rdi, CTX; 312 313 vmovdqa .Lbswap_mask(%rip), RKM; 314 vmovd .Lfirst_mask(%rip), R1ST; 315 vmovd .L32_mask(%rip), R32; 316 317 inpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 318 inpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 319 320 preload_rkr(2, shuffle, .Lrkr_dec_Q_Q_Q_Q); 321 Q(11); 322 Q(10); 323 Q(9); 324 Q(8); 325 preload_rkr(1, shuffle, .Lrkr_dec_Q_Q_QBAR_QBAR); 326 Q(7); 327 Q(6); 328 QBAR(5); 329 QBAR(4); 330 preload_rkr(0, shuffle, .Lrkr_dec_QBAR_QBAR_QBAR_QBAR); 331 QBAR(3); 332 QBAR(2); 333 QBAR(1); 334 QBAR(0); 335 336 popq %rbx; 337 popq %r15; 338 339 vmovdqa .Lbswap_mask(%rip), RKM; 340 outunpack_blocks(RA1, RB1, RC1, RD1, RTMP, RX, RKRF, RKM); 341 outunpack_blocks(RA2, RB2, RC2, RD2, RTMP, RX, RKRF, RKM); 342 343 RET; 344SYM_FUNC_END(__cast6_dec_blk8) 345 346SYM_FUNC_START(cast6_ecb_enc_8way) 347 /* input: 348 * %rdi: ctx 349 * %rsi: dst 350 * %rdx: src 351 */ 352 FRAME_BEGIN 353 pushq %r15; 354 355 movq %rdi, CTX; 356 movq %rsi, %r11; 357 358 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 359 360 call __cast6_enc_blk8; 361 362 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 363 364 popq %r15; 365 FRAME_END 366 RET; 367SYM_FUNC_END(cast6_ecb_enc_8way) 368 369SYM_FUNC_START(cast6_ecb_dec_8way) 370 /* input: 371 * %rdi: ctx 372 * %rsi: dst 373 * %rdx: src 374 */ 375 FRAME_BEGIN 376 pushq %r15; 377 378 movq %rdi, CTX; 379 movq %rsi, %r11; 380 381 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 382 383 call __cast6_dec_blk8; 384 385 store_8way(%r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 386 387 popq %r15; 388 FRAME_END 389 RET; 390SYM_FUNC_END(cast6_ecb_dec_8way) 391 392SYM_FUNC_START(cast6_cbc_dec_8way) 393 /* input: 394 * %rdi: ctx 395 * %rsi: dst 396 * %rdx: src 397 */ 398 FRAME_BEGIN 399 pushq %r12; 400 pushq %r15; 401 402 movq %rdi, CTX; 403 movq %rsi, %r11; 404 movq %rdx, %r12; 405 406 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 407 408 call __cast6_dec_blk8; 409 410 store_cbc_8way(%r12, %r11, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 411 412 popq %r15; 413 popq %r12; 414 FRAME_END 415 RET; 416SYM_FUNC_END(cast6_cbc_dec_8way) 417