1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Camellia Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9 10.file "camellia-x86_64-asm_64.S" 11.text 12 13.extern camellia_sp10011110; 14.extern camellia_sp22000222; 15.extern camellia_sp03303033; 16.extern camellia_sp00444404; 17.extern camellia_sp02220222; 18.extern camellia_sp30333033; 19.extern camellia_sp44044404; 20.extern camellia_sp11101110; 21 22#define sp10011110 camellia_sp10011110 23#define sp22000222 camellia_sp22000222 24#define sp03303033 camellia_sp03303033 25#define sp00444404 camellia_sp00444404 26#define sp02220222 camellia_sp02220222 27#define sp30333033 camellia_sp30333033 28#define sp44044404 camellia_sp44044404 29#define sp11101110 camellia_sp11101110 30 31#define CAMELLIA_TABLE_BYTE_LEN 272 32 33/* struct camellia_ctx: */ 34#define key_table 0 35#define key_length CAMELLIA_TABLE_BYTE_LEN 36 37/* register macros */ 38#define CTX %rdi 39#define RIO %rsi 40#define RIOd %esi 41 42#define RAB0 %rax 43#define RCD0 %rcx 44#define RAB1 %rbx 45#define RCD1 %rdx 46 47#define RAB0d %eax 48#define RCD0d %ecx 49#define RAB1d %ebx 50#define RCD1d %edx 51 52#define RAB0bl %al 53#define RCD0bl %cl 54#define RAB1bl %bl 55#define RCD1bl %dl 56 57#define RAB0bh %ah 58#define RCD0bh %ch 59#define RAB1bh %bh 60#define RCD1bh %dh 61 62#define RT0 %rsi 63#define RT1 %r12 64#define RT2 %r8 65 66#define RT0d %esi 67#define RT1d %r12d 68#define RT2d %r8d 69 70#define RT2bl %r8b 71 72#define RXOR %r9 73#define RR12 %r10 74#define RDST %r11 75 76#define RXORd %r9d 77#define RXORbl %r9b 78 79#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 80 leaq T0(%rip), tmp1; \ 81 movzbl ab ## bl, tmp2 ## d; \ 82 xorq (tmp1, tmp2, 8), dst; \ 83 leaq T1(%rip), tmp2; \ 84 movzbl ab ## bh, tmp1 ## d; \ 85 rorq $16, ab; \ 86 xorq (tmp2, tmp1, 8), dst; 87 88/********************************************************************** 89 1-way camellia 90 **********************************************************************/ 91#define roundsm(ab, subkey, cd) \ 92 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 93 \ 94 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 95 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 96 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 97 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 98 \ 99 xorq RT2, cd ## 0; 100 101#define fls(l, r, kl, kr) \ 102 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 103 andl l ## 0d, RT0d; \ 104 roll $1, RT0d; \ 105 shlq $32, RT0; \ 106 xorq RT0, l ## 0; \ 107 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 108 orq r ## 0, RT1; \ 109 shrq $32, RT1; \ 110 xorq RT1, r ## 0; \ 111 \ 112 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 113 orq l ## 0, RT2; \ 114 shrq $32, RT2; \ 115 xorq RT2, l ## 0; \ 116 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 117 andl r ## 0d, RT0d; \ 118 roll $1, RT0d; \ 119 shlq $32, RT0; \ 120 xorq RT0, r ## 0; 121 122#define enc_rounds(i) \ 123 roundsm(RAB, i + 2, RCD); \ 124 roundsm(RCD, i + 3, RAB); \ 125 roundsm(RAB, i + 4, RCD); \ 126 roundsm(RCD, i + 5, RAB); \ 127 roundsm(RAB, i + 6, RCD); \ 128 roundsm(RCD, i + 7, RAB); 129 130#define enc_fls(i) \ 131 fls(RAB, RCD, i + 0, i + 1); 132 133#define enc_inpack() \ 134 movq (RIO), RAB0; \ 135 bswapq RAB0; \ 136 rolq $32, RAB0; \ 137 movq 4*2(RIO), RCD0; \ 138 bswapq RCD0; \ 139 rorq $32, RCD0; \ 140 xorq key_table(CTX), RAB0; 141 142#define enc_outunpack(op, max) \ 143 xorq key_table(CTX, max, 8), RCD0; \ 144 rorq $32, RCD0; \ 145 bswapq RCD0; \ 146 op ## q RCD0, (RIO); \ 147 rolq $32, RAB0; \ 148 bswapq RAB0; \ 149 op ## q RAB0, 4*2(RIO); 150 151#define dec_rounds(i) \ 152 roundsm(RAB, i + 7, RCD); \ 153 roundsm(RCD, i + 6, RAB); \ 154 roundsm(RAB, i + 5, RCD); \ 155 roundsm(RCD, i + 4, RAB); \ 156 roundsm(RAB, i + 3, RCD); \ 157 roundsm(RCD, i + 2, RAB); 158 159#define dec_fls(i) \ 160 fls(RAB, RCD, i + 1, i + 0); 161 162#define dec_inpack(max) \ 163 movq (RIO), RAB0; \ 164 bswapq RAB0; \ 165 rolq $32, RAB0; \ 166 movq 4*2(RIO), RCD0; \ 167 bswapq RCD0; \ 168 rorq $32, RCD0; \ 169 xorq key_table(CTX, max, 8), RAB0; 170 171#define dec_outunpack() \ 172 xorq key_table(CTX), RCD0; \ 173 rorq $32, RCD0; \ 174 bswapq RCD0; \ 175 movq RCD0, (RIO); \ 176 rolq $32, RAB0; \ 177 bswapq RAB0; \ 178 movq RAB0, 4*2(RIO); 179 180SYM_FUNC_START(__camellia_enc_blk) 181 /* input: 182 * %rdi: ctx, CTX 183 * %rsi: dst 184 * %rdx: src 185 * %rcx: bool xor 186 */ 187 movq %r12, RR12; 188 189 movq %rcx, RXOR; 190 movq %rsi, RDST; 191 movq %rdx, RIO; 192 193 enc_inpack(); 194 195 enc_rounds(0); 196 enc_fls(8); 197 enc_rounds(8); 198 enc_fls(16); 199 enc_rounds(16); 200 movl $24, RT1d; /* max */ 201 202 cmpb $16, key_length(CTX); 203 je .L__enc_done; 204 205 enc_fls(24); 206 enc_rounds(24); 207 movl $32, RT1d; /* max */ 208 209.L__enc_done: 210 testb RXORbl, RXORbl; 211 movq RDST, RIO; 212 213 jnz .L__enc_xor; 214 215 enc_outunpack(mov, RT1); 216 217 movq RR12, %r12; 218 RET; 219 220.L__enc_xor: 221 enc_outunpack(xor, RT1); 222 223 movq RR12, %r12; 224 RET; 225SYM_FUNC_END(__camellia_enc_blk) 226 227SYM_FUNC_START(camellia_dec_blk) 228 /* input: 229 * %rdi: ctx, CTX 230 * %rsi: dst 231 * %rdx: src 232 */ 233 cmpl $16, key_length(CTX); 234 movl $32, RT2d; 235 movl $24, RXORd; 236 cmovel RXORd, RT2d; /* max */ 237 238 movq %r12, RR12; 239 movq %rsi, RDST; 240 movq %rdx, RIO; 241 242 dec_inpack(RT2); 243 244 cmpb $24, RT2bl; 245 je .L__dec_rounds16; 246 247 dec_rounds(24); 248 dec_fls(24); 249 250.L__dec_rounds16: 251 dec_rounds(16); 252 dec_fls(16); 253 dec_rounds(8); 254 dec_fls(8); 255 dec_rounds(0); 256 257 movq RDST, RIO; 258 259 dec_outunpack(); 260 261 movq RR12, %r12; 262 RET; 263SYM_FUNC_END(camellia_dec_blk) 264 265/********************************************************************** 266 2-way camellia 267 **********************************************************************/ 268#define roundsm2(ab, subkey, cd) \ 269 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 270 xorq RT2, cd ## 1; \ 271 \ 272 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 273 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 274 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 275 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 276 \ 277 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 278 xorq RT2, cd ## 0; \ 279 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 280 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 281 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 282 283#define fls2(l, r, kl, kr) \ 284 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 285 andl l ## 0d, RT0d; \ 286 roll $1, RT0d; \ 287 shlq $32, RT0; \ 288 xorq RT0, l ## 0; \ 289 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 290 orq r ## 0, RT1; \ 291 shrq $32, RT1; \ 292 xorq RT1, r ## 0; \ 293 \ 294 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 295 andl l ## 1d, RT2d; \ 296 roll $1, RT2d; \ 297 shlq $32, RT2; \ 298 xorq RT2, l ## 1; \ 299 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 300 orq r ## 1, RT0; \ 301 shrq $32, RT0; \ 302 xorq RT0, r ## 1; \ 303 \ 304 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 305 orq l ## 0, RT1; \ 306 shrq $32, RT1; \ 307 xorq RT1, l ## 0; \ 308 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 309 andl r ## 0d, RT2d; \ 310 roll $1, RT2d; \ 311 shlq $32, RT2; \ 312 xorq RT2, r ## 0; \ 313 \ 314 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 315 orq l ## 1, RT0; \ 316 shrq $32, RT0; \ 317 xorq RT0, l ## 1; \ 318 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 319 andl r ## 1d, RT1d; \ 320 roll $1, RT1d; \ 321 shlq $32, RT1; \ 322 xorq RT1, r ## 1; 323 324#define enc_rounds2(i) \ 325 roundsm2(RAB, i + 2, RCD); \ 326 roundsm2(RCD, i + 3, RAB); \ 327 roundsm2(RAB, i + 4, RCD); \ 328 roundsm2(RCD, i + 5, RAB); \ 329 roundsm2(RAB, i + 6, RCD); \ 330 roundsm2(RCD, i + 7, RAB); 331 332#define enc_fls2(i) \ 333 fls2(RAB, RCD, i + 0, i + 1); 334 335#define enc_inpack2() \ 336 movq (RIO), RAB0; \ 337 bswapq RAB0; \ 338 rorq $32, RAB0; \ 339 movq 4*2(RIO), RCD0; \ 340 bswapq RCD0; \ 341 rolq $32, RCD0; \ 342 xorq key_table(CTX), RAB0; \ 343 \ 344 movq 8*2(RIO), RAB1; \ 345 bswapq RAB1; \ 346 rorq $32, RAB1; \ 347 movq 12*2(RIO), RCD1; \ 348 bswapq RCD1; \ 349 rolq $32, RCD1; \ 350 xorq key_table(CTX), RAB1; 351 352#define enc_outunpack2(op, max) \ 353 xorq key_table(CTX, max, 8), RCD0; \ 354 rolq $32, RCD0; \ 355 bswapq RCD0; \ 356 op ## q RCD0, (RIO); \ 357 rorq $32, RAB0; \ 358 bswapq RAB0; \ 359 op ## q RAB0, 4*2(RIO); \ 360 \ 361 xorq key_table(CTX, max, 8), RCD1; \ 362 rolq $32, RCD1; \ 363 bswapq RCD1; \ 364 op ## q RCD1, 8*2(RIO); \ 365 rorq $32, RAB1; \ 366 bswapq RAB1; \ 367 op ## q RAB1, 12*2(RIO); 368 369#define dec_rounds2(i) \ 370 roundsm2(RAB, i + 7, RCD); \ 371 roundsm2(RCD, i + 6, RAB); \ 372 roundsm2(RAB, i + 5, RCD); \ 373 roundsm2(RCD, i + 4, RAB); \ 374 roundsm2(RAB, i + 3, RCD); \ 375 roundsm2(RCD, i + 2, RAB); 376 377#define dec_fls2(i) \ 378 fls2(RAB, RCD, i + 1, i + 0); 379 380#define dec_inpack2(max) \ 381 movq (RIO), RAB0; \ 382 bswapq RAB0; \ 383 rorq $32, RAB0; \ 384 movq 4*2(RIO), RCD0; \ 385 bswapq RCD0; \ 386 rolq $32, RCD0; \ 387 xorq key_table(CTX, max, 8), RAB0; \ 388 \ 389 movq 8*2(RIO), RAB1; \ 390 bswapq RAB1; \ 391 rorq $32, RAB1; \ 392 movq 12*2(RIO), RCD1; \ 393 bswapq RCD1; \ 394 rolq $32, RCD1; \ 395 xorq key_table(CTX, max, 8), RAB1; 396 397#define dec_outunpack2() \ 398 xorq key_table(CTX), RCD0; \ 399 rolq $32, RCD0; \ 400 bswapq RCD0; \ 401 movq RCD0, (RIO); \ 402 rorq $32, RAB0; \ 403 bswapq RAB0; \ 404 movq RAB0, 4*2(RIO); \ 405 \ 406 xorq key_table(CTX), RCD1; \ 407 rolq $32, RCD1; \ 408 bswapq RCD1; \ 409 movq RCD1, 8*2(RIO); \ 410 rorq $32, RAB1; \ 411 bswapq RAB1; \ 412 movq RAB1, 12*2(RIO); 413 414SYM_FUNC_START(__camellia_enc_blk_2way) 415 /* input: 416 * %rdi: ctx, CTX 417 * %rsi: dst 418 * %rdx: src 419 * %rcx: bool xor 420 */ 421 pushq %rbx; 422 423 movq %r12, RR12; 424 movq %rcx, RXOR; 425 movq %rsi, RDST; 426 movq %rdx, RIO; 427 428 enc_inpack2(); 429 430 enc_rounds2(0); 431 enc_fls2(8); 432 enc_rounds2(8); 433 enc_fls2(16); 434 enc_rounds2(16); 435 movl $24, RT2d; /* max */ 436 437 cmpb $16, key_length(CTX); 438 je .L__enc2_done; 439 440 enc_fls2(24); 441 enc_rounds2(24); 442 movl $32, RT2d; /* max */ 443 444.L__enc2_done: 445 test RXORbl, RXORbl; 446 movq RDST, RIO; 447 jnz .L__enc2_xor; 448 449 enc_outunpack2(mov, RT2); 450 451 movq RR12, %r12; 452 popq %rbx; 453 RET; 454 455.L__enc2_xor: 456 enc_outunpack2(xor, RT2); 457 458 movq RR12, %r12; 459 popq %rbx; 460 RET; 461SYM_FUNC_END(__camellia_enc_blk_2way) 462 463SYM_FUNC_START(camellia_dec_blk_2way) 464 /* input: 465 * %rdi: ctx, CTX 466 * %rsi: dst 467 * %rdx: src 468 */ 469 cmpl $16, key_length(CTX); 470 movl $32, RT2d; 471 movl $24, RXORd; 472 cmovel RXORd, RT2d; /* max */ 473 474 movq %rbx, RXOR; 475 movq %r12, RR12; 476 movq %rsi, RDST; 477 movq %rdx, RIO; 478 479 dec_inpack2(RT2); 480 481 cmpb $24, RT2bl; 482 je .L__dec2_rounds16; 483 484 dec_rounds2(24); 485 dec_fls2(24); 486 487.L__dec2_rounds16: 488 dec_rounds2(16); 489 dec_fls2(16); 490 dec_rounds2(8); 491 dec_fls2(8); 492 dec_rounds2(0); 493 494 movq RDST, RIO; 495 496 dec_outunpack2(); 497 498 movq RR12, %r12; 499 movq RXOR, %rbx; 500 RET; 501SYM_FUNC_END(camellia_dec_blk_2way) 502