1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Camellia Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2012 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9#include <linux/cfi_types.h> 10 11.file "camellia-x86_64-asm_64.S" 12.text 13 14.extern camellia_sp10011110; 15.extern camellia_sp22000222; 16.extern camellia_sp03303033; 17.extern camellia_sp00444404; 18.extern camellia_sp02220222; 19.extern camellia_sp30333033; 20.extern camellia_sp44044404; 21.extern camellia_sp11101110; 22 23#define sp10011110 camellia_sp10011110 24#define sp22000222 camellia_sp22000222 25#define sp03303033 camellia_sp03303033 26#define sp00444404 camellia_sp00444404 27#define sp02220222 camellia_sp02220222 28#define sp30333033 camellia_sp30333033 29#define sp44044404 camellia_sp44044404 30#define sp11101110 camellia_sp11101110 31 32#define CAMELLIA_TABLE_BYTE_LEN 272 33 34/* struct camellia_ctx: */ 35#define key_table 0 36#define key_length CAMELLIA_TABLE_BYTE_LEN 37 38/* register macros */ 39#define CTX %rdi 40#define RIO %rsi 41#define RIOd %esi 42 43#define RAB0 %rax 44#define RCD0 %rcx 45#define RAB1 %rbx 46#define RCD1 %rdx 47 48#define RAB0d %eax 49#define RCD0d %ecx 50#define RAB1d %ebx 51#define RCD1d %edx 52 53#define RAB0bl %al 54#define RCD0bl %cl 55#define RAB1bl %bl 56#define RCD1bl %dl 57 58#define RAB0bh %ah 59#define RCD0bh %ch 60#define RAB1bh %bh 61#define RCD1bh %dh 62 63#define RT0 %rsi 64#define RT1 %r12 65#define RT2 %r8 66 67#define RT0d %esi 68#define RT1d %r12d 69#define RT2d %r8d 70 71#define RT2bl %r8b 72 73#define RXOR %r9 74#define RR12 %r10 75#define RDST %r11 76 77#define RXORd %r9d 78#define RXORbl %r9b 79 80#define xor2ror16(T0, T1, tmp1, tmp2, ab, dst) \ 81 leaq T0(%rip), tmp1; \ 82 movzbl ab ## bl, tmp2 ## d; \ 83 xorq (tmp1, tmp2, 8), dst; \ 84 leaq T1(%rip), tmp2; \ 85 movzbl ab ## bh, tmp1 ## d; \ 86 rorq $16, ab; \ 87 xorq (tmp2, tmp1, 8), dst; 88 89/********************************************************************** 90 1-way camellia 91 **********************************************************************/ 92#define roundsm(ab, subkey, cd) \ 93 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 94 \ 95 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 96 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 97 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 98 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 99 \ 100 xorq RT2, cd ## 0; 101 102#define fls(l, r, kl, kr) \ 103 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 104 andl l ## 0d, RT0d; \ 105 roll $1, RT0d; \ 106 shlq $32, RT0; \ 107 xorq RT0, l ## 0; \ 108 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 109 orq r ## 0, RT1; \ 110 shrq $32, RT1; \ 111 xorq RT1, r ## 0; \ 112 \ 113 movq (key_table + ((kl) * 2) * 4)(CTX), RT2; \ 114 orq l ## 0, RT2; \ 115 shrq $32, RT2; \ 116 xorq RT2, l ## 0; \ 117 movl (key_table + ((kr) * 2) * 4)(CTX), RT0d; \ 118 andl r ## 0d, RT0d; \ 119 roll $1, RT0d; \ 120 shlq $32, RT0; \ 121 xorq RT0, r ## 0; 122 123#define enc_rounds(i) \ 124 roundsm(RAB, i + 2, RCD); \ 125 roundsm(RCD, i + 3, RAB); \ 126 roundsm(RAB, i + 4, RCD); \ 127 roundsm(RCD, i + 5, RAB); \ 128 roundsm(RAB, i + 6, RCD); \ 129 roundsm(RCD, i + 7, RAB); 130 131#define enc_fls(i) \ 132 fls(RAB, RCD, i + 0, i + 1); 133 134#define enc_inpack() \ 135 movq (RIO), RAB0; \ 136 bswapq RAB0; \ 137 rolq $32, RAB0; \ 138 movq 4*2(RIO), RCD0; \ 139 bswapq RCD0; \ 140 rorq $32, RCD0; \ 141 xorq key_table(CTX), RAB0; 142 143#define enc_outunpack(op, max) \ 144 xorq key_table(CTX, max, 8), RCD0; \ 145 rorq $32, RCD0; \ 146 bswapq RCD0; \ 147 op ## q RCD0, (RIO); \ 148 rolq $32, RAB0; \ 149 bswapq RAB0; \ 150 op ## q RAB0, 4*2(RIO); 151 152#define dec_rounds(i) \ 153 roundsm(RAB, i + 7, RCD); \ 154 roundsm(RCD, i + 6, RAB); \ 155 roundsm(RAB, i + 5, RCD); \ 156 roundsm(RCD, i + 4, RAB); \ 157 roundsm(RAB, i + 3, RCD); \ 158 roundsm(RCD, i + 2, RAB); 159 160#define dec_fls(i) \ 161 fls(RAB, RCD, i + 1, i + 0); 162 163#define dec_inpack(max) \ 164 movq (RIO), RAB0; \ 165 bswapq RAB0; \ 166 rolq $32, RAB0; \ 167 movq 4*2(RIO), RCD0; \ 168 bswapq RCD0; \ 169 rorq $32, RCD0; \ 170 xorq key_table(CTX, max, 8), RAB0; 171 172#define dec_outunpack() \ 173 xorq key_table(CTX), RCD0; \ 174 rorq $32, RCD0; \ 175 bswapq RCD0; \ 176 movq RCD0, (RIO); \ 177 rolq $32, RAB0; \ 178 bswapq RAB0; \ 179 movq RAB0, 4*2(RIO); 180 181SYM_TYPED_FUNC_START(__camellia_enc_blk) 182 /* input: 183 * %rdi: ctx, CTX 184 * %rsi: dst 185 * %rdx: src 186 * %rcx: bool xor 187 */ 188 movq %r12, RR12; 189 190 movq %rcx, RXOR; 191 movq %rsi, RDST; 192 movq %rdx, RIO; 193 194 enc_inpack(); 195 196 enc_rounds(0); 197 enc_fls(8); 198 enc_rounds(8); 199 enc_fls(16); 200 enc_rounds(16); 201 movl $24, RT1d; /* max */ 202 203 cmpb $16, key_length(CTX); 204 je .L__enc_done; 205 206 enc_fls(24); 207 enc_rounds(24); 208 movl $32, RT1d; /* max */ 209 210.L__enc_done: 211 testb RXORbl, RXORbl; 212 movq RDST, RIO; 213 214 jnz .L__enc_xor; 215 216 enc_outunpack(mov, RT1); 217 218 movq RR12, %r12; 219 RET; 220 221.L__enc_xor: 222 enc_outunpack(xor, RT1); 223 224 movq RR12, %r12; 225 RET; 226SYM_FUNC_END(__camellia_enc_blk) 227 228SYM_TYPED_FUNC_START(camellia_dec_blk) 229 /* input: 230 * %rdi: ctx, CTX 231 * %rsi: dst 232 * %rdx: src 233 */ 234 cmpl $16, key_length(CTX); 235 movl $32, RT2d; 236 movl $24, RXORd; 237 cmovel RXORd, RT2d; /* max */ 238 239 movq %r12, RR12; 240 movq %rsi, RDST; 241 movq %rdx, RIO; 242 243 dec_inpack(RT2); 244 245 cmpb $24, RT2bl; 246 je .L__dec_rounds16; 247 248 dec_rounds(24); 249 dec_fls(24); 250 251.L__dec_rounds16: 252 dec_rounds(16); 253 dec_fls(16); 254 dec_rounds(8); 255 dec_fls(8); 256 dec_rounds(0); 257 258 movq RDST, RIO; 259 260 dec_outunpack(); 261 262 movq RR12, %r12; 263 RET; 264SYM_FUNC_END(camellia_dec_blk) 265 266/********************************************************************** 267 2-way camellia 268 **********************************************************************/ 269#define roundsm2(ab, subkey, cd) \ 270 movq (key_table + ((subkey) * 2) * 4)(CTX), RT2; \ 271 xorq RT2, cd ## 1; \ 272 \ 273 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 0, cd ## 0); \ 274 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 0, RT2); \ 275 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 0, cd ## 0); \ 276 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 0, RT2); \ 277 \ 278 xor2ror16(sp00444404, sp03303033, RT0, RT1, ab ## 1, cd ## 1); \ 279 xorq RT2, cd ## 0; \ 280 xor2ror16(sp22000222, sp10011110, RT0, RT1, ab ## 1, cd ## 1); \ 281 xor2ror16(sp11101110, sp44044404, RT0, RT1, ab ## 1, cd ## 1); \ 282 xor2ror16(sp30333033, sp02220222, RT0, RT1, ab ## 1, cd ## 1); 283 284#define fls2(l, r, kl, kr) \ 285 movl (key_table + ((kl) * 2) * 4)(CTX), RT0d; \ 286 andl l ## 0d, RT0d; \ 287 roll $1, RT0d; \ 288 shlq $32, RT0; \ 289 xorq RT0, l ## 0; \ 290 movq (key_table + ((kr) * 2) * 4)(CTX), RT1; \ 291 orq r ## 0, RT1; \ 292 shrq $32, RT1; \ 293 xorq RT1, r ## 0; \ 294 \ 295 movl (key_table + ((kl) * 2) * 4)(CTX), RT2d; \ 296 andl l ## 1d, RT2d; \ 297 roll $1, RT2d; \ 298 shlq $32, RT2; \ 299 xorq RT2, l ## 1; \ 300 movq (key_table + ((kr) * 2) * 4)(CTX), RT0; \ 301 orq r ## 1, RT0; \ 302 shrq $32, RT0; \ 303 xorq RT0, r ## 1; \ 304 \ 305 movq (key_table + ((kl) * 2) * 4)(CTX), RT1; \ 306 orq l ## 0, RT1; \ 307 shrq $32, RT1; \ 308 xorq RT1, l ## 0; \ 309 movl (key_table + ((kr) * 2) * 4)(CTX), RT2d; \ 310 andl r ## 0d, RT2d; \ 311 roll $1, RT2d; \ 312 shlq $32, RT2; \ 313 xorq RT2, r ## 0; \ 314 \ 315 movq (key_table + ((kl) * 2) * 4)(CTX), RT0; \ 316 orq l ## 1, RT0; \ 317 shrq $32, RT0; \ 318 xorq RT0, l ## 1; \ 319 movl (key_table + ((kr) * 2) * 4)(CTX), RT1d; \ 320 andl r ## 1d, RT1d; \ 321 roll $1, RT1d; \ 322 shlq $32, RT1; \ 323 xorq RT1, r ## 1; 324 325#define enc_rounds2(i) \ 326 roundsm2(RAB, i + 2, RCD); \ 327 roundsm2(RCD, i + 3, RAB); \ 328 roundsm2(RAB, i + 4, RCD); \ 329 roundsm2(RCD, i + 5, RAB); \ 330 roundsm2(RAB, i + 6, RCD); \ 331 roundsm2(RCD, i + 7, RAB); 332 333#define enc_fls2(i) \ 334 fls2(RAB, RCD, i + 0, i + 1); 335 336#define enc_inpack2() \ 337 movq (RIO), RAB0; \ 338 bswapq RAB0; \ 339 rorq $32, RAB0; \ 340 movq 4*2(RIO), RCD0; \ 341 bswapq RCD0; \ 342 rolq $32, RCD0; \ 343 xorq key_table(CTX), RAB0; \ 344 \ 345 movq 8*2(RIO), RAB1; \ 346 bswapq RAB1; \ 347 rorq $32, RAB1; \ 348 movq 12*2(RIO), RCD1; \ 349 bswapq RCD1; \ 350 rolq $32, RCD1; \ 351 xorq key_table(CTX), RAB1; 352 353#define enc_outunpack2(op, max) \ 354 xorq key_table(CTX, max, 8), RCD0; \ 355 rolq $32, RCD0; \ 356 bswapq RCD0; \ 357 op ## q RCD0, (RIO); \ 358 rorq $32, RAB0; \ 359 bswapq RAB0; \ 360 op ## q RAB0, 4*2(RIO); \ 361 \ 362 xorq key_table(CTX, max, 8), RCD1; \ 363 rolq $32, RCD1; \ 364 bswapq RCD1; \ 365 op ## q RCD1, 8*2(RIO); \ 366 rorq $32, RAB1; \ 367 bswapq RAB1; \ 368 op ## q RAB1, 12*2(RIO); 369 370#define dec_rounds2(i) \ 371 roundsm2(RAB, i + 7, RCD); \ 372 roundsm2(RCD, i + 6, RAB); \ 373 roundsm2(RAB, i + 5, RCD); \ 374 roundsm2(RCD, i + 4, RAB); \ 375 roundsm2(RAB, i + 3, RCD); \ 376 roundsm2(RCD, i + 2, RAB); 377 378#define dec_fls2(i) \ 379 fls2(RAB, RCD, i + 1, i + 0); 380 381#define dec_inpack2(max) \ 382 movq (RIO), RAB0; \ 383 bswapq RAB0; \ 384 rorq $32, RAB0; \ 385 movq 4*2(RIO), RCD0; \ 386 bswapq RCD0; \ 387 rolq $32, RCD0; \ 388 xorq key_table(CTX, max, 8), RAB0; \ 389 \ 390 movq 8*2(RIO), RAB1; \ 391 bswapq RAB1; \ 392 rorq $32, RAB1; \ 393 movq 12*2(RIO), RCD1; \ 394 bswapq RCD1; \ 395 rolq $32, RCD1; \ 396 xorq key_table(CTX, max, 8), RAB1; 397 398#define dec_outunpack2() \ 399 xorq key_table(CTX), RCD0; \ 400 rolq $32, RCD0; \ 401 bswapq RCD0; \ 402 movq RCD0, (RIO); \ 403 rorq $32, RAB0; \ 404 bswapq RAB0; \ 405 movq RAB0, 4*2(RIO); \ 406 \ 407 xorq key_table(CTX), RCD1; \ 408 rolq $32, RCD1; \ 409 bswapq RCD1; \ 410 movq RCD1, 8*2(RIO); \ 411 rorq $32, RAB1; \ 412 bswapq RAB1; \ 413 movq RAB1, 12*2(RIO); 414 415SYM_TYPED_FUNC_START(__camellia_enc_blk_2way) 416 /* input: 417 * %rdi: ctx, CTX 418 * %rsi: dst 419 * %rdx: src 420 * %rcx: bool xor 421 */ 422 pushq %rbx; 423 424 movq %r12, RR12; 425 movq %rcx, RXOR; 426 movq %rsi, RDST; 427 movq %rdx, RIO; 428 429 enc_inpack2(); 430 431 enc_rounds2(0); 432 enc_fls2(8); 433 enc_rounds2(8); 434 enc_fls2(16); 435 enc_rounds2(16); 436 movl $24, RT2d; /* max */ 437 438 cmpb $16, key_length(CTX); 439 je .L__enc2_done; 440 441 enc_fls2(24); 442 enc_rounds2(24); 443 movl $32, RT2d; /* max */ 444 445.L__enc2_done: 446 test RXORbl, RXORbl; 447 movq RDST, RIO; 448 jnz .L__enc2_xor; 449 450 enc_outunpack2(mov, RT2); 451 452 movq RR12, %r12; 453 popq %rbx; 454 RET; 455 456.L__enc2_xor: 457 enc_outunpack2(xor, RT2); 458 459 movq RR12, %r12; 460 popq %rbx; 461 RET; 462SYM_FUNC_END(__camellia_enc_blk_2way) 463 464SYM_TYPED_FUNC_START(camellia_dec_blk_2way) 465 /* input: 466 * %rdi: ctx, CTX 467 * %rsi: dst 468 * %rdx: src 469 */ 470 cmpl $16, key_length(CTX); 471 movl $32, RT2d; 472 movl $24, RXORd; 473 cmovel RXORd, RT2d; /* max */ 474 475 movq %rbx, RXOR; 476 movq %r12, RR12; 477 movq %rsi, RDST; 478 movq %rdx, RIO; 479 480 dec_inpack2(RT2); 481 482 cmpb $24, RT2bl; 483 je .L__dec2_rounds16; 484 485 dec_rounds2(24); 486 dec_fls2(24); 487 488.L__dec2_rounds16: 489 dec_rounds2(16); 490 dec_fls2(16); 491 dec_rounds2(8); 492 dec_fls2(8); 493 dec_rounds2(0); 494 495 movq RDST, RIO; 496 497 dec_outunpack2(); 498 499 movq RR12, %r12; 500 movq RXOR, %rbx; 501 RET; 502SYM_FUNC_END(camellia_dec_blk_2way) 503