1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * Blowfish Cipher Algorithm (x86_64) 4 * 5 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 */ 7 8#include <linux/linkage.h> 9#include <linux/cfi_types.h> 10 11.file "blowfish-x86_64-asm.S" 12.text 13 14/* structure of crypto context */ 15#define p 0 16#define s0 ((16 + 2) * 4) 17#define s1 ((16 + 2 + (1 * 256)) * 4) 18#define s2 ((16 + 2 + (2 * 256)) * 4) 19#define s3 ((16 + 2 + (3 * 256)) * 4) 20 21/* register macros */ 22#define CTX %r12 23#define RIO %rsi 24 25#define RX0 %rax 26#define RX1 %rbx 27#define RX2 %rcx 28#define RX3 %rdx 29 30#define RX0d %eax 31#define RX1d %ebx 32#define RX2d %ecx 33#define RX3d %edx 34 35#define RX0bl %al 36#define RX1bl %bl 37#define RX2bl %cl 38#define RX3bl %dl 39 40#define RX0bh %ah 41#define RX1bh %bh 42#define RX2bh %ch 43#define RX3bh %dh 44 45#define RT0 %rdi 46#define RT1 %rsi 47#define RT2 %r8 48#define RT3 %r9 49 50#define RT0d %edi 51#define RT1d %esi 52#define RT2d %r8d 53#define RT3d %r9d 54 55#define RKEY %r10 56 57/*********************************************************************** 58 * 1-way blowfish 59 ***********************************************************************/ 60#define F() \ 61 rorq $16, RX0; \ 62 movzbl RX0bh, RT0d; \ 63 movzbl RX0bl, RT1d; \ 64 rolq $16, RX0; \ 65 movl s0(CTX,RT0,4), RT0d; \ 66 addl s1(CTX,RT1,4), RT0d; \ 67 movzbl RX0bh, RT1d; \ 68 movzbl RX0bl, RT2d; \ 69 rolq $32, RX0; \ 70 xorl s2(CTX,RT1,4), RT0d; \ 71 addl s3(CTX,RT2,4), RT0d; \ 72 xorq RT0, RX0; 73 74#define add_roundkey_enc(n) \ 75 xorq p+4*(n)(CTX), RX0; 76 77#define round_enc(n) \ 78 add_roundkey_enc(n); \ 79 \ 80 F(); \ 81 F(); 82 83#define add_roundkey_dec(n) \ 84 movq p+4*(n-1)(CTX), RT0; \ 85 rorq $32, RT0; \ 86 xorq RT0, RX0; 87 88#define round_dec(n) \ 89 add_roundkey_dec(n); \ 90 \ 91 F(); \ 92 F(); \ 93 94#define read_block() \ 95 movq (RIO), RX0; \ 96 rorq $32, RX0; \ 97 bswapq RX0; 98 99#define write_block() \ 100 bswapq RX0; \ 101 movq RX0, (RIO); 102 103#define xor_block() \ 104 bswapq RX0; \ 105 xorq RX0, (RIO); 106 107SYM_FUNC_START(__blowfish_enc_blk) 108 /* input: 109 * %rdi: ctx 110 * %rsi: dst 111 * %rdx: src 112 * %rcx: bool, if true: xor output 113 */ 114 movq %r12, %r11; 115 116 movq %rdi, CTX; 117 movq %rsi, %r10; 118 movq %rdx, RIO; 119 120 read_block(); 121 122 round_enc(0); 123 round_enc(2); 124 round_enc(4); 125 round_enc(6); 126 round_enc(8); 127 round_enc(10); 128 round_enc(12); 129 round_enc(14); 130 add_roundkey_enc(16); 131 132 movq %r11, %r12; 133 134 movq %r10, RIO; 135 test %cl, %cl; 136 jnz .L__enc_xor; 137 138 write_block(); 139 RET; 140.L__enc_xor: 141 xor_block(); 142 RET; 143SYM_FUNC_END(__blowfish_enc_blk) 144 145SYM_TYPED_FUNC_START(blowfish_dec_blk) 146 /* input: 147 * %rdi: ctx 148 * %rsi: dst 149 * %rdx: src 150 */ 151 movq %r12, %r11; 152 153 movq %rdi, CTX; 154 movq %rsi, %r10; 155 movq %rdx, RIO; 156 157 read_block(); 158 159 round_dec(17); 160 round_dec(15); 161 round_dec(13); 162 round_dec(11); 163 round_dec(9); 164 round_dec(7); 165 round_dec(5); 166 round_dec(3); 167 add_roundkey_dec(1); 168 169 movq %r10, RIO; 170 write_block(); 171 172 movq %r11, %r12; 173 174 RET; 175SYM_FUNC_END(blowfish_dec_blk) 176 177/********************************************************************** 178 4-way blowfish, four blocks parallel 179 **********************************************************************/ 180 181/* F() for 4-way. Slower when used alone/1-way, but faster when used 182 * parallel/4-way (tested on AMD Phenom II & Intel Xeon E7330). 183 */ 184#define F4(x) \ 185 movzbl x ## bh, RT1d; \ 186 movzbl x ## bl, RT3d; \ 187 rorq $16, x; \ 188 movzbl x ## bh, RT0d; \ 189 movzbl x ## bl, RT2d; \ 190 rorq $16, x; \ 191 movl s0(CTX,RT0,4), RT0d; \ 192 addl s1(CTX,RT2,4), RT0d; \ 193 xorl s2(CTX,RT1,4), RT0d; \ 194 addl s3(CTX,RT3,4), RT0d; \ 195 xorq RT0, x; 196 197#define add_preloaded_roundkey4() \ 198 xorq RKEY, RX0; \ 199 xorq RKEY, RX1; \ 200 xorq RKEY, RX2; \ 201 xorq RKEY, RX3; 202 203#define preload_roundkey_enc(n) \ 204 movq p+4*(n)(CTX), RKEY; 205 206#define add_roundkey_enc4(n) \ 207 add_preloaded_roundkey4(); \ 208 preload_roundkey_enc(n + 2); 209 210#define round_enc4(n) \ 211 add_roundkey_enc4(n); \ 212 \ 213 F4(RX0); \ 214 F4(RX1); \ 215 F4(RX2); \ 216 F4(RX3); \ 217 \ 218 F4(RX0); \ 219 F4(RX1); \ 220 F4(RX2); \ 221 F4(RX3); 222 223#define preload_roundkey_dec(n) \ 224 movq p+4*((n)-1)(CTX), RKEY; \ 225 rorq $32, RKEY; 226 227#define add_roundkey_dec4(n) \ 228 add_preloaded_roundkey4(); \ 229 preload_roundkey_dec(n - 2); 230 231#define round_dec4(n) \ 232 add_roundkey_dec4(n); \ 233 \ 234 F4(RX0); \ 235 F4(RX1); \ 236 F4(RX2); \ 237 F4(RX3); \ 238 \ 239 F4(RX0); \ 240 F4(RX1); \ 241 F4(RX2); \ 242 F4(RX3); 243 244#define read_block4() \ 245 movq (RIO), RX0; \ 246 rorq $32, RX0; \ 247 bswapq RX0; \ 248 \ 249 movq 8(RIO), RX1; \ 250 rorq $32, RX1; \ 251 bswapq RX1; \ 252 \ 253 movq 16(RIO), RX2; \ 254 rorq $32, RX2; \ 255 bswapq RX2; \ 256 \ 257 movq 24(RIO), RX3; \ 258 rorq $32, RX3; \ 259 bswapq RX3; 260 261#define write_block4() \ 262 bswapq RX0; \ 263 movq RX0, (RIO); \ 264 \ 265 bswapq RX1; \ 266 movq RX1, 8(RIO); \ 267 \ 268 bswapq RX2; \ 269 movq RX2, 16(RIO); \ 270 \ 271 bswapq RX3; \ 272 movq RX3, 24(RIO); 273 274#define xor_block4() \ 275 bswapq RX0; \ 276 xorq RX0, (RIO); \ 277 \ 278 bswapq RX1; \ 279 xorq RX1, 8(RIO); \ 280 \ 281 bswapq RX2; \ 282 xorq RX2, 16(RIO); \ 283 \ 284 bswapq RX3; \ 285 xorq RX3, 24(RIO); 286 287SYM_FUNC_START(__blowfish_enc_blk_4way) 288 /* input: 289 * %rdi: ctx 290 * %rsi: dst 291 * %rdx: src 292 * %rcx: bool, if true: xor output 293 */ 294 pushq %r12; 295 pushq %rbx; 296 pushq %rcx; 297 298 movq %rdi, CTX 299 movq %rsi, %r11; 300 movq %rdx, RIO; 301 302 preload_roundkey_enc(0); 303 304 read_block4(); 305 306 round_enc4(0); 307 round_enc4(2); 308 round_enc4(4); 309 round_enc4(6); 310 round_enc4(8); 311 round_enc4(10); 312 round_enc4(12); 313 round_enc4(14); 314 add_preloaded_roundkey4(); 315 316 popq %r12; 317 movq %r11, RIO; 318 319 test %r12b, %r12b; 320 jnz .L__enc_xor4; 321 322 write_block4(); 323 324 popq %rbx; 325 popq %r12; 326 RET; 327 328.L__enc_xor4: 329 xor_block4(); 330 331 popq %rbx; 332 popq %r12; 333 RET; 334SYM_FUNC_END(__blowfish_enc_blk_4way) 335 336SYM_TYPED_FUNC_START(blowfish_dec_blk_4way) 337 /* input: 338 * %rdi: ctx 339 * %rsi: dst 340 * %rdx: src 341 */ 342 pushq %r12; 343 pushq %rbx; 344 345 movq %rdi, CTX; 346 movq %rsi, %r11 347 movq %rdx, RIO; 348 349 preload_roundkey_dec(17); 350 read_block4(); 351 352 round_dec4(17); 353 round_dec4(15); 354 round_dec4(13); 355 round_dec4(11); 356 round_dec4(9); 357 round_dec4(7); 358 round_dec4(5); 359 round_dec4(3); 360 add_preloaded_roundkey4(); 361 362 movq %r11, RIO; 363 write_block4(); 364 365 popq %rbx; 366 popq %r12; 367 368 RET; 369SYM_FUNC_END(blowfish_dec_blk_4way) 370