1/* 2 * Serpent Cipher 8-way parallel algorithm (x86_64/AVX) 3 * 4 * Copyright (C) 2012 Johannes Goetzfried 5 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 6 * 7 * Based on arch/x86/crypto/serpent-sse2-x86_64-asm_64.S by 8 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 23 * USA 24 * 25 */ 26 27#include "glue_helper-asm-avx.S" 28 29.file "serpent-avx-x86_64-asm_64.S" 30 31.data 32.align 16 33 34.Lbswap128_mask: 35 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 36 37.text 38 39#define CTX %rdi 40 41/********************************************************************** 42 8-way AVX serpent 43 **********************************************************************/ 44#define RA1 %xmm0 45#define RB1 %xmm1 46#define RC1 %xmm2 47#define RD1 %xmm3 48#define RE1 %xmm4 49 50#define tp %xmm5 51 52#define RA2 %xmm6 53#define RB2 %xmm7 54#define RC2 %xmm8 55#define RD2 %xmm9 56#define RE2 %xmm10 57 58#define RNOT %xmm11 59 60#define RK0 %xmm12 61#define RK1 %xmm13 62#define RK2 %xmm14 63#define RK3 %xmm15 64 65 66#define S0_1(x0, x1, x2, x3, x4) \ 67 vpor x0, x3, tp; \ 68 vpxor x3, x0, x0; \ 69 vpxor x2, x3, x4; \ 70 vpxor RNOT, x4, x4; \ 71 vpxor x1, tp, x3; \ 72 vpand x0, x1, x1; \ 73 vpxor x4, x1, x1; \ 74 vpxor x0, x2, x2; 75#define S0_2(x0, x1, x2, x3, x4) \ 76 vpxor x3, x0, x0; \ 77 vpor x0, x4, x4; \ 78 vpxor x2, x0, x0; \ 79 vpand x1, x2, x2; \ 80 vpxor x2, x3, x3; \ 81 vpxor RNOT, x1, x1; \ 82 vpxor x4, x2, x2; \ 83 vpxor x2, x1, x1; 84 85#define S1_1(x0, x1, x2, x3, x4) \ 86 vpxor x0, x1, tp; \ 87 vpxor x3, x0, x0; \ 88 vpxor RNOT, x3, x3; \ 89 vpand tp, x1, x4; \ 90 vpor tp, x0, x0; \ 91 vpxor x2, x3, x3; \ 92 vpxor x3, x0, x0; \ 93 vpxor x3, tp, x1; 94#define S1_2(x0, x1, x2, x3, x4) \ 95 vpxor x4, x3, x3; \ 96 vpor x4, x1, x1; \ 97 vpxor x2, x4, x4; \ 98 vpand x0, x2, x2; \ 99 vpxor x1, x2, x2; \ 100 vpor x0, x1, x1; \ 101 vpxor RNOT, x0, x0; \ 102 vpxor x2, x0, x0; \ 103 vpxor x1, x4, x4; 104 105#define S2_1(x0, x1, x2, x3, x4) \ 106 vpxor RNOT, x3, x3; \ 107 vpxor x0, x1, x1; \ 108 vpand x2, x0, tp; \ 109 vpxor x3, tp, tp; \ 110 vpor x0, x3, x3; \ 111 vpxor x1, x2, x2; \ 112 vpxor x1, x3, x3; \ 113 vpand tp, x1, x1; 114#define S2_2(x0, x1, x2, x3, x4) \ 115 vpxor x2, tp, tp; \ 116 vpand x3, x2, x2; \ 117 vpor x1, x3, x3; \ 118 vpxor RNOT, tp, tp; \ 119 vpxor tp, x3, x3; \ 120 vpxor tp, x0, x4; \ 121 vpxor x2, tp, x0; \ 122 vpor x2, x1, x1; 123 124#define S3_1(x0, x1, x2, x3, x4) \ 125 vpxor x3, x1, tp; \ 126 vpor x0, x3, x3; \ 127 vpand x0, x1, x4; \ 128 vpxor x2, x0, x0; \ 129 vpxor tp, x2, x2; \ 130 vpand x3, tp, x1; \ 131 vpxor x3, x2, x2; \ 132 vpor x4, x0, x0; \ 133 vpxor x3, x4, x4; 134#define S3_2(x0, x1, x2, x3, x4) \ 135 vpxor x0, x1, x1; \ 136 vpand x3, x0, x0; \ 137 vpand x4, x3, x3; \ 138 vpxor x2, x3, x3; \ 139 vpor x1, x4, x4; \ 140 vpand x1, x2, x2; \ 141 vpxor x3, x4, x4; \ 142 vpxor x3, x0, x0; \ 143 vpxor x2, x3, x3; 144 145#define S4_1(x0, x1, x2, x3, x4) \ 146 vpand x0, x3, tp; \ 147 vpxor x3, x0, x0; \ 148 vpxor x2, tp, tp; \ 149 vpor x3, x2, x2; \ 150 vpxor x1, x0, x0; \ 151 vpxor tp, x3, x4; \ 152 vpor x0, x2, x2; \ 153 vpxor x1, x2, x2; 154#define S4_2(x0, x1, x2, x3, x4) \ 155 vpand x0, x1, x1; \ 156 vpxor x4, x1, x1; \ 157 vpand x2, x4, x4; \ 158 vpxor tp, x2, x2; \ 159 vpxor x0, x4, x4; \ 160 vpor x1, tp, x3; \ 161 vpxor RNOT, x1, x1; \ 162 vpxor x0, x3, x3; 163 164#define S5_1(x0, x1, x2, x3, x4) \ 165 vpor x0, x1, tp; \ 166 vpxor tp, x2, x2; \ 167 vpxor RNOT, x3, x3; \ 168 vpxor x0, x1, x4; \ 169 vpxor x2, x0, x0; \ 170 vpand x4, tp, x1; \ 171 vpor x3, x4, x4; \ 172 vpxor x0, x4, x4; 173#define S5_2(x0, x1, x2, x3, x4) \ 174 vpand x3, x0, x0; \ 175 vpxor x3, x1, x1; \ 176 vpxor x2, x3, x3; \ 177 vpxor x1, x0, x0; \ 178 vpand x4, x2, x2; \ 179 vpxor x2, x1, x1; \ 180 vpand x0, x2, x2; \ 181 vpxor x2, x3, x3; 182 183#define S6_1(x0, x1, x2, x3, x4) \ 184 vpxor x0, x3, x3; \ 185 vpxor x2, x1, tp; \ 186 vpxor x0, x2, x2; \ 187 vpand x3, x0, x0; \ 188 vpor x3, tp, tp; \ 189 vpxor RNOT, x1, x4; \ 190 vpxor tp, x0, x0; \ 191 vpxor x2, tp, x1; 192#define S6_2(x0, x1, x2, x3, x4) \ 193 vpxor x4, x3, x3; \ 194 vpxor x0, x4, x4; \ 195 vpand x0, x2, x2; \ 196 vpxor x1, x4, x4; \ 197 vpxor x3, x2, x2; \ 198 vpand x1, x3, x3; \ 199 vpxor x0, x3, x3; \ 200 vpxor x2, x1, x1; 201 202#define S7_1(x0, x1, x2, x3, x4) \ 203 vpxor RNOT, x1, tp; \ 204 vpxor RNOT, x0, x0; \ 205 vpand x2, tp, x1; \ 206 vpxor x3, x1, x1; \ 207 vpor tp, x3, x3; \ 208 vpxor x2, tp, x4; \ 209 vpxor x3, x2, x2; \ 210 vpxor x0, x3, x3; \ 211 vpor x1, x0, x0; 212#define S7_2(x0, x1, x2, x3, x4) \ 213 vpand x0, x2, x2; \ 214 vpxor x4, x0, x0; \ 215 vpxor x3, x4, x4; \ 216 vpand x0, x3, x3; \ 217 vpxor x1, x4, x4; \ 218 vpxor x4, x2, x2; \ 219 vpxor x1, x3, x3; \ 220 vpor x0, x4, x4; \ 221 vpxor x1, x4, x4; 222 223#define SI0_1(x0, x1, x2, x3, x4) \ 224 vpxor x0, x1, x1; \ 225 vpor x1, x3, tp; \ 226 vpxor x1, x3, x4; \ 227 vpxor RNOT, x0, x0; \ 228 vpxor tp, x2, x2; \ 229 vpxor x0, tp, x3; \ 230 vpand x1, x0, x0; \ 231 vpxor x2, x0, x0; 232#define SI0_2(x0, x1, x2, x3, x4) \ 233 vpand x3, x2, x2; \ 234 vpxor x4, x3, x3; \ 235 vpxor x3, x2, x2; \ 236 vpxor x3, x1, x1; \ 237 vpand x0, x3, x3; \ 238 vpxor x0, x1, x1; \ 239 vpxor x2, x0, x0; \ 240 vpxor x3, x4, x4; 241 242#define SI1_1(x0, x1, x2, x3, x4) \ 243 vpxor x3, x1, x1; \ 244 vpxor x2, x0, tp; \ 245 vpxor RNOT, x2, x2; \ 246 vpor x1, x0, x4; \ 247 vpxor x3, x4, x4; \ 248 vpand x1, x3, x3; \ 249 vpxor x2, x1, x1; \ 250 vpand x4, x2, x2; 251#define SI1_2(x0, x1, x2, x3, x4) \ 252 vpxor x1, x4, x4; \ 253 vpor x3, x1, x1; \ 254 vpxor tp, x3, x3; \ 255 vpxor tp, x2, x2; \ 256 vpor x4, tp, x0; \ 257 vpxor x4, x2, x2; \ 258 vpxor x0, x1, x1; \ 259 vpxor x1, x4, x4; 260 261#define SI2_1(x0, x1, x2, x3, x4) \ 262 vpxor x1, x2, x2; \ 263 vpxor RNOT, x3, tp; \ 264 vpor x2, tp, tp; \ 265 vpxor x3, x2, x2; \ 266 vpxor x0, x3, x4; \ 267 vpxor x1, tp, x3; \ 268 vpor x2, x1, x1; \ 269 vpxor x0, x2, x2; 270#define SI2_2(x0, x1, x2, x3, x4) \ 271 vpxor x4, x1, x1; \ 272 vpor x3, x4, x4; \ 273 vpxor x3, x2, x2; \ 274 vpxor x2, x4, x4; \ 275 vpand x1, x2, x2; \ 276 vpxor x3, x2, x2; \ 277 vpxor x4, x3, x3; \ 278 vpxor x0, x4, x4; 279 280#define SI3_1(x0, x1, x2, x3, x4) \ 281 vpxor x1, x2, x2; \ 282 vpand x2, x1, tp; \ 283 vpxor x0, tp, tp; \ 284 vpor x1, x0, x0; \ 285 vpxor x3, x1, x4; \ 286 vpxor x3, x0, x0; \ 287 vpor tp, x3, x3; \ 288 vpxor x2, tp, x1; 289#define SI3_2(x0, x1, x2, x3, x4) \ 290 vpxor x3, x1, x1; \ 291 vpxor x2, x0, x0; \ 292 vpxor x3, x2, x2; \ 293 vpand x1, x3, x3; \ 294 vpxor x0, x1, x1; \ 295 vpand x2, x0, x0; \ 296 vpxor x3, x4, x4; \ 297 vpxor x0, x3, x3; \ 298 vpxor x1, x0, x0; 299 300#define SI4_1(x0, x1, x2, x3, x4) \ 301 vpxor x3, x2, x2; \ 302 vpand x1, x0, tp; \ 303 vpxor x2, tp, tp; \ 304 vpor x3, x2, x2; \ 305 vpxor RNOT, x0, x4; \ 306 vpxor tp, x1, x1; \ 307 vpxor x2, tp, x0; \ 308 vpand x4, x2, x2; 309#define SI4_2(x0, x1, x2, x3, x4) \ 310 vpxor x0, x2, x2; \ 311 vpor x4, x0, x0; \ 312 vpxor x3, x0, x0; \ 313 vpand x2, x3, x3; \ 314 vpxor x3, x4, x4; \ 315 vpxor x1, x3, x3; \ 316 vpand x0, x1, x1; \ 317 vpxor x1, x4, x4; \ 318 vpxor x3, x0, x0; 319 320#define SI5_1(x0, x1, x2, x3, x4) \ 321 vpor x2, x1, tp; \ 322 vpxor x1, x2, x2; \ 323 vpxor x3, tp, tp; \ 324 vpand x1, x3, x3; \ 325 vpxor x3, x2, x2; \ 326 vpor x0, x3, x3; \ 327 vpxor RNOT, x0, x0; \ 328 vpxor x2, x3, x3; \ 329 vpor x0, x2, x2; 330#define SI5_2(x0, x1, x2, x3, x4) \ 331 vpxor tp, x1, x4; \ 332 vpxor x4, x2, x2; \ 333 vpand x0, x4, x4; \ 334 vpxor tp, x0, x0; \ 335 vpxor x3, tp, x1; \ 336 vpand x2, x0, x0; \ 337 vpxor x3, x2, x2; \ 338 vpxor x2, x0, x0; \ 339 vpxor x4, x2, x2; \ 340 vpxor x3, x4, x4; 341 342#define SI6_1(x0, x1, x2, x3, x4) \ 343 vpxor x2, x0, x0; \ 344 vpand x3, x0, tp; \ 345 vpxor x3, x2, x2; \ 346 vpxor x2, tp, tp; \ 347 vpxor x1, x3, x3; \ 348 vpor x0, x2, x2; \ 349 vpxor x3, x2, x2; \ 350 vpand tp, x3, x3; 351#define SI6_2(x0, x1, x2, x3, x4) \ 352 vpxor RNOT, tp, tp; \ 353 vpxor x1, x3, x3; \ 354 vpand x2, x1, x1; \ 355 vpxor tp, x0, x4; \ 356 vpxor x4, x3, x3; \ 357 vpxor x2, x4, x4; \ 358 vpxor x1, tp, x0; \ 359 vpxor x0, x2, x2; 360 361#define SI7_1(x0, x1, x2, x3, x4) \ 362 vpand x0, x3, tp; \ 363 vpxor x2, x0, x0; \ 364 vpor x3, x2, x2; \ 365 vpxor x1, x3, x4; \ 366 vpxor RNOT, x0, x0; \ 367 vpor tp, x1, x1; \ 368 vpxor x0, x4, x4; \ 369 vpand x2, x0, x0; \ 370 vpxor x1, x0, x0; 371#define SI7_2(x0, x1, x2, x3, x4) \ 372 vpand x2, x1, x1; \ 373 vpxor x2, tp, x3; \ 374 vpxor x3, x4, x4; \ 375 vpand x3, x2, x2; \ 376 vpor x0, x3, x3; \ 377 vpxor x4, x1, x1; \ 378 vpxor x4, x3, x3; \ 379 vpand x0, x4, x4; \ 380 vpxor x2, x4, x4; 381 382#define get_key(i, j, t) \ 383 vbroadcastss (4*(i)+(j))*4(CTX), t; 384 385#define K2(x0, x1, x2, x3, x4, i) \ 386 get_key(i, 0, RK0); \ 387 get_key(i, 1, RK1); \ 388 get_key(i, 2, RK2); \ 389 get_key(i, 3, RK3); \ 390 vpxor RK0, x0 ## 1, x0 ## 1; \ 391 vpxor RK1, x1 ## 1, x1 ## 1; \ 392 vpxor RK2, x2 ## 1, x2 ## 1; \ 393 vpxor RK3, x3 ## 1, x3 ## 1; \ 394 vpxor RK0, x0 ## 2, x0 ## 2; \ 395 vpxor RK1, x1 ## 2, x1 ## 2; \ 396 vpxor RK2, x2 ## 2, x2 ## 2; \ 397 vpxor RK3, x3 ## 2, x3 ## 2; 398 399#define LK2(x0, x1, x2, x3, x4, i) \ 400 vpslld $13, x0 ## 1, x4 ## 1; \ 401 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 402 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 403 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 404 vpslld $3, x2 ## 1, x4 ## 1; \ 405 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 406 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 407 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 408 vpslld $13, x0 ## 2, x4 ## 2; \ 409 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 410 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 411 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 412 vpslld $3, x2 ## 2, x4 ## 2; \ 413 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 414 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 415 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 416 vpslld $1, x1 ## 1, x4 ## 1; \ 417 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 418 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 419 vpslld $3, x0 ## 1, x4 ## 1; \ 420 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 421 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 422 get_key(i, 1, RK1); \ 423 vpslld $1, x1 ## 2, x4 ## 2; \ 424 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 425 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 426 vpslld $3, x0 ## 2, x4 ## 2; \ 427 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 428 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 429 get_key(i, 3, RK3); \ 430 vpslld $7, x3 ## 1, x4 ## 1; \ 431 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 432 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 433 vpslld $7, x1 ## 1, x4 ## 1; \ 434 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 435 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 436 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 437 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 438 get_key(i, 0, RK0); \ 439 vpslld $7, x3 ## 2, x4 ## 2; \ 440 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 441 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 442 vpslld $7, x1 ## 2, x4 ## 2; \ 443 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 444 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 445 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 446 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 447 get_key(i, 2, RK2); \ 448 vpxor RK1, x1 ## 1, x1 ## 1; \ 449 vpxor RK3, x3 ## 1, x3 ## 1; \ 450 vpslld $5, x0 ## 1, x4 ## 1; \ 451 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 452 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 453 vpslld $22, x2 ## 1, x4 ## 1; \ 454 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 455 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 456 vpxor RK0, x0 ## 1, x0 ## 1; \ 457 vpxor RK2, x2 ## 1, x2 ## 1; \ 458 vpxor RK1, x1 ## 2, x1 ## 2; \ 459 vpxor RK3, x3 ## 2, x3 ## 2; \ 460 vpslld $5, x0 ## 2, x4 ## 2; \ 461 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 462 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 463 vpslld $22, x2 ## 2, x4 ## 2; \ 464 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 465 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 466 vpxor RK0, x0 ## 2, x0 ## 2; \ 467 vpxor RK2, x2 ## 2, x2 ## 2; 468 469#define KL2(x0, x1, x2, x3, x4, i) \ 470 vpxor RK0, x0 ## 1, x0 ## 1; \ 471 vpxor RK2, x2 ## 1, x2 ## 1; \ 472 vpsrld $5, x0 ## 1, x4 ## 1; \ 473 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 474 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 475 vpxor RK3, x3 ## 1, x3 ## 1; \ 476 vpxor RK1, x1 ## 1, x1 ## 1; \ 477 vpsrld $22, x2 ## 1, x4 ## 1; \ 478 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 479 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 480 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 481 vpxor RK0, x0 ## 2, x0 ## 2; \ 482 vpxor RK2, x2 ## 2, x2 ## 2; \ 483 vpsrld $5, x0 ## 2, x4 ## 2; \ 484 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 485 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 486 vpxor RK3, x3 ## 2, x3 ## 2; \ 487 vpxor RK1, x1 ## 2, x1 ## 2; \ 488 vpsrld $22, x2 ## 2, x4 ## 2; \ 489 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 490 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 491 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 492 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 493 vpslld $7, x1 ## 1, x4 ## 1; \ 494 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 495 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 496 vpsrld $1, x1 ## 1, x4 ## 1; \ 497 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 498 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 499 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 500 vpslld $7, x1 ## 2, x4 ## 2; \ 501 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 502 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 503 vpsrld $1, x1 ## 2, x4 ## 2; \ 504 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 505 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 506 vpsrld $7, x3 ## 1, x4 ## 1; \ 507 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 508 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 509 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 510 vpslld $3, x0 ## 1, x4 ## 1; \ 511 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 512 vpsrld $7, x3 ## 2, x4 ## 2; \ 513 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 514 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 515 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 516 vpslld $3, x0 ## 2, x4 ## 2; \ 517 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 518 vpsrld $13, x0 ## 1, x4 ## 1; \ 519 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 520 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 521 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 522 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 523 vpsrld $3, x2 ## 1, x4 ## 1; \ 524 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 525 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 526 vpsrld $13, x0 ## 2, x4 ## 2; \ 527 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 528 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 529 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 530 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 531 vpsrld $3, x2 ## 2, x4 ## 2; \ 532 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 533 vpor x4 ## 2, x2 ## 2, x2 ## 2; 534 535#define S(SBOX, x0, x1, x2, x3, x4) \ 536 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 537 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 538 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 539 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 540 541#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 542 get_key(i, 0, RK0); \ 543 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 544 get_key(i, 2, RK2); \ 545 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 546 get_key(i, 3, RK3); \ 547 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 548 get_key(i, 1, RK1); \ 549 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 550 551#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 552 vpunpckldq x1, x0, t0; \ 553 vpunpckhdq x1, x0, t2; \ 554 vpunpckldq x3, x2, t1; \ 555 vpunpckhdq x3, x2, x3; \ 556 \ 557 vpunpcklqdq t1, t0, x0; \ 558 vpunpckhqdq t1, t0, x1; \ 559 vpunpcklqdq x3, t2, x2; \ 560 vpunpckhqdq x3, t2, x3; 561 562#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 563 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 564 565#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 566 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 567 568.align 8 569.type __serpent_enc_blk8_avx,@function; 570 571__serpent_enc_blk8_avx: 572 /* input: 573 * %rdi: ctx, CTX 574 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: blocks 575 * output: 576 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 577 */ 578 579 vpcmpeqd RNOT, RNOT, RNOT; 580 581 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 582 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 583 584 K2(RA, RB, RC, RD, RE, 0); 585 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 586 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 587 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 588 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 589 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 590 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 591 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 592 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 593 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 594 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 595 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 596 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 597 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 598 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 599 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 600 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 601 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 602 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 603 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 604 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 605 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 606 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 607 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 608 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 609 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 610 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 611 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 612 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 613 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 614 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 615 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 616 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 617 618 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 619 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 620 621 ret; 622 623.align 8 624.type __serpent_dec_blk8_avx,@function; 625 626__serpent_dec_blk8_avx: 627 /* input: 628 * %rdi: ctx, CTX 629 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: encrypted blocks 630 * output: 631 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: decrypted blocks 632 */ 633 634 vpcmpeqd RNOT, RNOT, RNOT; 635 636 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 637 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 638 639 K2(RA, RB, RC, RD, RE, 32); 640 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 641 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 642 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 643 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 644 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 645 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 646 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 647 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 648 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 649 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 650 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 651 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 652 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 653 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 654 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 655 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 656 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 657 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 658 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 659 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 660 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 661 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 662 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 663 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 664 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 665 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 666 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 667 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 668 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 669 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 670 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 671 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 672 673 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 674 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 675 676 ret; 677 678.align 8 679.global serpent_ecb_enc_8way_avx 680.type serpent_ecb_enc_8way_avx,@function; 681 682serpent_ecb_enc_8way_avx: 683 /* input: 684 * %rdi: ctx, CTX 685 * %rsi: dst 686 * %rdx: src 687 */ 688 689 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 690 691 call __serpent_enc_blk8_avx; 692 693 store_8way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 694 695 ret; 696 697.align 8 698.global serpent_ecb_dec_8way_avx 699.type serpent_ecb_dec_8way_avx,@function; 700 701serpent_ecb_dec_8way_avx: 702 /* input: 703 * %rdi: ctx, CTX 704 * %rsi: dst 705 * %rdx: src 706 */ 707 708 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 709 710 call __serpent_dec_blk8_avx; 711 712 store_8way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 713 714 ret; 715 716.align 8 717.global serpent_cbc_dec_8way_avx 718.type serpent_cbc_dec_8way_avx,@function; 719 720serpent_cbc_dec_8way_avx: 721 /* input: 722 * %rdi: ctx, CTX 723 * %rsi: dst 724 * %rdx: src 725 */ 726 727 load_8way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 728 729 call __serpent_dec_blk8_avx; 730 731 store_cbc_8way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 732 733 ret; 734 735.align 8 736.global serpent_ctr_8way_avx 737.type serpent_ctr_8way_avx,@function; 738 739serpent_ctr_8way_avx: 740 /* input: 741 * %rdi: ctx, CTX 742 * %rsi: dst 743 * %rdx: src 744 * %rcx: iv (little endian, 128bit) 745 */ 746 747 load_ctr_8way(%rcx, .Lbswap128_mask, RA1, RB1, RC1, RD1, RA2, RB2, RC2, 748 RD2, RK0, RK1, RK2); 749 750 call __serpent_enc_blk8_avx; 751 752 store_ctr_8way(%rdx, %rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 753 754 ret; 755