1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3 * x86_64/AVX2 assembler optimized version of Serpent 4 * 5 * Copyright © 2012-2013 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 6 * 7 * Based on AVX assembler implementation of Serpent by: 8 * Copyright © 2012 Johannes Goetzfried 9 * <Johannes.Goetzfried@informatik.stud.uni-erlangen.de> 10 */ 11 12#include <linux/linkage.h> 13#include <asm/frame.h> 14#include "glue_helper-asm-avx2.S" 15 16.file "serpent-avx2-asm_64.S" 17 18.section .rodata.cst16.bswap128_mask, "aM", @progbits, 16 19.align 16 20.Lbswap128_mask: 21 .byte 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 22 23.text 24 25#define CTX %rdi 26 27#define RNOT %ymm0 28#define tp %ymm1 29 30#define RA1 %ymm2 31#define RA2 %ymm3 32#define RB1 %ymm4 33#define RB2 %ymm5 34#define RC1 %ymm6 35#define RC2 %ymm7 36#define RD1 %ymm8 37#define RD2 %ymm9 38#define RE1 %ymm10 39#define RE2 %ymm11 40 41#define RK0 %ymm12 42#define RK1 %ymm13 43#define RK2 %ymm14 44#define RK3 %ymm15 45 46#define RK0x %xmm12 47#define RK1x %xmm13 48#define RK2x %xmm14 49#define RK3x %xmm15 50 51#define S0_1(x0, x1, x2, x3, x4) \ 52 vpor x0, x3, tp; \ 53 vpxor x3, x0, x0; \ 54 vpxor x2, x3, x4; \ 55 vpxor RNOT, x4, x4; \ 56 vpxor x1, tp, x3; \ 57 vpand x0, x1, x1; \ 58 vpxor x4, x1, x1; \ 59 vpxor x0, x2, x2; 60#define S0_2(x0, x1, x2, x3, x4) \ 61 vpxor x3, x0, x0; \ 62 vpor x0, x4, x4; \ 63 vpxor x2, x0, x0; \ 64 vpand x1, x2, x2; \ 65 vpxor x2, x3, x3; \ 66 vpxor RNOT, x1, x1; \ 67 vpxor x4, x2, x2; \ 68 vpxor x2, x1, x1; 69 70#define S1_1(x0, x1, x2, x3, x4) \ 71 vpxor x0, x1, tp; \ 72 vpxor x3, x0, x0; \ 73 vpxor RNOT, x3, x3; \ 74 vpand tp, x1, x4; \ 75 vpor tp, x0, x0; \ 76 vpxor x2, x3, x3; \ 77 vpxor x3, x0, x0; \ 78 vpxor x3, tp, x1; 79#define S1_2(x0, x1, x2, x3, x4) \ 80 vpxor x4, x3, x3; \ 81 vpor x4, x1, x1; \ 82 vpxor x2, x4, x4; \ 83 vpand x0, x2, x2; \ 84 vpxor x1, x2, x2; \ 85 vpor x0, x1, x1; \ 86 vpxor RNOT, x0, x0; \ 87 vpxor x2, x0, x0; \ 88 vpxor x1, x4, x4; 89 90#define S2_1(x0, x1, x2, x3, x4) \ 91 vpxor RNOT, x3, x3; \ 92 vpxor x0, x1, x1; \ 93 vpand x2, x0, tp; \ 94 vpxor x3, tp, tp; \ 95 vpor x0, x3, x3; \ 96 vpxor x1, x2, x2; \ 97 vpxor x1, x3, x3; \ 98 vpand tp, x1, x1; 99#define S2_2(x0, x1, x2, x3, x4) \ 100 vpxor x2, tp, tp; \ 101 vpand x3, x2, x2; \ 102 vpor x1, x3, x3; \ 103 vpxor RNOT, tp, tp; \ 104 vpxor tp, x3, x3; \ 105 vpxor tp, x0, x4; \ 106 vpxor x2, tp, x0; \ 107 vpor x2, x1, x1; 108 109#define S3_1(x0, x1, x2, x3, x4) \ 110 vpxor x3, x1, tp; \ 111 vpor x0, x3, x3; \ 112 vpand x0, x1, x4; \ 113 vpxor x2, x0, x0; \ 114 vpxor tp, x2, x2; \ 115 vpand x3, tp, x1; \ 116 vpxor x3, x2, x2; \ 117 vpor x4, x0, x0; \ 118 vpxor x3, x4, x4; 119#define S3_2(x0, x1, x2, x3, x4) \ 120 vpxor x0, x1, x1; \ 121 vpand x3, x0, x0; \ 122 vpand x4, x3, x3; \ 123 vpxor x2, x3, x3; \ 124 vpor x1, x4, x4; \ 125 vpand x1, x2, x2; \ 126 vpxor x3, x4, x4; \ 127 vpxor x3, x0, x0; \ 128 vpxor x2, x3, x3; 129 130#define S4_1(x0, x1, x2, x3, x4) \ 131 vpand x0, x3, tp; \ 132 vpxor x3, x0, x0; \ 133 vpxor x2, tp, tp; \ 134 vpor x3, x2, x2; \ 135 vpxor x1, x0, x0; \ 136 vpxor tp, x3, x4; \ 137 vpor x0, x2, x2; \ 138 vpxor x1, x2, x2; 139#define S4_2(x0, x1, x2, x3, x4) \ 140 vpand x0, x1, x1; \ 141 vpxor x4, x1, x1; \ 142 vpand x2, x4, x4; \ 143 vpxor tp, x2, x2; \ 144 vpxor x0, x4, x4; \ 145 vpor x1, tp, x3; \ 146 vpxor RNOT, x1, x1; \ 147 vpxor x0, x3, x3; 148 149#define S5_1(x0, x1, x2, x3, x4) \ 150 vpor x0, x1, tp; \ 151 vpxor tp, x2, x2; \ 152 vpxor RNOT, x3, x3; \ 153 vpxor x0, x1, x4; \ 154 vpxor x2, x0, x0; \ 155 vpand x4, tp, x1; \ 156 vpor x3, x4, x4; \ 157 vpxor x0, x4, x4; 158#define S5_2(x0, x1, x2, x3, x4) \ 159 vpand x3, x0, x0; \ 160 vpxor x3, x1, x1; \ 161 vpxor x2, x3, x3; \ 162 vpxor x1, x0, x0; \ 163 vpand x4, x2, x2; \ 164 vpxor x2, x1, x1; \ 165 vpand x0, x2, x2; \ 166 vpxor x2, x3, x3; 167 168#define S6_1(x0, x1, x2, x3, x4) \ 169 vpxor x0, x3, x3; \ 170 vpxor x2, x1, tp; \ 171 vpxor x0, x2, x2; \ 172 vpand x3, x0, x0; \ 173 vpor x3, tp, tp; \ 174 vpxor RNOT, x1, x4; \ 175 vpxor tp, x0, x0; \ 176 vpxor x2, tp, x1; 177#define S6_2(x0, x1, x2, x3, x4) \ 178 vpxor x4, x3, x3; \ 179 vpxor x0, x4, x4; \ 180 vpand x0, x2, x2; \ 181 vpxor x1, x4, x4; \ 182 vpxor x3, x2, x2; \ 183 vpand x1, x3, x3; \ 184 vpxor x0, x3, x3; \ 185 vpxor x2, x1, x1; 186 187#define S7_1(x0, x1, x2, x3, x4) \ 188 vpxor RNOT, x1, tp; \ 189 vpxor RNOT, x0, x0; \ 190 vpand x2, tp, x1; \ 191 vpxor x3, x1, x1; \ 192 vpor tp, x3, x3; \ 193 vpxor x2, tp, x4; \ 194 vpxor x3, x2, x2; \ 195 vpxor x0, x3, x3; \ 196 vpor x1, x0, x0; 197#define S7_2(x0, x1, x2, x3, x4) \ 198 vpand x0, x2, x2; \ 199 vpxor x4, x0, x0; \ 200 vpxor x3, x4, x4; \ 201 vpand x0, x3, x3; \ 202 vpxor x1, x4, x4; \ 203 vpxor x4, x2, x2; \ 204 vpxor x1, x3, x3; \ 205 vpor x0, x4, x4; \ 206 vpxor x1, x4, x4; 207 208#define SI0_1(x0, x1, x2, x3, x4) \ 209 vpxor x0, x1, x1; \ 210 vpor x1, x3, tp; \ 211 vpxor x1, x3, x4; \ 212 vpxor RNOT, x0, x0; \ 213 vpxor tp, x2, x2; \ 214 vpxor x0, tp, x3; \ 215 vpand x1, x0, x0; \ 216 vpxor x2, x0, x0; 217#define SI0_2(x0, x1, x2, x3, x4) \ 218 vpand x3, x2, x2; \ 219 vpxor x4, x3, x3; \ 220 vpxor x3, x2, x2; \ 221 vpxor x3, x1, x1; \ 222 vpand x0, x3, x3; \ 223 vpxor x0, x1, x1; \ 224 vpxor x2, x0, x0; \ 225 vpxor x3, x4, x4; 226 227#define SI1_1(x0, x1, x2, x3, x4) \ 228 vpxor x3, x1, x1; \ 229 vpxor x2, x0, tp; \ 230 vpxor RNOT, x2, x2; \ 231 vpor x1, x0, x4; \ 232 vpxor x3, x4, x4; \ 233 vpand x1, x3, x3; \ 234 vpxor x2, x1, x1; \ 235 vpand x4, x2, x2; 236#define SI1_2(x0, x1, x2, x3, x4) \ 237 vpxor x1, x4, x4; \ 238 vpor x3, x1, x1; \ 239 vpxor tp, x3, x3; \ 240 vpxor tp, x2, x2; \ 241 vpor x4, tp, x0; \ 242 vpxor x4, x2, x2; \ 243 vpxor x0, x1, x1; \ 244 vpxor x1, x4, x4; 245 246#define SI2_1(x0, x1, x2, x3, x4) \ 247 vpxor x1, x2, x2; \ 248 vpxor RNOT, x3, tp; \ 249 vpor x2, tp, tp; \ 250 vpxor x3, x2, x2; \ 251 vpxor x0, x3, x4; \ 252 vpxor x1, tp, x3; \ 253 vpor x2, x1, x1; \ 254 vpxor x0, x2, x2; 255#define SI2_2(x0, x1, x2, x3, x4) \ 256 vpxor x4, x1, x1; \ 257 vpor x3, x4, x4; \ 258 vpxor x3, x2, x2; \ 259 vpxor x2, x4, x4; \ 260 vpand x1, x2, x2; \ 261 vpxor x3, x2, x2; \ 262 vpxor x4, x3, x3; \ 263 vpxor x0, x4, x4; 264 265#define SI3_1(x0, x1, x2, x3, x4) \ 266 vpxor x1, x2, x2; \ 267 vpand x2, x1, tp; \ 268 vpxor x0, tp, tp; \ 269 vpor x1, x0, x0; \ 270 vpxor x3, x1, x4; \ 271 vpxor x3, x0, x0; \ 272 vpor tp, x3, x3; \ 273 vpxor x2, tp, x1; 274#define SI3_2(x0, x1, x2, x3, x4) \ 275 vpxor x3, x1, x1; \ 276 vpxor x2, x0, x0; \ 277 vpxor x3, x2, x2; \ 278 vpand x1, x3, x3; \ 279 vpxor x0, x1, x1; \ 280 vpand x2, x0, x0; \ 281 vpxor x3, x4, x4; \ 282 vpxor x0, x3, x3; \ 283 vpxor x1, x0, x0; 284 285#define SI4_1(x0, x1, x2, x3, x4) \ 286 vpxor x3, x2, x2; \ 287 vpand x1, x0, tp; \ 288 vpxor x2, tp, tp; \ 289 vpor x3, x2, x2; \ 290 vpxor RNOT, x0, x4; \ 291 vpxor tp, x1, x1; \ 292 vpxor x2, tp, x0; \ 293 vpand x4, x2, x2; 294#define SI4_2(x0, x1, x2, x3, x4) \ 295 vpxor x0, x2, x2; \ 296 vpor x4, x0, x0; \ 297 vpxor x3, x0, x0; \ 298 vpand x2, x3, x3; \ 299 vpxor x3, x4, x4; \ 300 vpxor x1, x3, x3; \ 301 vpand x0, x1, x1; \ 302 vpxor x1, x4, x4; \ 303 vpxor x3, x0, x0; 304 305#define SI5_1(x0, x1, x2, x3, x4) \ 306 vpor x2, x1, tp; \ 307 vpxor x1, x2, x2; \ 308 vpxor x3, tp, tp; \ 309 vpand x1, x3, x3; \ 310 vpxor x3, x2, x2; \ 311 vpor x0, x3, x3; \ 312 vpxor RNOT, x0, x0; \ 313 vpxor x2, x3, x3; \ 314 vpor x0, x2, x2; 315#define SI5_2(x0, x1, x2, x3, x4) \ 316 vpxor tp, x1, x4; \ 317 vpxor x4, x2, x2; \ 318 vpand x0, x4, x4; \ 319 vpxor tp, x0, x0; \ 320 vpxor x3, tp, x1; \ 321 vpand x2, x0, x0; \ 322 vpxor x3, x2, x2; \ 323 vpxor x2, x0, x0; \ 324 vpxor x4, x2, x2; \ 325 vpxor x3, x4, x4; 326 327#define SI6_1(x0, x1, x2, x3, x4) \ 328 vpxor x2, x0, x0; \ 329 vpand x3, x0, tp; \ 330 vpxor x3, x2, x2; \ 331 vpxor x2, tp, tp; \ 332 vpxor x1, x3, x3; \ 333 vpor x0, x2, x2; \ 334 vpxor x3, x2, x2; \ 335 vpand tp, x3, x3; 336#define SI6_2(x0, x1, x2, x3, x4) \ 337 vpxor RNOT, tp, tp; \ 338 vpxor x1, x3, x3; \ 339 vpand x2, x1, x1; \ 340 vpxor tp, x0, x4; \ 341 vpxor x4, x3, x3; \ 342 vpxor x2, x4, x4; \ 343 vpxor x1, tp, x0; \ 344 vpxor x0, x2, x2; 345 346#define SI7_1(x0, x1, x2, x3, x4) \ 347 vpand x0, x3, tp; \ 348 vpxor x2, x0, x0; \ 349 vpor x3, x2, x2; \ 350 vpxor x1, x3, x4; \ 351 vpxor RNOT, x0, x0; \ 352 vpor tp, x1, x1; \ 353 vpxor x0, x4, x4; \ 354 vpand x2, x0, x0; \ 355 vpxor x1, x0, x0; 356#define SI7_2(x0, x1, x2, x3, x4) \ 357 vpand x2, x1, x1; \ 358 vpxor x2, tp, x3; \ 359 vpxor x3, x4, x4; \ 360 vpand x3, x2, x2; \ 361 vpor x0, x3, x3; \ 362 vpxor x4, x1, x1; \ 363 vpxor x4, x3, x3; \ 364 vpand x0, x4, x4; \ 365 vpxor x2, x4, x4; 366 367#define get_key(i,j,t) \ 368 vpbroadcastd (4*(i)+(j))*4(CTX), t; 369 370#define K2(x0, x1, x2, x3, x4, i) \ 371 get_key(i, 0, RK0); \ 372 get_key(i, 1, RK1); \ 373 get_key(i, 2, RK2); \ 374 get_key(i, 3, RK3); \ 375 vpxor RK0, x0 ## 1, x0 ## 1; \ 376 vpxor RK1, x1 ## 1, x1 ## 1; \ 377 vpxor RK2, x2 ## 1, x2 ## 1; \ 378 vpxor RK3, x3 ## 1, x3 ## 1; \ 379 vpxor RK0, x0 ## 2, x0 ## 2; \ 380 vpxor RK1, x1 ## 2, x1 ## 2; \ 381 vpxor RK2, x2 ## 2, x2 ## 2; \ 382 vpxor RK3, x3 ## 2, x3 ## 2; 383 384#define LK2(x0, x1, x2, x3, x4, i) \ 385 vpslld $13, x0 ## 1, x4 ## 1; \ 386 vpsrld $(32 - 13), x0 ## 1, x0 ## 1; \ 387 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 388 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 389 vpslld $3, x2 ## 1, x4 ## 1; \ 390 vpsrld $(32 - 3), x2 ## 1, x2 ## 1; \ 391 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 392 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 393 vpslld $13, x0 ## 2, x4 ## 2; \ 394 vpsrld $(32 - 13), x0 ## 2, x0 ## 2; \ 395 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 396 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 397 vpslld $3, x2 ## 2, x4 ## 2; \ 398 vpsrld $(32 - 3), x2 ## 2, x2 ## 2; \ 399 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 400 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 401 vpslld $1, x1 ## 1, x4 ## 1; \ 402 vpsrld $(32 - 1), x1 ## 1, x1 ## 1; \ 403 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 404 vpslld $3, x0 ## 1, x4 ## 1; \ 405 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 406 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 407 get_key(i, 1, RK1); \ 408 vpslld $1, x1 ## 2, x4 ## 2; \ 409 vpsrld $(32 - 1), x1 ## 2, x1 ## 2; \ 410 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 411 vpslld $3, x0 ## 2, x4 ## 2; \ 412 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 413 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 414 get_key(i, 3, RK3); \ 415 vpslld $7, x3 ## 1, x4 ## 1; \ 416 vpsrld $(32 - 7), x3 ## 1, x3 ## 1; \ 417 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 418 vpslld $7, x1 ## 1, x4 ## 1; \ 419 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 420 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 421 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 422 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 423 get_key(i, 0, RK0); \ 424 vpslld $7, x3 ## 2, x4 ## 2; \ 425 vpsrld $(32 - 7), x3 ## 2, x3 ## 2; \ 426 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 427 vpslld $7, x1 ## 2, x4 ## 2; \ 428 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 429 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 430 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 431 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 432 get_key(i, 2, RK2); \ 433 vpxor RK1, x1 ## 1, x1 ## 1; \ 434 vpxor RK3, x3 ## 1, x3 ## 1; \ 435 vpslld $5, x0 ## 1, x4 ## 1; \ 436 vpsrld $(32 - 5), x0 ## 1, x0 ## 1; \ 437 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 438 vpslld $22, x2 ## 1, x4 ## 1; \ 439 vpsrld $(32 - 22), x2 ## 1, x2 ## 1; \ 440 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 441 vpxor RK0, x0 ## 1, x0 ## 1; \ 442 vpxor RK2, x2 ## 1, x2 ## 1; \ 443 vpxor RK1, x1 ## 2, x1 ## 2; \ 444 vpxor RK3, x3 ## 2, x3 ## 2; \ 445 vpslld $5, x0 ## 2, x4 ## 2; \ 446 vpsrld $(32 - 5), x0 ## 2, x0 ## 2; \ 447 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 448 vpslld $22, x2 ## 2, x4 ## 2; \ 449 vpsrld $(32 - 22), x2 ## 2, x2 ## 2; \ 450 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 451 vpxor RK0, x0 ## 2, x0 ## 2; \ 452 vpxor RK2, x2 ## 2, x2 ## 2; 453 454#define KL2(x0, x1, x2, x3, x4, i) \ 455 vpxor RK0, x0 ## 1, x0 ## 1; \ 456 vpxor RK2, x2 ## 1, x2 ## 1; \ 457 vpsrld $5, x0 ## 1, x4 ## 1; \ 458 vpslld $(32 - 5), x0 ## 1, x0 ## 1; \ 459 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 460 vpxor RK3, x3 ## 1, x3 ## 1; \ 461 vpxor RK1, x1 ## 1, x1 ## 1; \ 462 vpsrld $22, x2 ## 1, x4 ## 1; \ 463 vpslld $(32 - 22), x2 ## 1, x2 ## 1; \ 464 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 465 vpxor x3 ## 1, x2 ## 1, x2 ## 1; \ 466 vpxor RK0, x0 ## 2, x0 ## 2; \ 467 vpxor RK2, x2 ## 2, x2 ## 2; \ 468 vpsrld $5, x0 ## 2, x4 ## 2; \ 469 vpslld $(32 - 5), x0 ## 2, x0 ## 2; \ 470 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 471 vpxor RK3, x3 ## 2, x3 ## 2; \ 472 vpxor RK1, x1 ## 2, x1 ## 2; \ 473 vpsrld $22, x2 ## 2, x4 ## 2; \ 474 vpslld $(32 - 22), x2 ## 2, x2 ## 2; \ 475 vpor x4 ## 2, x2 ## 2, x2 ## 2; \ 476 vpxor x3 ## 2, x2 ## 2, x2 ## 2; \ 477 vpxor x3 ## 1, x0 ## 1, x0 ## 1; \ 478 vpslld $7, x1 ## 1, x4 ## 1; \ 479 vpxor x1 ## 1, x0 ## 1, x0 ## 1; \ 480 vpxor x4 ## 1, x2 ## 1, x2 ## 1; \ 481 vpsrld $1, x1 ## 1, x4 ## 1; \ 482 vpslld $(32 - 1), x1 ## 1, x1 ## 1; \ 483 vpor x4 ## 1, x1 ## 1, x1 ## 1; \ 484 vpxor x3 ## 2, x0 ## 2, x0 ## 2; \ 485 vpslld $7, x1 ## 2, x4 ## 2; \ 486 vpxor x1 ## 2, x0 ## 2, x0 ## 2; \ 487 vpxor x4 ## 2, x2 ## 2, x2 ## 2; \ 488 vpsrld $1, x1 ## 2, x4 ## 2; \ 489 vpslld $(32 - 1), x1 ## 2, x1 ## 2; \ 490 vpor x4 ## 2, x1 ## 2, x1 ## 2; \ 491 vpsrld $7, x3 ## 1, x4 ## 1; \ 492 vpslld $(32 - 7), x3 ## 1, x3 ## 1; \ 493 vpor x4 ## 1, x3 ## 1, x3 ## 1; \ 494 vpxor x0 ## 1, x1 ## 1, x1 ## 1; \ 495 vpslld $3, x0 ## 1, x4 ## 1; \ 496 vpxor x4 ## 1, x3 ## 1, x3 ## 1; \ 497 vpsrld $7, x3 ## 2, x4 ## 2; \ 498 vpslld $(32 - 7), x3 ## 2, x3 ## 2; \ 499 vpor x4 ## 2, x3 ## 2, x3 ## 2; \ 500 vpxor x0 ## 2, x1 ## 2, x1 ## 2; \ 501 vpslld $3, x0 ## 2, x4 ## 2; \ 502 vpxor x4 ## 2, x3 ## 2, x3 ## 2; \ 503 vpsrld $13, x0 ## 1, x4 ## 1; \ 504 vpslld $(32 - 13), x0 ## 1, x0 ## 1; \ 505 vpor x4 ## 1, x0 ## 1, x0 ## 1; \ 506 vpxor x2 ## 1, x1 ## 1, x1 ## 1; \ 507 vpxor x2 ## 1, x3 ## 1, x3 ## 1; \ 508 vpsrld $3, x2 ## 1, x4 ## 1; \ 509 vpslld $(32 - 3), x2 ## 1, x2 ## 1; \ 510 vpor x4 ## 1, x2 ## 1, x2 ## 1; \ 511 vpsrld $13, x0 ## 2, x4 ## 2; \ 512 vpslld $(32 - 13), x0 ## 2, x0 ## 2; \ 513 vpor x4 ## 2, x0 ## 2, x0 ## 2; \ 514 vpxor x2 ## 2, x1 ## 2, x1 ## 2; \ 515 vpxor x2 ## 2, x3 ## 2, x3 ## 2; \ 516 vpsrld $3, x2 ## 2, x4 ## 2; \ 517 vpslld $(32 - 3), x2 ## 2, x2 ## 2; \ 518 vpor x4 ## 2, x2 ## 2, x2 ## 2; 519 520#define S(SBOX, x0, x1, x2, x3, x4) \ 521 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 522 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 523 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 524 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); 525 526#define SP(SBOX, x0, x1, x2, x3, x4, i) \ 527 get_key(i, 0, RK0); \ 528 SBOX ## _1(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 529 get_key(i, 2, RK2); \ 530 SBOX ## _2(x0 ## 1, x1 ## 1, x2 ## 1, x3 ## 1, x4 ## 1); \ 531 get_key(i, 3, RK3); \ 532 SBOX ## _1(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 533 get_key(i, 1, RK1); \ 534 SBOX ## _2(x0 ## 2, x1 ## 2, x2 ## 2, x3 ## 2, x4 ## 2); \ 535 536#define transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 537 vpunpckldq x1, x0, t0; \ 538 vpunpckhdq x1, x0, t2; \ 539 vpunpckldq x3, x2, t1; \ 540 vpunpckhdq x3, x2, x3; \ 541 \ 542 vpunpcklqdq t1, t0, x0; \ 543 vpunpckhqdq t1, t0, x1; \ 544 vpunpcklqdq x3, t2, x2; \ 545 vpunpckhqdq x3, t2, x3; 546 547#define read_blocks(x0, x1, x2, x3, t0, t1, t2) \ 548 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 549 550#define write_blocks(x0, x1, x2, x3, t0, t1, t2) \ 551 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 552 553SYM_FUNC_START_LOCAL(__serpent_enc_blk16) 554 /* input: 555 * %rdi: ctx, CTX 556 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: plaintext 557 * output: 558 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 559 */ 560 561 vpcmpeqd RNOT, RNOT, RNOT; 562 563 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 564 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 565 566 K2(RA, RB, RC, RD, RE, 0); 567 S(S0, RA, RB, RC, RD, RE); LK2(RC, RB, RD, RA, RE, 1); 568 S(S1, RC, RB, RD, RA, RE); LK2(RE, RD, RA, RC, RB, 2); 569 S(S2, RE, RD, RA, RC, RB); LK2(RB, RD, RE, RC, RA, 3); 570 S(S3, RB, RD, RE, RC, RA); LK2(RC, RA, RD, RB, RE, 4); 571 S(S4, RC, RA, RD, RB, RE); LK2(RA, RD, RB, RE, RC, 5); 572 S(S5, RA, RD, RB, RE, RC); LK2(RC, RA, RD, RE, RB, 6); 573 S(S6, RC, RA, RD, RE, RB); LK2(RD, RB, RA, RE, RC, 7); 574 S(S7, RD, RB, RA, RE, RC); LK2(RC, RA, RE, RD, RB, 8); 575 S(S0, RC, RA, RE, RD, RB); LK2(RE, RA, RD, RC, RB, 9); 576 S(S1, RE, RA, RD, RC, RB); LK2(RB, RD, RC, RE, RA, 10); 577 S(S2, RB, RD, RC, RE, RA); LK2(RA, RD, RB, RE, RC, 11); 578 S(S3, RA, RD, RB, RE, RC); LK2(RE, RC, RD, RA, RB, 12); 579 S(S4, RE, RC, RD, RA, RB); LK2(RC, RD, RA, RB, RE, 13); 580 S(S5, RC, RD, RA, RB, RE); LK2(RE, RC, RD, RB, RA, 14); 581 S(S6, RE, RC, RD, RB, RA); LK2(RD, RA, RC, RB, RE, 15); 582 S(S7, RD, RA, RC, RB, RE); LK2(RE, RC, RB, RD, RA, 16); 583 S(S0, RE, RC, RB, RD, RA); LK2(RB, RC, RD, RE, RA, 17); 584 S(S1, RB, RC, RD, RE, RA); LK2(RA, RD, RE, RB, RC, 18); 585 S(S2, RA, RD, RE, RB, RC); LK2(RC, RD, RA, RB, RE, 19); 586 S(S3, RC, RD, RA, RB, RE); LK2(RB, RE, RD, RC, RA, 20); 587 S(S4, RB, RE, RD, RC, RA); LK2(RE, RD, RC, RA, RB, 21); 588 S(S5, RE, RD, RC, RA, RB); LK2(RB, RE, RD, RA, RC, 22); 589 S(S6, RB, RE, RD, RA, RC); LK2(RD, RC, RE, RA, RB, 23); 590 S(S7, RD, RC, RE, RA, RB); LK2(RB, RE, RA, RD, RC, 24); 591 S(S0, RB, RE, RA, RD, RC); LK2(RA, RE, RD, RB, RC, 25); 592 S(S1, RA, RE, RD, RB, RC); LK2(RC, RD, RB, RA, RE, 26); 593 S(S2, RC, RD, RB, RA, RE); LK2(RE, RD, RC, RA, RB, 27); 594 S(S3, RE, RD, RC, RA, RB); LK2(RA, RB, RD, RE, RC, 28); 595 S(S4, RA, RB, RD, RE, RC); LK2(RB, RD, RE, RC, RA, 29); 596 S(S5, RB, RD, RE, RC, RA); LK2(RA, RB, RD, RC, RE, 30); 597 S(S6, RA, RB, RD, RC, RE); LK2(RD, RE, RB, RC, RA, 31); 598 S(S7, RD, RE, RB, RC, RA); K2(RA, RB, RC, RD, RE, 32); 599 600 write_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 601 write_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 602 603 RET; 604SYM_FUNC_END(__serpent_enc_blk16) 605 606SYM_FUNC_START_LOCAL(__serpent_dec_blk16) 607 /* input: 608 * %rdi: ctx, CTX 609 * RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2: ciphertext 610 * output: 611 * RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2: plaintext 612 */ 613 614 vpcmpeqd RNOT, RNOT, RNOT; 615 616 read_blocks(RA1, RB1, RC1, RD1, RK0, RK1, RK2); 617 read_blocks(RA2, RB2, RC2, RD2, RK0, RK1, RK2); 618 619 K2(RA, RB, RC, RD, RE, 32); 620 SP(SI7, RA, RB, RC, RD, RE, 31); KL2(RB, RD, RA, RE, RC, 31); 621 SP(SI6, RB, RD, RA, RE, RC, 30); KL2(RA, RC, RE, RB, RD, 30); 622 SP(SI5, RA, RC, RE, RB, RD, 29); KL2(RC, RD, RA, RE, RB, 29); 623 SP(SI4, RC, RD, RA, RE, RB, 28); KL2(RC, RA, RB, RE, RD, 28); 624 SP(SI3, RC, RA, RB, RE, RD, 27); KL2(RB, RC, RD, RE, RA, 27); 625 SP(SI2, RB, RC, RD, RE, RA, 26); KL2(RC, RA, RE, RD, RB, 26); 626 SP(SI1, RC, RA, RE, RD, RB, 25); KL2(RB, RA, RE, RD, RC, 25); 627 SP(SI0, RB, RA, RE, RD, RC, 24); KL2(RE, RC, RA, RB, RD, 24); 628 SP(SI7, RE, RC, RA, RB, RD, 23); KL2(RC, RB, RE, RD, RA, 23); 629 SP(SI6, RC, RB, RE, RD, RA, 22); KL2(RE, RA, RD, RC, RB, 22); 630 SP(SI5, RE, RA, RD, RC, RB, 21); KL2(RA, RB, RE, RD, RC, 21); 631 SP(SI4, RA, RB, RE, RD, RC, 20); KL2(RA, RE, RC, RD, RB, 20); 632 SP(SI3, RA, RE, RC, RD, RB, 19); KL2(RC, RA, RB, RD, RE, 19); 633 SP(SI2, RC, RA, RB, RD, RE, 18); KL2(RA, RE, RD, RB, RC, 18); 634 SP(SI1, RA, RE, RD, RB, RC, 17); KL2(RC, RE, RD, RB, RA, 17); 635 SP(SI0, RC, RE, RD, RB, RA, 16); KL2(RD, RA, RE, RC, RB, 16); 636 SP(SI7, RD, RA, RE, RC, RB, 15); KL2(RA, RC, RD, RB, RE, 15); 637 SP(SI6, RA, RC, RD, RB, RE, 14); KL2(RD, RE, RB, RA, RC, 14); 638 SP(SI5, RD, RE, RB, RA, RC, 13); KL2(RE, RC, RD, RB, RA, 13); 639 SP(SI4, RE, RC, RD, RB, RA, 12); KL2(RE, RD, RA, RB, RC, 12); 640 SP(SI3, RE, RD, RA, RB, RC, 11); KL2(RA, RE, RC, RB, RD, 11); 641 SP(SI2, RA, RE, RC, RB, RD, 10); KL2(RE, RD, RB, RC, RA, 10); 642 SP(SI1, RE, RD, RB, RC, RA, 9); KL2(RA, RD, RB, RC, RE, 9); 643 SP(SI0, RA, RD, RB, RC, RE, 8); KL2(RB, RE, RD, RA, RC, 8); 644 SP(SI7, RB, RE, RD, RA, RC, 7); KL2(RE, RA, RB, RC, RD, 7); 645 SP(SI6, RE, RA, RB, RC, RD, 6); KL2(RB, RD, RC, RE, RA, 6); 646 SP(SI5, RB, RD, RC, RE, RA, 5); KL2(RD, RA, RB, RC, RE, 5); 647 SP(SI4, RD, RA, RB, RC, RE, 4); KL2(RD, RB, RE, RC, RA, 4); 648 SP(SI3, RD, RB, RE, RC, RA, 3); KL2(RE, RD, RA, RC, RB, 3); 649 SP(SI2, RE, RD, RA, RC, RB, 2); KL2(RD, RB, RC, RA, RE, 2); 650 SP(SI1, RD, RB, RC, RA, RE, 1); KL2(RE, RB, RC, RA, RD, 1); 651 S(SI0, RE, RB, RC, RA, RD); K2(RC, RD, RB, RE, RA, 0); 652 653 write_blocks(RC1, RD1, RB1, RE1, RK0, RK1, RK2); 654 write_blocks(RC2, RD2, RB2, RE2, RK0, RK1, RK2); 655 656 RET; 657SYM_FUNC_END(__serpent_dec_blk16) 658 659SYM_FUNC_START(serpent_ecb_enc_16way) 660 /* input: 661 * %rdi: ctx, CTX 662 * %rsi: dst 663 * %rdx: src 664 */ 665 FRAME_BEGIN 666 667 vzeroupper; 668 669 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 670 671 call __serpent_enc_blk16; 672 673 store_16way(%rsi, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 674 675 vzeroupper; 676 677 FRAME_END 678 RET; 679SYM_FUNC_END(serpent_ecb_enc_16way) 680 681SYM_FUNC_START(serpent_ecb_dec_16way) 682 /* input: 683 * %rdi: ctx, CTX 684 * %rsi: dst 685 * %rdx: src 686 */ 687 FRAME_BEGIN 688 689 vzeroupper; 690 691 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 692 693 call __serpent_dec_blk16; 694 695 store_16way(%rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2); 696 697 vzeroupper; 698 699 FRAME_END 700 RET; 701SYM_FUNC_END(serpent_ecb_dec_16way) 702 703SYM_FUNC_START(serpent_cbc_dec_16way) 704 /* input: 705 * %rdi: ctx, CTX 706 * %rsi: dst 707 * %rdx: src 708 */ 709 FRAME_BEGIN 710 711 vzeroupper; 712 713 load_16way(%rdx, RA1, RB1, RC1, RD1, RA2, RB2, RC2, RD2); 714 715 call __serpent_dec_blk16; 716 717 store_cbc_16way(%rdx, %rsi, RC1, RD1, RB1, RE1, RC2, RD2, RB2, RE2, 718 RK0); 719 720 vzeroupper; 721 722 FRAME_END 723 RET; 724SYM_FUNC_END(serpent_cbc_dec_16way) 725