1/* 2 * Serpent Cipher 4-way parallel algorithm (i586/SSE2) 3 * 4 * Copyright (C) 2011 Jussi Kivilinna <jussi.kivilinna@mbnet.fi> 5 * 6 * Based on crypto/serpent.c by 7 * Copyright (C) 2002 Dag Arne Osvik <osvik@ii.uib.no> 8 * 2003 Herbert Valerio Riedel <hvr@gnu.org> 9 * 10 * This program is free software; you can redistribute it and/or modify 11 * it under the terms of the GNU General Public License as published by 12 * the Free Software Foundation; either version 2 of the License, or 13 * (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 * GNU General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public License 21 * along with this program; if not, write to the Free Software 22 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 23 * USA 24 * 25 */ 26 27.file "serpent-sse2-i586-asm_32.S" 28.text 29 30#define arg_ctx 4 31#define arg_dst 8 32#define arg_src 12 33#define arg_xor 16 34 35/********************************************************************** 36 4-way SSE2 serpent 37 **********************************************************************/ 38#define CTX %edx 39 40#define RA %xmm0 41#define RB %xmm1 42#define RC %xmm2 43#define RD %xmm3 44#define RE %xmm4 45 46#define RT0 %xmm5 47#define RT1 %xmm6 48 49#define RNOT %xmm7 50 51#define get_key(i, j, t) \ 52 movd (4*(i)+(j))*4(CTX), t; \ 53 pshufd $0, t, t; 54 55#define K(x0, x1, x2, x3, x4, i) \ 56 get_key(i, 0, x4); \ 57 get_key(i, 1, RT0); \ 58 get_key(i, 2, RT1); \ 59 pxor x4, x0; \ 60 pxor RT0, x1; \ 61 pxor RT1, x2; \ 62 get_key(i, 3, x4); \ 63 pxor x4, x3; 64 65#define LK(x0, x1, x2, x3, x4, i) \ 66 movdqa x0, x4; \ 67 pslld $13, x0; \ 68 psrld $(32 - 13), x4; \ 69 por x4, x0; \ 70 pxor x0, x1; \ 71 movdqa x2, x4; \ 72 pslld $3, x2; \ 73 psrld $(32 - 3), x4; \ 74 por x4, x2; \ 75 pxor x2, x1; \ 76 movdqa x1, x4; \ 77 pslld $1, x1; \ 78 psrld $(32 - 1), x4; \ 79 por x4, x1; \ 80 movdqa x0, x4; \ 81 pslld $3, x4; \ 82 pxor x2, x3; \ 83 pxor x4, x3; \ 84 movdqa x3, x4; \ 85 pslld $7, x3; \ 86 psrld $(32 - 7), x4; \ 87 por x4, x3; \ 88 movdqa x1, x4; \ 89 pslld $7, x4; \ 90 pxor x1, x0; \ 91 pxor x3, x0; \ 92 pxor x3, x2; \ 93 pxor x4, x2; \ 94 movdqa x0, x4; \ 95 get_key(i, 1, RT0); \ 96 pxor RT0, x1; \ 97 get_key(i, 3, RT0); \ 98 pxor RT0, x3; \ 99 pslld $5, x0; \ 100 psrld $(32 - 5), x4; \ 101 por x4, x0; \ 102 movdqa x2, x4; \ 103 pslld $22, x2; \ 104 psrld $(32 - 22), x4; \ 105 por x4, x2; \ 106 get_key(i, 0, RT0); \ 107 pxor RT0, x0; \ 108 get_key(i, 2, RT0); \ 109 pxor RT0, x2; 110 111#define KL(x0, x1, x2, x3, x4, i) \ 112 K(x0, x1, x2, x3, x4, i); \ 113 movdqa x0, x4; \ 114 psrld $5, x0; \ 115 pslld $(32 - 5), x4; \ 116 por x4, x0; \ 117 movdqa x2, x4; \ 118 psrld $22, x2; \ 119 pslld $(32 - 22), x4; \ 120 por x4, x2; \ 121 pxor x3, x2; \ 122 pxor x3, x0; \ 123 movdqa x1, x4; \ 124 pslld $7, x4; \ 125 pxor x1, x0; \ 126 pxor x4, x2; \ 127 movdqa x1, x4; \ 128 psrld $1, x1; \ 129 pslld $(32 - 1), x4; \ 130 por x4, x1; \ 131 movdqa x3, x4; \ 132 psrld $7, x3; \ 133 pslld $(32 - 7), x4; \ 134 por x4, x3; \ 135 pxor x0, x1; \ 136 movdqa x0, x4; \ 137 pslld $3, x4; \ 138 pxor x4, x3; \ 139 movdqa x0, x4; \ 140 psrld $13, x0; \ 141 pslld $(32 - 13), x4; \ 142 por x4, x0; \ 143 pxor x2, x1; \ 144 pxor x2, x3; \ 145 movdqa x2, x4; \ 146 psrld $3, x2; \ 147 pslld $(32 - 3), x4; \ 148 por x4, x2; 149 150#define S0(x0, x1, x2, x3, x4) \ 151 movdqa x3, x4; \ 152 por x0, x3; \ 153 pxor x4, x0; \ 154 pxor x2, x4; \ 155 pxor RNOT, x4; \ 156 pxor x1, x3; \ 157 pand x0, x1; \ 158 pxor x4, x1; \ 159 pxor x0, x2; \ 160 pxor x3, x0; \ 161 por x0, x4; \ 162 pxor x2, x0; \ 163 pand x1, x2; \ 164 pxor x2, x3; \ 165 pxor RNOT, x1; \ 166 pxor x4, x2; \ 167 pxor x2, x1; 168 169#define S1(x0, x1, x2, x3, x4) \ 170 movdqa x1, x4; \ 171 pxor x0, x1; \ 172 pxor x3, x0; \ 173 pxor RNOT, x3; \ 174 pand x1, x4; \ 175 por x1, x0; \ 176 pxor x2, x3; \ 177 pxor x3, x0; \ 178 pxor x3, x1; \ 179 pxor x4, x3; \ 180 por x4, x1; \ 181 pxor x2, x4; \ 182 pand x0, x2; \ 183 pxor x1, x2; \ 184 por x0, x1; \ 185 pxor RNOT, x0; \ 186 pxor x2, x0; \ 187 pxor x1, x4; 188 189#define S2(x0, x1, x2, x3, x4) \ 190 pxor RNOT, x3; \ 191 pxor x0, x1; \ 192 movdqa x0, x4; \ 193 pand x2, x0; \ 194 pxor x3, x0; \ 195 por x4, x3; \ 196 pxor x1, x2; \ 197 pxor x1, x3; \ 198 pand x0, x1; \ 199 pxor x2, x0; \ 200 pand x3, x2; \ 201 por x1, x3; \ 202 pxor RNOT, x0; \ 203 pxor x0, x3; \ 204 pxor x0, x4; \ 205 pxor x2, x0; \ 206 por x2, x1; 207 208#define S3(x0, x1, x2, x3, x4) \ 209 movdqa x1, x4; \ 210 pxor x3, x1; \ 211 por x0, x3; \ 212 pand x0, x4; \ 213 pxor x2, x0; \ 214 pxor x1, x2; \ 215 pand x3, x1; \ 216 pxor x3, x2; \ 217 por x4, x0; \ 218 pxor x3, x4; \ 219 pxor x0, x1; \ 220 pand x3, x0; \ 221 pand x4, x3; \ 222 pxor x2, x3; \ 223 por x1, x4; \ 224 pand x1, x2; \ 225 pxor x3, x4; \ 226 pxor x3, x0; \ 227 pxor x2, x3; 228 229#define S4(x0, x1, x2, x3, x4) \ 230 movdqa x3, x4; \ 231 pand x0, x3; \ 232 pxor x4, x0; \ 233 pxor x2, x3; \ 234 por x4, x2; \ 235 pxor x1, x0; \ 236 pxor x3, x4; \ 237 por x0, x2; \ 238 pxor x1, x2; \ 239 pand x0, x1; \ 240 pxor x4, x1; \ 241 pand x2, x4; \ 242 pxor x3, x2; \ 243 pxor x0, x4; \ 244 por x1, x3; \ 245 pxor RNOT, x1; \ 246 pxor x0, x3; 247 248#define S5(x0, x1, x2, x3, x4) \ 249 movdqa x1, x4; \ 250 por x0, x1; \ 251 pxor x1, x2; \ 252 pxor RNOT, x3; \ 253 pxor x0, x4; \ 254 pxor x2, x0; \ 255 pand x4, x1; \ 256 por x3, x4; \ 257 pxor x0, x4; \ 258 pand x3, x0; \ 259 pxor x3, x1; \ 260 pxor x2, x3; \ 261 pxor x1, x0; \ 262 pand x4, x2; \ 263 pxor x2, x1; \ 264 pand x0, x2; \ 265 pxor x2, x3; 266 267#define S6(x0, x1, x2, x3, x4) \ 268 movdqa x1, x4; \ 269 pxor x0, x3; \ 270 pxor x2, x1; \ 271 pxor x0, x2; \ 272 pand x3, x0; \ 273 por x3, x1; \ 274 pxor RNOT, x4; \ 275 pxor x1, x0; \ 276 pxor x2, x1; \ 277 pxor x4, x3; \ 278 pxor x0, x4; \ 279 pand x0, x2; \ 280 pxor x1, x4; \ 281 pxor x3, x2; \ 282 pand x1, x3; \ 283 pxor x0, x3; \ 284 pxor x2, x1; 285 286#define S7(x0, x1, x2, x3, x4) \ 287 pxor RNOT, x1; \ 288 movdqa x1, x4; \ 289 pxor RNOT, x0; \ 290 pand x2, x1; \ 291 pxor x3, x1; \ 292 por x4, x3; \ 293 pxor x2, x4; \ 294 pxor x3, x2; \ 295 pxor x0, x3; \ 296 por x1, x0; \ 297 pand x0, x2; \ 298 pxor x4, x0; \ 299 pxor x3, x4; \ 300 pand x0, x3; \ 301 pxor x1, x4; \ 302 pxor x4, x2; \ 303 pxor x1, x3; \ 304 por x0, x4; \ 305 pxor x1, x4; 306 307#define SI0(x0, x1, x2, x3, x4) \ 308 movdqa x3, x4; \ 309 pxor x0, x1; \ 310 por x1, x3; \ 311 pxor x1, x4; \ 312 pxor RNOT, x0; \ 313 pxor x3, x2; \ 314 pxor x0, x3; \ 315 pand x1, x0; \ 316 pxor x2, x0; \ 317 pand x3, x2; \ 318 pxor x4, x3; \ 319 pxor x3, x2; \ 320 pxor x3, x1; \ 321 pand x0, x3; \ 322 pxor x0, x1; \ 323 pxor x2, x0; \ 324 pxor x3, x4; 325 326#define SI1(x0, x1, x2, x3, x4) \ 327 pxor x3, x1; \ 328 movdqa x0, x4; \ 329 pxor x2, x0; \ 330 pxor RNOT, x2; \ 331 por x1, x4; \ 332 pxor x3, x4; \ 333 pand x1, x3; \ 334 pxor x2, x1; \ 335 pand x4, x2; \ 336 pxor x1, x4; \ 337 por x3, x1; \ 338 pxor x0, x3; \ 339 pxor x0, x2; \ 340 por x4, x0; \ 341 pxor x4, x2; \ 342 pxor x0, x1; \ 343 pxor x1, x4; 344 345#define SI2(x0, x1, x2, x3, x4) \ 346 pxor x1, x2; \ 347 movdqa x3, x4; \ 348 pxor RNOT, x3; \ 349 por x2, x3; \ 350 pxor x4, x2; \ 351 pxor x0, x4; \ 352 pxor x1, x3; \ 353 por x2, x1; \ 354 pxor x0, x2; \ 355 pxor x4, x1; \ 356 por x3, x4; \ 357 pxor x3, x2; \ 358 pxor x2, x4; \ 359 pand x1, x2; \ 360 pxor x3, x2; \ 361 pxor x4, x3; \ 362 pxor x0, x4; 363 364#define SI3(x0, x1, x2, x3, x4) \ 365 pxor x1, x2; \ 366 movdqa x1, x4; \ 367 pand x2, x1; \ 368 pxor x0, x1; \ 369 por x4, x0; \ 370 pxor x3, x4; \ 371 pxor x3, x0; \ 372 por x1, x3; \ 373 pxor x2, x1; \ 374 pxor x3, x1; \ 375 pxor x2, x0; \ 376 pxor x3, x2; \ 377 pand x1, x3; \ 378 pxor x0, x1; \ 379 pand x2, x0; \ 380 pxor x3, x4; \ 381 pxor x0, x3; \ 382 pxor x1, x0; 383 384#define SI4(x0, x1, x2, x3, x4) \ 385 pxor x3, x2; \ 386 movdqa x0, x4; \ 387 pand x1, x0; \ 388 pxor x2, x0; \ 389 por x3, x2; \ 390 pxor RNOT, x4; \ 391 pxor x0, x1; \ 392 pxor x2, x0; \ 393 pand x4, x2; \ 394 pxor x0, x2; \ 395 por x4, x0; \ 396 pxor x3, x0; \ 397 pand x2, x3; \ 398 pxor x3, x4; \ 399 pxor x1, x3; \ 400 pand x0, x1; \ 401 pxor x1, x4; \ 402 pxor x3, x0; 403 404#define SI5(x0, x1, x2, x3, x4) \ 405 movdqa x1, x4; \ 406 por x2, x1; \ 407 pxor x4, x2; \ 408 pxor x3, x1; \ 409 pand x4, x3; \ 410 pxor x3, x2; \ 411 por x0, x3; \ 412 pxor RNOT, x0; \ 413 pxor x2, x3; \ 414 por x0, x2; \ 415 pxor x1, x4; \ 416 pxor x4, x2; \ 417 pand x0, x4; \ 418 pxor x1, x0; \ 419 pxor x3, x1; \ 420 pand x2, x0; \ 421 pxor x3, x2; \ 422 pxor x2, x0; \ 423 pxor x4, x2; \ 424 pxor x3, x4; 425 426#define SI6(x0, x1, x2, x3, x4) \ 427 pxor x2, x0; \ 428 movdqa x0, x4; \ 429 pand x3, x0; \ 430 pxor x3, x2; \ 431 pxor x2, x0; \ 432 pxor x1, x3; \ 433 por x4, x2; \ 434 pxor x3, x2; \ 435 pand x0, x3; \ 436 pxor RNOT, x0; \ 437 pxor x1, x3; \ 438 pand x2, x1; \ 439 pxor x0, x4; \ 440 pxor x4, x3; \ 441 pxor x2, x4; \ 442 pxor x1, x0; \ 443 pxor x0, x2; 444 445#define SI7(x0, x1, x2, x3, x4) \ 446 movdqa x3, x4; \ 447 pand x0, x3; \ 448 pxor x2, x0; \ 449 por x4, x2; \ 450 pxor x1, x4; \ 451 pxor RNOT, x0; \ 452 por x3, x1; \ 453 pxor x0, x4; \ 454 pand x2, x0; \ 455 pxor x1, x0; \ 456 pand x2, x1; \ 457 pxor x2, x3; \ 458 pxor x3, x4; \ 459 pand x3, x2; \ 460 por x0, x3; \ 461 pxor x4, x1; \ 462 pxor x4, x3; \ 463 pand x0, x4; \ 464 pxor x2, x4; 465 466#define transpose_4x4(x0, x1, x2, x3, t1, t2, t3) \ 467 movdqa x2, t3; \ 468 movdqa x0, t1; \ 469 unpcklps x3, t3; \ 470 movdqa x0, t2; \ 471 unpcklps x1, t1; \ 472 unpckhps x1, t2; \ 473 movdqa t3, x1; \ 474 unpckhps x3, x2; \ 475 movdqa t1, x0; \ 476 movhlps t1, x1; \ 477 movdqa t2, t1; \ 478 movlhps t3, x0; \ 479 movlhps x2, t1; \ 480 movhlps t2, x2; \ 481 movdqa x2, x3; \ 482 movdqa t1, x2; 483 484#define read_blocks(in, x0, x1, x2, x3, t0, t1, t2) \ 485 movdqu (0*4*4)(in), x0; \ 486 movdqu (1*4*4)(in), x1; \ 487 movdqu (2*4*4)(in), x2; \ 488 movdqu (3*4*4)(in), x3; \ 489 \ 490 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) 491 492#define write_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 493 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 494 \ 495 movdqu x0, (0*4*4)(out); \ 496 movdqu x1, (1*4*4)(out); \ 497 movdqu x2, (2*4*4)(out); \ 498 movdqu x3, (3*4*4)(out); 499 500#define xor_blocks(out, x0, x1, x2, x3, t0, t1, t2) \ 501 transpose_4x4(x0, x1, x2, x3, t0, t1, t2) \ 502 \ 503 movdqu (0*4*4)(out), t0; \ 504 pxor t0, x0; \ 505 movdqu x0, (0*4*4)(out); \ 506 movdqu (1*4*4)(out), t0; \ 507 pxor t0, x1; \ 508 movdqu x1, (1*4*4)(out); \ 509 movdqu (2*4*4)(out), t0; \ 510 pxor t0, x2; \ 511 movdqu x2, (2*4*4)(out); \ 512 movdqu (3*4*4)(out), t0; \ 513 pxor t0, x3; \ 514 movdqu x3, (3*4*4)(out); 515 516.align 8 517.global __serpent_enc_blk_4way 518.type __serpent_enc_blk_4way,@function; 519 520__serpent_enc_blk_4way: 521 /* input: 522 * arg_ctx(%esp): ctx, CTX 523 * arg_dst(%esp): dst 524 * arg_src(%esp): src 525 * arg_xor(%esp): bool, if true: xor output 526 */ 527 528 pcmpeqd RNOT, RNOT; 529 530 movl arg_ctx(%esp), CTX; 531 532 movl arg_src(%esp), %eax; 533 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 534 535 K(RA, RB, RC, RD, RE, 0); 536 S0(RA, RB, RC, RD, RE); LK(RC, RB, RD, RA, RE, 1); 537 S1(RC, RB, RD, RA, RE); LK(RE, RD, RA, RC, RB, 2); 538 S2(RE, RD, RA, RC, RB); LK(RB, RD, RE, RC, RA, 3); 539 S3(RB, RD, RE, RC, RA); LK(RC, RA, RD, RB, RE, 4); 540 S4(RC, RA, RD, RB, RE); LK(RA, RD, RB, RE, RC, 5); 541 S5(RA, RD, RB, RE, RC); LK(RC, RA, RD, RE, RB, 6); 542 S6(RC, RA, RD, RE, RB); LK(RD, RB, RA, RE, RC, 7); 543 S7(RD, RB, RA, RE, RC); LK(RC, RA, RE, RD, RB, 8); 544 S0(RC, RA, RE, RD, RB); LK(RE, RA, RD, RC, RB, 9); 545 S1(RE, RA, RD, RC, RB); LK(RB, RD, RC, RE, RA, 10); 546 S2(RB, RD, RC, RE, RA); LK(RA, RD, RB, RE, RC, 11); 547 S3(RA, RD, RB, RE, RC); LK(RE, RC, RD, RA, RB, 12); 548 S4(RE, RC, RD, RA, RB); LK(RC, RD, RA, RB, RE, 13); 549 S5(RC, RD, RA, RB, RE); LK(RE, RC, RD, RB, RA, 14); 550 S6(RE, RC, RD, RB, RA); LK(RD, RA, RC, RB, RE, 15); 551 S7(RD, RA, RC, RB, RE); LK(RE, RC, RB, RD, RA, 16); 552 S0(RE, RC, RB, RD, RA); LK(RB, RC, RD, RE, RA, 17); 553 S1(RB, RC, RD, RE, RA); LK(RA, RD, RE, RB, RC, 18); 554 S2(RA, RD, RE, RB, RC); LK(RC, RD, RA, RB, RE, 19); 555 S3(RC, RD, RA, RB, RE); LK(RB, RE, RD, RC, RA, 20); 556 S4(RB, RE, RD, RC, RA); LK(RE, RD, RC, RA, RB, 21); 557 S5(RE, RD, RC, RA, RB); LK(RB, RE, RD, RA, RC, 22); 558 S6(RB, RE, RD, RA, RC); LK(RD, RC, RE, RA, RB, 23); 559 S7(RD, RC, RE, RA, RB); LK(RB, RE, RA, RD, RC, 24); 560 S0(RB, RE, RA, RD, RC); LK(RA, RE, RD, RB, RC, 25); 561 S1(RA, RE, RD, RB, RC); LK(RC, RD, RB, RA, RE, 26); 562 S2(RC, RD, RB, RA, RE); LK(RE, RD, RC, RA, RB, 27); 563 S3(RE, RD, RC, RA, RB); LK(RA, RB, RD, RE, RC, 28); 564 S4(RA, RB, RD, RE, RC); LK(RB, RD, RE, RC, RA, 29); 565 S5(RB, RD, RE, RC, RA); LK(RA, RB, RD, RC, RE, 30); 566 S6(RA, RB, RD, RC, RE); LK(RD, RE, RB, RC, RA, 31); 567 S7(RD, RE, RB, RC, RA); K(RA, RB, RC, RD, RE, 32); 568 569 movl arg_dst(%esp), %eax; 570 571 cmpb $0, arg_xor(%esp); 572 jnz __enc_xor4; 573 574 write_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 575 576 ret; 577 578__enc_xor4: 579 xor_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 580 581 ret; 582 583.align 8 584.global serpent_dec_blk_4way 585.type serpent_dec_blk_4way,@function; 586 587serpent_dec_blk_4way: 588 /* input: 589 * arg_ctx(%esp): ctx, CTX 590 * arg_dst(%esp): dst 591 * arg_src(%esp): src 592 */ 593 594 pcmpeqd RNOT, RNOT; 595 596 movl arg_ctx(%esp), CTX; 597 598 movl arg_src(%esp), %eax; 599 read_blocks(%eax, RA, RB, RC, RD, RT0, RT1, RE); 600 601 K(RA, RB, RC, RD, RE, 32); 602 SI7(RA, RB, RC, RD, RE); KL(RB, RD, RA, RE, RC, 31); 603 SI6(RB, RD, RA, RE, RC); KL(RA, RC, RE, RB, RD, 30); 604 SI5(RA, RC, RE, RB, RD); KL(RC, RD, RA, RE, RB, 29); 605 SI4(RC, RD, RA, RE, RB); KL(RC, RA, RB, RE, RD, 28); 606 SI3(RC, RA, RB, RE, RD); KL(RB, RC, RD, RE, RA, 27); 607 SI2(RB, RC, RD, RE, RA); KL(RC, RA, RE, RD, RB, 26); 608 SI1(RC, RA, RE, RD, RB); KL(RB, RA, RE, RD, RC, 25); 609 SI0(RB, RA, RE, RD, RC); KL(RE, RC, RA, RB, RD, 24); 610 SI7(RE, RC, RA, RB, RD); KL(RC, RB, RE, RD, RA, 23); 611 SI6(RC, RB, RE, RD, RA); KL(RE, RA, RD, RC, RB, 22); 612 SI5(RE, RA, RD, RC, RB); KL(RA, RB, RE, RD, RC, 21); 613 SI4(RA, RB, RE, RD, RC); KL(RA, RE, RC, RD, RB, 20); 614 SI3(RA, RE, RC, RD, RB); KL(RC, RA, RB, RD, RE, 19); 615 SI2(RC, RA, RB, RD, RE); KL(RA, RE, RD, RB, RC, 18); 616 SI1(RA, RE, RD, RB, RC); KL(RC, RE, RD, RB, RA, 17); 617 SI0(RC, RE, RD, RB, RA); KL(RD, RA, RE, RC, RB, 16); 618 SI7(RD, RA, RE, RC, RB); KL(RA, RC, RD, RB, RE, 15); 619 SI6(RA, RC, RD, RB, RE); KL(RD, RE, RB, RA, RC, 14); 620 SI5(RD, RE, RB, RA, RC); KL(RE, RC, RD, RB, RA, 13); 621 SI4(RE, RC, RD, RB, RA); KL(RE, RD, RA, RB, RC, 12); 622 SI3(RE, RD, RA, RB, RC); KL(RA, RE, RC, RB, RD, 11); 623 SI2(RA, RE, RC, RB, RD); KL(RE, RD, RB, RC, RA, 10); 624 SI1(RE, RD, RB, RC, RA); KL(RA, RD, RB, RC, RE, 9); 625 SI0(RA, RD, RB, RC, RE); KL(RB, RE, RD, RA, RC, 8); 626 SI7(RB, RE, RD, RA, RC); KL(RE, RA, RB, RC, RD, 7); 627 SI6(RE, RA, RB, RC, RD); KL(RB, RD, RC, RE, RA, 6); 628 SI5(RB, RD, RC, RE, RA); KL(RD, RA, RB, RC, RE, 5); 629 SI4(RD, RA, RB, RC, RE); KL(RD, RB, RE, RC, RA, 4); 630 SI3(RD, RB, RE, RC, RA); KL(RE, RD, RA, RC, RB, 3); 631 SI2(RE, RD, RA, RC, RB); KL(RD, RB, RC, RA, RE, 2); 632 SI1(RD, RB, RC, RA, RE); KL(RE, RB, RC, RA, RD, 1); 633 SI0(RE, RB, RC, RA, RD); K(RC, RD, RB, RE, RA, 0); 634 635 movl arg_dst(%esp), %eax; 636 write_blocks(%eax, RC, RD, RB, RE, RT0, RT1, RA); 637 638 ret; 639