1 /* 2 * Implementation of the Skein block functions. 3 * Source code author: Doug Whiting, 2008. 4 * This algorithm and source code is released to the public domain. 5 * Compile-time switches: 6 * SKEIN_USE_ASM -- set bits (256/512/1024) to select which 7 * versions use ASM code for block processing 8 * [default: use C for all block sizes] 9 */ 10 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */ 11 12 #include <sys/skein.h> 13 #include "skein_impl.h" 14 #include <sys/isa_defs.h> /* for _ILP32 */ 15 16 #ifndef SKEIN_USE_ASM 17 #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ 18 #endif 19 20 #ifndef SKEIN_LOOP 21 /* 22 * The low-level checksum routines use a lot of stack space. On systems where 23 * small stacks frame are enforced (like 32-bit kernel builds), do not unroll 24 * checksum calculations to save stack space. 25 * 26 * Even with no loops unrolled, we still can exceed the 1k stack frame limit 27 * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can 28 * safely ignore it though, since that the checksum functions will be called 29 * from a worker thread that won't be using much stack. That's why we have 30 * the #pragma here to ignore the warning. 31 */ 32 #if defined(_ILP32) || defined(__powerpc) /* Assume small stack */ 33 #if defined(__GNUC__) && !defined(__clang__) 34 #pragma GCC diagnostic ignored "-Wframe-larger-than=" 35 #endif 36 /* 37 * We're running on 32-bit, don't unroll loops to save stack frame space 38 * 39 * Due to the ways the calculations on SKEIN_LOOP are done in 40 * Skein_*_Process_Block(), a value of 111 disables unrolling loops 41 * in any of those functions. 42 */ 43 #define SKEIN_LOOP 111 44 #else 45 /* We're compiling with large stacks */ 46 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ 47 #endif 48 #endif 49 50 /* some useful definitions for code here */ 51 #define BLK_BITS (WCNT*64) 52 #define KW_TWK_BASE (0) 53 #define KW_KEY_BASE (3) 54 #define ks (kw + KW_KEY_BASE) 55 #define ts (kw + KW_TWK_BASE) 56 57 /* no debugging in Illumos version */ 58 #define DebugSaveTweak(ctx) 59 60 /* Skein_256 */ 61 #if !(SKEIN_USE_ASM & 256) 62 void 63 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, 64 size_t blkCnt, size_t byteCntAdd) 65 { 66 enum { 67 WCNT = SKEIN_256_STATE_WORDS 68 }; 69 #undef RCNT 70 #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) 71 72 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 73 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) 74 #else 75 #define SKEIN_UNROLL_256 (0) 76 #endif 77 78 #if SKEIN_UNROLL_256 79 #if (RCNT % SKEIN_UNROLL_256) 80 #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ 81 #endif 82 size_t r; 83 /* key schedule words : chaining vars + tweak + "rotation" */ 84 uint64_t kw[WCNT + 4 + RCNT * 2]; 85 #else 86 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 87 #endif 88 /* local copy of context vars, for speed */ 89 uint64_t X0, X1, X2, X3; 90 uint64_t w[WCNT]; /* local copy of input block */ 91 #ifdef SKEIN_DEBUG 92 /* use for debugging (help compiler put Xn in registers) */ 93 const uint64_t *Xptr[4]; 94 Xptr[0] = &X0; 95 Xptr[1] = &X1; 96 Xptr[2] = &X2; 97 Xptr[3] = &X3; 98 #endif 99 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 100 ts[0] = ctx->h.T[0]; 101 ts[1] = ctx->h.T[1]; 102 do { 103 /* 104 * this implementation only supports 2**64 input bytes 105 * (no carry out here) 106 */ 107 ts[0] += byteCntAdd; /* update processed length */ 108 109 /* precompute the key schedule for this block */ 110 ks[0] = ctx->X[0]; 111 ks[1] = ctx->X[1]; 112 ks[2] = ctx->X[2]; 113 ks[3] = ctx->X[3]; 114 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; 115 116 ts[2] = ts[0] ^ ts[1]; 117 118 /* get input block in little-endian format */ 119 Skein_Get64_LSB_First(w, blkPtr, WCNT); 120 DebugSaveTweak(ctx); 121 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 122 123 X0 = w[0] + ks[0]; /* do the first full key injection */ 124 X1 = w[1] + ks[1] + ts[0]; 125 X2 = w[2] + ks[2] + ts[1]; 126 X3 = w[3] + ks[3]; 127 128 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 129 Xptr); /* show starting state values */ 130 131 blkPtr += SKEIN_256_BLOCK_BYTES; 132 133 /* run the rounds */ 134 135 #define Round256(p0, p1, p2, p3, ROT, rNum) \ 136 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ 137 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ 138 139 #if SKEIN_UNROLL_256 == 0 140 #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ 141 Round256(p0, p1, p2, p3, ROT, rNum) \ 142 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); 143 144 #define I256(R) \ 145 X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ 146 X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ 147 X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ 148 X3 += ks[((R) + 4) % 5] + (R) + 1; \ 149 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 150 #else /* looping version */ 151 #define R256(p0, p1, p2, p3, ROT, rNum) \ 152 Round256(p0, p1, p2, p3, ROT, rNum) \ 153 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); 154 155 #define I256(R) \ 156 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 157 X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ 158 X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ 159 X3 += ks[r + (R) + 3] + r + (R); \ 160 ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \ 161 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 162 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 163 164 /* loop through it */ 165 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) 166 #endif 167 { 168 #define R256_8_rounds(R) \ 169 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ 170 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ 171 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ 172 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ 173 I256(2 * (R)); \ 174 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ 175 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ 176 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ 177 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ 178 I256(2 * (R) + 1); 179 180 R256_8_rounds(0); 181 182 #define R256_Unroll_R(NN) \ 183 ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ 184 (SKEIN_UNROLL_256 > (NN))) 185 186 #if R256_Unroll_R(1) 187 R256_8_rounds(1); 188 #endif 189 #if R256_Unroll_R(2) 190 R256_8_rounds(2); 191 #endif 192 #if R256_Unroll_R(3) 193 R256_8_rounds(3); 194 #endif 195 #if R256_Unroll_R(4) 196 R256_8_rounds(4); 197 #endif 198 #if R256_Unroll_R(5) 199 R256_8_rounds(5); 200 #endif 201 #if R256_Unroll_R(6) 202 R256_8_rounds(6); 203 #endif 204 #if R256_Unroll_R(7) 205 R256_8_rounds(7); 206 #endif 207 #if R256_Unroll_R(8) 208 R256_8_rounds(8); 209 #endif 210 #if R256_Unroll_R(9) 211 R256_8_rounds(9); 212 #endif 213 #if R256_Unroll_R(10) 214 R256_8_rounds(10); 215 #endif 216 #if R256_Unroll_R(11) 217 R256_8_rounds(11); 218 #endif 219 #if R256_Unroll_R(12) 220 R256_8_rounds(12); 221 #endif 222 #if R256_Unroll_R(13) 223 R256_8_rounds(13); 224 #endif 225 #if R256_Unroll_R(14) 226 R256_8_rounds(14); 227 #endif 228 #if (SKEIN_UNROLL_256 > 14) 229 #error "need more unrolling in Skein_256_Process_Block" 230 #endif 231 } 232 /* 233 * do the final "feedforward" xor, update context chaining vars 234 */ 235 ctx->X[0] = X0 ^ w[0]; 236 ctx->X[1] = X1 ^ w[1]; 237 ctx->X[2] = X2 ^ w[2]; 238 ctx->X[3] = X3 ^ w[3]; 239 240 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 241 242 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 243 } while (--blkCnt); 244 ctx->h.T[0] = ts[0]; 245 ctx->h.T[1] = ts[1]; 246 } 247 248 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 249 size_t 250 Skein_256_Process_Block_CodeSize(void) 251 { 252 return ((uint8_t *)Skein_256_Process_Block_CodeSize) - 253 ((uint8_t *)Skein_256_Process_Block); 254 } 255 256 uint_t 257 Skein_256_Unroll_Cnt(void) 258 { 259 return (SKEIN_UNROLL_256); 260 } 261 #endif 262 #endif 263 264 /* Skein_512 */ 265 #if !(SKEIN_USE_ASM & 512) 266 void 267 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, 268 size_t blkCnt, size_t byteCntAdd) 269 { 270 enum { 271 WCNT = SKEIN_512_STATE_WORDS 272 }; 273 #undef RCNT 274 #define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) 275 276 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 277 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) 278 #else 279 #define SKEIN_UNROLL_512 (0) 280 #endif 281 282 #if SKEIN_UNROLL_512 283 #if (RCNT % SKEIN_UNROLL_512) 284 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ 285 #endif 286 size_t r; 287 /* key schedule words : chaining vars + tweak + "rotation" */ 288 uint64_t kw[WCNT + 4 + RCNT * 2]; 289 #else 290 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 291 #endif 292 /* local copy of vars, for speed */ 293 uint64_t X0, X1, X2, X3, X4, X5, X6, X7; 294 uint64_t w[WCNT]; /* local copy of input block */ 295 #ifdef SKEIN_DEBUG 296 /* use for debugging (help compiler put Xn in registers) */ 297 const uint64_t *Xptr[8]; 298 Xptr[0] = &X0; 299 Xptr[1] = &X1; 300 Xptr[2] = &X2; 301 Xptr[3] = &X3; 302 Xptr[4] = &X4; 303 Xptr[5] = &X5; 304 Xptr[6] = &X6; 305 Xptr[7] = &X7; 306 #endif 307 308 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 309 ts[0] = ctx->h.T[0]; 310 ts[1] = ctx->h.T[1]; 311 do { 312 /* 313 * this implementation only supports 2**64 input bytes 314 * (no carry out here) 315 */ 316 ts[0] += byteCntAdd; /* update processed length */ 317 318 /* precompute the key schedule for this block */ 319 ks[0] = ctx->X[0]; 320 ks[1] = ctx->X[1]; 321 ks[2] = ctx->X[2]; 322 ks[3] = ctx->X[3]; 323 ks[4] = ctx->X[4]; 324 ks[5] = ctx->X[5]; 325 ks[6] = ctx->X[6]; 326 ks[7] = ctx->X[7]; 327 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 328 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; 329 330 ts[2] = ts[0] ^ ts[1]; 331 332 /* get input block in little-endian format */ 333 Skein_Get64_LSB_First(w, blkPtr, WCNT); 334 DebugSaveTweak(ctx); 335 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 336 337 X0 = w[0] + ks[0]; /* do the first full key injection */ 338 X1 = w[1] + ks[1]; 339 X2 = w[2] + ks[2]; 340 X3 = w[3] + ks[3]; 341 X4 = w[4] + ks[4]; 342 X5 = w[5] + ks[5] + ts[0]; 343 X6 = w[6] + ks[6] + ts[1]; 344 X7 = w[7] + ks[7]; 345 346 blkPtr += SKEIN_512_BLOCK_BYTES; 347 348 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 349 Xptr); 350 /* run the rounds */ 351 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 352 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ 353 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ 354 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ 355 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; 356 357 #if SKEIN_UNROLL_512 == 0 358 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ 359 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 360 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); 361 362 #define I512(R) \ 363 X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\ 364 X1 += ks[((R) + 2) % 9]; \ 365 X2 += ks[((R) + 3) % 9]; \ 366 X3 += ks[((R) + 4) % 9]; \ 367 X4 += ks[((R) + 5) % 9]; \ 368 X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ 369 X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ 370 X7 += ks[((R) + 8) % 9] + (R) + 1; \ 371 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 372 #else /* looping version */ 373 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 374 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 375 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); 376 377 #define I512(R) \ 378 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 379 X1 += ks[r + (R) + 1]; \ 380 X2 += ks[r + (R) + 2]; \ 381 X3 += ks[r + (R) + 3]; \ 382 X4 += ks[r + (R) + 4]; \ 383 X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ 384 X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ 385 X7 += ks[r + (R) + 7] + r + (R); \ 386 ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\ 387 ts[r + (R)+2] = ts[r + (R) - 1]; \ 388 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 389 390 /* loop through it */ 391 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) 392 #endif /* end of looped code definitions */ 393 { 394 #define R512_8_rounds(R) /* do 8 full rounds */ \ 395 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ 396 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ 397 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ 398 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ 399 I512(2 * (R)); \ 400 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ 401 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ 402 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ 403 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ 404 I512(2*(R) + 1); /* and key injection */ 405 406 R512_8_rounds(0); 407 408 #define R512_Unroll_R(NN) \ 409 ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \ 410 (SKEIN_UNROLL_512 > (NN))) 411 412 #if R512_Unroll_R(1) 413 R512_8_rounds(1); 414 #endif 415 #if R512_Unroll_R(2) 416 R512_8_rounds(2); 417 #endif 418 #if R512_Unroll_R(3) 419 R512_8_rounds(3); 420 #endif 421 #if R512_Unroll_R(4) 422 R512_8_rounds(4); 423 #endif 424 #if R512_Unroll_R(5) 425 R512_8_rounds(5); 426 #endif 427 #if R512_Unroll_R(6) 428 R512_8_rounds(6); 429 #endif 430 #if R512_Unroll_R(7) 431 R512_8_rounds(7); 432 #endif 433 #if R512_Unroll_R(8) 434 R512_8_rounds(8); 435 #endif 436 #if R512_Unroll_R(9) 437 R512_8_rounds(9); 438 #endif 439 #if R512_Unroll_R(10) 440 R512_8_rounds(10); 441 #endif 442 #if R512_Unroll_R(11) 443 R512_8_rounds(11); 444 #endif 445 #if R512_Unroll_R(12) 446 R512_8_rounds(12); 447 #endif 448 #if R512_Unroll_R(13) 449 R512_8_rounds(13); 450 #endif 451 #if R512_Unroll_R(14) 452 R512_8_rounds(14); 453 #endif 454 #if (SKEIN_UNROLL_512 > 14) 455 #error "need more unrolling in Skein_512_Process_Block" 456 #endif 457 } 458 459 /* 460 * do the final "feedforward" xor, update context chaining vars 461 */ 462 ctx->X[0] = X0 ^ w[0]; 463 ctx->X[1] = X1 ^ w[1]; 464 ctx->X[2] = X2 ^ w[2]; 465 ctx->X[3] = X3 ^ w[3]; 466 ctx->X[4] = X4 ^ w[4]; 467 ctx->X[5] = X5 ^ w[5]; 468 ctx->X[6] = X6 ^ w[6]; 469 ctx->X[7] = X7 ^ w[7]; 470 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 471 472 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 473 } while (--blkCnt); 474 ctx->h.T[0] = ts[0]; 475 ctx->h.T[1] = ts[1]; 476 } 477 478 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 479 size_t 480 Skein_512_Process_Block_CodeSize(void) 481 { 482 return ((uint8_t *)Skein_512_Process_Block_CodeSize) - 483 ((uint8_t *)Skein_512_Process_Block); 484 } 485 486 uint_t 487 Skein_512_Unroll_Cnt(void) 488 { 489 return (SKEIN_UNROLL_512); 490 } 491 #endif 492 #endif 493 494 /* Skein1024 */ 495 #if !(SKEIN_USE_ASM & 1024) 496 void 497 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, 498 size_t blkCnt, size_t byteCntAdd) 499 { 500 /* do it in C, always looping (unrolled is bigger AND slower!) */ 501 enum { 502 WCNT = SKEIN1024_STATE_WORDS 503 }; 504 #undef RCNT 505 #define RCNT (SKEIN1024_ROUNDS_TOTAL/8) 506 507 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 508 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) 509 #else 510 #define SKEIN_UNROLL_1024 (0) 511 #endif 512 513 #if (SKEIN_UNROLL_1024 != 0) 514 #if (RCNT % SKEIN_UNROLL_1024) 515 #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ 516 #endif 517 size_t r; 518 /* key schedule words : chaining vars + tweak + "rotation" */ 519 uint64_t kw[WCNT + 4 + RCNT * 2]; 520 #else 521 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 522 #endif 523 524 /* local copy of vars, for speed */ 525 uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11, 526 X12, X13, X14, X15; 527 uint64_t w[WCNT]; /* local copy of input block */ 528 #ifdef SKEIN_DEBUG 529 /* use for debugging (help compiler put Xn in registers) */ 530 const uint64_t *Xptr[16]; 531 Xptr[0] = &X00; 532 Xptr[1] = &X01; 533 Xptr[2] = &X02; 534 Xptr[3] = &X03; 535 Xptr[4] = &X04; 536 Xptr[5] = &X05; 537 Xptr[6] = &X06; 538 Xptr[7] = &X07; 539 Xptr[8] = &X08; 540 Xptr[9] = &X09; 541 Xptr[10] = &X10; 542 Xptr[11] = &X11; 543 Xptr[12] = &X12; 544 Xptr[13] = &X13; 545 Xptr[14] = &X14; 546 Xptr[15] = &X15; 547 #endif 548 549 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 550 ts[0] = ctx->h.T[0]; 551 ts[1] = ctx->h.T[1]; 552 do { 553 /* 554 * this implementation only supports 2**64 input bytes 555 * (no carry out here) 556 */ 557 ts[0] += byteCntAdd; /* update processed length */ 558 559 /* precompute the key schedule for this block */ 560 ks[0] = ctx->X[0]; 561 ks[1] = ctx->X[1]; 562 ks[2] = ctx->X[2]; 563 ks[3] = ctx->X[3]; 564 ks[4] = ctx->X[4]; 565 ks[5] = ctx->X[5]; 566 ks[6] = ctx->X[6]; 567 ks[7] = ctx->X[7]; 568 ks[8] = ctx->X[8]; 569 ks[9] = ctx->X[9]; 570 ks[10] = ctx->X[10]; 571 ks[11] = ctx->X[11]; 572 ks[12] = ctx->X[12]; 573 ks[13] = ctx->X[13]; 574 ks[14] = ctx->X[14]; 575 ks[15] = ctx->X[15]; 576 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 577 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ 578 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ 579 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; 580 581 ts[2] = ts[0] ^ ts[1]; 582 583 /* get input block in little-endian format */ 584 Skein_Get64_LSB_First(w, blkPtr, WCNT); 585 DebugSaveTweak(ctx); 586 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 587 588 X00 = w[0] + ks[0]; /* do the first full key injection */ 589 X01 = w[1] + ks[1]; 590 X02 = w[2] + ks[2]; 591 X03 = w[3] + ks[3]; 592 X04 = w[4] + ks[4]; 593 X05 = w[5] + ks[5]; 594 X06 = w[6] + ks[6]; 595 X07 = w[7] + ks[7]; 596 X08 = w[8] + ks[8]; 597 X09 = w[9] + ks[9]; 598 X10 = w[10] + ks[10]; 599 X11 = w[11] + ks[11]; 600 X12 = w[12] + ks[12]; 601 X13 = w[13] + ks[13] + ts[0]; 602 X14 = w[14] + ks[14] + ts[1]; 603 X15 = w[15] + ks[15]; 604 605 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 606 Xptr); 607 608 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 609 pD, pE, pF, ROT, rNum) \ 610 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ 611 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ 612 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ 613 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\ 614 X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\ 615 X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\ 616 X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\ 617 X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; 618 619 #if SKEIN_UNROLL_1024 == 0 620 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ 621 pE, pF, ROT, rn) \ 622 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 623 pD, pE, pF, ROT, rn) \ 624 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); 625 626 #define I1024(R) \ 627 X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\ 628 X01 += ks[((R) + 2) % 17]; \ 629 X02 += ks[((R) + 3) % 17]; \ 630 X03 += ks[((R) + 4) % 17]; \ 631 X04 += ks[((R) + 5) % 17]; \ 632 X05 += ks[((R) + 6) % 17]; \ 633 X06 += ks[((R) + 7) % 17]; \ 634 X07 += ks[((R) + 8) % 17]; \ 635 X08 += ks[((R) + 9) % 17]; \ 636 X09 += ks[((R) + 10) % 17]; \ 637 X10 += ks[((R) + 11) % 17]; \ 638 X11 += ks[((R) + 12) % 17]; \ 639 X12 += ks[((R) + 13) % 17]; \ 640 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ 641 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ 642 X15 += ks[((R) + 16) % 17] + (R) +1; \ 643 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 644 #else /* looping version */ 645 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ 646 pE, pF, ROT, rn) \ 647 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 648 pD, pE, pF, ROT, rn) \ 649 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); 650 651 #define I1024(R) \ 652 X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 653 X01 += ks[r + (R) + 1]; \ 654 X02 += ks[r + (R) + 2]; \ 655 X03 += ks[r + (R) + 3]; \ 656 X04 += ks[r + (R) + 4]; \ 657 X05 += ks[r + (R) + 5]; \ 658 X06 += ks[r + (R) + 6]; \ 659 X07 += ks[r + (R) + 7]; \ 660 X08 += ks[r + (R) + 8]; \ 661 X09 += ks[r + (R) + 9]; \ 662 X10 += ks[r + (R) + 10]; \ 663 X11 += ks[r + (R) + 11]; \ 664 X12 += ks[r + (R) + 12]; \ 665 X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ 666 X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ 667 X15 += ks[r + (R) + 15] + r + (R); \ 668 ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\ 669 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 670 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 671 672 /* loop through it */ 673 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) 674 #endif 675 { 676 #define R1024_8_rounds(R) /* do 8 full rounds */ \ 677 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 678 14, 15, R1024_0, 8 * (R) + 1); \ 679 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 680 08, 01, R1024_1, 8 * (R) + 2); \ 681 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 682 10, 09, R1024_2, 8 * (R) + 3); \ 683 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 684 12, 07, R1024_3, 8 * (R) + 4); \ 685 I1024(2 * (R)); \ 686 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 687 14, 15, R1024_4, 8 * (R) + 5); \ 688 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 689 08, 01, R1024_5, 8 * (R) + 6); \ 690 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 691 10, 09, R1024_6, 8 * (R) + 7); \ 692 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 693 12, 07, R1024_7, 8 * (R) + 8); \ 694 I1024(2 * (R) + 1); 695 696 R1024_8_rounds(0); 697 698 #define R1024_Unroll_R(NN) \ 699 ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \ 700 (SKEIN_UNROLL_1024 > (NN))) 701 702 #if R1024_Unroll_R(1) 703 R1024_8_rounds(1); 704 #endif 705 #if R1024_Unroll_R(2) 706 R1024_8_rounds(2); 707 #endif 708 #if R1024_Unroll_R(3) 709 R1024_8_rounds(3); 710 #endif 711 #if R1024_Unroll_R(4) 712 R1024_8_rounds(4); 713 #endif 714 #if R1024_Unroll_R(5) 715 R1024_8_rounds(5); 716 #endif 717 #if R1024_Unroll_R(6) 718 R1024_8_rounds(6); 719 #endif 720 #if R1024_Unroll_R(7) 721 R1024_8_rounds(7); 722 #endif 723 #if R1024_Unroll_R(8) 724 R1024_8_rounds(8); 725 #endif 726 #if R1024_Unroll_R(9) 727 R1024_8_rounds(9); 728 #endif 729 #if R1024_Unroll_R(10) 730 R1024_8_rounds(10); 731 #endif 732 #if R1024_Unroll_R(11) 733 R1024_8_rounds(11); 734 #endif 735 #if R1024_Unroll_R(12) 736 R1024_8_rounds(12); 737 #endif 738 #if R1024_Unroll_R(13) 739 R1024_8_rounds(13); 740 #endif 741 #if R1024_Unroll_R(14) 742 R1024_8_rounds(14); 743 #endif 744 #if (SKEIN_UNROLL_1024 > 14) 745 #error "need more unrolling in Skein_1024_Process_Block" 746 #endif 747 } 748 /* 749 * do the final "feedforward" xor, update context chaining vars 750 */ 751 752 ctx->X[0] = X00 ^ w[0]; 753 ctx->X[1] = X01 ^ w[1]; 754 ctx->X[2] = X02 ^ w[2]; 755 ctx->X[3] = X03 ^ w[3]; 756 ctx->X[4] = X04 ^ w[4]; 757 ctx->X[5] = X05 ^ w[5]; 758 ctx->X[6] = X06 ^ w[6]; 759 ctx->X[7] = X07 ^ w[7]; 760 ctx->X[8] = X08 ^ w[8]; 761 ctx->X[9] = X09 ^ w[9]; 762 ctx->X[10] = X10 ^ w[10]; 763 ctx->X[11] = X11 ^ w[11]; 764 ctx->X[12] = X12 ^ w[12]; 765 ctx->X[13] = X13 ^ w[13]; 766 ctx->X[14] = X14 ^ w[14]; 767 ctx->X[15] = X15 ^ w[15]; 768 769 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 770 771 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 772 blkPtr += SKEIN1024_BLOCK_BYTES; 773 } while (--blkCnt); 774 ctx->h.T[0] = ts[0]; 775 ctx->h.T[1] = ts[1]; 776 } 777 778 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 779 size_t 780 Skein1024_Process_Block_CodeSize(void) 781 { 782 return ((uint8_t *)Skein1024_Process_Block_CodeSize) - 783 ((uint8_t *)Skein1024_Process_Block); 784 } 785 786 uint_t 787 Skein1024_Unroll_Cnt(void) 788 { 789 return (SKEIN_UNROLL_1024); 790 } 791 #endif 792 #endif 793