1 // SPDX-License-Identifier: LicenseRef-OpenZFS-ThirdParty-PublicDomain 2 /* 3 * Implementation of the Skein block functions. 4 * Source code author: Doug Whiting, 2008. 5 * This algorithm and source code is released to the public domain. 6 * Compile-time switches: 7 * SKEIN_USE_ASM -- set bits (256/512/1024) to select which 8 * versions use ASM code for block processing 9 * [default: use C for all block sizes] 10 */ 11 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */ 12 13 #include <sys/skein.h> 14 #include "skein_impl.h" 15 #include <sys/isa_defs.h> /* for _ILP32 */ 16 17 #ifndef SKEIN_USE_ASM 18 #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */ 19 #endif 20 21 #ifndef SKEIN_LOOP 22 /* 23 * The low-level checksum routines use a lot of stack space. On systems where 24 * small stacks frame are enforced (like 32-bit kernel builds), do not unroll 25 * checksum calculations to save stack space. 26 * 27 * Even with no loops unrolled, we still can exceed the 1k stack frame limit 28 * in Skein1024_Process_Block() (it hits 1272 bytes on ARM32). We can 29 * safely ignore it though, since that the checksum functions will be called 30 * from a worker thread that won't be using much stack. That's why we have 31 * the #pragma here to ignore the warning. 32 */ 33 #if defined(_ILP32) || defined(__powerpc) /* Assume small stack */ 34 #if defined(__GNUC__) && !defined(__clang__) 35 #pragma GCC diagnostic ignored "-Wframe-larger-than=" 36 #endif 37 /* 38 * We're running on 32-bit, don't unroll loops to save stack frame space 39 * 40 * Due to the ways the calculations on SKEIN_LOOP are done in 41 * Skein_*_Process_Block(), a value of 111 disables unrolling loops 42 * in any of those functions. 43 */ 44 #define SKEIN_LOOP 111 45 #else 46 /* We're compiling with large stacks */ 47 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */ 48 #endif 49 #endif 50 51 /* some useful definitions for code here */ 52 #define BLK_BITS (WCNT*64) 53 #define KW_TWK_BASE (0) 54 #define KW_KEY_BASE (3) 55 #define ks (kw + KW_KEY_BASE) 56 #define ts (kw + KW_TWK_BASE) 57 58 /* no debugging in Illumos version */ 59 #define DebugSaveTweak(ctx) 60 61 /* Skein_256 */ 62 #if !(SKEIN_USE_ASM & 256) 63 void 64 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr, 65 size_t blkCnt, size_t byteCntAdd) 66 { 67 enum { 68 WCNT = SKEIN_256_STATE_WORDS 69 }; 70 #undef RCNT 71 #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8) 72 73 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 74 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10) 75 #else 76 #define SKEIN_UNROLL_256 (0) 77 #endif 78 79 #if SKEIN_UNROLL_256 80 #if (RCNT % SKEIN_UNROLL_256) 81 #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */ 82 #endif 83 size_t r; 84 /* key schedule words : chaining vars + tweak + "rotation" */ 85 uint64_t kw[WCNT + 4 + RCNT * 2]; 86 #else 87 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 88 #endif 89 /* local copy of context vars, for speed */ 90 uint64_t X0, X1, X2, X3; 91 uint64_t w[WCNT]; /* local copy of input block */ 92 #ifdef SKEIN_DEBUG 93 /* use for debugging (help compiler put Xn in registers) */ 94 const uint64_t *Xptr[4]; 95 Xptr[0] = &X0; 96 Xptr[1] = &X1; 97 Xptr[2] = &X2; 98 Xptr[3] = &X3; 99 #endif 100 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 101 ts[0] = ctx->h.T[0]; 102 ts[1] = ctx->h.T[1]; 103 do { 104 /* 105 * this implementation only supports 2**64 input bytes 106 * (no carry out here) 107 */ 108 ts[0] += byteCntAdd; /* update processed length */ 109 110 /* precompute the key schedule for this block */ 111 ks[0] = ctx->X[0]; 112 ks[1] = ctx->X[1]; 113 ks[2] = ctx->X[2]; 114 ks[3] = ctx->X[3]; 115 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY; 116 117 ts[2] = ts[0] ^ ts[1]; 118 119 /* get input block in little-endian format */ 120 Skein_Get64_LSB_First(w, blkPtr, WCNT); 121 DebugSaveTweak(ctx); 122 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 123 124 X0 = w[0] + ks[0]; /* do the first full key injection */ 125 X1 = w[1] + ks[1] + ts[0]; 126 X2 = w[2] + ks[2] + ts[1]; 127 X3 = w[3] + ks[3]; 128 129 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 130 Xptr); /* show starting state values */ 131 132 blkPtr += SKEIN_256_BLOCK_BYTES; 133 134 /* run the rounds */ 135 136 #define Round256(p0, p1, p2, p3, ROT, rNum) \ 137 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \ 138 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \ 139 140 #if SKEIN_UNROLL_256 == 0 141 #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \ 142 Round256(p0, p1, p2, p3, ROT, rNum) \ 143 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); 144 145 #define I256(R) \ 146 X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \ 147 X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \ 148 X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \ 149 X3 += ks[((R) + 4) % 5] + (R) + 1; \ 150 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 151 #else /* looping version */ 152 #define R256(p0, p1, p2, p3, ROT, rNum) \ 153 Round256(p0, p1, p2, p3, ROT, rNum) \ 154 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); 155 156 #define I256(R) \ 157 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 158 X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \ 159 X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \ 160 X3 += ks[r + (R) + 3] + r + (R); \ 161 ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \ 162 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 163 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 164 165 /* loop through it */ 166 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256) 167 #endif 168 { 169 #define R256_8_rounds(R) \ 170 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \ 171 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \ 172 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \ 173 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \ 174 I256(2 * (R)); \ 175 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \ 176 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \ 177 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \ 178 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \ 179 I256(2 * (R) + 1); 180 181 R256_8_rounds(0); 182 183 #define R256_Unroll_R(NN) \ 184 ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \ 185 (SKEIN_UNROLL_256 > (NN))) 186 187 #if R256_Unroll_R(1) 188 R256_8_rounds(1); 189 #endif 190 #if R256_Unroll_R(2) 191 R256_8_rounds(2); 192 #endif 193 #if R256_Unroll_R(3) 194 R256_8_rounds(3); 195 #endif 196 #if R256_Unroll_R(4) 197 R256_8_rounds(4); 198 #endif 199 #if R256_Unroll_R(5) 200 R256_8_rounds(5); 201 #endif 202 #if R256_Unroll_R(6) 203 R256_8_rounds(6); 204 #endif 205 #if R256_Unroll_R(7) 206 R256_8_rounds(7); 207 #endif 208 #if R256_Unroll_R(8) 209 R256_8_rounds(8); 210 #endif 211 #if R256_Unroll_R(9) 212 R256_8_rounds(9); 213 #endif 214 #if R256_Unroll_R(10) 215 R256_8_rounds(10); 216 #endif 217 #if R256_Unroll_R(11) 218 R256_8_rounds(11); 219 #endif 220 #if R256_Unroll_R(12) 221 R256_8_rounds(12); 222 #endif 223 #if R256_Unroll_R(13) 224 R256_8_rounds(13); 225 #endif 226 #if R256_Unroll_R(14) 227 R256_8_rounds(14); 228 #endif 229 #if (SKEIN_UNROLL_256 > 14) 230 #error "need more unrolling in Skein_256_Process_Block" 231 #endif 232 } 233 /* 234 * do the final "feedforward" xor, update context chaining vars 235 */ 236 ctx->X[0] = X0 ^ w[0]; 237 ctx->X[1] = X1 ^ w[1]; 238 ctx->X[2] = X2 ^ w[2]; 239 ctx->X[3] = X3 ^ w[3]; 240 241 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 242 243 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 244 } while (--blkCnt); 245 ctx->h.T[0] = ts[0]; 246 ctx->h.T[1] = ts[1]; 247 } 248 249 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 250 size_t 251 Skein_256_Process_Block_CodeSize(void) 252 { 253 return ((uint8_t *)Skein_256_Process_Block_CodeSize) - 254 ((uint8_t *)Skein_256_Process_Block); 255 } 256 257 uint_t 258 Skein_256_Unroll_Cnt(void) 259 { 260 return (SKEIN_UNROLL_256); 261 } 262 #endif 263 #endif 264 265 /* Skein_512 */ 266 #if !(SKEIN_USE_ASM & 512) 267 void 268 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr, 269 size_t blkCnt, size_t byteCntAdd) 270 { 271 enum { 272 WCNT = SKEIN_512_STATE_WORDS 273 }; 274 #undef RCNT 275 #define RCNT (SKEIN_512_ROUNDS_TOTAL / 8) 276 277 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 278 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10) 279 #else 280 #define SKEIN_UNROLL_512 (0) 281 #endif 282 283 #if SKEIN_UNROLL_512 284 #if (RCNT % SKEIN_UNROLL_512) 285 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */ 286 #endif 287 size_t r; 288 /* key schedule words : chaining vars + tweak + "rotation" */ 289 uint64_t kw[WCNT + 4 + RCNT * 2]; 290 #else 291 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 292 #endif 293 /* local copy of vars, for speed */ 294 uint64_t X0, X1, X2, X3, X4, X5, X6, X7; 295 uint64_t w[WCNT]; /* local copy of input block */ 296 #ifdef SKEIN_DEBUG 297 /* use for debugging (help compiler put Xn in registers) */ 298 const uint64_t *Xptr[8]; 299 Xptr[0] = &X0; 300 Xptr[1] = &X1; 301 Xptr[2] = &X2; 302 Xptr[3] = &X3; 303 Xptr[4] = &X4; 304 Xptr[5] = &X5; 305 Xptr[6] = &X6; 306 Xptr[7] = &X7; 307 #endif 308 309 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 310 ts[0] = ctx->h.T[0]; 311 ts[1] = ctx->h.T[1]; 312 do { 313 /* 314 * this implementation only supports 2**64 input bytes 315 * (no carry out here) 316 */ 317 ts[0] += byteCntAdd; /* update processed length */ 318 319 /* precompute the key schedule for this block */ 320 ks[0] = ctx->X[0]; 321 ks[1] = ctx->X[1]; 322 ks[2] = ctx->X[2]; 323 ks[3] = ctx->X[3]; 324 ks[4] = ctx->X[4]; 325 ks[5] = ctx->X[5]; 326 ks[6] = ctx->X[6]; 327 ks[7] = ctx->X[7]; 328 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 329 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY; 330 331 ts[2] = ts[0] ^ ts[1]; 332 333 /* get input block in little-endian format */ 334 Skein_Get64_LSB_First(w, blkPtr, WCNT); 335 DebugSaveTweak(ctx); 336 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 337 338 X0 = w[0] + ks[0]; /* do the first full key injection */ 339 X1 = w[1] + ks[1]; 340 X2 = w[2] + ks[2]; 341 X3 = w[3] + ks[3]; 342 X4 = w[4] + ks[4]; 343 X5 = w[5] + ks[5] + ts[0]; 344 X6 = w[6] + ks[6] + ts[1]; 345 X7 = w[7] + ks[7]; 346 347 blkPtr += SKEIN_512_BLOCK_BYTES; 348 349 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 350 Xptr); 351 /* run the rounds */ 352 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 353 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ 354 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ 355 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ 356 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6; 357 358 #if SKEIN_UNROLL_512 == 0 359 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \ 360 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 361 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr); 362 363 #define I512(R) \ 364 X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\ 365 X1 += ks[((R) + 2) % 9]; \ 366 X2 += ks[((R) + 3) % 9]; \ 367 X3 += ks[((R) + 4) % 9]; \ 368 X4 += ks[((R) + 5) % 9]; \ 369 X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \ 370 X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \ 371 X7 += ks[((R) + 8) % 9] + (R) + 1; \ 372 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 373 #else /* looping version */ 374 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 375 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \ 376 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr); 377 378 #define I512(R) \ 379 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 380 X1 += ks[r + (R) + 1]; \ 381 X2 += ks[r + (R) + 2]; \ 382 X3 += ks[r + (R) + 3]; \ 383 X4 += ks[r + (R) + 4]; \ 384 X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \ 385 X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \ 386 X7 += ks[r + (R) + 7] + r + (R); \ 387 ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\ 388 ts[r + (R)+2] = ts[r + (R) - 1]; \ 389 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 390 391 /* loop through it */ 392 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512) 393 #endif /* end of looped code definitions */ 394 { 395 #define R512_8_rounds(R) /* do 8 full rounds */ \ 396 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \ 397 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \ 398 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \ 399 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \ 400 I512(2 * (R)); \ 401 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \ 402 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \ 403 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \ 404 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \ 405 I512(2*(R) + 1); /* and key injection */ 406 407 R512_8_rounds(0); 408 409 #define R512_Unroll_R(NN) \ 410 ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \ 411 (SKEIN_UNROLL_512 > (NN))) 412 413 #if R512_Unroll_R(1) 414 R512_8_rounds(1); 415 #endif 416 #if R512_Unroll_R(2) 417 R512_8_rounds(2); 418 #endif 419 #if R512_Unroll_R(3) 420 R512_8_rounds(3); 421 #endif 422 #if R512_Unroll_R(4) 423 R512_8_rounds(4); 424 #endif 425 #if R512_Unroll_R(5) 426 R512_8_rounds(5); 427 #endif 428 #if R512_Unroll_R(6) 429 R512_8_rounds(6); 430 #endif 431 #if R512_Unroll_R(7) 432 R512_8_rounds(7); 433 #endif 434 #if R512_Unroll_R(8) 435 R512_8_rounds(8); 436 #endif 437 #if R512_Unroll_R(9) 438 R512_8_rounds(9); 439 #endif 440 #if R512_Unroll_R(10) 441 R512_8_rounds(10); 442 #endif 443 #if R512_Unroll_R(11) 444 R512_8_rounds(11); 445 #endif 446 #if R512_Unroll_R(12) 447 R512_8_rounds(12); 448 #endif 449 #if R512_Unroll_R(13) 450 R512_8_rounds(13); 451 #endif 452 #if R512_Unroll_R(14) 453 R512_8_rounds(14); 454 #endif 455 #if (SKEIN_UNROLL_512 > 14) 456 #error "need more unrolling in Skein_512_Process_Block" 457 #endif 458 } 459 460 /* 461 * do the final "feedforward" xor, update context chaining vars 462 */ 463 ctx->X[0] = X0 ^ w[0]; 464 ctx->X[1] = X1 ^ w[1]; 465 ctx->X[2] = X2 ^ w[2]; 466 ctx->X[3] = X3 ^ w[3]; 467 ctx->X[4] = X4 ^ w[4]; 468 ctx->X[5] = X5 ^ w[5]; 469 ctx->X[6] = X6 ^ w[6]; 470 ctx->X[7] = X7 ^ w[7]; 471 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 472 473 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 474 } while (--blkCnt); 475 ctx->h.T[0] = ts[0]; 476 ctx->h.T[1] = ts[1]; 477 } 478 479 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 480 size_t 481 Skein_512_Process_Block_CodeSize(void) 482 { 483 return ((uint8_t *)Skein_512_Process_Block_CodeSize) - 484 ((uint8_t *)Skein_512_Process_Block); 485 } 486 487 uint_t 488 Skein_512_Unroll_Cnt(void) 489 { 490 return (SKEIN_UNROLL_512); 491 } 492 #endif 493 #endif 494 495 /* Skein1024 */ 496 #if !(SKEIN_USE_ASM & 1024) 497 void 498 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr, 499 size_t blkCnt, size_t byteCntAdd) 500 { 501 /* do it in C, always looping (unrolled is bigger AND slower!) */ 502 enum { 503 WCNT = SKEIN1024_STATE_WORDS 504 }; 505 #undef RCNT 506 #define RCNT (SKEIN1024_ROUNDS_TOTAL/8) 507 508 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */ 509 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10) 510 #else 511 #define SKEIN_UNROLL_1024 (0) 512 #endif 513 514 #if (SKEIN_UNROLL_1024 != 0) 515 #if (RCNT % SKEIN_UNROLL_1024) 516 #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */ 517 #endif 518 size_t r; 519 /* key schedule words : chaining vars + tweak + "rotation" */ 520 uint64_t kw[WCNT + 4 + RCNT * 2]; 521 #else 522 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */ 523 #endif 524 525 /* local copy of vars, for speed */ 526 uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11, 527 X12, X13, X14, X15; 528 uint64_t w[WCNT]; /* local copy of input block */ 529 #ifdef SKEIN_DEBUG 530 /* use for debugging (help compiler put Xn in registers) */ 531 const uint64_t *Xptr[16]; 532 Xptr[0] = &X00; 533 Xptr[1] = &X01; 534 Xptr[2] = &X02; 535 Xptr[3] = &X03; 536 Xptr[4] = &X04; 537 Xptr[5] = &X05; 538 Xptr[6] = &X06; 539 Xptr[7] = &X07; 540 Xptr[8] = &X08; 541 Xptr[9] = &X09; 542 Xptr[10] = &X10; 543 Xptr[11] = &X11; 544 Xptr[12] = &X12; 545 Xptr[13] = &X13; 546 Xptr[14] = &X14; 547 Xptr[15] = &X15; 548 #endif 549 550 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */ 551 ts[0] = ctx->h.T[0]; 552 ts[1] = ctx->h.T[1]; 553 do { 554 /* 555 * this implementation only supports 2**64 input bytes 556 * (no carry out here) 557 */ 558 ts[0] += byteCntAdd; /* update processed length */ 559 560 /* precompute the key schedule for this block */ 561 ks[0] = ctx->X[0]; 562 ks[1] = ctx->X[1]; 563 ks[2] = ctx->X[2]; 564 ks[3] = ctx->X[3]; 565 ks[4] = ctx->X[4]; 566 ks[5] = ctx->X[5]; 567 ks[6] = ctx->X[6]; 568 ks[7] = ctx->X[7]; 569 ks[8] = ctx->X[8]; 570 ks[9] = ctx->X[9]; 571 ks[10] = ctx->X[10]; 572 ks[11] = ctx->X[11]; 573 ks[12] = ctx->X[12]; 574 ks[13] = ctx->X[13]; 575 ks[14] = ctx->X[14]; 576 ks[15] = ctx->X[15]; 577 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 578 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ 579 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^ 580 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY; 581 582 ts[2] = ts[0] ^ ts[1]; 583 584 /* get input block in little-endian format */ 585 Skein_Get64_LSB_First(w, blkPtr, WCNT); 586 DebugSaveTweak(ctx); 587 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts); 588 589 X00 = w[0] + ks[0]; /* do the first full key injection */ 590 X01 = w[1] + ks[1]; 591 X02 = w[2] + ks[2]; 592 X03 = w[3] + ks[3]; 593 X04 = w[4] + ks[4]; 594 X05 = w[5] + ks[5]; 595 X06 = w[6] + ks[6]; 596 X07 = w[7] + ks[7]; 597 X08 = w[8] + ks[8]; 598 X09 = w[9] + ks[9]; 599 X10 = w[10] + ks[10]; 600 X11 = w[11] + ks[11]; 601 X12 = w[12] + ks[12]; 602 X13 = w[13] + ks[13] + ts[0]; 603 X14 = w[14] + ks[14] + ts[1]; 604 X15 = w[15] + ks[15]; 605 606 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL, 607 Xptr); 608 609 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 610 pD, pE, pF, ROT, rNum) \ 611 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\ 612 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\ 613 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\ 614 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\ 615 X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\ 616 X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\ 617 X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\ 618 X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE; 619 620 #if SKEIN_UNROLL_1024 == 0 621 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ 622 pE, pF, ROT, rn) \ 623 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 624 pD, pE, pF, ROT, rn) \ 625 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr); 626 627 #define I1024(R) \ 628 X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\ 629 X01 += ks[((R) + 2) % 17]; \ 630 X02 += ks[((R) + 3) % 17]; \ 631 X03 += ks[((R) + 4) % 17]; \ 632 X04 += ks[((R) + 5) % 17]; \ 633 X05 += ks[((R) + 6) % 17]; \ 634 X06 += ks[((R) + 7) % 17]; \ 635 X07 += ks[((R) + 8) % 17]; \ 636 X08 += ks[((R) + 9) % 17]; \ 637 X09 += ks[((R) + 10) % 17]; \ 638 X10 += ks[((R) + 11) % 17]; \ 639 X11 += ks[((R) + 12) % 17]; \ 640 X12 += ks[((R) + 13) % 17]; \ 641 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \ 642 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \ 643 X15 += ks[((R) + 16) % 17] + (R) +1; \ 644 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 645 #else /* looping version */ 646 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \ 647 pE, pF, ROT, rn) \ 648 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \ 649 pD, pE, pF, ROT, rn) \ 650 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr); 651 652 #define I1024(R) \ 653 X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \ 654 X01 += ks[r + (R) + 1]; \ 655 X02 += ks[r + (R) + 2]; \ 656 X03 += ks[r + (R) + 3]; \ 657 X04 += ks[r + (R) + 4]; \ 658 X05 += ks[r + (R) + 5]; \ 659 X06 += ks[r + (R) + 6]; \ 660 X07 += ks[r + (R) + 7]; \ 661 X08 += ks[r + (R) + 8]; \ 662 X09 += ks[r + (R) + 9]; \ 663 X10 += ks[r + (R) + 10]; \ 664 X11 += ks[r + (R) + 11]; \ 665 X12 += ks[r + (R) + 12]; \ 666 X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \ 667 X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \ 668 X15 += ks[r + (R) + 15] + r + (R); \ 669 ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\ 670 ts[r + (R) + 2] = ts[r + (R) - 1]; \ 671 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr); 672 673 /* loop through it */ 674 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024) 675 #endif 676 { 677 #define R1024_8_rounds(R) /* do 8 full rounds */ \ 678 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 679 14, 15, R1024_0, 8 * (R) + 1); \ 680 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 681 08, 01, R1024_1, 8 * (R) + 2); \ 682 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 683 10, 09, R1024_2, 8 * (R) + 3); \ 684 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 685 12, 07, R1024_3, 8 * (R) + 4); \ 686 I1024(2 * (R)); \ 687 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \ 688 14, 15, R1024_4, 8 * (R) + 5); \ 689 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \ 690 08, 01, R1024_5, 8 * (R) + 6); \ 691 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \ 692 10, 09, R1024_6, 8 * (R) + 7); \ 693 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \ 694 12, 07, R1024_7, 8 * (R) + 8); \ 695 I1024(2 * (R) + 1); 696 697 R1024_8_rounds(0); 698 699 #define R1024_Unroll_R(NN) \ 700 ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \ 701 (SKEIN_UNROLL_1024 > (NN))) 702 703 #if R1024_Unroll_R(1) 704 R1024_8_rounds(1); 705 #endif 706 #if R1024_Unroll_R(2) 707 R1024_8_rounds(2); 708 #endif 709 #if R1024_Unroll_R(3) 710 R1024_8_rounds(3); 711 #endif 712 #if R1024_Unroll_R(4) 713 R1024_8_rounds(4); 714 #endif 715 #if R1024_Unroll_R(5) 716 R1024_8_rounds(5); 717 #endif 718 #if R1024_Unroll_R(6) 719 R1024_8_rounds(6); 720 #endif 721 #if R1024_Unroll_R(7) 722 R1024_8_rounds(7); 723 #endif 724 #if R1024_Unroll_R(8) 725 R1024_8_rounds(8); 726 #endif 727 #if R1024_Unroll_R(9) 728 R1024_8_rounds(9); 729 #endif 730 #if R1024_Unroll_R(10) 731 R1024_8_rounds(10); 732 #endif 733 #if R1024_Unroll_R(11) 734 R1024_8_rounds(11); 735 #endif 736 #if R1024_Unroll_R(12) 737 R1024_8_rounds(12); 738 #endif 739 #if R1024_Unroll_R(13) 740 R1024_8_rounds(13); 741 #endif 742 #if R1024_Unroll_R(14) 743 R1024_8_rounds(14); 744 #endif 745 #if (SKEIN_UNROLL_1024 > 14) 746 #error "need more unrolling in Skein_1024_Process_Block" 747 #endif 748 } 749 /* 750 * do the final "feedforward" xor, update context chaining vars 751 */ 752 753 ctx->X[0] = X00 ^ w[0]; 754 ctx->X[1] = X01 ^ w[1]; 755 ctx->X[2] = X02 ^ w[2]; 756 ctx->X[3] = X03 ^ w[3]; 757 ctx->X[4] = X04 ^ w[4]; 758 ctx->X[5] = X05 ^ w[5]; 759 ctx->X[6] = X06 ^ w[6]; 760 ctx->X[7] = X07 ^ w[7]; 761 ctx->X[8] = X08 ^ w[8]; 762 ctx->X[9] = X09 ^ w[9]; 763 ctx->X[10] = X10 ^ w[10]; 764 ctx->X[11] = X11 ^ w[11]; 765 ctx->X[12] = X12 ^ w[12]; 766 ctx->X[13] = X13 ^ w[13]; 767 ctx->X[14] = X14 ^ w[14]; 768 ctx->X[15] = X15 ^ w[15]; 769 770 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X); 771 772 ts[1] &= ~SKEIN_T1_FLAG_FIRST; 773 blkPtr += SKEIN1024_BLOCK_BYTES; 774 } while (--blkCnt); 775 ctx->h.T[0] = ts[0]; 776 ctx->h.T[1] = ts[1]; 777 } 778 779 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF) 780 size_t 781 Skein1024_Process_Block_CodeSize(void) 782 { 783 return ((uint8_t *)Skein1024_Process_Block_CodeSize) - 784 ((uint8_t *)Skein1024_Process_Block); 785 } 786 787 uint_t 788 Skein1024_Unroll_Cnt(void) 789 { 790 return (SKEIN_UNROLL_1024); 791 } 792 #endif 793 #endif 794