1 /*
2 * Implementation of the Skein block functions.
3 * Source code author: Doug Whiting, 2008.
4 * This algorithm and source code is released to the public domain.
5 * Compile-time switches:
6 * SKEIN_USE_ASM -- set bits (256/512/1024) to select which
7 * versions use ASM code for block processing
8 * [default: use C for all block sizes]
9 */
10 /* Copyright 2013 Doug Whiting. This code is released to the public domain. */
11
12 #include <sys/skein.h>
13 #include "skein_impl.h"
14
15 #ifndef SKEIN_USE_ASM
16 #define SKEIN_USE_ASM (0) /* default is all C code (no ASM) */
17 #endif
18
19 #ifndef SKEIN_LOOP
20 #define SKEIN_LOOP 001 /* default: unroll 256 and 512, but not 1024 */
21 #endif
22
23 /* some useful definitions for code here */
24 #define BLK_BITS (WCNT*64)
25 #define KW_TWK_BASE (0)
26 #define KW_KEY_BASE (3)
27 #define ks (kw + KW_KEY_BASE)
28 #define ts (kw + KW_TWK_BASE)
29
30 /* no debugging in Illumos version */
31 #define DebugSaveTweak(ctx)
32
33 /* Skein_256 */
34 #if !(SKEIN_USE_ASM & 256)
35 void
Skein_256_Process_Block(Skein_256_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)36 Skein_256_Process_Block(Skein_256_Ctxt_t *ctx, const uint8_t *blkPtr,
37 size_t blkCnt, size_t byteCntAdd)
38 { /* do it in C */
39 enum {
40 WCNT = SKEIN_256_STATE_WORDS
41 };
42 #undef RCNT
43 #define RCNT (SKEIN_256_ROUNDS_TOTAL / 8)
44
45 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
46 #define SKEIN_UNROLL_256 (((SKEIN_LOOP) / 100) % 10)
47 #else
48 #define SKEIN_UNROLL_256 (0)
49 #endif
50
51 #if SKEIN_UNROLL_256
52 #if (RCNT % SKEIN_UNROLL_256)
53 #error "Invalid SKEIN_UNROLL_256" /* sanity check on unroll count */
54 #endif
55 size_t r;
56 /* key schedule words : chaining vars + tweak + "rotation" */
57 uint64_t kw[WCNT + 4 + RCNT * 2];
58 #else
59 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
60 #endif
61 /* local copy of context vars, for speed */
62 uint64_t X0, X1, X2, X3;
63 uint64_t w[WCNT]; /* local copy of input block */
64 #ifdef SKEIN_DEBUG
65 /* use for debugging (help compiler put Xn in registers) */
66 const uint64_t *Xptr[4];
67 Xptr[0] = &X0;
68 Xptr[1] = &X1;
69 Xptr[2] = &X2;
70 Xptr[3] = &X3;
71 #endif
72 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
73 ts[0] = ctx->h.T[0];
74 ts[1] = ctx->h.T[1];
75 do {
76 /*
77 * this implementation only supports 2**64 input bytes
78 * (no carry out here)
79 */
80 ts[0] += byteCntAdd; /* update processed length */
81
82 /* precompute the key schedule for this block */
83 ks[0] = ctx->X[0];
84 ks[1] = ctx->X[1];
85 ks[2] = ctx->X[2];
86 ks[3] = ctx->X[3];
87 ks[4] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ SKEIN_KS_PARITY;
88
89 ts[2] = ts[0] ^ ts[1];
90
91 /* get input block in little-endian format */
92 Skein_Get64_LSB_First(w, blkPtr, WCNT);
93 DebugSaveTweak(ctx);
94 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
95
96 X0 = w[0] + ks[0]; /* do the first full key injection */
97 X1 = w[1] + ks[1] + ts[0];
98 X2 = w[2] + ks[2] + ts[1];
99 X3 = w[3] + ks[3];
100
101 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
102 Xptr); /* show starting state values */
103
104 blkPtr += SKEIN_256_BLOCK_BYTES;
105
106 /* run the rounds */
107
108 #define Round256(p0, p1, p2, p3, ROT, rNum) \
109 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0; \
110 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2; \
111
112 #if SKEIN_UNROLL_256 == 0
113 #define R256(p0, p1, p2, p3, ROT, rNum) /* fully unrolled */ \
114 Round256(p0, p1, p2, p3, ROT, rNum) \
115 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
116
117 #define I256(R) \
118 X0 += ks[((R) + 1) % 5]; /* inject the key schedule value */ \
119 X1 += ks[((R) + 2) % 5] + ts[((R) + 1) % 3]; \
120 X2 += ks[((R) + 3) % 5] + ts[((R) + 2) % 3]; \
121 X3 += ks[((R) + 4) % 5] + (R) + 1; \
122 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
123 #else /* looping version */
124 #define R256(p0, p1, p2, p3, ROT, rNum) \
125 Round256(p0, p1, p2, p3, ROT, rNum) \
126 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
127
128 #define I256(R) \
129 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
130 X1 += ks[r + (R) + 1] + ts[r + (R) + 0]; \
131 X2 += ks[r + (R) + 2] + ts[r + (R) + 1]; \
132 X3 += ks[r + (R) + 3] + r + (R); \
133 ks[r + (R) + 4] = ks[r + (R) - 1]; /* rotate key schedule */ \
134 ts[r + (R) + 2] = ts[r + (R) - 1]; \
135 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
136
137 /* loop thru it */
138 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_256)
139 #endif
140 {
141 #define R256_8_rounds(R) \
142 R256(0, 1, 2, 3, R_256_0, 8 * (R) + 1); \
143 R256(0, 3, 2, 1, R_256_1, 8 * (R) + 2); \
144 R256(0, 1, 2, 3, R_256_2, 8 * (R) + 3); \
145 R256(0, 3, 2, 1, R_256_3, 8 * (R) + 4); \
146 I256(2 * (R)); \
147 R256(0, 1, 2, 3, R_256_4, 8 * (R) + 5); \
148 R256(0, 3, 2, 1, R_256_5, 8 * (R) + 6); \
149 R256(0, 1, 2, 3, R_256_6, 8 * (R) + 7); \
150 R256(0, 3, 2, 1, R_256_7, 8 * (R) + 8); \
151 I256(2 * (R) + 1);
152
153 R256_8_rounds(0);
154
155 #define R256_Unroll_R(NN) \
156 ((SKEIN_UNROLL_256 == 0 && SKEIN_256_ROUNDS_TOTAL / 8 > (NN)) || \
157 (SKEIN_UNROLL_256 > (NN)))
158
159 #if R256_Unroll_R(1)
160 R256_8_rounds(1);
161 #endif
162 #if R256_Unroll_R(2)
163 R256_8_rounds(2);
164 #endif
165 #if R256_Unroll_R(3)
166 R256_8_rounds(3);
167 #endif
168 #if R256_Unroll_R(4)
169 R256_8_rounds(4);
170 #endif
171 #if R256_Unroll_R(5)
172 R256_8_rounds(5);
173 #endif
174 #if R256_Unroll_R(6)
175 R256_8_rounds(6);
176 #endif
177 #if R256_Unroll_R(7)
178 R256_8_rounds(7);
179 #endif
180 #if R256_Unroll_R(8)
181 R256_8_rounds(8);
182 #endif
183 #if R256_Unroll_R(9)
184 R256_8_rounds(9);
185 #endif
186 #if R256_Unroll_R(10)
187 R256_8_rounds(10);
188 #endif
189 #if R256_Unroll_R(11)
190 R256_8_rounds(11);
191 #endif
192 #if R256_Unroll_R(12)
193 R256_8_rounds(12);
194 #endif
195 #if R256_Unroll_R(13)
196 R256_8_rounds(13);
197 #endif
198 #if R256_Unroll_R(14)
199 R256_8_rounds(14);
200 #endif
201 #if (SKEIN_UNROLL_256 > 14)
202 #error "need more unrolling in Skein_256_Process_Block"
203 #endif
204 }
205 /*
206 * do the final "feedforward" xor, update context chaining vars
207 */
208 ctx->X[0] = X0 ^ w[0];
209 ctx->X[1] = X1 ^ w[1];
210 ctx->X[2] = X2 ^ w[2];
211 ctx->X[3] = X3 ^ w[3];
212
213 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
214
215 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
216 }
217 while (--blkCnt);
218 ctx->h.T[0] = ts[0];
219 ctx->h.T[1] = ts[1];
220 }
221
222 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
223 size_t
Skein_256_Process_Block_CodeSize(void)224 Skein_256_Process_Block_CodeSize(void)
225 {
226 return ((uint8_t *)Skein_256_Process_Block_CodeSize) -
227 ((uint8_t *)Skein_256_Process_Block);
228 }
229
230 uint_t
Skein_256_Unroll_Cnt(void)231 Skein_256_Unroll_Cnt(void)
232 {
233 return (SKEIN_UNROLL_256);
234 }
235 #endif
236 #endif
237
238 /* Skein_512 */
239 #if !(SKEIN_USE_ASM & 512)
240 void
Skein_512_Process_Block(Skein_512_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)241 Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const uint8_t *blkPtr,
242 size_t blkCnt, size_t byteCntAdd)
243 { /* do it in C */
244 enum {
245 WCNT = SKEIN_512_STATE_WORDS
246 };
247 #undef RCNT
248 #define RCNT (SKEIN_512_ROUNDS_TOTAL / 8)
249
250 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
251 #define SKEIN_UNROLL_512 (((SKEIN_LOOP) / 10) % 10)
252 #else
253 #define SKEIN_UNROLL_512 (0)
254 #endif
255
256 #if SKEIN_UNROLL_512
257 #if (RCNT % SKEIN_UNROLL_512)
258 #error "Invalid SKEIN_UNROLL_512" /* sanity check on unroll count */
259 #endif
260 size_t r;
261 /* key schedule words : chaining vars + tweak + "rotation" */
262 uint64_t kw[WCNT + 4 + RCNT * 2];
263 #else
264 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
265 #endif
266 /* local copy of vars, for speed */
267 uint64_t X0, X1, X2, X3, X4, X5, X6, X7;
268 uint64_t w[WCNT]; /* local copy of input block */
269 #ifdef SKEIN_DEBUG
270 /* use for debugging (help compiler put Xn in registers) */
271 const uint64_t *Xptr[8];
272 Xptr[0] = &X0;
273 Xptr[1] = &X1;
274 Xptr[2] = &X2;
275 Xptr[3] = &X3;
276 Xptr[4] = &X4;
277 Xptr[5] = &X5;
278 Xptr[6] = &X6;
279 Xptr[7] = &X7;
280 #endif
281
282 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
283 ts[0] = ctx->h.T[0];
284 ts[1] = ctx->h.T[1];
285 do {
286 /*
287 * this implementation only supports 2**64 input bytes
288 * (no carry out here)
289 */
290 ts[0] += byteCntAdd; /* update processed length */
291
292 /* precompute the key schedule for this block */
293 ks[0] = ctx->X[0];
294 ks[1] = ctx->X[1];
295 ks[2] = ctx->X[2];
296 ks[3] = ctx->X[3];
297 ks[4] = ctx->X[4];
298 ks[5] = ctx->X[5];
299 ks[6] = ctx->X[6];
300 ks[7] = ctx->X[7];
301 ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
302 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;
303
304 ts[2] = ts[0] ^ ts[1];
305
306 /* get input block in little-endian format */
307 Skein_Get64_LSB_First(w, blkPtr, WCNT);
308 DebugSaveTweak(ctx);
309 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
310
311 X0 = w[0] + ks[0]; /* do the first full key injection */
312 X1 = w[1] + ks[1];
313 X2 = w[2] + ks[2];
314 X3 = w[3] + ks[3];
315 X4 = w[4] + ks[4];
316 X5 = w[5] + ks[5] + ts[0];
317 X6 = w[6] + ks[6] + ts[1];
318 X7 = w[7] + ks[7];
319
320 blkPtr += SKEIN_512_BLOCK_BYTES;
321
322 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
323 Xptr);
324 /* run the rounds */
325 #define Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
326 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
327 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
328 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
329 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;
330
331 #if SKEIN_UNROLL_512 == 0
332 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) /* unrolled */ \
333 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
334 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rNum, Xptr);
335
336 #define I512(R) \
337 X0 += ks[((R) + 1) % 9]; /* inject the key schedule value */\
338 X1 += ks[((R) + 2) % 9]; \
339 X2 += ks[((R) + 3) % 9]; \
340 X3 += ks[((R) + 4) % 9]; \
341 X4 += ks[((R) + 5) % 9]; \
342 X5 += ks[((R) + 6) % 9] + ts[((R) + 1) % 3]; \
343 X6 += ks[((R) + 7) % 9] + ts[((R) + 2) % 3]; \
344 X7 += ks[((R) + 8) % 9] + (R) + 1; \
345 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
346 #else /* looping version */
347 #define R512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
348 Round512(p0, p1, p2, p3, p4, p5, p6, p7, ROT, rNum) \
349 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rNum, Xptr);
350
351 #define I512(R) \
352 X0 += ks[r + (R) + 0]; /* inject the key schedule value */ \
353 X1 += ks[r + (R) + 1]; \
354 X2 += ks[r + (R) + 2]; \
355 X3 += ks[r + (R) + 3]; \
356 X4 += ks[r + (R) + 4]; \
357 X5 += ks[r + (R) + 5] + ts[r + (R) + 0]; \
358 X6 += ks[r + (R) + 6] + ts[r + (R) + 1]; \
359 X7 += ks[r + (R) + 7] + r + (R); \
360 ks[r + (R)+8] = ks[r + (R) - 1]; /* rotate key schedule */\
361 ts[r + (R)+2] = ts[r + (R) - 1]; \
362 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
363
364 /* loop thru it */
365 for (r = 1; r < 2 * RCNT; r += 2 * SKEIN_UNROLL_512)
366 #endif /* end of looped code definitions */
367 {
368 #define R512_8_rounds(R) /* do 8 full rounds */ \
369 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_0, 8 * (R) + 1); \
370 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_1, 8 * (R) + 2); \
371 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_2, 8 * (R) + 3); \
372 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_3, 8 * (R) + 4); \
373 I512(2 * (R)); \
374 R512(0, 1, 2, 3, 4, 5, 6, 7, R_512_4, 8 * (R) + 5); \
375 R512(2, 1, 4, 7, 6, 5, 0, 3, R_512_5, 8 * (R) + 6); \
376 R512(4, 1, 6, 3, 0, 5, 2, 7, R_512_6, 8 * (R) + 7); \
377 R512(6, 1, 0, 7, 2, 5, 4, 3, R_512_7, 8 * (R) + 8); \
378 I512(2*(R) + 1); /* and key injection */
379
380 R512_8_rounds(0);
381
382 #define R512_Unroll_R(NN) \
383 ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL / 8 > (NN)) || \
384 (SKEIN_UNROLL_512 > (NN)))
385
386 #if R512_Unroll_R(1)
387 R512_8_rounds(1);
388 #endif
389 #if R512_Unroll_R(2)
390 R512_8_rounds(2);
391 #endif
392 #if R512_Unroll_R(3)
393 R512_8_rounds(3);
394 #endif
395 #if R512_Unroll_R(4)
396 R512_8_rounds(4);
397 #endif
398 #if R512_Unroll_R(5)
399 R512_8_rounds(5);
400 #endif
401 #if R512_Unroll_R(6)
402 R512_8_rounds(6);
403 #endif
404 #if R512_Unroll_R(7)
405 R512_8_rounds(7);
406 #endif
407 #if R512_Unroll_R(8)
408 R512_8_rounds(8);
409 #endif
410 #if R512_Unroll_R(9)
411 R512_8_rounds(9);
412 #endif
413 #if R512_Unroll_R(10)
414 R512_8_rounds(10);
415 #endif
416 #if R512_Unroll_R(11)
417 R512_8_rounds(11);
418 #endif
419 #if R512_Unroll_R(12)
420 R512_8_rounds(12);
421 #endif
422 #if R512_Unroll_R(13)
423 R512_8_rounds(13);
424 #endif
425 #if R512_Unroll_R(14)
426 R512_8_rounds(14);
427 #endif
428 #if (SKEIN_UNROLL_512 > 14)
429 #error "need more unrolling in Skein_512_Process_Block"
430 #endif
431 }
432
433 /*
434 * do the final "feedforward" xor, update context chaining vars
435 */
436 ctx->X[0] = X0 ^ w[0];
437 ctx->X[1] = X1 ^ w[1];
438 ctx->X[2] = X2 ^ w[2];
439 ctx->X[3] = X3 ^ w[3];
440 ctx->X[4] = X4 ^ w[4];
441 ctx->X[5] = X5 ^ w[5];
442 ctx->X[6] = X6 ^ w[6];
443 ctx->X[7] = X7 ^ w[7];
444 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
445
446 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
447 }
448 while (--blkCnt);
449 ctx->h.T[0] = ts[0];
450 ctx->h.T[1] = ts[1];
451 }
452
453 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
454 size_t
Skein_512_Process_Block_CodeSize(void)455 Skein_512_Process_Block_CodeSize(void)
456 {
457 return ((uint8_t *)Skein_512_Process_Block_CodeSize) -
458 ((uint8_t *)Skein_512_Process_Block);
459 }
460
461 uint_t
Skein_512_Unroll_Cnt(void)462 Skein_512_Unroll_Cnt(void)
463 {
464 return (SKEIN_UNROLL_512);
465 }
466 #endif
467 #endif
468
469 /* Skein1024 */
470 #if !(SKEIN_USE_ASM & 1024)
471 void
Skein1024_Process_Block(Skein1024_Ctxt_t * ctx,const uint8_t * blkPtr,size_t blkCnt,size_t byteCntAdd)472 Skein1024_Process_Block(Skein1024_Ctxt_t *ctx, const uint8_t *blkPtr,
473 size_t blkCnt, size_t byteCntAdd)
474 {
475 /* do it in C, always looping (unrolled is bigger AND slower!) */
476 enum {
477 WCNT = SKEIN1024_STATE_WORDS
478 };
479 #undef RCNT
480 #define RCNT (SKEIN1024_ROUNDS_TOTAL/8)
481
482 #ifdef SKEIN_LOOP /* configure how much to unroll the loop */
483 #define SKEIN_UNROLL_1024 ((SKEIN_LOOP)%10)
484 #else
485 #define SKEIN_UNROLL_1024 (0)
486 #endif
487
488 #if (SKEIN_UNROLL_1024 != 0)
489 #if (RCNT % SKEIN_UNROLL_1024)
490 #error "Invalid SKEIN_UNROLL_1024" /* sanity check on unroll count */
491 #endif
492 size_t r;
493 /* key schedule words : chaining vars + tweak + "rotation" */
494 uint64_t kw[WCNT + 4 + RCNT * 2];
495 #else
496 uint64_t kw[WCNT + 4]; /* key schedule words : chaining vars + tweak */
497 #endif
498
499 /* local copy of vars, for speed */
500 uint64_t X00, X01, X02, X03, X04, X05, X06, X07, X08, X09, X10, X11,
501 X12, X13, X14, X15;
502 uint64_t w[WCNT]; /* local copy of input block */
503 #ifdef SKEIN_DEBUG
504 /* use for debugging (help compiler put Xn in registers) */
505 const uint64_t *Xptr[16];
506 Xptr[0] = &X00;
507 Xptr[1] = &X01;
508 Xptr[2] = &X02;
509 Xptr[3] = &X03;
510 Xptr[4] = &X04;
511 Xptr[5] = &X05;
512 Xptr[6] = &X06;
513 Xptr[7] = &X07;
514 Xptr[8] = &X08;
515 Xptr[9] = &X09;
516 Xptr[10] = &X10;
517 Xptr[11] = &X11;
518 Xptr[12] = &X12;
519 Xptr[13] = &X13;
520 Xptr[14] = &X14;
521 Xptr[15] = &X15;
522 #endif
523
524 Skein_assert(blkCnt != 0); /* never call with blkCnt == 0! */
525 ts[0] = ctx->h.T[0];
526 ts[1] = ctx->h.T[1];
527 do {
528 /*
529 * this implementation only supports 2**64 input bytes
530 * (no carry out here)
531 */
532 ts[0] += byteCntAdd; /* update processed length */
533
534 /* precompute the key schedule for this block */
535 ks[0] = ctx->X[0];
536 ks[1] = ctx->X[1];
537 ks[2] = ctx->X[2];
538 ks[3] = ctx->X[3];
539 ks[4] = ctx->X[4];
540 ks[5] = ctx->X[5];
541 ks[6] = ctx->X[6];
542 ks[7] = ctx->X[7];
543 ks[8] = ctx->X[8];
544 ks[9] = ctx->X[9];
545 ks[10] = ctx->X[10];
546 ks[11] = ctx->X[11];
547 ks[12] = ctx->X[12];
548 ks[13] = ctx->X[13];
549 ks[14] = ctx->X[14];
550 ks[15] = ctx->X[15];
551 ks[16] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
552 ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^
553 ks[8] ^ ks[9] ^ ks[10] ^ ks[11] ^
554 ks[12] ^ ks[13] ^ ks[14] ^ ks[15] ^ SKEIN_KS_PARITY;
555
556 ts[2] = ts[0] ^ ts[1];
557
558 /* get input block in little-endian format */
559 Skein_Get64_LSB_First(w, blkPtr, WCNT);
560 DebugSaveTweak(ctx);
561 Skein_Show_Block(BLK_BITS, &ctx->h, ctx->X, blkPtr, w, ks, ts);
562
563 X00 = w[0] + ks[0]; /* do the first full key injection */
564 X01 = w[1] + ks[1];
565 X02 = w[2] + ks[2];
566 X03 = w[3] + ks[3];
567 X04 = w[4] + ks[4];
568 X05 = w[5] + ks[5];
569 X06 = w[6] + ks[6];
570 X07 = w[7] + ks[7];
571 X08 = w[8] + ks[8];
572 X09 = w[9] + ks[9];
573 X10 = w[10] + ks[10];
574 X11 = w[11] + ks[11];
575 X12 = w[12] + ks[12];
576 X13 = w[13] + ks[13] + ts[0];
577 X14 = w[14] + ks[14] + ts[1];
578 X15 = w[15] + ks[15];
579
580 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INITIAL,
581 Xptr);
582
583 #define Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
584 pD, pE, pF, ROT, rNum) \
585 X##p0 += X##p1; X##p1 = RotL_64(X##p1, ROT##_0); X##p1 ^= X##p0;\
586 X##p2 += X##p3; X##p3 = RotL_64(X##p3, ROT##_1); X##p3 ^= X##p2;\
587 X##p4 += X##p5; X##p5 = RotL_64(X##p5, ROT##_2); X##p5 ^= X##p4;\
588 X##p6 += X##p7; X##p7 = RotL_64(X##p7, ROT##_3); X##p7 ^= X##p6;\
589 X##p8 += X##p9; X##p9 = RotL_64(X##p9, ROT##_4); X##p9 ^= X##p8;\
590 X##pA += X##pB; X##pB = RotL_64(X##pB, ROT##_5); X##pB ^= X##pA;\
591 X##pC += X##pD; X##pD = RotL_64(X##pD, ROT##_6); X##pD ^= X##pC;\
592 X##pE += X##pF; X##pF = RotL_64(X##pF, ROT##_7); X##pF ^= X##pE;
593
594 #if SKEIN_UNROLL_1024 == 0
595 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
596 pE, pF, ROT, rn) \
597 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
598 pD, pE, pF, ROT, rn) \
599 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, rn, Xptr);
600
601 #define I1024(R) \
602 X00 += ks[((R) + 1) % 17]; /* inject the key schedule value */\
603 X01 += ks[((R) + 2) % 17]; \
604 X02 += ks[((R) + 3) % 17]; \
605 X03 += ks[((R) + 4) % 17]; \
606 X04 += ks[((R) + 5) % 17]; \
607 X05 += ks[((R) + 6) % 17]; \
608 X06 += ks[((R) + 7) % 17]; \
609 X07 += ks[((R) + 8) % 17]; \
610 X08 += ks[((R) + 9) % 17]; \
611 X09 += ks[((R) + 10) % 17]; \
612 X10 += ks[((R) + 11) % 17]; \
613 X11 += ks[((R) + 12) % 17]; \
614 X12 += ks[((R) + 13) % 17]; \
615 X13 += ks[((R) + 14) % 17] + ts[((R) + 1) % 3]; \
616 X14 += ks[((R) + 15) % 17] + ts[((R) + 2) % 3]; \
617 X15 += ks[((R) + 16) % 17] + (R) +1; \
618 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
619 #else /* looping version */
620 #define R1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, pD, \
621 pE, pF, ROT, rn) \
622 Round1024(p0, p1, p2, p3, p4, p5, p6, p7, p8, p9, pA, pB, pC, \
623 pD, pE, pF, ROT, rn) \
624 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, 4 * (r - 1) + rn, Xptr);
625
626 #define I1024(R) \
627 X00 += ks[r + (R) + 0]; /* inject the key schedule value */ \
628 X01 += ks[r + (R) + 1]; \
629 X02 += ks[r + (R) + 2]; \
630 X03 += ks[r + (R) + 3]; \
631 X04 += ks[r + (R) + 4]; \
632 X05 += ks[r + (R) + 5]; \
633 X06 += ks[r + (R) + 6]; \
634 X07 += ks[r + (R) + 7]; \
635 X08 += ks[r + (R) + 8]; \
636 X09 += ks[r + (R) + 9]; \
637 X10 += ks[r + (R) + 10]; \
638 X11 += ks[r + (R) + 11]; \
639 X12 += ks[r + (R) + 12]; \
640 X13 += ks[r + (R) + 13] + ts[r + (R) + 0]; \
641 X14 += ks[r + (R) + 14] + ts[r + (R) + 1]; \
642 X15 += ks[r + (R) + 15] + r + (R); \
643 ks[r + (R) + 16] = ks[r + (R) - 1]; /* rotate key schedule */\
644 ts[r + (R) + 2] = ts[r + (R) - 1]; \
645 Skein_Show_R_Ptr(BLK_BITS, &ctx->h, SKEIN_RND_KEY_INJECT, Xptr);
646
647 /* loop thru it */
648 for (r = 1; r <= 2 * RCNT; r += 2 * SKEIN_UNROLL_1024)
649 #endif
650 {
651 #define R1024_8_rounds(R) /* do 8 full rounds */ \
652 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
653 14, 15, R1024_0, 8 * (R) + 1); \
654 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
655 08, 01, R1024_1, 8 * (R) + 2); \
656 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
657 10, 09, R1024_2, 8 * (R) + 3); \
658 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
659 12, 07, R1024_3, 8 * (R) + 4); \
660 I1024(2 * (R)); \
661 R1024(00, 01, 02, 03, 04, 05, 06, 07, 08, 09, 10, 11, 12, 13, \
662 14, 15, R1024_4, 8 * (R) + 5); \
663 R1024(00, 09, 02, 13, 06, 11, 04, 15, 10, 07, 12, 03, 14, 05, \
664 08, 01, R1024_5, 8 * (R) + 6); \
665 R1024(00, 07, 02, 05, 04, 03, 06, 01, 12, 15, 14, 13, 08, 11, \
666 10, 09, R1024_6, 8 * (R) + 7); \
667 R1024(00, 15, 02, 11, 06, 13, 04, 09, 14, 01, 08, 05, 10, 03, \
668 12, 07, R1024_7, 8 * (R) + 8); \
669 I1024(2 * (R) + 1);
670
671 R1024_8_rounds(0);
672
673 #define R1024_Unroll_R(NN) \
674 ((SKEIN_UNROLL_1024 == 0 && SKEIN1024_ROUNDS_TOTAL/8 > (NN)) || \
675 (SKEIN_UNROLL_1024 > (NN)))
676
677 #if R1024_Unroll_R(1)
678 R1024_8_rounds(1);
679 #endif
680 #if R1024_Unroll_R(2)
681 R1024_8_rounds(2);
682 #endif
683 #if R1024_Unroll_R(3)
684 R1024_8_rounds(3);
685 #endif
686 #if R1024_Unroll_R(4)
687 R1024_8_rounds(4);
688 #endif
689 #if R1024_Unroll_R(5)
690 R1024_8_rounds(5);
691 #endif
692 #if R1024_Unroll_R(6)
693 R1024_8_rounds(6);
694 #endif
695 #if R1024_Unroll_R(7)
696 R1024_8_rounds(7);
697 #endif
698 #if R1024_Unroll_R(8)
699 R1024_8_rounds(8);
700 #endif
701 #if R1024_Unroll_R(9)
702 R1024_8_rounds(9);
703 #endif
704 #if R1024_Unroll_R(10)
705 R1024_8_rounds(10);
706 #endif
707 #if R1024_Unroll_R(11)
708 R1024_8_rounds(11);
709 #endif
710 #if R1024_Unroll_R(12)
711 R1024_8_rounds(12);
712 #endif
713 #if R1024_Unroll_R(13)
714 R1024_8_rounds(13);
715 #endif
716 #if R1024_Unroll_R(14)
717 R1024_8_rounds(14);
718 #endif
719 #if (SKEIN_UNROLL_1024 > 14)
720 #error "need more unrolling in Skein_1024_Process_Block"
721 #endif
722 }
723 /*
724 * do the final "feedforward" xor, update context chaining vars
725 */
726
727 ctx->X[0] = X00 ^ w[0];
728 ctx->X[1] = X01 ^ w[1];
729 ctx->X[2] = X02 ^ w[2];
730 ctx->X[3] = X03 ^ w[3];
731 ctx->X[4] = X04 ^ w[4];
732 ctx->X[5] = X05 ^ w[5];
733 ctx->X[6] = X06 ^ w[6];
734 ctx->X[7] = X07 ^ w[7];
735 ctx->X[8] = X08 ^ w[8];
736 ctx->X[9] = X09 ^ w[9];
737 ctx->X[10] = X10 ^ w[10];
738 ctx->X[11] = X11 ^ w[11];
739 ctx->X[12] = X12 ^ w[12];
740 ctx->X[13] = X13 ^ w[13];
741 ctx->X[14] = X14 ^ w[14];
742 ctx->X[15] = X15 ^ w[15];
743
744 Skein_Show_Round(BLK_BITS, &ctx->h, SKEIN_RND_FEED_FWD, ctx->X);
745
746 ts[1] &= ~SKEIN_T1_FLAG_FIRST;
747 blkPtr += SKEIN1024_BLOCK_BYTES;
748 } while (--blkCnt);
749 ctx->h.T[0] = ts[0];
750 ctx->h.T[1] = ts[1];
751 }
752
753 #if defined(SKEIN_CODE_SIZE) || defined(SKEIN_PERF)
754 size_t
Skein1024_Process_Block_CodeSize(void)755 Skein1024_Process_Block_CodeSize(void)
756 {
757 return ((uint8_t *)Skein1024_Process_Block_CodeSize) -
758 ((uint8_t *)Skein1024_Process_Block);
759 }
760
761 uint_t
Skein1024_Unroll_Cnt(void)762 Skein1024_Unroll_Cnt(void)
763 {
764 return (SKEIN_UNROLL_1024);
765 }
766 #endif
767 #endif
768