1 /*
2 * Copyright 2016 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13
14 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
15 size_t r);
16 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r);
17
18 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
19
20 /*
21 * Choose some sensible defaults
22 */
23 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && \
24 !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
25 # define KECCAK_2X /* default to KECCAK_2X variant */
26 #endif
27
28 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || \
29 (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || \
30 defined(__mips) || defined(__riscv) || defined(__s390__) || \
31 defined(__EMSCRIPTEN__)
32 /*
33 * These don't have "and with complement" instruction, so minimize amount
34 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
35 */
36 # define KECCAK_COMPLEMENTING_TRANSFORM
37 #endif
38
39 #if defined(__x86_64__) || defined(__aarch64__) || \
40 defined(__mips64) || defined(__ia64) || \
41 (defined(__VMS) && !defined(__vax))
42 /*
43 * These are available even in ILP32 flavours, but even then they are
44 * capable of performing 64-bit operations as efficiently as in *P64.
45 * Since it's not given that we can use sizeof(void *), just shunt it.
46 */
47 # define BIT_INTERLEAVE (0)
48 #else
49 # define BIT_INTERLEAVE (sizeof(void *) < 8)
50 #endif
51
52 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
53
ROL64(uint64_t val,int offset)54 static uint64_t ROL64(uint64_t val, int offset)
55 {
56 if (offset == 0) {
57 return val;
58 } else if (!BIT_INTERLEAVE) {
59 return (val << offset) | (val >> (64-offset));
60 } else {
61 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
62
63 if (offset & 1) {
64 uint32_t tmp = hi;
65
66 offset >>= 1;
67 hi = ROL32(lo, offset);
68 lo = ROL32(tmp, offset + 1);
69 } else {
70 offset >>= 1;
71 lo = ROL32(lo, offset);
72 hi = ROL32(hi, offset);
73 }
74
75 return ((uint64_t)hi << 32) | lo;
76 }
77 }
78
79 static const unsigned char rhotates[5][5] = {
80 { 0, 1, 62, 28, 27 },
81 { 36, 44, 6, 55, 20 },
82 { 3, 10, 43, 25, 39 },
83 { 41, 45, 15, 21, 8 },
84 { 18, 2, 61, 56, 14 }
85 };
86
87 static const uint64_t iotas[] = {
88 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
89 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
90 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
91 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
92 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
93 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
94 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
95 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
96 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
97 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
98 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
99 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
100 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
101 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
102 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
103 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
104 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
105 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
106 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
107 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
108 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
109 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
110 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
111 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
112 };
113
114 #if defined(KECCAK_REF)
115 /*
116 * This is straightforward or "maximum clarity" implementation aiming
117 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
118 * Permutation-Based Hash and Extendible-Output Functions" as much as
119 * possible. With one caveat. Because of the way C stores matrices,
120 * references to A[x,y] in the specification are presented as A[y][x].
121 * Implementation unrolls inner x-loops so that modulo 5 operations are
122 * explicitly pre-computed.
123 */
Theta(uint64_t A[5][5])124 static void Theta(uint64_t A[5][5])
125 {
126 uint64_t C[5], D[5];
127 size_t y;
128
129 C[0] = A[0][0];
130 C[1] = A[0][1];
131 C[2] = A[0][2];
132 C[3] = A[0][3];
133 C[4] = A[0][4];
134
135 for (y = 1; y < 5; y++) {
136 C[0] ^= A[y][0];
137 C[1] ^= A[y][1];
138 C[2] ^= A[y][2];
139 C[3] ^= A[y][3];
140 C[4] ^= A[y][4];
141 }
142
143 D[0] = ROL64(C[1], 1) ^ C[4];
144 D[1] = ROL64(C[2], 1) ^ C[0];
145 D[2] = ROL64(C[3], 1) ^ C[1];
146 D[3] = ROL64(C[4], 1) ^ C[2];
147 D[4] = ROL64(C[0], 1) ^ C[3];
148
149 for (y = 0; y < 5; y++) {
150 A[y][0] ^= D[0];
151 A[y][1] ^= D[1];
152 A[y][2] ^= D[2];
153 A[y][3] ^= D[3];
154 A[y][4] ^= D[4];
155 }
156 }
157
Rho(uint64_t A[5][5])158 static void Rho(uint64_t A[5][5])
159 {
160 size_t y;
161
162 for (y = 0; y < 5; y++) {
163 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
164 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
165 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
166 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
167 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
168 }
169 }
170
Pi(uint64_t A[5][5])171 static void Pi(uint64_t A[5][5])
172 {
173 uint64_t T[5][5];
174
175 /*
176 * T = A
177 * A[y][x] = T[x][(3*y+x)%5]
178 */
179 memcpy(T, A, sizeof(T));
180
181 A[0][0] = T[0][0];
182 A[0][1] = T[1][1];
183 A[0][2] = T[2][2];
184 A[0][3] = T[3][3];
185 A[0][4] = T[4][4];
186
187 A[1][0] = T[0][3];
188 A[1][1] = T[1][4];
189 A[1][2] = T[2][0];
190 A[1][3] = T[3][1];
191 A[1][4] = T[4][2];
192
193 A[2][0] = T[0][1];
194 A[2][1] = T[1][2];
195 A[2][2] = T[2][3];
196 A[2][3] = T[3][4];
197 A[2][4] = T[4][0];
198
199 A[3][0] = T[0][4];
200 A[3][1] = T[1][0];
201 A[3][2] = T[2][1];
202 A[3][3] = T[3][2];
203 A[3][4] = T[4][3];
204
205 A[4][0] = T[0][2];
206 A[4][1] = T[1][3];
207 A[4][2] = T[2][4];
208 A[4][3] = T[3][0];
209 A[4][4] = T[4][1];
210 }
211
Chi(uint64_t A[5][5])212 static void Chi(uint64_t A[5][5])
213 {
214 uint64_t C[5];
215 size_t y;
216
217 for (y = 0; y < 5; y++) {
218 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
219 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
220 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
221 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
222 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
223
224 A[y][0] = C[0];
225 A[y][1] = C[1];
226 A[y][2] = C[2];
227 A[y][3] = C[3];
228 A[y][4] = C[4];
229 }
230 }
231
Iota(uint64_t A[5][5],size_t i)232 static void Iota(uint64_t A[5][5], size_t i)
233 {
234 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
235 A[0][0] ^= iotas[i];
236 }
237
KeccakF1600(uint64_t A[5][5])238 static void KeccakF1600(uint64_t A[5][5])
239 {
240 size_t i;
241
242 for (i = 0; i < 24; i++) {
243 Theta(A);
244 Rho(A);
245 Pi(A);
246 Chi(A);
247 Iota(A, i);
248 }
249 }
250
251 #elif defined(KECCAK_1X)
252 /*
253 * This implementation is optimization of above code featuring unroll
254 * of even y-loops, their fusion and code motion. It also minimizes
255 * temporary storage. Compiler would normally do all these things for
256 * you, purpose of manual optimization is to provide "unobscured"
257 * reference for assembly implementation [in case this approach is
258 * chosen for implementation on some platform]. In the nutshell it's
259 * equivalent of "plane-per-plane processing" approach discussed in
260 * section 2.4 of "Keccak implementation overview".
261 */
Round(uint64_t A[5][5],size_t i)262 static void Round(uint64_t A[5][5], size_t i)
263 {
264 uint64_t C[5], E[2]; /* registers */
265 uint64_t D[5], T[2][5]; /* memory */
266
267 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
268
269 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
270 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
271 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
272 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
273 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
274
275 #if defined(__arm__)
276 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
277 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
278 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
279 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
280 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
281
282 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
283 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
284 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
285 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
286 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
287
288 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
289 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
290 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
291 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
292 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
293 #else
294 D[0] = ROL64(C[1], 1) ^ C[4];
295 D[1] = ROL64(C[2], 1) ^ C[0];
296 D[2] = ROL64(C[3], 1) ^ C[1];
297 D[3] = ROL64(C[4], 1) ^ C[2];
298 D[4] = ROL64(C[0], 1) ^ C[3];
299
300 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
301 T[0][1] = A[0][1] ^ D[1];
302 T[0][2] = A[0][2] ^ D[2];
303 T[0][3] = A[0][3] ^ D[3];
304 T[0][4] = A[0][4] ^ D[4];
305
306 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
307 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
308 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
309 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
310 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
311 #endif
312 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
313 A[0][1] = C[1] ^ (~C[2] & C[3]);
314 A[0][2] = C[2] ^ (~C[3] & C[4]);
315 A[0][3] = C[3] ^ (~C[4] & C[0]);
316 A[0][4] = C[4] ^ (~C[0] & C[1]);
317
318 T[1][0] = A[1][0] ^ (C[3] = D[0]);
319 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
320 T[1][2] = A[1][2] ^ (E[0] = D[2]);
321 T[1][3] = A[1][3] ^ (E[1] = D[3]);
322 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
323
324 C[0] = ROL64(T[0][3], rhotates[0][3]);
325 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
326 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
327 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
328 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
329
330 A[1][0] = C[0] ^ (~C[1] & C[2]);
331 A[1][1] = C[1] ^ (~C[2] & C[3]);
332 A[1][2] = C[2] ^ (~C[3] & C[4]);
333 A[1][3] = C[3] ^ (~C[4] & C[0]);
334 A[1][4] = C[4] ^ (~C[0] & C[1]);
335
336 C[0] = ROL64(T[0][1], rhotates[0][1]);
337 C[1] = ROL64(T[1][2], rhotates[1][2]);
338 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
339 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
340 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
341
342 A[2][0] = C[0] ^ (~C[1] & C[2]);
343 A[2][1] = C[1] ^ (~C[2] & C[3]);
344 A[2][2] = C[2] ^ (~C[3] & C[4]);
345 A[2][3] = C[3] ^ (~C[4] & C[0]);
346 A[2][4] = C[4] ^ (~C[0] & C[1]);
347
348 C[0] = ROL64(T[0][4], rhotates[0][4]);
349 C[1] = ROL64(T[1][0], rhotates[1][0]);
350 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
351 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
352 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
353
354 A[3][0] = C[0] ^ (~C[1] & C[2]);
355 A[3][1] = C[1] ^ (~C[2] & C[3]);
356 A[3][2] = C[2] ^ (~C[3] & C[4]);
357 A[3][3] = C[3] ^ (~C[4] & C[0]);
358 A[3][4] = C[4] ^ (~C[0] & C[1]);
359
360 C[0] = ROL64(T[0][2], rhotates[0][2]);
361 C[1] = ROL64(T[1][3], rhotates[1][3]);
362 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
363 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
364 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
365
366 A[4][0] = C[0] ^ (~C[1] & C[2]);
367 A[4][1] = C[1] ^ (~C[2] & C[3]);
368 A[4][2] = C[2] ^ (~C[3] & C[4]);
369 A[4][3] = C[3] ^ (~C[4] & C[0]);
370 A[4][4] = C[4] ^ (~C[0] & C[1]);
371 }
372
KeccakF1600(uint64_t A[5][5])373 static void KeccakF1600(uint64_t A[5][5])
374 {
375 size_t i;
376
377 for (i = 0; i < 24; i++) {
378 Round(A, i);
379 }
380 }
381
382 #elif defined(KECCAK_1X_ALT)
383 /*
384 * This is variant of above KECCAK_1X that reduces requirement for
385 * temporary storage even further, but at cost of more updates to A[][].
386 * It's less suitable if A[][] is memory bound, but better if it's
387 * register bound.
388 */
389
Round(uint64_t A[5][5],size_t i)390 static void Round(uint64_t A[5][5], size_t i)
391 {
392 uint64_t C[5], D[5];
393
394 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
395
396 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
397 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
398 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
399 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
400 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
401
402 D[1] = C[0] ^ ROL64(C[2], 1);
403 D[2] = C[1] ^ ROL64(C[3], 1);
404 D[3] = C[2] ^= ROL64(C[4], 1);
405 D[4] = C[3] ^= ROL64(C[0], 1);
406 D[0] = C[4] ^= ROL64(C[1], 1);
407
408 A[0][1] ^= D[1];
409 A[1][1] ^= D[1];
410 A[2][1] ^= D[1];
411 A[3][1] ^= D[1];
412 A[4][1] ^= D[1];
413
414 A[0][2] ^= D[2];
415 A[1][2] ^= D[2];
416 A[2][2] ^= D[2];
417 A[3][2] ^= D[2];
418 A[4][2] ^= D[2];
419
420 A[0][3] ^= C[2];
421 A[1][3] ^= C[2];
422 A[2][3] ^= C[2];
423 A[3][3] ^= C[2];
424 A[4][3] ^= C[2];
425
426 A[0][4] ^= C[3];
427 A[1][4] ^= C[3];
428 A[2][4] ^= C[3];
429 A[3][4] ^= C[3];
430 A[4][4] ^= C[3];
431
432 A[0][0] ^= C[4];
433 A[1][0] ^= C[4];
434 A[2][0] ^= C[4];
435 A[3][0] ^= C[4];
436 A[4][0] ^= C[4];
437
438 C[1] = A[0][1];
439 C[2] = A[0][2];
440 C[3] = A[0][3];
441 C[4] = A[0][4];
442
443 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
444 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
445 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
446 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
447
448 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
449 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
450 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
451 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
452
453 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
454 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
455 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
456 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
457
458 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
459 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
460 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
461 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
462
463 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
464 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
465 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
466 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
467
468 A[1][0] = ROL64(C[3], rhotates[0][3]);
469 A[2][0] = ROL64(C[1], rhotates[0][1]);
470 A[3][0] = ROL64(C[4], rhotates[0][4]);
471 A[4][0] = ROL64(C[2], rhotates[0][2]);
472
473 C[0] = A[0][0];
474 C[1] = A[1][0];
475 D[0] = A[0][1];
476 D[1] = A[1][1];
477
478 A[0][0] ^= (~A[0][1] & A[0][2]);
479 A[1][0] ^= (~A[1][1] & A[1][2]);
480 A[0][1] ^= (~A[0][2] & A[0][3]);
481 A[1][1] ^= (~A[1][2] & A[1][3]);
482 A[0][2] ^= (~A[0][3] & A[0][4]);
483 A[1][2] ^= (~A[1][3] & A[1][4]);
484 A[0][3] ^= (~A[0][4] & C[0]);
485 A[1][3] ^= (~A[1][4] & C[1]);
486 A[0][4] ^= (~C[0] & D[0]);
487 A[1][4] ^= (~C[1] & D[1]);
488
489 C[2] = A[2][0];
490 C[3] = A[3][0];
491 D[2] = A[2][1];
492 D[3] = A[3][1];
493
494 A[2][0] ^= (~A[2][1] & A[2][2]);
495 A[3][0] ^= (~A[3][1] & A[3][2]);
496 A[2][1] ^= (~A[2][2] & A[2][3]);
497 A[3][1] ^= (~A[3][2] & A[3][3]);
498 A[2][2] ^= (~A[2][3] & A[2][4]);
499 A[3][2] ^= (~A[3][3] & A[3][4]);
500 A[2][3] ^= (~A[2][4] & C[2]);
501 A[3][3] ^= (~A[3][4] & C[3]);
502 A[2][4] ^= (~C[2] & D[2]);
503 A[3][4] ^= (~C[3] & D[3]);
504
505 C[4] = A[4][0];
506 D[4] = A[4][1];
507
508 A[4][0] ^= (~A[4][1] & A[4][2]);
509 A[4][1] ^= (~A[4][2] & A[4][3]);
510 A[4][2] ^= (~A[4][3] & A[4][4]);
511 A[4][3] ^= (~A[4][4] & C[4]);
512 A[4][4] ^= (~C[4] & D[4]);
513 A[0][0] ^= iotas[i];
514 }
515
KeccakF1600(uint64_t A[5][5])516 static void KeccakF1600(uint64_t A[5][5])
517 {
518 size_t i;
519
520 for (i = 0; i < 24; i++) {
521 Round(A, i);
522 }
523 }
524
525 #elif defined(KECCAK_2X)
526 /*
527 * This implementation is variant of KECCAK_1X above with outer-most
528 * round loop unrolled twice. This allows to take temporary storage
529 * out of round procedure and simplify references to it by alternating
530 * it with actual data (see round loop below). Originally it was meant
531 * rather as reference for an assembly implementation, but it seems to
532 * play best with compilers [as well as provide best instruction per
533 * processed byte ratio at minimal round unroll factor]...
534 */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)535 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
536 {
537 uint64_t C[5], D[5];
538
539 assert(i < (sizeof(iotas) / sizeof(iotas[0])));
540
541 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
542 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
543 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
544 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
545 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
546
547 D[0] = ROL64(C[1], 1) ^ C[4];
548 D[1] = ROL64(C[2], 1) ^ C[0];
549 D[2] = ROL64(C[3], 1) ^ C[1];
550 D[3] = ROL64(C[4], 1) ^ C[2];
551 D[4] = ROL64(C[0], 1) ^ C[3];
552
553 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
554 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
555 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
556 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
557 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
558
559 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
560 R[0][0] = C[0] ^ ( C[1] | C[2]) ^ iotas[i];
561 R[0][1] = C[1] ^ (~C[2] | C[3]);
562 R[0][2] = C[2] ^ ( C[3] & C[4]);
563 R[0][3] = C[3] ^ ( C[4] | C[0]);
564 R[0][4] = C[4] ^ ( C[0] & C[1]);
565 #else
566 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
567 R[0][1] = C[1] ^ (~C[2] & C[3]);
568 R[0][2] = C[2] ^ (~C[3] & C[4]);
569 R[0][3] = C[3] ^ (~C[4] & C[0]);
570 R[0][4] = C[4] ^ (~C[0] & C[1]);
571 #endif
572
573 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
574 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
575 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
576 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
577 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
578
579 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
580 R[1][0] = C[0] ^ (C[1] | C[2]);
581 R[1][1] = C[1] ^ (C[2] & C[3]);
582 R[1][2] = C[2] ^ (C[3] | ~C[4]);
583 R[1][3] = C[3] ^ (C[4] | C[0]);
584 R[1][4] = C[4] ^ (C[0] & C[1]);
585 #else
586 R[1][0] = C[0] ^ (~C[1] & C[2]);
587 R[1][1] = C[1] ^ (~C[2] & C[3]);
588 R[1][2] = C[2] ^ (~C[3] & C[4]);
589 R[1][3] = C[3] ^ (~C[4] & C[0]);
590 R[1][4] = C[4] ^ (~C[0] & C[1]);
591 #endif
592
593 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
594 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
595 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
596 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
597 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
598
599 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
600 R[2][0] = C[0] ^ ( C[1] | C[2]);
601 R[2][1] = C[1] ^ ( C[2] & C[3]);
602 R[2][2] = C[2] ^ (~C[3] & C[4]);
603 R[2][3] = ~C[3] ^ ( C[4] | C[0]);
604 R[2][4] = C[4] ^ ( C[0] & C[1]);
605 #else
606 R[2][0] = C[0] ^ (~C[1] & C[2]);
607 R[2][1] = C[1] ^ (~C[2] & C[3]);
608 R[2][2] = C[2] ^ (~C[3] & C[4]);
609 R[2][3] = C[3] ^ (~C[4] & C[0]);
610 R[2][4] = C[4] ^ (~C[0] & C[1]);
611 #endif
612
613 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
614 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
615 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
616 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
617 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
618
619 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
620 R[3][0] = C[0] ^ ( C[1] & C[2]);
621 R[3][1] = C[1] ^ ( C[2] | C[3]);
622 R[3][2] = C[2] ^ (~C[3] | C[4]);
623 R[3][3] = ~C[3] ^ ( C[4] & C[0]);
624 R[3][4] = C[4] ^ ( C[0] | C[1]);
625 #else
626 R[3][0] = C[0] ^ (~C[1] & C[2]);
627 R[3][1] = C[1] ^ (~C[2] & C[3]);
628 R[3][2] = C[2] ^ (~C[3] & C[4]);
629 R[3][3] = C[3] ^ (~C[4] & C[0]);
630 R[3][4] = C[4] ^ (~C[0] & C[1]);
631 #endif
632
633 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
634 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
635 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
636 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
637 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
638
639 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
640 R[4][0] = C[0] ^ (~C[1] & C[2]);
641 R[4][1] = ~C[1] ^ ( C[2] | C[3]);
642 R[4][2] = C[2] ^ ( C[3] & C[4]);
643 R[4][3] = C[3] ^ ( C[4] | C[0]);
644 R[4][4] = C[4] ^ ( C[0] & C[1]);
645 #else
646 R[4][0] = C[0] ^ (~C[1] & C[2]);
647 R[4][1] = C[1] ^ (~C[2] & C[3]);
648 R[4][2] = C[2] ^ (~C[3] & C[4]);
649 R[4][3] = C[3] ^ (~C[4] & C[0]);
650 R[4][4] = C[4] ^ (~C[0] & C[1]);
651 #endif
652 }
653
KeccakF1600(uint64_t A[5][5])654 static void KeccakF1600(uint64_t A[5][5])
655 {
656 uint64_t T[5][5];
657 size_t i;
658
659 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
660 A[0][1] = ~A[0][1];
661 A[0][2] = ~A[0][2];
662 A[1][3] = ~A[1][3];
663 A[2][2] = ~A[2][2];
664 A[3][2] = ~A[3][2];
665 A[4][0] = ~A[4][0];
666 #endif
667
668 for (i = 0; i < 24; i += 2) {
669 Round(T, A, i);
670 Round(A, T, i + 1);
671 }
672
673 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
674 A[0][1] = ~A[0][1];
675 A[0][2] = ~A[0][2];
676 A[1][3] = ~A[1][3];
677 A[2][2] = ~A[2][2];
678 A[3][2] = ~A[3][2];
679 A[4][0] = ~A[4][0];
680 #endif
681 }
682
683 #else /* define KECCAK_INPLACE to compile this code path */
684 /*
685 * This implementation is KECCAK_1X from above combined 4 times with
686 * a twist that allows to omit temporary storage and perform in-place
687 * processing. It's discussed in section 2.5 of "Keccak implementation
688 * overview". It's likely to be best suited for processors with large
689 * register bank... On the other hand processor with large register
690 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
691 * more compact...
692 */
FourRounds(uint64_t A[5][5],size_t i)693 static void FourRounds(uint64_t A[5][5], size_t i)
694 {
695 uint64_t B[5], C[5], D[5];
696
697 assert(i <= (sizeof(iotas) / sizeof(iotas[0]) - 4));
698
699 /* Round 4*n */
700 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
701 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
702 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
703 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
704 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
705
706 D[0] = ROL64(C[1], 1) ^ C[4];
707 D[1] = ROL64(C[2], 1) ^ C[0];
708 D[2] = ROL64(C[3], 1) ^ C[1];
709 D[3] = ROL64(C[4], 1) ^ C[2];
710 D[4] = ROL64(C[0], 1) ^ C[3];
711
712 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
713 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
714 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
715 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
716 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
717
718 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
719 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
720 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
721 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
722 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
723
724 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
725 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
726 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
727 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
728 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
729
730 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
731 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
732 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
733 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
734 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
735
736 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
737 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
738 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
739 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
740 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
741
742 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
743 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
744 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
745 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
746 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
747
748 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
749 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
750 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
751 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
752 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
753
754 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
755 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
756 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
757 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
758 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
759
760 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
761 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
762 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
763 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
764 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
765
766 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
767 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
768 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
769 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
770 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
771
772 /* Round 4*n+1 */
773 D[0] = ROL64(C[1], 1) ^ C[4];
774 D[1] = ROL64(C[2], 1) ^ C[0];
775 D[2] = ROL64(C[3], 1) ^ C[1];
776 D[3] = ROL64(C[4], 1) ^ C[2];
777 D[4] = ROL64(C[0], 1) ^ C[3];
778
779 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
780 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
781 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
782 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
783 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
784
785 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
786 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
787 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
788 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
789 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
790
791 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
792 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
793 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
794 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
795 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
796
797 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
798 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
799 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
800 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
801 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
802
803 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
804 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
805 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
806 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
807 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
808
809 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
810 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
811 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
812 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
813 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
814
815 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
816 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
817 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
818 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
819 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
820
821 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
822 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
823 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
824 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
825 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
826
827 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
828 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
829 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
830 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
831 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
832
833 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
834 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
835 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
836 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
837 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
838
839 /* Round 4*n+2 */
840 D[0] = ROL64(C[1], 1) ^ C[4];
841 D[1] = ROL64(C[2], 1) ^ C[0];
842 D[2] = ROL64(C[3], 1) ^ C[1];
843 D[3] = ROL64(C[4], 1) ^ C[2];
844 D[4] = ROL64(C[0], 1) ^ C[3];
845
846 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
847 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
848 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
849 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
850 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
851
852 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
853 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
854 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
855 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
856 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
857
858 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
859 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
860 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
861 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
862 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
863
864 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
865 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
866 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
867 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
868 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
869
870 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
871 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
872 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
873 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
874 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
875
876 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
877 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
878 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
879 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
880 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
881
882 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
883 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
884 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
885 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
886 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
887
888 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
889 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
890 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
891 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
892 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
893
894 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
895 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
896 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
897 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
898 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
899
900 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
901 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
902 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
903 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
904 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
905
906 /* Round 4*n+3 */
907 D[0] = ROL64(C[1], 1) ^ C[4];
908 D[1] = ROL64(C[2], 1) ^ C[0];
909 D[2] = ROL64(C[3], 1) ^ C[1];
910 D[3] = ROL64(C[4], 1) ^ C[2];
911 D[4] = ROL64(C[0], 1) ^ C[3];
912
913 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
914 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
915 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
916 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
917 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
918
919 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
920 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
921 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
922 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
923 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
924
925 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
926 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
927 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
928 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
929 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
930
931 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
932 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
933 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
934 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
935 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
936
937 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
938 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
939 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
940 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
941 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
942
943 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
944 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
945 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
946 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
947 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
948
949 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
950 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
951 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
952 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
953 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
954
955 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
956 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
957 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
958 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
959 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
960
961 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
962 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
963 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
964 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
965 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
966
967 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
968 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
969 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
970 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
971 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
972 }
973
KeccakF1600(uint64_t A[5][5])974 static void KeccakF1600(uint64_t A[5][5])
975 {
976 size_t i;
977
978 for (i = 0; i < 24; i += 4) {
979 FourRounds(A, i);
980 }
981 }
982
983 #endif
984
BitInterleave(uint64_t Ai)985 static uint64_t BitInterleave(uint64_t Ai)
986 {
987 if (BIT_INTERLEAVE) {
988 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
989 uint32_t t0, t1;
990
991 t0 = lo & 0x55555555;
992 t0 |= t0 >> 1; t0 &= 0x33333333;
993 t0 |= t0 >> 2; t0 &= 0x0f0f0f0f;
994 t0 |= t0 >> 4; t0 &= 0x00ff00ff;
995 t0 |= t0 >> 8; t0 &= 0x0000ffff;
996
997 t1 = hi & 0x55555555;
998 t1 |= t1 >> 1; t1 &= 0x33333333;
999 t1 |= t1 >> 2; t1 &= 0x0f0f0f0f;
1000 t1 |= t1 >> 4; t1 &= 0x00ff00ff;
1001 t1 |= t1 >> 8; t1 <<= 16;
1002
1003 lo &= 0xaaaaaaaa;
1004 lo |= lo << 1; lo &= 0xcccccccc;
1005 lo |= lo << 2; lo &= 0xf0f0f0f0;
1006 lo |= lo << 4; lo &= 0xff00ff00;
1007 lo |= lo << 8; lo >>= 16;
1008
1009 hi &= 0xaaaaaaaa;
1010 hi |= hi << 1; hi &= 0xcccccccc;
1011 hi |= hi << 2; hi &= 0xf0f0f0f0;
1012 hi |= hi << 4; hi &= 0xff00ff00;
1013 hi |= hi << 8; hi &= 0xffff0000;
1014
1015 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1016 }
1017
1018 return Ai;
1019 }
1020
BitDeinterleave(uint64_t Ai)1021 static uint64_t BitDeinterleave(uint64_t Ai)
1022 {
1023 if (BIT_INTERLEAVE) {
1024 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1025 uint32_t t0, t1;
1026
1027 t0 = lo & 0x0000ffff;
1028 t0 |= t0 << 8; t0 &= 0x00ff00ff;
1029 t0 |= t0 << 4; t0 &= 0x0f0f0f0f;
1030 t0 |= t0 << 2; t0 &= 0x33333333;
1031 t0 |= t0 << 1; t0 &= 0x55555555;
1032
1033 t1 = hi << 16;
1034 t1 |= t1 >> 8; t1 &= 0xff00ff00;
1035 t1 |= t1 >> 4; t1 &= 0xf0f0f0f0;
1036 t1 |= t1 >> 2; t1 &= 0xcccccccc;
1037 t1 |= t1 >> 1; t1 &= 0xaaaaaaaa;
1038
1039 lo >>= 16;
1040 lo |= lo << 8; lo &= 0x00ff00ff;
1041 lo |= lo << 4; lo &= 0x0f0f0f0f;
1042 lo |= lo << 2; lo &= 0x33333333;
1043 lo |= lo << 1; lo &= 0x55555555;
1044
1045 hi &= 0xffff0000;
1046 hi |= hi >> 8; hi &= 0xff00ff00;
1047 hi |= hi >> 4; hi &= 0xf0f0f0f0;
1048 hi |= hi >> 2; hi &= 0xcccccccc;
1049 hi |= hi >> 1; hi &= 0xaaaaaaaa;
1050
1051 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1052 }
1053
1054 return Ai;
1055 }
1056
1057 /*
1058 * SHA3_absorb can be called multiple times, but at each invocation
1059 * largest multiple of |r| out of |len| bytes are processed. Then
1060 * remaining amount of bytes is returned. This is done to spare caller
1061 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1062 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1063 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1064 * padding and intermediate sub-block buffering, byte- or bitwise, is
1065 * caller's responsibility.
1066 */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1067 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1068 size_t r)
1069 {
1070 uint64_t *A_flat = (uint64_t *)A;
1071 size_t i, w = r / 8;
1072
1073 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1074
1075 while (len >= r) {
1076 for (i = 0; i < w; i++) {
1077 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 |
1078 (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 |
1079 (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 |
1080 (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1081 inp += 8;
1082
1083 A_flat[i] ^= BitInterleave(Ai);
1084 }
1085 KeccakF1600(A);
1086 len -= r;
1087 }
1088
1089 return len;
1090 }
1091
1092 /*
1093 * sha3_squeeze is called once at the end to generate |out| hash value
1094 * of |len| bytes.
1095 */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r)1096 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r)
1097 {
1098 uint64_t *A_flat = (uint64_t *)A;
1099 size_t i, w = r / 8;
1100
1101 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102
1103 while (len != 0) {
1104 for (i = 0; i < w && len != 0; i++) {
1105 uint64_t Ai = BitDeinterleave(A_flat[i]);
1106
1107 if (len < 8) {
1108 for (i = 0; i < len; i++) {
1109 *out++ = (unsigned char)Ai;
1110 Ai >>= 8;
1111 }
1112 return;
1113 }
1114
1115 out[0] = (unsigned char)(Ai);
1116 out[1] = (unsigned char)(Ai >> 8);
1117 out[2] = (unsigned char)(Ai >> 16);
1118 out[3] = (unsigned char)(Ai >> 24);
1119 out[4] = (unsigned char)(Ai >> 32);
1120 out[5] = (unsigned char)(Ai >> 40);
1121 out[6] = (unsigned char)(Ai >> 48);
1122 out[7] = (unsigned char)(Ai >> 56);
1123 out += 8;
1124 len -= 8;
1125 }
1126 if (len)
1127 KeccakF1600(A);
1128 }
1129 }
1130 #endif
1131
1132 #ifdef SELFTEST
1133 /*
1134 * Post-padding one-shot implementations would look as following:
1135 *
1136 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1137 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1138 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1139 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1140 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1141 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1142 */
1143
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1144 void SHA3_sponge(const unsigned char *inp, size_t len,
1145 unsigned char *out, size_t d, size_t r)
1146 {
1147 uint64_t A[5][5];
1148
1149 memset(A, 0, sizeof(A));
1150 SHA3_absorb(A, inp, len, r);
1151 SHA3_squeeze(A, out, d, r);
1152 }
1153
1154 # include <stdio.h>
1155
main()1156 int main()
1157 {
1158 /*
1159 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1160 */
1161 unsigned char test[168] = { '\xf3', '\x3' };
1162 unsigned char out[512];
1163 size_t i;
1164 static const unsigned char result[512] = {
1165 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1166 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1167 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1168 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1169 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1170 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1171 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1172 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1173 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1174 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1175 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1176 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1177 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1178 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1179 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1180 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1181 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1182 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1183 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1184 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1185 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1186 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1187 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1188 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1189 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1190 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1191 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1192 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1193 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1194 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1195 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1196 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1197 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1198 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1199 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1200 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1201 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1202 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1203 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1204 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1205 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1206 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1207 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1208 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1209 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1210 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1211 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1212 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1213 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1214 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1215 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1216 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1217 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1218 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1219 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1220 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1221 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1222 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1223 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1224 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1225 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1226 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1227 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1228 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1229 };
1230
1231 test[167] = '\x80';
1232 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1233
1234 /*
1235 * Rationale behind keeping output [formatted as below] is that
1236 * one should be able to redirect it to a file, then copy-n-paste
1237 * final "output val" from official example to another file, and
1238 * compare the two with diff(1).
1239 */
1240 for (i = 0; i < sizeof(out);) {
1241 printf("%02X", out[i]);
1242 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1243 }
1244
1245 if (memcmp(out,result,sizeof(out))) {
1246 fprintf(stderr,"failure\n");
1247 return 1;
1248 } else {
1249 fprintf(stderr,"success\n");
1250 return 0;
1251 }
1252 }
1253 #endif
1254