1 /*
2 * Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3 *
4 * Licensed under the Apache License 2.0 (the "License"). You may not use
5 * this file except in compliance with the License. You can obtain a copy
6 * in the file LICENSE in the source distribution or at
7 * https://www.openssl.org/source/license.html
8 */
9
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13
14 #include "internal/nelem.h"
15
16 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17 size_t r);
18 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19
20 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21
22 /*
23 * Choose some sensible defaults
24 */
25 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
26 #define KECCAK_2X /* default to KECCAK_2X variant */
27 #endif
28
29 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__EMSCRIPTEN__)
30 /*
31 * These don't have "and with complement" instruction, so minimize amount
32 * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
33 */
34 #define KECCAK_COMPLEMENTING_TRANSFORM
35 #endif
36
37 #if defined(__x86_64__) || defined(__aarch64__) || defined(__mips64) || defined(__ia64) || (defined(__VMS) && !defined(__vax))
38 /*
39 * These are available even in ILP32 flavours, but even then they are
40 * capable of performing 64-bit operations as efficiently as in *P64.
41 * Since it's not given that we can use sizeof(void *), just shunt it.
42 */
43 #define BIT_INTERLEAVE (0)
44 #else
45 #define BIT_INTERLEAVE (sizeof(void *) < 8)
46 #endif
47
48 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
49
ROL64(uint64_t val,int offset)50 static uint64_t ROL64(uint64_t val, int offset)
51 {
52 if (offset == 0) {
53 return val;
54 } else if (!BIT_INTERLEAVE) {
55 return (val << offset) | (val >> (64 - offset));
56 } else {
57 uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
58
59 if (offset & 1) {
60 uint32_t tmp = hi;
61
62 offset >>= 1;
63 hi = ROL32(lo, offset);
64 lo = ROL32(tmp, offset + 1);
65 } else {
66 offset >>= 1;
67 lo = ROL32(lo, offset);
68 hi = ROL32(hi, offset);
69 }
70
71 return ((uint64_t)hi << 32) | lo;
72 }
73 }
74
75 static const unsigned char rhotates[5][5] = {
76 { 0, 1, 62, 28, 27 },
77 { 36, 44, 6, 55, 20 },
78 { 3, 10, 43, 25, 39 },
79 { 41, 45, 15, 21, 8 },
80 { 18, 2, 61, 56, 14 }
81 };
82
83 static const uint64_t iotas[] = {
84 BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
85 BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
86 BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
87 BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
88 BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
89 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
90 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
91 BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
92 BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
93 BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
94 BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
95 BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
96 BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
97 BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
98 BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
99 BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
100 BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
101 BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
102 BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
103 BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
104 BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
105 BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
106 BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
107 BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
108 };
109
110 #if defined(KECCAK_REF)
111 /*
112 * This is straightforward or "maximum clarity" implementation aiming
113 * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
114 * Permutation-Based Hash and Extendible-Output Functions" as much as
115 * possible. With one caveat. Because of the way C stores matrices,
116 * references to A[x,y] in the specification are presented as A[y][x].
117 * Implementation unrolls inner x-loops so that modulo 5 operations are
118 * explicitly pre-computed.
119 */
Theta(uint64_t A[5][5])120 static void Theta(uint64_t A[5][5])
121 {
122 uint64_t C[5], D[5];
123 size_t y;
124
125 C[0] = A[0][0];
126 C[1] = A[0][1];
127 C[2] = A[0][2];
128 C[3] = A[0][3];
129 C[4] = A[0][4];
130
131 for (y = 1; y < 5; y++) {
132 C[0] ^= A[y][0];
133 C[1] ^= A[y][1];
134 C[2] ^= A[y][2];
135 C[3] ^= A[y][3];
136 C[4] ^= A[y][4];
137 }
138
139 D[0] = ROL64(C[1], 1) ^ C[4];
140 D[1] = ROL64(C[2], 1) ^ C[0];
141 D[2] = ROL64(C[3], 1) ^ C[1];
142 D[3] = ROL64(C[4], 1) ^ C[2];
143 D[4] = ROL64(C[0], 1) ^ C[3];
144
145 for (y = 0; y < 5; y++) {
146 A[y][0] ^= D[0];
147 A[y][1] ^= D[1];
148 A[y][2] ^= D[2];
149 A[y][3] ^= D[3];
150 A[y][4] ^= D[4];
151 }
152 }
153
Rho(uint64_t A[5][5])154 static void Rho(uint64_t A[5][5])
155 {
156 size_t y;
157
158 for (y = 0; y < 5; y++) {
159 A[y][0] = ROL64(A[y][0], rhotates[y][0]);
160 A[y][1] = ROL64(A[y][1], rhotates[y][1]);
161 A[y][2] = ROL64(A[y][2], rhotates[y][2]);
162 A[y][3] = ROL64(A[y][3], rhotates[y][3]);
163 A[y][4] = ROL64(A[y][4], rhotates[y][4]);
164 }
165 }
166
Pi(uint64_t A[5][5])167 static void Pi(uint64_t A[5][5])
168 {
169 uint64_t T[5][5];
170
171 /*
172 * T = A
173 * A[y][x] = T[x][(3*y+x)%5]
174 */
175 memcpy(T, A, sizeof(T));
176
177 A[0][0] = T[0][0];
178 A[0][1] = T[1][1];
179 A[0][2] = T[2][2];
180 A[0][3] = T[3][3];
181 A[0][4] = T[4][4];
182
183 A[1][0] = T[0][3];
184 A[1][1] = T[1][4];
185 A[1][2] = T[2][0];
186 A[1][3] = T[3][1];
187 A[1][4] = T[4][2];
188
189 A[2][0] = T[0][1];
190 A[2][1] = T[1][2];
191 A[2][2] = T[2][3];
192 A[2][3] = T[3][4];
193 A[2][4] = T[4][0];
194
195 A[3][0] = T[0][4];
196 A[3][1] = T[1][0];
197 A[3][2] = T[2][1];
198 A[3][3] = T[3][2];
199 A[3][4] = T[4][3];
200
201 A[4][0] = T[0][2];
202 A[4][1] = T[1][3];
203 A[4][2] = T[2][4];
204 A[4][3] = T[3][0];
205 A[4][4] = T[4][1];
206 }
207
Chi(uint64_t A[5][5])208 static void Chi(uint64_t A[5][5])
209 {
210 uint64_t C[5];
211 size_t y;
212
213 for (y = 0; y < 5; y++) {
214 C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
215 C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
216 C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
217 C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
218 C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
219
220 A[y][0] = C[0];
221 A[y][1] = C[1];
222 A[y][2] = C[2];
223 A[y][3] = C[3];
224 A[y][4] = C[4];
225 }
226 }
227
Iota(uint64_t A[5][5],size_t i)228 static void Iota(uint64_t A[5][5], size_t i)
229 {
230 assert(i < OSSL_NELEM(iotas));
231 A[0][0] ^= iotas[i];
232 }
233
KeccakF1600(uint64_t A[5][5])234 static void KeccakF1600(uint64_t A[5][5])
235 {
236 size_t i;
237
238 for (i = 0; i < 24; i++) {
239 Theta(A);
240 Rho(A);
241 Pi(A);
242 Chi(A);
243 Iota(A, i);
244 }
245 }
246
247 #elif defined(KECCAK_1X)
248 /*
249 * This implementation is optimization of above code featuring unroll
250 * of even y-loops, their fusion and code motion. It also minimizes
251 * temporary storage. Compiler would normally do all these things for
252 * you, purpose of manual optimization is to provide "unobscured"
253 * reference for assembly implementation [in case this approach is
254 * chosen for implementation on some platform]. In the nutshell it's
255 * equivalent of "plane-per-plane processing" approach discussed in
256 * section 2.4 of "Keccak implementation overview".
257 */
Round(uint64_t A[5][5],size_t i)258 static void Round(uint64_t A[5][5], size_t i)
259 {
260 uint64_t C[5], E[2]; /* registers */
261 uint64_t D[5], T[2][5]; /* memory */
262
263 assert(i < OSSL_NELEM(iotas));
264
265 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
266 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
267 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
268 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
269 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
270
271 #if defined(__arm__)
272 D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
273 D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
274 D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
275 D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
276 D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
277
278 T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
279 T[0][1] = A[0][1] ^ E[0]; /* D[1] */
280 T[0][2] = A[0][2] ^ C[1]; /* D[2] */
281 T[0][3] = A[0][3] ^ C[2]; /* D[3] */
282 T[0][4] = A[0][4] ^ E[1]; /* D[4] */
283
284 C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
285 C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
286 C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
287 C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
288 C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
289 #else
290 D[0] = ROL64(C[1], 1) ^ C[4];
291 D[1] = ROL64(C[2], 1) ^ C[0];
292 D[2] = ROL64(C[3], 1) ^ C[1];
293 D[3] = ROL64(C[4], 1) ^ C[2];
294 D[4] = ROL64(C[0], 1) ^ C[3];
295
296 T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
297 T[0][1] = A[0][1] ^ D[1];
298 T[0][2] = A[0][2] ^ D[2];
299 T[0][3] = A[0][3] ^ D[3];
300 T[0][4] = A[0][4] ^ D[4];
301
302 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
303 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
304 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
305 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
306 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
307 #endif
308 A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
309 A[0][1] = C[1] ^ (~C[2] & C[3]);
310 A[0][2] = C[2] ^ (~C[3] & C[4]);
311 A[0][3] = C[3] ^ (~C[4] & C[0]);
312 A[0][4] = C[4] ^ (~C[0] & C[1]);
313
314 T[1][0] = A[1][0] ^ (C[3] = D[0]);
315 T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
316 T[1][2] = A[1][2] ^ (E[0] = D[2]);
317 T[1][3] = A[1][3] ^ (E[1] = D[3]);
318 T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
319
320 C[0] = ROL64(T[0][3], rhotates[0][3]);
321 C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
322 C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
323 C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
324 C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
325
326 A[1][0] = C[0] ^ (~C[1] & C[2]);
327 A[1][1] = C[1] ^ (~C[2] & C[3]);
328 A[1][2] = C[2] ^ (~C[3] & C[4]);
329 A[1][3] = C[3] ^ (~C[4] & C[0]);
330 A[1][4] = C[4] ^ (~C[0] & C[1]);
331
332 C[0] = ROL64(T[0][1], rhotates[0][1]);
333 C[1] = ROL64(T[1][2], rhotates[1][2]);
334 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
335 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
336 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
337
338 A[2][0] = C[0] ^ (~C[1] & C[2]);
339 A[2][1] = C[1] ^ (~C[2] & C[3]);
340 A[2][2] = C[2] ^ (~C[3] & C[4]);
341 A[2][3] = C[3] ^ (~C[4] & C[0]);
342 A[2][4] = C[4] ^ (~C[0] & C[1]);
343
344 C[0] = ROL64(T[0][4], rhotates[0][4]);
345 C[1] = ROL64(T[1][0], rhotates[1][0]);
346 C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
347 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
348 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
349
350 A[3][0] = C[0] ^ (~C[1] & C[2]);
351 A[3][1] = C[1] ^ (~C[2] & C[3]);
352 A[3][2] = C[2] ^ (~C[3] & C[4]);
353 A[3][3] = C[3] ^ (~C[4] & C[0]);
354 A[3][4] = C[4] ^ (~C[0] & C[1]);
355
356 C[0] = ROL64(T[0][2], rhotates[0][2]);
357 C[1] = ROL64(T[1][3], rhotates[1][3]);
358 C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
359 C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
360 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
361
362 A[4][0] = C[0] ^ (~C[1] & C[2]);
363 A[4][1] = C[1] ^ (~C[2] & C[3]);
364 A[4][2] = C[2] ^ (~C[3] & C[4]);
365 A[4][3] = C[3] ^ (~C[4] & C[0]);
366 A[4][4] = C[4] ^ (~C[0] & C[1]);
367 }
368
KeccakF1600(uint64_t A[5][5])369 static void KeccakF1600(uint64_t A[5][5])
370 {
371 size_t i;
372
373 for (i = 0; i < 24; i++) {
374 Round(A, i);
375 }
376 }
377
378 #elif defined(KECCAK_1X_ALT)
379 /*
380 * This is variant of above KECCAK_1X that reduces requirement for
381 * temporary storage even further, but at cost of more updates to A[][].
382 * It's less suitable if A[][] is memory bound, but better if it's
383 * register bound.
384 */
385
Round(uint64_t A[5][5],size_t i)386 static void Round(uint64_t A[5][5], size_t i)
387 {
388 uint64_t C[5], D[5];
389
390 assert(i < OSSL_NELEM(iotas));
391
392 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
393 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
394 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
395 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
396 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
397
398 D[1] = C[0] ^ ROL64(C[2], 1);
399 D[2] = C[1] ^ ROL64(C[3], 1);
400 D[3] = C[2] ^= ROL64(C[4], 1);
401 D[4] = C[3] ^= ROL64(C[0], 1);
402 D[0] = C[4] ^= ROL64(C[1], 1);
403
404 A[0][1] ^= D[1];
405 A[1][1] ^= D[1];
406 A[2][1] ^= D[1];
407 A[3][1] ^= D[1];
408 A[4][1] ^= D[1];
409
410 A[0][2] ^= D[2];
411 A[1][2] ^= D[2];
412 A[2][2] ^= D[2];
413 A[3][2] ^= D[2];
414 A[4][2] ^= D[2];
415
416 A[0][3] ^= C[2];
417 A[1][3] ^= C[2];
418 A[2][3] ^= C[2];
419 A[3][3] ^= C[2];
420 A[4][3] ^= C[2];
421
422 A[0][4] ^= C[3];
423 A[1][4] ^= C[3];
424 A[2][4] ^= C[3];
425 A[3][4] ^= C[3];
426 A[4][4] ^= C[3];
427
428 A[0][0] ^= C[4];
429 A[1][0] ^= C[4];
430 A[2][0] ^= C[4];
431 A[3][0] ^= C[4];
432 A[4][0] ^= C[4];
433
434 C[1] = A[0][1];
435 C[2] = A[0][2];
436 C[3] = A[0][3];
437 C[4] = A[0][4];
438
439 A[0][1] = ROL64(A[1][1], rhotates[1][1]);
440 A[0][2] = ROL64(A[2][2], rhotates[2][2]);
441 A[0][3] = ROL64(A[3][3], rhotates[3][3]);
442 A[0][4] = ROL64(A[4][4], rhotates[4][4]);
443
444 A[1][1] = ROL64(A[1][4], rhotates[1][4]);
445 A[2][2] = ROL64(A[2][3], rhotates[2][3]);
446 A[3][3] = ROL64(A[3][2], rhotates[3][2]);
447 A[4][4] = ROL64(A[4][1], rhotates[4][1]);
448
449 A[1][4] = ROL64(A[4][2], rhotates[4][2]);
450 A[2][3] = ROL64(A[3][4], rhotates[3][4]);
451 A[3][2] = ROL64(A[2][1], rhotates[2][1]);
452 A[4][1] = ROL64(A[1][3], rhotates[1][3]);
453
454 A[4][2] = ROL64(A[2][4], rhotates[2][4]);
455 A[3][4] = ROL64(A[4][3], rhotates[4][3]);
456 A[2][1] = ROL64(A[1][2], rhotates[1][2]);
457 A[1][3] = ROL64(A[3][1], rhotates[3][1]);
458
459 A[2][4] = ROL64(A[4][0], rhotates[4][0]);
460 A[4][3] = ROL64(A[3][0], rhotates[3][0]);
461 A[1][2] = ROL64(A[2][0], rhotates[2][0]);
462 A[3][1] = ROL64(A[1][0], rhotates[1][0]);
463
464 A[1][0] = ROL64(C[3], rhotates[0][3]);
465 A[2][0] = ROL64(C[1], rhotates[0][1]);
466 A[3][0] = ROL64(C[4], rhotates[0][4]);
467 A[4][0] = ROL64(C[2], rhotates[0][2]);
468
469 C[0] = A[0][0];
470 C[1] = A[1][0];
471 D[0] = A[0][1];
472 D[1] = A[1][1];
473
474 A[0][0] ^= (~A[0][1] & A[0][2]);
475 A[1][0] ^= (~A[1][1] & A[1][2]);
476 A[0][1] ^= (~A[0][2] & A[0][3]);
477 A[1][1] ^= (~A[1][2] & A[1][3]);
478 A[0][2] ^= (~A[0][3] & A[0][4]);
479 A[1][2] ^= (~A[1][3] & A[1][4]);
480 A[0][3] ^= (~A[0][4] & C[0]);
481 A[1][3] ^= (~A[1][4] & C[1]);
482 A[0][4] ^= (~C[0] & D[0]);
483 A[1][4] ^= (~C[1] & D[1]);
484
485 C[2] = A[2][0];
486 C[3] = A[3][0];
487 D[2] = A[2][1];
488 D[3] = A[3][1];
489
490 A[2][0] ^= (~A[2][1] & A[2][2]);
491 A[3][0] ^= (~A[3][1] & A[3][2]);
492 A[2][1] ^= (~A[2][2] & A[2][3]);
493 A[3][1] ^= (~A[3][2] & A[3][3]);
494 A[2][2] ^= (~A[2][3] & A[2][4]);
495 A[3][2] ^= (~A[3][3] & A[3][4]);
496 A[2][3] ^= (~A[2][4] & C[2]);
497 A[3][3] ^= (~A[3][4] & C[3]);
498 A[2][4] ^= (~C[2] & D[2]);
499 A[3][4] ^= (~C[3] & D[3]);
500
501 C[4] = A[4][0];
502 D[4] = A[4][1];
503
504 A[4][0] ^= (~A[4][1] & A[4][2]);
505 A[4][1] ^= (~A[4][2] & A[4][3]);
506 A[4][2] ^= (~A[4][3] & A[4][4]);
507 A[4][3] ^= (~A[4][4] & C[4]);
508 A[4][4] ^= (~C[4] & D[4]);
509 A[0][0] ^= iotas[i];
510 }
511
KeccakF1600(uint64_t A[5][5])512 static void KeccakF1600(uint64_t A[5][5])
513 {
514 size_t i;
515
516 for (i = 0; i < 24; i++) {
517 Round(A, i);
518 }
519 }
520
521 #elif defined(KECCAK_2X)
522 /*
523 * This implementation is variant of KECCAK_1X above with outer-most
524 * round loop unrolled twice. This allows to take temporary storage
525 * out of round procedure and simplify references to it by alternating
526 * it with actual data (see round loop below). Originally it was meant
527 * rather as reference for an assembly implementation, but it seems to
528 * play best with compilers [as well as provide best instruction per
529 * processed byte ratio at minimal round unroll factor]...
530 */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)531 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
532 {
533 uint64_t C[5], D[5];
534
535 assert(i < OSSL_NELEM(iotas));
536
537 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
538 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
539 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
540 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
541 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
542
543 D[0] = ROL64(C[1], 1) ^ C[4];
544 D[1] = ROL64(C[2], 1) ^ C[0];
545 D[2] = ROL64(C[3], 1) ^ C[1];
546 D[3] = ROL64(C[4], 1) ^ C[2];
547 D[4] = ROL64(C[0], 1) ^ C[3];
548
549 C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
550 C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
551 C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
552 C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
553 C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
554
555 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
556 R[0][0] = C[0] ^ (C[1] | C[2]) ^ iotas[i];
557 R[0][1] = C[1] ^ (~C[2] | C[3]);
558 R[0][2] = C[2] ^ (C[3] & C[4]);
559 R[0][3] = C[3] ^ (C[4] | C[0]);
560 R[0][4] = C[4] ^ (C[0] & C[1]);
561 #else
562 R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
563 R[0][1] = C[1] ^ (~C[2] & C[3]);
564 R[0][2] = C[2] ^ (~C[3] & C[4]);
565 R[0][3] = C[3] ^ (~C[4] & C[0]);
566 R[0][4] = C[4] ^ (~C[0] & C[1]);
567 #endif
568
569 C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
570 C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
571 C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
572 C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
573 C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
574
575 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
576 R[1][0] = C[0] ^ (C[1] | C[2]);
577 R[1][1] = C[1] ^ (C[2] & C[3]);
578 R[1][2] = C[2] ^ (C[3] | ~C[4]);
579 R[1][3] = C[3] ^ (C[4] | C[0]);
580 R[1][4] = C[4] ^ (C[0] & C[1]);
581 #else
582 R[1][0] = C[0] ^ (~C[1] & C[2]);
583 R[1][1] = C[1] ^ (~C[2] & C[3]);
584 R[1][2] = C[2] ^ (~C[3] & C[4]);
585 R[1][3] = C[3] ^ (~C[4] & C[0]);
586 R[1][4] = C[4] ^ (~C[0] & C[1]);
587 #endif
588
589 C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
590 C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
591 C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
592 C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
593 C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
594
595 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
596 R[2][0] = C[0] ^ (C[1] | C[2]);
597 R[2][1] = C[1] ^ (C[2] & C[3]);
598 R[2][2] = C[2] ^ (~C[3] & C[4]);
599 R[2][3] = ~C[3] ^ (C[4] | C[0]);
600 R[2][4] = C[4] ^ (C[0] & C[1]);
601 #else
602 R[2][0] = C[0] ^ (~C[1] & C[2]);
603 R[2][1] = C[1] ^ (~C[2] & C[3]);
604 R[2][2] = C[2] ^ (~C[3] & C[4]);
605 R[2][3] = C[3] ^ (~C[4] & C[0]);
606 R[2][4] = C[4] ^ (~C[0] & C[1]);
607 #endif
608
609 C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
610 C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
611 C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
612 C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
613 C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
614
615 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
616 R[3][0] = C[0] ^ (C[1] & C[2]);
617 R[3][1] = C[1] ^ (C[2] | C[3]);
618 R[3][2] = C[2] ^ (~C[3] | C[4]);
619 R[3][3] = ~C[3] ^ (C[4] & C[0]);
620 R[3][4] = C[4] ^ (C[0] | C[1]);
621 #else
622 R[3][0] = C[0] ^ (~C[1] & C[2]);
623 R[3][1] = C[1] ^ (~C[2] & C[3]);
624 R[3][2] = C[2] ^ (~C[3] & C[4]);
625 R[3][3] = C[3] ^ (~C[4] & C[0]);
626 R[3][4] = C[4] ^ (~C[0] & C[1]);
627 #endif
628
629 C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
630 C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
631 C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
632 C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
633 C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
634
635 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
636 R[4][0] = C[0] ^ (~C[1] & C[2]);
637 R[4][1] = ~C[1] ^ (C[2] | C[3]);
638 R[4][2] = C[2] ^ (C[3] & C[4]);
639 R[4][3] = C[3] ^ (C[4] | C[0]);
640 R[4][4] = C[4] ^ (C[0] & C[1]);
641 #else
642 R[4][0] = C[0] ^ (~C[1] & C[2]);
643 R[4][1] = C[1] ^ (~C[2] & C[3]);
644 R[4][2] = C[2] ^ (~C[3] & C[4]);
645 R[4][3] = C[3] ^ (~C[4] & C[0]);
646 R[4][4] = C[4] ^ (~C[0] & C[1]);
647 #endif
648 }
649
KeccakF1600(uint64_t A[5][5])650 static void KeccakF1600(uint64_t A[5][5])
651 {
652 uint64_t T[5][5];
653 size_t i;
654
655 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
656 A[0][1] = ~A[0][1];
657 A[0][2] = ~A[0][2];
658 A[1][3] = ~A[1][3];
659 A[2][2] = ~A[2][2];
660 A[3][2] = ~A[3][2];
661 A[4][0] = ~A[4][0];
662 #endif
663
664 for (i = 0; i < 24; i += 2) {
665 Round(T, A, i);
666 Round(A, T, i + 1);
667 }
668
669 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
670 A[0][1] = ~A[0][1];
671 A[0][2] = ~A[0][2];
672 A[1][3] = ~A[1][3];
673 A[2][2] = ~A[2][2];
674 A[3][2] = ~A[3][2];
675 A[4][0] = ~A[4][0];
676 #endif
677 }
678
679 #else /* define KECCAK_INPLACE to compile this code path */
680 /*
681 * This implementation is KECCAK_1X from above combined 4 times with
682 * a twist that allows to omit temporary storage and perform in-place
683 * processing. It's discussed in section 2.5 of "Keccak implementation
684 * overview". It's likely to be best suited for processors with large
685 * register bank... On the other hand processor with large register
686 * bank can as well use KECCAK_1X_ALT, it would be as fast but much
687 * more compact...
688 */
FourRounds(uint64_t A[5][5],size_t i)689 static void FourRounds(uint64_t A[5][5], size_t i)
690 {
691 uint64_t B[5], C[5], D[5];
692
693 assert(i <= OSSL_NELEM(iotas) - 4);
694
695 /* Round 4*n */
696 C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
697 C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
698 C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
699 C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
700 C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
701
702 D[0] = ROL64(C[1], 1) ^ C[4];
703 D[1] = ROL64(C[2], 1) ^ C[0];
704 D[2] = ROL64(C[3], 1) ^ C[1];
705 D[3] = ROL64(C[4], 1) ^ C[2];
706 D[4] = ROL64(C[0], 1) ^ C[3];
707
708 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
709 B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
710 B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
711 B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
712 B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
713
714 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
715 C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
716 C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
717 C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
718 C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
719
720 B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
721 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
722 B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
723 B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
724 B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
725
726 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
727 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
728 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
729 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
730 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
731
732 B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
733 B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
734 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
735 B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
736 B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
737
738 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
739 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
740 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
741 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
742 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
743
744 B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
745 B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
746 B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
747 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
748 B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
749
750 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
751 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
752 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
753 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
754 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
755
756 B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
757 B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
758 B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
759 B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
760 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
761
762 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
763 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
764 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
765 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
766 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
767
768 /* Round 4*n+1 */
769 D[0] = ROL64(C[1], 1) ^ C[4];
770 D[1] = ROL64(C[2], 1) ^ C[0];
771 D[2] = ROL64(C[3], 1) ^ C[1];
772 D[3] = ROL64(C[4], 1) ^ C[2];
773 D[4] = ROL64(C[0], 1) ^ C[3];
774
775 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
776 B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
777 B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
778 B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
779 B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
780
781 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
782 C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
783 C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
784 C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
785 C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
786
787 B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
788 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
789 B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
790 B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
791 B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
792
793 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
794 C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
795 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
796 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
797 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
798
799 B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
800 B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
801 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
802 B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
803 B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
804
805 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
806 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
807 C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
808 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
809 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
810
811 B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
812 B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
813 B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
814 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
815 B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
816
817 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
818 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
819 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
820 C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
821 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
822
823 B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
824 B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
825 B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
826 B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
827 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
828
829 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
830 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
831 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
832 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
833 C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
834
835 /* Round 4*n+2 */
836 D[0] = ROL64(C[1], 1) ^ C[4];
837 D[1] = ROL64(C[2], 1) ^ C[0];
838 D[2] = ROL64(C[3], 1) ^ C[1];
839 D[3] = ROL64(C[4], 1) ^ C[2];
840 D[4] = ROL64(C[0], 1) ^ C[3];
841
842 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
843 B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
844 B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
845 B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
846 B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
847
848 C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
849 C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
850 C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
851 C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
852 C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
853
854 B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
855 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
856 B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
857 B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
858 B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
859
860 C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
861 C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
862 C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
863 C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
864 C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
865
866 B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
867 B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
868 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
869 B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
870 B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
871
872 C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
873 C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
874 C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
875 C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
876 C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
877
878 B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
879 B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
880 B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
881 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
882 B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
883
884 C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
885 C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
886 C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
887 C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
888 C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
889
890 B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
891 B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
892 B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
893 B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
894 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
895
896 C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
897 C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
898 C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
899 C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
900 C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
901
902 /* Round 4*n+3 */
903 D[0] = ROL64(C[1], 1) ^ C[4];
904 D[1] = ROL64(C[2], 1) ^ C[0];
905 D[2] = ROL64(C[3], 1) ^ C[1];
906 D[3] = ROL64(C[4], 1) ^ C[2];
907 D[4] = ROL64(C[0], 1) ^ C[3];
908
909 B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
910 B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
911 B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
912 B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
913 B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
914
915 /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
916 /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
917 /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
918 /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
919 /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
920
921 B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
922 B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
923 B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
924 B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
925 B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
926
927 /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
928 /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
929 /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
930 /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
931 /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
932
933 B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
934 B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
935 B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
936 B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
937 B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
938
939 /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
940 /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
941 /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
942 /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
943 /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
944
945 B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
946 B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
947 B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
948 B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
949 B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
950
951 /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
952 /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
953 /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
954 /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
955 /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
956
957 B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
958 B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
959 B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
960 B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
961 B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
962
963 /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
964 /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
965 /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
966 /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
967 /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
968 }
969
KeccakF1600(uint64_t A[5][5])970 static void KeccakF1600(uint64_t A[5][5])
971 {
972 size_t i;
973
974 for (i = 0; i < 24; i += 4) {
975 FourRounds(A, i);
976 }
977 }
978
979 #endif
980
BitInterleave(uint64_t Ai)981 static uint64_t BitInterleave(uint64_t Ai)
982 {
983 if (BIT_INTERLEAVE) {
984 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
985 uint32_t t0, t1;
986
987 t0 = lo & 0x55555555;
988 t0 |= t0 >> 1;
989 t0 &= 0x33333333;
990 t0 |= t0 >> 2;
991 t0 &= 0x0f0f0f0f;
992 t0 |= t0 >> 4;
993 t0 &= 0x00ff00ff;
994 t0 |= t0 >> 8;
995 t0 &= 0x0000ffff;
996
997 t1 = hi & 0x55555555;
998 t1 |= t1 >> 1;
999 t1 &= 0x33333333;
1000 t1 |= t1 >> 2;
1001 t1 &= 0x0f0f0f0f;
1002 t1 |= t1 >> 4;
1003 t1 &= 0x00ff00ff;
1004 t1 |= t1 >> 8;
1005 t1 <<= 16;
1006
1007 lo &= 0xaaaaaaaa;
1008 lo |= lo << 1;
1009 lo &= 0xcccccccc;
1010 lo |= lo << 2;
1011 lo &= 0xf0f0f0f0;
1012 lo |= lo << 4;
1013 lo &= 0xff00ff00;
1014 lo |= lo << 8;
1015 lo >>= 16;
1016
1017 hi &= 0xaaaaaaaa;
1018 hi |= hi << 1;
1019 hi &= 0xcccccccc;
1020 hi |= hi << 2;
1021 hi &= 0xf0f0f0f0;
1022 hi |= hi << 4;
1023 hi &= 0xff00ff00;
1024 hi |= hi << 8;
1025 hi &= 0xffff0000;
1026
1027 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1028 }
1029
1030 return Ai;
1031 }
1032
BitDeinterleave(uint64_t Ai)1033 static uint64_t BitDeinterleave(uint64_t Ai)
1034 {
1035 if (BIT_INTERLEAVE) {
1036 uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1037 uint32_t t0, t1;
1038
1039 t0 = lo & 0x0000ffff;
1040 t0 |= t0 << 8;
1041 t0 &= 0x00ff00ff;
1042 t0 |= t0 << 4;
1043 t0 &= 0x0f0f0f0f;
1044 t0 |= t0 << 2;
1045 t0 &= 0x33333333;
1046 t0 |= t0 << 1;
1047 t0 &= 0x55555555;
1048
1049 t1 = hi << 16;
1050 t1 |= t1 >> 8;
1051 t1 &= 0xff00ff00;
1052 t1 |= t1 >> 4;
1053 t1 &= 0xf0f0f0f0;
1054 t1 |= t1 >> 2;
1055 t1 &= 0xcccccccc;
1056 t1 |= t1 >> 1;
1057 t1 &= 0xaaaaaaaa;
1058
1059 lo >>= 16;
1060 lo |= lo << 8;
1061 lo &= 0x00ff00ff;
1062 lo |= lo << 4;
1063 lo &= 0x0f0f0f0f;
1064 lo |= lo << 2;
1065 lo &= 0x33333333;
1066 lo |= lo << 1;
1067 lo &= 0x55555555;
1068
1069 hi &= 0xffff0000;
1070 hi |= hi >> 8;
1071 hi &= 0xff00ff00;
1072 hi |= hi >> 4;
1073 hi &= 0xf0f0f0f0;
1074 hi |= hi >> 2;
1075 hi &= 0xcccccccc;
1076 hi |= hi >> 1;
1077 hi &= 0xaaaaaaaa;
1078
1079 Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1080 }
1081
1082 return Ai;
1083 }
1084
1085 /*
1086 * SHA3_absorb can be called multiple times, but at each invocation
1087 * largest multiple of |r| out of |len| bytes are processed. Then
1088 * remaining amount of bytes is returned. This is done to spare caller
1089 * trouble of calculating the largest multiple of |r|. |r| can be viewed
1090 * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1091 * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1092 * padding and intermediate sub-block buffering, byte- or bitwise, is
1093 * caller's responsibility.
1094 */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1095 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1096 size_t r)
1097 {
1098 uint64_t *A_flat = (uint64_t *)A;
1099 size_t i, w = r / 8;
1100
1101 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102
1103 while (len >= r) {
1104 for (i = 0; i < w; i++) {
1105 uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1106 inp += 8;
1107
1108 A_flat[i] ^= BitInterleave(Ai);
1109 }
1110 KeccakF1600(A);
1111 len -= r;
1112 }
1113
1114 return len;
1115 }
1116
1117 /*
1118 * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1119 * |len| bytes.
1120 * If multiple SHA3_squeeze calls are required the output length |len| must be a
1121 * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1122 * subsequent calls. It is the callers responsibility to buffer the results.
1123 * When only a single call to SHA3_squeeze is required, |len| can be any size
1124 * and |next| must be 0.
1125 */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r,int next)1126 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1127 int next)
1128 {
1129 uint64_t *A_flat = (uint64_t *)A;
1130 size_t i, w = r / 8;
1131
1132 assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1133
1134 while (len != 0) {
1135 if (next)
1136 KeccakF1600(A);
1137 next = 1;
1138 for (i = 0; i < w && len != 0; i++) {
1139 uint64_t Ai = BitDeinterleave(A_flat[i]);
1140
1141 if (len < 8) {
1142 for (i = 0; i < len; i++) {
1143 *out++ = (unsigned char)Ai;
1144 Ai >>= 8;
1145 }
1146 return;
1147 }
1148
1149 out[0] = (unsigned char)(Ai);
1150 out[1] = (unsigned char)(Ai >> 8);
1151 out[2] = (unsigned char)(Ai >> 16);
1152 out[3] = (unsigned char)(Ai >> 24);
1153 out[4] = (unsigned char)(Ai >> 32);
1154 out[5] = (unsigned char)(Ai >> 40);
1155 out[6] = (unsigned char)(Ai >> 48);
1156 out[7] = (unsigned char)(Ai >> 56);
1157 out += 8;
1158 len -= 8;
1159 }
1160 }
1161 }
1162 #endif
1163
1164 #ifdef SELFTEST
1165 /*
1166 * Post-padding one-shot implementations would look as following:
1167 *
1168 * SHA3_224 SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1169 * SHA3_256 SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1170 * SHA3_384 SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1171 * SHA3_512 SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1172 * SHAKE_128 SHA3_sponge(inp, len, out, d, (1600-256)/8);
1173 * SHAKE_256 SHA3_sponge(inp, len, out, d, (1600-512)/8);
1174 */
1175
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1176 void SHA3_sponge(const unsigned char *inp, size_t len,
1177 unsigned char *out, size_t d, size_t r)
1178 {
1179 uint64_t A[5][5];
1180
1181 memset(A, 0, sizeof(A));
1182 SHA3_absorb(A, inp, len, r);
1183 SHA3_squeeze(A, out, d, r);
1184 }
1185
1186 #include <stdio.h>
1187
main(void)1188 int main(void)
1189 {
1190 /*
1191 * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1192 */
1193 unsigned char test[168] = { '\xf3', '\x3' };
1194 unsigned char out[512];
1195 size_t i;
1196 static const unsigned char result[512] = {
1197 0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1198 0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1199 0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1200 0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1201 0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1202 0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1203 0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1204 0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1205 0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1206 0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1207 0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1208 0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1209 0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1210 0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1211 0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1212 0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1213 0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1214 0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1215 0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1216 0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1217 0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1218 0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1219 0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1220 0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1221 0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1222 0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1223 0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1224 0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1225 0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1226 0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1227 0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1228 0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1229 0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1230 0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1231 0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1232 0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1233 0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1234 0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1235 0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1236 0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1237 0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1238 0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1239 0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1240 0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1241 0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1242 0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1243 0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1244 0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1245 0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1246 0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1247 0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1248 0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1249 0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1250 0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1251 0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1252 0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1253 0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1254 0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1255 0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1256 0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1257 0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1258 0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1259 0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1260 0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1261 };
1262
1263 test[167] = '\x80';
1264 SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1265
1266 /*
1267 * Rationale behind keeping output [formatted as below] is that
1268 * one should be able to redirect it to a file, then copy-n-paste
1269 * final "output val" from official example to another file, and
1270 * compare the two with diff(1).
1271 */
1272 for (i = 0; i < sizeof(out);) {
1273 printf("%02X", out[i]);
1274 printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1275 }
1276
1277 if (memcmp(out, result, sizeof(out))) {
1278 fprintf(stderr, "failure\n");
1279 return 1;
1280 } else {
1281 fprintf(stderr, "success\n");
1282 return 0;
1283 }
1284 }
1285 #endif
1286