xref: /freebsd/crypto/openssl/crypto/sha/keccak1600.c (revision f25b8c9fb4f58cf61adb47d7570abe7caa6d385d)
1 /*
2  * Copyright 2016-2024 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 #include <openssl/e_os2.h>
11 #include <string.h>
12 #include <assert.h>
13 
14 #include "internal/nelem.h"
15 
16 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
17     size_t r);
18 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r, int next);
19 
20 #if !defined(KECCAK1600_ASM) || !defined(SELFTEST)
21 
22 /*
23  * Choose some sensible defaults
24  */
25 #if !defined(KECCAK_REF) && !defined(KECCAK_1X) && !defined(KECCAK_1X_ALT) && !defined(KECCAK_2X) && !defined(KECCAK_INPLACE)
26 #define KECCAK_2X /* default to KECCAK_2X variant */
27 #endif
28 
29 #if defined(__i386) || defined(__i386__) || defined(_M_IX86) || (defined(__x86_64) && !defined(__BMI__)) || defined(_M_X64) || defined(__mips) || defined(__riscv) || defined(__s390__) || defined(__EMSCRIPTEN__)
30 /*
31  * These don't have "and with complement" instruction, so minimize amount
32  * of "not"-s. Implemented only in the [default] KECCAK_2X variant.
33  */
34 #define KECCAK_COMPLEMENTING_TRANSFORM
35 #endif
36 
37 #if defined(__x86_64__) || defined(__aarch64__) || defined(__mips64) || defined(__ia64) || (defined(__VMS) && !defined(__vax))
38 /*
39  * These are available even in ILP32 flavours, but even then they are
40  * capable of performing 64-bit operations as efficiently as in *P64.
41  * Since it's not given that we can use sizeof(void *), just shunt it.
42  */
43 #define BIT_INTERLEAVE (0)
44 #else
45 #define BIT_INTERLEAVE (sizeof(void *) < 8)
46 #endif
47 
48 #define ROL32(a, offset) (((a) << (offset)) | ((a) >> ((32 - (offset)) & 31)))
49 
ROL64(uint64_t val,int offset)50 static uint64_t ROL64(uint64_t val, int offset)
51 {
52     if (offset == 0) {
53         return val;
54     } else if (!BIT_INTERLEAVE) {
55         return (val << offset) | (val >> (64 - offset));
56     } else {
57         uint32_t hi = (uint32_t)(val >> 32), lo = (uint32_t)val;
58 
59         if (offset & 1) {
60             uint32_t tmp = hi;
61 
62             offset >>= 1;
63             hi = ROL32(lo, offset);
64             lo = ROL32(tmp, offset + 1);
65         } else {
66             offset >>= 1;
67             lo = ROL32(lo, offset);
68             hi = ROL32(hi, offset);
69         }
70 
71         return ((uint64_t)hi << 32) | lo;
72     }
73 }
74 
75 static const unsigned char rhotates[5][5] = {
76     { 0, 1, 62, 28, 27 },
77     { 36, 44, 6, 55, 20 },
78     { 3, 10, 43, 25, 39 },
79     { 41, 45, 15, 21, 8 },
80     { 18, 2, 61, 56, 14 }
81 };
82 
83 static const uint64_t iotas[] = {
84     BIT_INTERLEAVE ? 0x0000000000000001ULL : 0x0000000000000001ULL,
85     BIT_INTERLEAVE ? 0x0000008900000000ULL : 0x0000000000008082ULL,
86     BIT_INTERLEAVE ? 0x8000008b00000000ULL : 0x800000000000808aULL,
87     BIT_INTERLEAVE ? 0x8000808000000000ULL : 0x8000000080008000ULL,
88     BIT_INTERLEAVE ? 0x0000008b00000001ULL : 0x000000000000808bULL,
89     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
90     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
91     BIT_INTERLEAVE ? 0x8000008200000001ULL : 0x8000000000008009ULL,
92     BIT_INTERLEAVE ? 0x0000000b00000000ULL : 0x000000000000008aULL,
93     BIT_INTERLEAVE ? 0x0000000a00000000ULL : 0x0000000000000088ULL,
94     BIT_INTERLEAVE ? 0x0000808200000001ULL : 0x0000000080008009ULL,
95     BIT_INTERLEAVE ? 0x0000800300000000ULL : 0x000000008000000aULL,
96     BIT_INTERLEAVE ? 0x0000808b00000001ULL : 0x000000008000808bULL,
97     BIT_INTERLEAVE ? 0x8000000b00000001ULL : 0x800000000000008bULL,
98     BIT_INTERLEAVE ? 0x8000008a00000001ULL : 0x8000000000008089ULL,
99     BIT_INTERLEAVE ? 0x8000008100000001ULL : 0x8000000000008003ULL,
100     BIT_INTERLEAVE ? 0x8000008100000000ULL : 0x8000000000008002ULL,
101     BIT_INTERLEAVE ? 0x8000000800000000ULL : 0x8000000000000080ULL,
102     BIT_INTERLEAVE ? 0x0000008300000000ULL : 0x000000000000800aULL,
103     BIT_INTERLEAVE ? 0x8000800300000000ULL : 0x800000008000000aULL,
104     BIT_INTERLEAVE ? 0x8000808800000001ULL : 0x8000000080008081ULL,
105     BIT_INTERLEAVE ? 0x8000008800000000ULL : 0x8000000000008080ULL,
106     BIT_INTERLEAVE ? 0x0000800000000001ULL : 0x0000000080000001ULL,
107     BIT_INTERLEAVE ? 0x8000808200000000ULL : 0x8000000080008008ULL
108 };
109 
110 #if defined(KECCAK_REF)
111 /*
112  * This is straightforward or "maximum clarity" implementation aiming
113  * to resemble section 3.2 of the FIPS PUB 202 "SHA-3 Standard:
114  * Permutation-Based Hash and Extendible-Output Functions" as much as
115  * possible. With one caveat. Because of the way C stores matrices,
116  * references to A[x,y] in the specification are presented as A[y][x].
117  * Implementation unrolls inner x-loops so that modulo 5 operations are
118  * explicitly pre-computed.
119  */
Theta(uint64_t A[5][5])120 static void Theta(uint64_t A[5][5])
121 {
122     uint64_t C[5], D[5];
123     size_t y;
124 
125     C[0] = A[0][0];
126     C[1] = A[0][1];
127     C[2] = A[0][2];
128     C[3] = A[0][3];
129     C[4] = A[0][4];
130 
131     for (y = 1; y < 5; y++) {
132         C[0] ^= A[y][0];
133         C[1] ^= A[y][1];
134         C[2] ^= A[y][2];
135         C[3] ^= A[y][3];
136         C[4] ^= A[y][4];
137     }
138 
139     D[0] = ROL64(C[1], 1) ^ C[4];
140     D[1] = ROL64(C[2], 1) ^ C[0];
141     D[2] = ROL64(C[3], 1) ^ C[1];
142     D[3] = ROL64(C[4], 1) ^ C[2];
143     D[4] = ROL64(C[0], 1) ^ C[3];
144 
145     for (y = 0; y < 5; y++) {
146         A[y][0] ^= D[0];
147         A[y][1] ^= D[1];
148         A[y][2] ^= D[2];
149         A[y][3] ^= D[3];
150         A[y][4] ^= D[4];
151     }
152 }
153 
Rho(uint64_t A[5][5])154 static void Rho(uint64_t A[5][5])
155 {
156     size_t y;
157 
158     for (y = 0; y < 5; y++) {
159         A[y][0] = ROL64(A[y][0], rhotates[y][0]);
160         A[y][1] = ROL64(A[y][1], rhotates[y][1]);
161         A[y][2] = ROL64(A[y][2], rhotates[y][2]);
162         A[y][3] = ROL64(A[y][3], rhotates[y][3]);
163         A[y][4] = ROL64(A[y][4], rhotates[y][4]);
164     }
165 }
166 
Pi(uint64_t A[5][5])167 static void Pi(uint64_t A[5][5])
168 {
169     uint64_t T[5][5];
170 
171     /*
172      * T = A
173      * A[y][x] = T[x][(3*y+x)%5]
174      */
175     memcpy(T, A, sizeof(T));
176 
177     A[0][0] = T[0][0];
178     A[0][1] = T[1][1];
179     A[0][2] = T[2][2];
180     A[0][3] = T[3][3];
181     A[0][4] = T[4][4];
182 
183     A[1][0] = T[0][3];
184     A[1][1] = T[1][4];
185     A[1][2] = T[2][0];
186     A[1][3] = T[3][1];
187     A[1][4] = T[4][2];
188 
189     A[2][0] = T[0][1];
190     A[2][1] = T[1][2];
191     A[2][2] = T[2][3];
192     A[2][3] = T[3][4];
193     A[2][4] = T[4][0];
194 
195     A[3][0] = T[0][4];
196     A[3][1] = T[1][0];
197     A[3][2] = T[2][1];
198     A[3][3] = T[3][2];
199     A[3][4] = T[4][3];
200 
201     A[4][0] = T[0][2];
202     A[4][1] = T[1][3];
203     A[4][2] = T[2][4];
204     A[4][3] = T[3][0];
205     A[4][4] = T[4][1];
206 }
207 
Chi(uint64_t A[5][5])208 static void Chi(uint64_t A[5][5])
209 {
210     uint64_t C[5];
211     size_t y;
212 
213     for (y = 0; y < 5; y++) {
214         C[0] = A[y][0] ^ (~A[y][1] & A[y][2]);
215         C[1] = A[y][1] ^ (~A[y][2] & A[y][3]);
216         C[2] = A[y][2] ^ (~A[y][3] & A[y][4]);
217         C[3] = A[y][3] ^ (~A[y][4] & A[y][0]);
218         C[4] = A[y][4] ^ (~A[y][0] & A[y][1]);
219 
220         A[y][0] = C[0];
221         A[y][1] = C[1];
222         A[y][2] = C[2];
223         A[y][3] = C[3];
224         A[y][4] = C[4];
225     }
226 }
227 
Iota(uint64_t A[5][5],size_t i)228 static void Iota(uint64_t A[5][5], size_t i)
229 {
230     assert(i < OSSL_NELEM(iotas));
231     A[0][0] ^= iotas[i];
232 }
233 
KeccakF1600(uint64_t A[5][5])234 static void KeccakF1600(uint64_t A[5][5])
235 {
236     size_t i;
237 
238     for (i = 0; i < 24; i++) {
239         Theta(A);
240         Rho(A);
241         Pi(A);
242         Chi(A);
243         Iota(A, i);
244     }
245 }
246 
247 #elif defined(KECCAK_1X)
248 /*
249  * This implementation is optimization of above code featuring unroll
250  * of even y-loops, their fusion and code motion. It also minimizes
251  * temporary storage. Compiler would normally do all these things for
252  * you, purpose of manual optimization is to provide "unobscured"
253  * reference for assembly implementation [in case this approach is
254  * chosen for implementation on some platform]. In the nutshell it's
255  * equivalent of "plane-per-plane processing" approach discussed in
256  * section 2.4 of "Keccak implementation overview".
257  */
Round(uint64_t A[5][5],size_t i)258 static void Round(uint64_t A[5][5], size_t i)
259 {
260     uint64_t C[5], E[2]; /* registers */
261     uint64_t D[5], T[2][5]; /* memory    */
262 
263     assert(i < OSSL_NELEM(iotas));
264 
265     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
266     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
267     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
268     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
269     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
270 
271 #if defined(__arm__)
272     D[1] = E[0] = ROL64(C[2], 1) ^ C[0];
273     D[4] = E[1] = ROL64(C[0], 1) ^ C[3];
274     D[0] = C[0] = ROL64(C[1], 1) ^ C[4];
275     D[2] = C[1] = ROL64(C[3], 1) ^ C[1];
276     D[3] = C[2] = ROL64(C[4], 1) ^ C[2];
277 
278     T[0][0] = A[3][0] ^ C[0]; /* borrow T[0][0] */
279     T[0][1] = A[0][1] ^ E[0]; /* D[1] */
280     T[0][2] = A[0][2] ^ C[1]; /* D[2] */
281     T[0][3] = A[0][3] ^ C[2]; /* D[3] */
282     T[0][4] = A[0][4] ^ E[1]; /* D[4] */
283 
284     C[3] = ROL64(A[3][3] ^ C[2], rhotates[3][3]); /* D[3] */
285     C[4] = ROL64(A[4][4] ^ E[1], rhotates[4][4]); /* D[4] */
286     C[0] = A[0][0] ^ C[0]; /* rotate by 0 */ /* D[0] */
287     C[2] = ROL64(A[2][2] ^ C[1], rhotates[2][2]); /* D[2] */
288     C[1] = ROL64(A[1][1] ^ E[0], rhotates[1][1]); /* D[1] */
289 #else
290     D[0] = ROL64(C[1], 1) ^ C[4];
291     D[1] = ROL64(C[2], 1) ^ C[0];
292     D[2] = ROL64(C[3], 1) ^ C[1];
293     D[3] = ROL64(C[4], 1) ^ C[2];
294     D[4] = ROL64(C[0], 1) ^ C[3];
295 
296     T[0][0] = A[3][0] ^ D[0]; /* borrow T[0][0] */
297     T[0][1] = A[0][1] ^ D[1];
298     T[0][2] = A[0][2] ^ D[2];
299     T[0][3] = A[0][3] ^ D[3];
300     T[0][4] = A[0][4] ^ D[4];
301 
302     C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
303     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
304     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
305     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
306     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
307 #endif
308     A[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
309     A[0][1] = C[1] ^ (~C[2] & C[3]);
310     A[0][2] = C[2] ^ (~C[3] & C[4]);
311     A[0][3] = C[3] ^ (~C[4] & C[0]);
312     A[0][4] = C[4] ^ (~C[0] & C[1]);
313 
314     T[1][0] = A[1][0] ^ (C[3] = D[0]);
315     T[1][1] = A[2][1] ^ (C[4] = D[1]); /* borrow T[1][1] */
316     T[1][2] = A[1][2] ^ (E[0] = D[2]);
317     T[1][3] = A[1][3] ^ (E[1] = D[3]);
318     T[1][4] = A[2][4] ^ (C[2] = D[4]); /* borrow T[1][4] */
319 
320     C[0] = ROL64(T[0][3], rhotates[0][3]);
321     C[1] = ROL64(A[1][4] ^ C[2], rhotates[1][4]); /* D[4] */
322     C[2] = ROL64(A[2][0] ^ C[3], rhotates[2][0]); /* D[0] */
323     C[3] = ROL64(A[3][1] ^ C[4], rhotates[3][1]); /* D[1] */
324     C[4] = ROL64(A[4][2] ^ E[0], rhotates[4][2]); /* D[2] */
325 
326     A[1][0] = C[0] ^ (~C[1] & C[2]);
327     A[1][1] = C[1] ^ (~C[2] & C[3]);
328     A[1][2] = C[2] ^ (~C[3] & C[4]);
329     A[1][3] = C[3] ^ (~C[4] & C[0]);
330     A[1][4] = C[4] ^ (~C[0] & C[1]);
331 
332     C[0] = ROL64(T[0][1], rhotates[0][1]);
333     C[1] = ROL64(T[1][2], rhotates[1][2]);
334     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
335     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
336     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
337 
338     A[2][0] = C[0] ^ (~C[1] & C[2]);
339     A[2][1] = C[1] ^ (~C[2] & C[3]);
340     A[2][2] = C[2] ^ (~C[3] & C[4]);
341     A[2][3] = C[3] ^ (~C[4] & C[0]);
342     A[2][4] = C[4] ^ (~C[0] & C[1]);
343 
344     C[0] = ROL64(T[0][4], rhotates[0][4]);
345     C[1] = ROL64(T[1][0], rhotates[1][0]);
346     C[2] = ROL64(T[1][1], rhotates[2][1]); /* originally A[2][1] */
347     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
348     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
349 
350     A[3][0] = C[0] ^ (~C[1] & C[2]);
351     A[3][1] = C[1] ^ (~C[2] & C[3]);
352     A[3][2] = C[2] ^ (~C[3] & C[4]);
353     A[3][3] = C[3] ^ (~C[4] & C[0]);
354     A[3][4] = C[4] ^ (~C[0] & C[1]);
355 
356     C[0] = ROL64(T[0][2], rhotates[0][2]);
357     C[1] = ROL64(T[1][3], rhotates[1][3]);
358     C[2] = ROL64(T[1][4], rhotates[2][4]); /* originally A[2][4] */
359     C[3] = ROL64(T[0][0], rhotates[3][0]); /* originally A[3][0] */
360     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
361 
362     A[4][0] = C[0] ^ (~C[1] & C[2]);
363     A[4][1] = C[1] ^ (~C[2] & C[3]);
364     A[4][2] = C[2] ^ (~C[3] & C[4]);
365     A[4][3] = C[3] ^ (~C[4] & C[0]);
366     A[4][4] = C[4] ^ (~C[0] & C[1]);
367 }
368 
KeccakF1600(uint64_t A[5][5])369 static void KeccakF1600(uint64_t A[5][5])
370 {
371     size_t i;
372 
373     for (i = 0; i < 24; i++) {
374         Round(A, i);
375     }
376 }
377 
378 #elif defined(KECCAK_1X_ALT)
379 /*
380  * This is variant of above KECCAK_1X that reduces requirement for
381  * temporary storage even further, but at cost of more updates to A[][].
382  * It's less suitable if A[][] is memory bound, but better if it's
383  * register bound.
384  */
385 
Round(uint64_t A[5][5],size_t i)386 static void Round(uint64_t A[5][5], size_t i)
387 {
388     uint64_t C[5], D[5];
389 
390     assert(i < OSSL_NELEM(iotas));
391 
392     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
393     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
394     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
395     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
396     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
397 
398     D[1] = C[0] ^ ROL64(C[2], 1);
399     D[2] = C[1] ^ ROL64(C[3], 1);
400     D[3] = C[2] ^= ROL64(C[4], 1);
401     D[4] = C[3] ^= ROL64(C[0], 1);
402     D[0] = C[4] ^= ROL64(C[1], 1);
403 
404     A[0][1] ^= D[1];
405     A[1][1] ^= D[1];
406     A[2][1] ^= D[1];
407     A[3][1] ^= D[1];
408     A[4][1] ^= D[1];
409 
410     A[0][2] ^= D[2];
411     A[1][2] ^= D[2];
412     A[2][2] ^= D[2];
413     A[3][2] ^= D[2];
414     A[4][2] ^= D[2];
415 
416     A[0][3] ^= C[2];
417     A[1][3] ^= C[2];
418     A[2][3] ^= C[2];
419     A[3][3] ^= C[2];
420     A[4][3] ^= C[2];
421 
422     A[0][4] ^= C[3];
423     A[1][4] ^= C[3];
424     A[2][4] ^= C[3];
425     A[3][4] ^= C[3];
426     A[4][4] ^= C[3];
427 
428     A[0][0] ^= C[4];
429     A[1][0] ^= C[4];
430     A[2][0] ^= C[4];
431     A[3][0] ^= C[4];
432     A[4][0] ^= C[4];
433 
434     C[1] = A[0][1];
435     C[2] = A[0][2];
436     C[3] = A[0][3];
437     C[4] = A[0][4];
438 
439     A[0][1] = ROL64(A[1][1], rhotates[1][1]);
440     A[0][2] = ROL64(A[2][2], rhotates[2][2]);
441     A[0][3] = ROL64(A[3][3], rhotates[3][3]);
442     A[0][4] = ROL64(A[4][4], rhotates[4][4]);
443 
444     A[1][1] = ROL64(A[1][4], rhotates[1][4]);
445     A[2][2] = ROL64(A[2][3], rhotates[2][3]);
446     A[3][3] = ROL64(A[3][2], rhotates[3][2]);
447     A[4][4] = ROL64(A[4][1], rhotates[4][1]);
448 
449     A[1][4] = ROL64(A[4][2], rhotates[4][2]);
450     A[2][3] = ROL64(A[3][4], rhotates[3][4]);
451     A[3][2] = ROL64(A[2][1], rhotates[2][1]);
452     A[4][1] = ROL64(A[1][3], rhotates[1][3]);
453 
454     A[4][2] = ROL64(A[2][4], rhotates[2][4]);
455     A[3][4] = ROL64(A[4][3], rhotates[4][3]);
456     A[2][1] = ROL64(A[1][2], rhotates[1][2]);
457     A[1][3] = ROL64(A[3][1], rhotates[3][1]);
458 
459     A[2][4] = ROL64(A[4][0], rhotates[4][0]);
460     A[4][3] = ROL64(A[3][0], rhotates[3][0]);
461     A[1][2] = ROL64(A[2][0], rhotates[2][0]);
462     A[3][1] = ROL64(A[1][0], rhotates[1][0]);
463 
464     A[1][0] = ROL64(C[3], rhotates[0][3]);
465     A[2][0] = ROL64(C[1], rhotates[0][1]);
466     A[3][0] = ROL64(C[4], rhotates[0][4]);
467     A[4][0] = ROL64(C[2], rhotates[0][2]);
468 
469     C[0] = A[0][0];
470     C[1] = A[1][0];
471     D[0] = A[0][1];
472     D[1] = A[1][1];
473 
474     A[0][0] ^= (~A[0][1] & A[0][2]);
475     A[1][0] ^= (~A[1][1] & A[1][2]);
476     A[0][1] ^= (~A[0][2] & A[0][3]);
477     A[1][1] ^= (~A[1][2] & A[1][3]);
478     A[0][2] ^= (~A[0][3] & A[0][4]);
479     A[1][2] ^= (~A[1][3] & A[1][4]);
480     A[0][3] ^= (~A[0][4] & C[0]);
481     A[1][3] ^= (~A[1][4] & C[1]);
482     A[0][4] ^= (~C[0] & D[0]);
483     A[1][4] ^= (~C[1] & D[1]);
484 
485     C[2] = A[2][0];
486     C[3] = A[3][0];
487     D[2] = A[2][1];
488     D[3] = A[3][1];
489 
490     A[2][0] ^= (~A[2][1] & A[2][2]);
491     A[3][0] ^= (~A[3][1] & A[3][2]);
492     A[2][1] ^= (~A[2][2] & A[2][3]);
493     A[3][1] ^= (~A[3][2] & A[3][3]);
494     A[2][2] ^= (~A[2][3] & A[2][4]);
495     A[3][2] ^= (~A[3][3] & A[3][4]);
496     A[2][3] ^= (~A[2][4] & C[2]);
497     A[3][3] ^= (~A[3][4] & C[3]);
498     A[2][4] ^= (~C[2] & D[2]);
499     A[3][4] ^= (~C[3] & D[3]);
500 
501     C[4] = A[4][0];
502     D[4] = A[4][1];
503 
504     A[4][0] ^= (~A[4][1] & A[4][2]);
505     A[4][1] ^= (~A[4][2] & A[4][3]);
506     A[4][2] ^= (~A[4][3] & A[4][4]);
507     A[4][3] ^= (~A[4][4] & C[4]);
508     A[4][4] ^= (~C[4] & D[4]);
509     A[0][0] ^= iotas[i];
510 }
511 
KeccakF1600(uint64_t A[5][5])512 static void KeccakF1600(uint64_t A[5][5])
513 {
514     size_t i;
515 
516     for (i = 0; i < 24; i++) {
517         Round(A, i);
518     }
519 }
520 
521 #elif defined(KECCAK_2X)
522 /*
523  * This implementation is variant of KECCAK_1X above with outer-most
524  * round loop unrolled twice. This allows to take temporary storage
525  * out of round procedure and simplify references to it by alternating
526  * it with actual data (see round loop below). Originally it was meant
527  * rather as reference for an assembly implementation, but it seems to
528  * play best with compilers [as well as provide best instruction per
529  * processed byte ratio at minimal round unroll factor]...
530  */
Round(uint64_t R[5][5],uint64_t A[5][5],size_t i)531 static void Round(uint64_t R[5][5], uint64_t A[5][5], size_t i)
532 {
533     uint64_t C[5], D[5];
534 
535     assert(i < OSSL_NELEM(iotas));
536 
537     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
538     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
539     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
540     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
541     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
542 
543     D[0] = ROL64(C[1], 1) ^ C[4];
544     D[1] = ROL64(C[2], 1) ^ C[0];
545     D[2] = ROL64(C[3], 1) ^ C[1];
546     D[3] = ROL64(C[4], 1) ^ C[2];
547     D[4] = ROL64(C[0], 1) ^ C[3];
548 
549     C[0] = A[0][0] ^ D[0]; /* rotate by 0 */
550     C[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
551     C[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
552     C[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
553     C[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
554 
555 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
556     R[0][0] = C[0] ^ (C[1] | C[2]) ^ iotas[i];
557     R[0][1] = C[1] ^ (~C[2] | C[3]);
558     R[0][2] = C[2] ^ (C[3] & C[4]);
559     R[0][3] = C[3] ^ (C[4] | C[0]);
560     R[0][4] = C[4] ^ (C[0] & C[1]);
561 #else
562     R[0][0] = C[0] ^ (~C[1] & C[2]) ^ iotas[i];
563     R[0][1] = C[1] ^ (~C[2] & C[3]);
564     R[0][2] = C[2] ^ (~C[3] & C[4]);
565     R[0][3] = C[3] ^ (~C[4] & C[0]);
566     R[0][4] = C[4] ^ (~C[0] & C[1]);
567 #endif
568 
569     C[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
570     C[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
571     C[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
572     C[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
573     C[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
574 
575 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
576     R[1][0] = C[0] ^ (C[1] | C[2]);
577     R[1][1] = C[1] ^ (C[2] & C[3]);
578     R[1][2] = C[2] ^ (C[3] | ~C[4]);
579     R[1][3] = C[3] ^ (C[4] | C[0]);
580     R[1][4] = C[4] ^ (C[0] & C[1]);
581 #else
582     R[1][0] = C[0] ^ (~C[1] & C[2]);
583     R[1][1] = C[1] ^ (~C[2] & C[3]);
584     R[1][2] = C[2] ^ (~C[3] & C[4]);
585     R[1][3] = C[3] ^ (~C[4] & C[0]);
586     R[1][4] = C[4] ^ (~C[0] & C[1]);
587 #endif
588 
589     C[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
590     C[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
591     C[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
592     C[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
593     C[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
594 
595 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
596     R[2][0] = C[0] ^ (C[1] | C[2]);
597     R[2][1] = C[1] ^ (C[2] & C[3]);
598     R[2][2] = C[2] ^ (~C[3] & C[4]);
599     R[2][3] = ~C[3] ^ (C[4] | C[0]);
600     R[2][4] = C[4] ^ (C[0] & C[1]);
601 #else
602     R[2][0] = C[0] ^ (~C[1] & C[2]);
603     R[2][1] = C[1] ^ (~C[2] & C[3]);
604     R[2][2] = C[2] ^ (~C[3] & C[4]);
605     R[2][3] = C[3] ^ (~C[4] & C[0]);
606     R[2][4] = C[4] ^ (~C[0] & C[1]);
607 #endif
608 
609     C[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
610     C[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
611     C[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
612     C[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
613     C[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
614 
615 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
616     R[3][0] = C[0] ^ (C[1] & C[2]);
617     R[3][1] = C[1] ^ (C[2] | C[3]);
618     R[3][2] = C[2] ^ (~C[3] | C[4]);
619     R[3][3] = ~C[3] ^ (C[4] & C[0]);
620     R[3][4] = C[4] ^ (C[0] | C[1]);
621 #else
622     R[3][0] = C[0] ^ (~C[1] & C[2]);
623     R[3][1] = C[1] ^ (~C[2] & C[3]);
624     R[3][2] = C[2] ^ (~C[3] & C[4]);
625     R[3][3] = C[3] ^ (~C[4] & C[0]);
626     R[3][4] = C[4] ^ (~C[0] & C[1]);
627 #endif
628 
629     C[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
630     C[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
631     C[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
632     C[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
633     C[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
634 
635 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
636     R[4][0] = C[0] ^ (~C[1] & C[2]);
637     R[4][1] = ~C[1] ^ (C[2] | C[3]);
638     R[4][2] = C[2] ^ (C[3] & C[4]);
639     R[4][3] = C[3] ^ (C[4] | C[0]);
640     R[4][4] = C[4] ^ (C[0] & C[1]);
641 #else
642     R[4][0] = C[0] ^ (~C[1] & C[2]);
643     R[4][1] = C[1] ^ (~C[2] & C[3]);
644     R[4][2] = C[2] ^ (~C[3] & C[4]);
645     R[4][3] = C[3] ^ (~C[4] & C[0]);
646     R[4][4] = C[4] ^ (~C[0] & C[1]);
647 #endif
648 }
649 
KeccakF1600(uint64_t A[5][5])650 static void KeccakF1600(uint64_t A[5][5])
651 {
652     uint64_t T[5][5];
653     size_t i;
654 
655 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
656     A[0][1] = ~A[0][1];
657     A[0][2] = ~A[0][2];
658     A[1][3] = ~A[1][3];
659     A[2][2] = ~A[2][2];
660     A[3][2] = ~A[3][2];
661     A[4][0] = ~A[4][0];
662 #endif
663 
664     for (i = 0; i < 24; i += 2) {
665         Round(T, A, i);
666         Round(A, T, i + 1);
667     }
668 
669 #ifdef KECCAK_COMPLEMENTING_TRANSFORM
670     A[0][1] = ~A[0][1];
671     A[0][2] = ~A[0][2];
672     A[1][3] = ~A[1][3];
673     A[2][2] = ~A[2][2];
674     A[3][2] = ~A[3][2];
675     A[4][0] = ~A[4][0];
676 #endif
677 }
678 
679 #else /* define KECCAK_INPLACE to compile this code path */
680 /*
681  * This implementation is KECCAK_1X from above combined 4 times with
682  * a twist that allows to omit temporary storage and perform in-place
683  * processing. It's discussed in section 2.5 of "Keccak implementation
684  * overview". It's likely to be best suited for processors with large
685  * register bank... On the other hand processor with large register
686  * bank can as well use KECCAK_1X_ALT, it would be as fast but much
687  * more compact...
688  */
FourRounds(uint64_t A[5][5],size_t i)689 static void FourRounds(uint64_t A[5][5], size_t i)
690 {
691     uint64_t B[5], C[5], D[5];
692 
693     assert(i <= OSSL_NELEM(iotas) - 4);
694 
695     /* Round 4*n */
696     C[0] = A[0][0] ^ A[1][0] ^ A[2][0] ^ A[3][0] ^ A[4][0];
697     C[1] = A[0][1] ^ A[1][1] ^ A[2][1] ^ A[3][1] ^ A[4][1];
698     C[2] = A[0][2] ^ A[1][2] ^ A[2][2] ^ A[3][2] ^ A[4][2];
699     C[3] = A[0][3] ^ A[1][3] ^ A[2][3] ^ A[3][3] ^ A[4][3];
700     C[4] = A[0][4] ^ A[1][4] ^ A[2][4] ^ A[3][4] ^ A[4][4];
701 
702     D[0] = ROL64(C[1], 1) ^ C[4];
703     D[1] = ROL64(C[2], 1) ^ C[0];
704     D[2] = ROL64(C[3], 1) ^ C[1];
705     D[3] = ROL64(C[4], 1) ^ C[2];
706     D[4] = ROL64(C[0], 1) ^ C[3];
707 
708     B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
709     B[1] = ROL64(A[1][1] ^ D[1], rhotates[1][1]);
710     B[2] = ROL64(A[2][2] ^ D[2], rhotates[2][2]);
711     B[3] = ROL64(A[3][3] ^ D[3], rhotates[3][3]);
712     B[4] = ROL64(A[4][4] ^ D[4], rhotates[4][4]);
713 
714     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i];
715     C[1] = A[1][1] = B[1] ^ (~B[2] & B[3]);
716     C[2] = A[2][2] = B[2] ^ (~B[3] & B[4]);
717     C[3] = A[3][3] = B[3] ^ (~B[4] & B[0]);
718     C[4] = A[4][4] = B[4] ^ (~B[0] & B[1]);
719 
720     B[0] = ROL64(A[0][3] ^ D[3], rhotates[0][3]);
721     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
722     B[2] = ROL64(A[2][0] ^ D[0], rhotates[2][0]);
723     B[3] = ROL64(A[3][1] ^ D[1], rhotates[3][1]);
724     B[4] = ROL64(A[4][2] ^ D[2], rhotates[4][2]);
725 
726     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
727     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
728     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
729     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
730     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
731 
732     B[0] = ROL64(A[0][1] ^ D[1], rhotates[0][1]);
733     B[1] = ROL64(A[1][2] ^ D[2], rhotates[1][2]);
734     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
735     B[3] = ROL64(A[3][4] ^ D[4], rhotates[3][4]);
736     B[4] = ROL64(A[4][0] ^ D[0], rhotates[4][0]);
737 
738     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
739     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
740     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
741     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
742     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
743 
744     B[0] = ROL64(A[0][4] ^ D[4], rhotates[0][4]);
745     B[1] = ROL64(A[1][0] ^ D[0], rhotates[1][0]);
746     B[2] = ROL64(A[2][1] ^ D[1], rhotates[2][1]);
747     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
748     B[4] = ROL64(A[4][3] ^ D[3], rhotates[4][3]);
749 
750     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
751     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
752     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
753     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
754     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
755 
756     B[0] = ROL64(A[0][2] ^ D[2], rhotates[0][2]);
757     B[1] = ROL64(A[1][3] ^ D[3], rhotates[1][3]);
758     B[2] = ROL64(A[2][4] ^ D[4], rhotates[2][4]);
759     B[3] = ROL64(A[3][0] ^ D[0], rhotates[3][0]);
760     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
761 
762     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
763     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
764     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
765     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
766     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
767 
768     /* Round 4*n+1 */
769     D[0] = ROL64(C[1], 1) ^ C[4];
770     D[1] = ROL64(C[2], 1) ^ C[0];
771     D[2] = ROL64(C[3], 1) ^ C[1];
772     D[3] = ROL64(C[4], 1) ^ C[2];
773     D[4] = ROL64(C[0], 1) ^ C[3];
774 
775     B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
776     B[1] = ROL64(A[3][1] ^ D[1], rhotates[1][1]);
777     B[2] = ROL64(A[1][2] ^ D[2], rhotates[2][2]);
778     B[3] = ROL64(A[4][3] ^ D[3], rhotates[3][3]);
779     B[4] = ROL64(A[2][4] ^ D[4], rhotates[4][4]);
780 
781     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 1];
782     C[1] = A[3][1] = B[1] ^ (~B[2] & B[3]);
783     C[2] = A[1][2] = B[2] ^ (~B[3] & B[4]);
784     C[3] = A[4][3] = B[3] ^ (~B[4] & B[0]);
785     C[4] = A[2][4] = B[4] ^ (~B[0] & B[1]);
786 
787     B[0] = ROL64(A[3][3] ^ D[3], rhotates[0][3]);
788     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
789     B[2] = ROL64(A[4][0] ^ D[0], rhotates[2][0]);
790     B[3] = ROL64(A[2][1] ^ D[1], rhotates[3][1]);
791     B[4] = ROL64(A[0][2] ^ D[2], rhotates[4][2]);
792 
793     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
794     C[1] ^= A[2][1] = B[1] ^ (~B[2] & B[3]);
795     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
796     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
797     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
798 
799     B[0] = ROL64(A[1][1] ^ D[1], rhotates[0][1]);
800     B[1] = ROL64(A[4][2] ^ D[2], rhotates[1][2]);
801     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
802     B[3] = ROL64(A[0][4] ^ D[4], rhotates[3][4]);
803     B[4] = ROL64(A[3][0] ^ D[0], rhotates[4][0]);
804 
805     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
806     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
807     C[2] ^= A[4][2] = B[2] ^ (~B[3] & B[4]);
808     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
809     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
810 
811     B[0] = ROL64(A[4][4] ^ D[4], rhotates[0][4]);
812     B[1] = ROL64(A[2][0] ^ D[0], rhotates[1][0]);
813     B[2] = ROL64(A[0][1] ^ D[1], rhotates[2][1]);
814     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
815     B[4] = ROL64(A[1][3] ^ D[3], rhotates[4][3]);
816 
817     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
818     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
819     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
820     C[3] ^= A[1][3] = B[3] ^ (~B[4] & B[0]);
821     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
822 
823     B[0] = ROL64(A[2][2] ^ D[2], rhotates[0][2]);
824     B[1] = ROL64(A[0][3] ^ D[3], rhotates[1][3]);
825     B[2] = ROL64(A[3][4] ^ D[4], rhotates[2][4]);
826     B[3] = ROL64(A[1][0] ^ D[0], rhotates[3][0]);
827     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
828 
829     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
830     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
831     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
832     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
833     C[4] ^= A[3][4] = B[4] ^ (~B[0] & B[1]);
834 
835     /* Round 4*n+2 */
836     D[0] = ROL64(C[1], 1) ^ C[4];
837     D[1] = ROL64(C[2], 1) ^ C[0];
838     D[2] = ROL64(C[3], 1) ^ C[1];
839     D[3] = ROL64(C[4], 1) ^ C[2];
840     D[4] = ROL64(C[0], 1) ^ C[3];
841 
842     B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
843     B[1] = ROL64(A[2][1] ^ D[1], rhotates[1][1]);
844     B[2] = ROL64(A[4][2] ^ D[2], rhotates[2][2]);
845     B[3] = ROL64(A[1][3] ^ D[3], rhotates[3][3]);
846     B[4] = ROL64(A[3][4] ^ D[4], rhotates[4][4]);
847 
848     C[0] = A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 2];
849     C[1] = A[2][1] = B[1] ^ (~B[2] & B[3]);
850     C[2] = A[4][2] = B[2] ^ (~B[3] & B[4]);
851     C[3] = A[1][3] = B[3] ^ (~B[4] & B[0]);
852     C[4] = A[3][4] = B[4] ^ (~B[0] & B[1]);
853 
854     B[0] = ROL64(A[4][3] ^ D[3], rhotates[0][3]);
855     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
856     B[2] = ROL64(A[3][0] ^ D[0], rhotates[2][0]);
857     B[3] = ROL64(A[0][1] ^ D[1], rhotates[3][1]);
858     B[4] = ROL64(A[2][2] ^ D[2], rhotates[4][2]);
859 
860     C[0] ^= A[3][0] = B[0] ^ (~B[1] & B[2]);
861     C[1] ^= A[0][1] = B[1] ^ (~B[2] & B[3]);
862     C[2] ^= A[2][2] = B[2] ^ (~B[3] & B[4]);
863     C[3] ^= A[4][3] = B[3] ^ (~B[4] & B[0]);
864     C[4] ^= A[1][4] = B[4] ^ (~B[0] & B[1]);
865 
866     B[0] = ROL64(A[3][1] ^ D[1], rhotates[0][1]);
867     B[1] = ROL64(A[0][2] ^ D[2], rhotates[1][2]);
868     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
869     B[3] = ROL64(A[4][4] ^ D[4], rhotates[3][4]);
870     B[4] = ROL64(A[1][0] ^ D[0], rhotates[4][0]);
871 
872     C[0] ^= A[1][0] = B[0] ^ (~B[1] & B[2]);
873     C[1] ^= A[3][1] = B[1] ^ (~B[2] & B[3]);
874     C[2] ^= A[0][2] = B[2] ^ (~B[3] & B[4]);
875     C[3] ^= A[2][3] = B[3] ^ (~B[4] & B[0]);
876     C[4] ^= A[4][4] = B[4] ^ (~B[0] & B[1]);
877 
878     B[0] = ROL64(A[2][4] ^ D[4], rhotates[0][4]);
879     B[1] = ROL64(A[4][0] ^ D[0], rhotates[1][0]);
880     B[2] = ROL64(A[1][1] ^ D[1], rhotates[2][1]);
881     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
882     B[4] = ROL64(A[0][3] ^ D[3], rhotates[4][3]);
883 
884     C[0] ^= A[4][0] = B[0] ^ (~B[1] & B[2]);
885     C[1] ^= A[1][1] = B[1] ^ (~B[2] & B[3]);
886     C[2] ^= A[3][2] = B[2] ^ (~B[3] & B[4]);
887     C[3] ^= A[0][3] = B[3] ^ (~B[4] & B[0]);
888     C[4] ^= A[2][4] = B[4] ^ (~B[0] & B[1]);
889 
890     B[0] = ROL64(A[1][2] ^ D[2], rhotates[0][2]);
891     B[1] = ROL64(A[3][3] ^ D[3], rhotates[1][3]);
892     B[2] = ROL64(A[0][4] ^ D[4], rhotates[2][4]);
893     B[3] = ROL64(A[2][0] ^ D[0], rhotates[3][0]);
894     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
895 
896     C[0] ^= A[2][0] = B[0] ^ (~B[1] & B[2]);
897     C[1] ^= A[4][1] = B[1] ^ (~B[2] & B[3]);
898     C[2] ^= A[1][2] = B[2] ^ (~B[3] & B[4]);
899     C[3] ^= A[3][3] = B[3] ^ (~B[4] & B[0]);
900     C[4] ^= A[0][4] = B[4] ^ (~B[0] & B[1]);
901 
902     /* Round 4*n+3 */
903     D[0] = ROL64(C[1], 1) ^ C[4];
904     D[1] = ROL64(C[2], 1) ^ C[0];
905     D[2] = ROL64(C[3], 1) ^ C[1];
906     D[3] = ROL64(C[4], 1) ^ C[2];
907     D[4] = ROL64(C[0], 1) ^ C[3];
908 
909     B[0] = A[0][0] ^ D[0]; /* rotate by 0 */
910     B[1] = ROL64(A[0][1] ^ D[1], rhotates[1][1]);
911     B[2] = ROL64(A[0][2] ^ D[2], rhotates[2][2]);
912     B[3] = ROL64(A[0][3] ^ D[3], rhotates[3][3]);
913     B[4] = ROL64(A[0][4] ^ D[4], rhotates[4][4]);
914 
915     /* C[0] = */ A[0][0] = B[0] ^ (~B[1] & B[2]) ^ iotas[i + 3];
916     /* C[1] = */ A[0][1] = B[1] ^ (~B[2] & B[3]);
917     /* C[2] = */ A[0][2] = B[2] ^ (~B[3] & B[4]);
918     /* C[3] = */ A[0][3] = B[3] ^ (~B[4] & B[0]);
919     /* C[4] = */ A[0][4] = B[4] ^ (~B[0] & B[1]);
920 
921     B[0] = ROL64(A[1][3] ^ D[3], rhotates[0][3]);
922     B[1] = ROL64(A[1][4] ^ D[4], rhotates[1][4]);
923     B[2] = ROL64(A[1][0] ^ D[0], rhotates[2][0]);
924     B[3] = ROL64(A[1][1] ^ D[1], rhotates[3][1]);
925     B[4] = ROL64(A[1][2] ^ D[2], rhotates[4][2]);
926 
927     /* C[0] ^= */ A[1][0] = B[0] ^ (~B[1] & B[2]);
928     /* C[1] ^= */ A[1][1] = B[1] ^ (~B[2] & B[3]);
929     /* C[2] ^= */ A[1][2] = B[2] ^ (~B[3] & B[4]);
930     /* C[3] ^= */ A[1][3] = B[3] ^ (~B[4] & B[0]);
931     /* C[4] ^= */ A[1][4] = B[4] ^ (~B[0] & B[1]);
932 
933     B[0] = ROL64(A[2][1] ^ D[1], rhotates[0][1]);
934     B[1] = ROL64(A[2][2] ^ D[2], rhotates[1][2]);
935     B[2] = ROL64(A[2][3] ^ D[3], rhotates[2][3]);
936     B[3] = ROL64(A[2][4] ^ D[4], rhotates[3][4]);
937     B[4] = ROL64(A[2][0] ^ D[0], rhotates[4][0]);
938 
939     /* C[0] ^= */ A[2][0] = B[0] ^ (~B[1] & B[2]);
940     /* C[1] ^= */ A[2][1] = B[1] ^ (~B[2] & B[3]);
941     /* C[2] ^= */ A[2][2] = B[2] ^ (~B[3] & B[4]);
942     /* C[3] ^= */ A[2][3] = B[3] ^ (~B[4] & B[0]);
943     /* C[4] ^= */ A[2][4] = B[4] ^ (~B[0] & B[1]);
944 
945     B[0] = ROL64(A[3][4] ^ D[4], rhotates[0][4]);
946     B[1] = ROL64(A[3][0] ^ D[0], rhotates[1][0]);
947     B[2] = ROL64(A[3][1] ^ D[1], rhotates[2][1]);
948     B[3] = ROL64(A[3][2] ^ D[2], rhotates[3][2]);
949     B[4] = ROL64(A[3][3] ^ D[3], rhotates[4][3]);
950 
951     /* C[0] ^= */ A[3][0] = B[0] ^ (~B[1] & B[2]);
952     /* C[1] ^= */ A[3][1] = B[1] ^ (~B[2] & B[3]);
953     /* C[2] ^= */ A[3][2] = B[2] ^ (~B[3] & B[4]);
954     /* C[3] ^= */ A[3][3] = B[3] ^ (~B[4] & B[0]);
955     /* C[4] ^= */ A[3][4] = B[4] ^ (~B[0] & B[1]);
956 
957     B[0] = ROL64(A[4][2] ^ D[2], rhotates[0][2]);
958     B[1] = ROL64(A[4][3] ^ D[3], rhotates[1][3]);
959     B[2] = ROL64(A[4][4] ^ D[4], rhotates[2][4]);
960     B[3] = ROL64(A[4][0] ^ D[0], rhotates[3][0]);
961     B[4] = ROL64(A[4][1] ^ D[1], rhotates[4][1]);
962 
963     /* C[0] ^= */ A[4][0] = B[0] ^ (~B[1] & B[2]);
964     /* C[1] ^= */ A[4][1] = B[1] ^ (~B[2] & B[3]);
965     /* C[2] ^= */ A[4][2] = B[2] ^ (~B[3] & B[4]);
966     /* C[3] ^= */ A[4][3] = B[3] ^ (~B[4] & B[0]);
967     /* C[4] ^= */ A[4][4] = B[4] ^ (~B[0] & B[1]);
968 }
969 
KeccakF1600(uint64_t A[5][5])970 static void KeccakF1600(uint64_t A[5][5])
971 {
972     size_t i;
973 
974     for (i = 0; i < 24; i += 4) {
975         FourRounds(A, i);
976     }
977 }
978 
979 #endif
980 
BitInterleave(uint64_t Ai)981 static uint64_t BitInterleave(uint64_t Ai)
982 {
983     if (BIT_INTERLEAVE) {
984         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
985         uint32_t t0, t1;
986 
987         t0 = lo & 0x55555555;
988         t0 |= t0 >> 1;
989         t0 &= 0x33333333;
990         t0 |= t0 >> 2;
991         t0 &= 0x0f0f0f0f;
992         t0 |= t0 >> 4;
993         t0 &= 0x00ff00ff;
994         t0 |= t0 >> 8;
995         t0 &= 0x0000ffff;
996 
997         t1 = hi & 0x55555555;
998         t1 |= t1 >> 1;
999         t1 &= 0x33333333;
1000         t1 |= t1 >> 2;
1001         t1 &= 0x0f0f0f0f;
1002         t1 |= t1 >> 4;
1003         t1 &= 0x00ff00ff;
1004         t1 |= t1 >> 8;
1005         t1 <<= 16;
1006 
1007         lo &= 0xaaaaaaaa;
1008         lo |= lo << 1;
1009         lo &= 0xcccccccc;
1010         lo |= lo << 2;
1011         lo &= 0xf0f0f0f0;
1012         lo |= lo << 4;
1013         lo &= 0xff00ff00;
1014         lo |= lo << 8;
1015         lo >>= 16;
1016 
1017         hi &= 0xaaaaaaaa;
1018         hi |= hi << 1;
1019         hi &= 0xcccccccc;
1020         hi |= hi << 2;
1021         hi &= 0xf0f0f0f0;
1022         hi |= hi << 4;
1023         hi &= 0xff00ff00;
1024         hi |= hi << 8;
1025         hi &= 0xffff0000;
1026 
1027         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1028     }
1029 
1030     return Ai;
1031 }
1032 
BitDeinterleave(uint64_t Ai)1033 static uint64_t BitDeinterleave(uint64_t Ai)
1034 {
1035     if (BIT_INTERLEAVE) {
1036         uint32_t hi = (uint32_t)(Ai >> 32), lo = (uint32_t)Ai;
1037         uint32_t t0, t1;
1038 
1039         t0 = lo & 0x0000ffff;
1040         t0 |= t0 << 8;
1041         t0 &= 0x00ff00ff;
1042         t0 |= t0 << 4;
1043         t0 &= 0x0f0f0f0f;
1044         t0 |= t0 << 2;
1045         t0 &= 0x33333333;
1046         t0 |= t0 << 1;
1047         t0 &= 0x55555555;
1048 
1049         t1 = hi << 16;
1050         t1 |= t1 >> 8;
1051         t1 &= 0xff00ff00;
1052         t1 |= t1 >> 4;
1053         t1 &= 0xf0f0f0f0;
1054         t1 |= t1 >> 2;
1055         t1 &= 0xcccccccc;
1056         t1 |= t1 >> 1;
1057         t1 &= 0xaaaaaaaa;
1058 
1059         lo >>= 16;
1060         lo |= lo << 8;
1061         lo &= 0x00ff00ff;
1062         lo |= lo << 4;
1063         lo &= 0x0f0f0f0f;
1064         lo |= lo << 2;
1065         lo &= 0x33333333;
1066         lo |= lo << 1;
1067         lo &= 0x55555555;
1068 
1069         hi &= 0xffff0000;
1070         hi |= hi >> 8;
1071         hi &= 0xff00ff00;
1072         hi |= hi >> 4;
1073         hi &= 0xf0f0f0f0;
1074         hi |= hi >> 2;
1075         hi &= 0xcccccccc;
1076         hi |= hi >> 1;
1077         hi &= 0xaaaaaaaa;
1078 
1079         Ai = ((uint64_t)(hi | lo) << 32) | (t1 | t0);
1080     }
1081 
1082     return Ai;
1083 }
1084 
1085 /*
1086  * SHA3_absorb can be called multiple times, but at each invocation
1087  * largest multiple of |r| out of |len| bytes are processed. Then
1088  * remaining amount of bytes is returned. This is done to spare caller
1089  * trouble of calculating the largest multiple of |r|. |r| can be viewed
1090  * as blocksize. It is commonly (1600 - 256*n)/8, e.g. 168, 136, 104,
1091  * 72, but can also be (1600 - 448)/8 = 144. All this means that message
1092  * padding and intermediate sub-block buffering, byte- or bitwise, is
1093  * caller's responsibility.
1094  */
SHA3_absorb(uint64_t A[5][5],const unsigned char * inp,size_t len,size_t r)1095 size_t SHA3_absorb(uint64_t A[5][5], const unsigned char *inp, size_t len,
1096     size_t r)
1097 {
1098     uint64_t *A_flat = (uint64_t *)A;
1099     size_t i, w = r / 8;
1100 
1101     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1102 
1103     while (len >= r) {
1104         for (i = 0; i < w; i++) {
1105             uint64_t Ai = (uint64_t)inp[0] | (uint64_t)inp[1] << 8 | (uint64_t)inp[2] << 16 | (uint64_t)inp[3] << 24 | (uint64_t)inp[4] << 32 | (uint64_t)inp[5] << 40 | (uint64_t)inp[6] << 48 | (uint64_t)inp[7] << 56;
1106             inp += 8;
1107 
1108             A_flat[i] ^= BitInterleave(Ai);
1109         }
1110         KeccakF1600(A);
1111         len -= r;
1112     }
1113 
1114     return len;
1115 }
1116 
1117 /*
1118  * SHA3_squeeze may be called after SHA3_absorb to generate |out| hash value of
1119  * |len| bytes.
1120  * If multiple SHA3_squeeze calls are required the output length |len| must be a
1121  * multiple of the blocksize, with |next| being 0 on the first call and 1 on
1122  * subsequent calls. It is the callers responsibility to buffer the results.
1123  * When only a single call to SHA3_squeeze is required, |len| can be any size
1124  * and |next| must be 0.
1125  */
SHA3_squeeze(uint64_t A[5][5],unsigned char * out,size_t len,size_t r,int next)1126 void SHA3_squeeze(uint64_t A[5][5], unsigned char *out, size_t len, size_t r,
1127     int next)
1128 {
1129     uint64_t *A_flat = (uint64_t *)A;
1130     size_t i, w = r / 8;
1131 
1132     assert(r < (25 * sizeof(A[0][0])) && (r % 8) == 0);
1133 
1134     while (len != 0) {
1135         if (next)
1136             KeccakF1600(A);
1137         next = 1;
1138         for (i = 0; i < w && len != 0; i++) {
1139             uint64_t Ai = BitDeinterleave(A_flat[i]);
1140 
1141             if (len < 8) {
1142                 for (i = 0; i < len; i++) {
1143                     *out++ = (unsigned char)Ai;
1144                     Ai >>= 8;
1145                 }
1146                 return;
1147             }
1148 
1149             out[0] = (unsigned char)(Ai);
1150             out[1] = (unsigned char)(Ai >> 8);
1151             out[2] = (unsigned char)(Ai >> 16);
1152             out[3] = (unsigned char)(Ai >> 24);
1153             out[4] = (unsigned char)(Ai >> 32);
1154             out[5] = (unsigned char)(Ai >> 40);
1155             out[6] = (unsigned char)(Ai >> 48);
1156             out[7] = (unsigned char)(Ai >> 56);
1157             out += 8;
1158             len -= 8;
1159         }
1160     }
1161 }
1162 #endif
1163 
1164 #ifdef SELFTEST
1165 /*
1166  * Post-padding one-shot implementations would look as following:
1167  *
1168  * SHA3_224     SHA3_sponge(inp, len, out, 224/8, (1600-448)/8);
1169  * SHA3_256     SHA3_sponge(inp, len, out, 256/8, (1600-512)/8);
1170  * SHA3_384     SHA3_sponge(inp, len, out, 384/8, (1600-768)/8);
1171  * SHA3_512     SHA3_sponge(inp, len, out, 512/8, (1600-1024)/8);
1172  * SHAKE_128    SHA3_sponge(inp, len, out, d, (1600-256)/8);
1173  * SHAKE_256    SHA3_sponge(inp, len, out, d, (1600-512)/8);
1174  */
1175 
SHA3_sponge(const unsigned char * inp,size_t len,unsigned char * out,size_t d,size_t r)1176 void SHA3_sponge(const unsigned char *inp, size_t len,
1177     unsigned char *out, size_t d, size_t r)
1178 {
1179     uint64_t A[5][5];
1180 
1181     memset(A, 0, sizeof(A));
1182     SHA3_absorb(A, inp, len, r);
1183     SHA3_squeeze(A, out, d, r);
1184 }
1185 
1186 #include <stdio.h>
1187 
main(void)1188 int main(void)
1189 {
1190     /*
1191      * This is 5-bit SHAKE128 test from http://csrc.nist.gov/groups/ST/toolkit/examples.html#aHashing
1192      */
1193     unsigned char test[168] = { '\xf3', '\x3' };
1194     unsigned char out[512];
1195     size_t i;
1196     static const unsigned char result[512] = {
1197         0x2E, 0x0A, 0xBF, 0xBA, 0x83, 0xE6, 0x72, 0x0B,
1198         0xFB, 0xC2, 0x25, 0xFF, 0x6B, 0x7A, 0xB9, 0xFF,
1199         0xCE, 0x58, 0xBA, 0x02, 0x7E, 0xE3, 0xD8, 0x98,
1200         0x76, 0x4F, 0xEF, 0x28, 0x7D, 0xDE, 0xCC, 0xCA,
1201         0x3E, 0x6E, 0x59, 0x98, 0x41, 0x1E, 0x7D, 0xDB,
1202         0x32, 0xF6, 0x75, 0x38, 0xF5, 0x00, 0xB1, 0x8C,
1203         0x8C, 0x97, 0xC4, 0x52, 0xC3, 0x70, 0xEA, 0x2C,
1204         0xF0, 0xAF, 0xCA, 0x3E, 0x05, 0xDE, 0x7E, 0x4D,
1205         0xE2, 0x7F, 0xA4, 0x41, 0xA9, 0xCB, 0x34, 0xFD,
1206         0x17, 0xC9, 0x78, 0xB4, 0x2D, 0x5B, 0x7E, 0x7F,
1207         0x9A, 0xB1, 0x8F, 0xFE, 0xFF, 0xC3, 0xC5, 0xAC,
1208         0x2F, 0x3A, 0x45, 0x5E, 0xEB, 0xFD, 0xC7, 0x6C,
1209         0xEA, 0xEB, 0x0A, 0x2C, 0xCA, 0x22, 0xEE, 0xF6,
1210         0xE6, 0x37, 0xF4, 0xCA, 0xBE, 0x5C, 0x51, 0xDE,
1211         0xD2, 0xE3, 0xFA, 0xD8, 0xB9, 0x52, 0x70, 0xA3,
1212         0x21, 0x84, 0x56, 0x64, 0xF1, 0x07, 0xD1, 0x64,
1213         0x96, 0xBB, 0x7A, 0xBF, 0xBE, 0x75, 0x04, 0xB6,
1214         0xED, 0xE2, 0xE8, 0x9E, 0x4B, 0x99, 0x6F, 0xB5,
1215         0x8E, 0xFD, 0xC4, 0x18, 0x1F, 0x91, 0x63, 0x38,
1216         0x1C, 0xBE, 0x7B, 0xC0, 0x06, 0xA7, 0xA2, 0x05,
1217         0x98, 0x9C, 0x52, 0x6C, 0xD1, 0xBD, 0x68, 0x98,
1218         0x36, 0x93, 0xB4, 0xBD, 0xC5, 0x37, 0x28, 0xB2,
1219         0x41, 0xC1, 0xCF, 0xF4, 0x2B, 0xB6, 0x11, 0x50,
1220         0x2C, 0x35, 0x20, 0x5C, 0xAB, 0xB2, 0x88, 0x75,
1221         0x56, 0x55, 0xD6, 0x20, 0xC6, 0x79, 0x94, 0xF0,
1222         0x64, 0x51, 0x18, 0x7F, 0x6F, 0xD1, 0x7E, 0x04,
1223         0x66, 0x82, 0xBA, 0x12, 0x86, 0x06, 0x3F, 0xF8,
1224         0x8F, 0xE2, 0x50, 0x8D, 0x1F, 0xCA, 0xF9, 0x03,
1225         0x5A, 0x12, 0x31, 0xAD, 0x41, 0x50, 0xA9, 0xC9,
1226         0xB2, 0x4C, 0x9B, 0x2D, 0x66, 0xB2, 0xAD, 0x1B,
1227         0xDE, 0x0B, 0xD0, 0xBB, 0xCB, 0x8B, 0xE0, 0x5B,
1228         0x83, 0x52, 0x29, 0xEF, 0x79, 0x19, 0x73, 0x73,
1229         0x23, 0x42, 0x44, 0x01, 0xE1, 0xD8, 0x37, 0xB6,
1230         0x6E, 0xB4, 0xE6, 0x30, 0xFF, 0x1D, 0xE7, 0x0C,
1231         0xB3, 0x17, 0xC2, 0xBA, 0xCB, 0x08, 0x00, 0x1D,
1232         0x34, 0x77, 0xB7, 0xA7, 0x0A, 0x57, 0x6D, 0x20,
1233         0x86, 0x90, 0x33, 0x58, 0x9D, 0x85, 0xA0, 0x1D,
1234         0xDB, 0x2B, 0x66, 0x46, 0xC0, 0x43, 0xB5, 0x9F,
1235         0xC0, 0x11, 0x31, 0x1D, 0xA6, 0x66, 0xFA, 0x5A,
1236         0xD1, 0xD6, 0x38, 0x7F, 0xA9, 0xBC, 0x40, 0x15,
1237         0xA3, 0x8A, 0x51, 0xD1, 0xDA, 0x1E, 0xA6, 0x1D,
1238         0x64, 0x8D, 0xC8, 0xE3, 0x9A, 0x88, 0xB9, 0xD6,
1239         0x22, 0xBD, 0xE2, 0x07, 0xFD, 0xAB, 0xC6, 0xF2,
1240         0x82, 0x7A, 0x88, 0x0C, 0x33, 0x0B, 0xBF, 0x6D,
1241         0xF7, 0x33, 0x77, 0x4B, 0x65, 0x3E, 0x57, 0x30,
1242         0x5D, 0x78, 0xDC, 0xE1, 0x12, 0xF1, 0x0A, 0x2C,
1243         0x71, 0xF4, 0xCD, 0xAD, 0x92, 0xED, 0x11, 0x3E,
1244         0x1C, 0xEA, 0x63, 0xB9, 0x19, 0x25, 0xED, 0x28,
1245         0x19, 0x1E, 0x6D, 0xBB, 0xB5, 0xAA, 0x5A, 0x2A,
1246         0xFD, 0xA5, 0x1F, 0xC0, 0x5A, 0x3A, 0xF5, 0x25,
1247         0x8B, 0x87, 0x66, 0x52, 0x43, 0x55, 0x0F, 0x28,
1248         0x94, 0x8A, 0xE2, 0xB8, 0xBE, 0xB6, 0xBC, 0x9C,
1249         0x77, 0x0B, 0x35, 0xF0, 0x67, 0xEA, 0xA6, 0x41,
1250         0xEF, 0xE6, 0x5B, 0x1A, 0x44, 0x90, 0x9D, 0x1B,
1251         0x14, 0x9F, 0x97, 0xEE, 0xA6, 0x01, 0x39, 0x1C,
1252         0x60, 0x9E, 0xC8, 0x1D, 0x19, 0x30, 0xF5, 0x7C,
1253         0x18, 0xA4, 0xE0, 0xFA, 0xB4, 0x91, 0xD1, 0xCA,
1254         0xDF, 0xD5, 0x04, 0x83, 0x44, 0x9E, 0xDC, 0x0F,
1255         0x07, 0xFF, 0xB2, 0x4D, 0x2C, 0x6F, 0x9A, 0x9A,
1256         0x3B, 0xFF, 0x39, 0xAE, 0x3D, 0x57, 0xF5, 0x60,
1257         0x65, 0x4D, 0x7D, 0x75, 0xC9, 0x08, 0xAB, 0xE6,
1258         0x25, 0x64, 0x75, 0x3E, 0xAC, 0x39, 0xD7, 0x50,
1259         0x3D, 0xA6, 0xD3, 0x7C, 0x2E, 0x32, 0xE1, 0xAF,
1260         0x3B, 0x8A, 0xEC, 0x8A, 0xE3, 0x06, 0x9C, 0xD9
1261     };
1262 
1263     test[167] = '\x80';
1264     SHA3_sponge(test, sizeof(test), out, sizeof(out), sizeof(test));
1265 
1266     /*
1267      * Rationale behind keeping output [formatted as below] is that
1268      * one should be able to redirect it to a file, then copy-n-paste
1269      * final "output val" from official example to another file, and
1270      * compare the two with diff(1).
1271      */
1272     for (i = 0; i < sizeof(out);) {
1273         printf("%02X", out[i]);
1274         printf(++i % 16 && i != sizeof(out) ? " " : "\n");
1275     }
1276 
1277     if (memcmp(out, result, sizeof(out))) {
1278         fprintf(stderr, "failure\n");
1279         return 1;
1280     } else {
1281         fprintf(stderr, "success\n");
1282         return 0;
1283     }
1284 }
1285 #endif
1286