10957b409SSimon J. Gerraty /*
20957b409SSimon J. Gerraty * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
30957b409SSimon J. Gerraty *
40957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
50957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
60957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
70957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
80957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
90957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
100957b409SSimon J. Gerraty * the following conditions:
110957b409SSimon J. Gerraty *
120957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
130957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
140957b409SSimon J. Gerraty *
150957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
160957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
170957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
180957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
190957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
200957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
210957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
220957b409SSimon J. Gerraty * SOFTWARE.
230957b409SSimon J. Gerraty */
240957b409SSimon J. Gerraty
250957b409SSimon J. Gerraty #include "inner.h"
260957b409SSimon J. Gerraty
270957b409SSimon J. Gerraty /*
280957b409SSimon J. Gerraty * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
290957b409SSimon J. Gerraty * that right-shifting a signed negative integer copies the sign bit
300957b409SSimon J. Gerraty * (arithmetic right-shift). This is "implementation-defined behaviour",
310957b409SSimon J. Gerraty * i.e. it is not undefined, but it may differ between compilers. Each
320957b409SSimon J. Gerraty * compiler is supposed to document its behaviour in that respect. GCC
330957b409SSimon J. Gerraty * explicitly defines that an arithmetic right shift is used. We expect
340957b409SSimon J. Gerraty * all other compilers to do the same, because underlying CPU offer an
350957b409SSimon J. Gerraty * arithmetic right shift opcode that could not be used otherwise.
360957b409SSimon J. Gerraty */
370957b409SSimon J. Gerraty #if BR_NO_ARITH_SHIFT
380957b409SSimon J. Gerraty #define ARSH(x, n) (((uint32_t)(x) >> (n)) \
390957b409SSimon J. Gerraty | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
400957b409SSimon J. Gerraty #else
410957b409SSimon J. Gerraty #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n))
420957b409SSimon J. Gerraty #endif
430957b409SSimon J. Gerraty
440957b409SSimon J. Gerraty /*
450957b409SSimon J. Gerraty * Convert an integer from unsigned big-endian encoding to a sequence of
460957b409SSimon J. Gerraty * 13-bit words in little-endian order. The final "partial" word is
470957b409SSimon J. Gerraty * returned.
480957b409SSimon J. Gerraty */
490957b409SSimon J. Gerraty static uint32_t
be8_to_le13(uint32_t * dst,const unsigned char * src,size_t len)500957b409SSimon J. Gerraty be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
510957b409SSimon J. Gerraty {
520957b409SSimon J. Gerraty uint32_t acc;
530957b409SSimon J. Gerraty int acc_len;
540957b409SSimon J. Gerraty
550957b409SSimon J. Gerraty acc = 0;
560957b409SSimon J. Gerraty acc_len = 0;
570957b409SSimon J. Gerraty while (len -- > 0) {
580957b409SSimon J. Gerraty acc |= (uint32_t)src[len] << acc_len;
590957b409SSimon J. Gerraty acc_len += 8;
600957b409SSimon J. Gerraty if (acc_len >= 13) {
610957b409SSimon J. Gerraty *dst ++ = acc & 0x1FFF;
620957b409SSimon J. Gerraty acc >>= 13;
630957b409SSimon J. Gerraty acc_len -= 13;
640957b409SSimon J. Gerraty }
650957b409SSimon J. Gerraty }
660957b409SSimon J. Gerraty return acc;
670957b409SSimon J. Gerraty }
680957b409SSimon J. Gerraty
690957b409SSimon J. Gerraty /*
700957b409SSimon J. Gerraty * Convert an integer (13-bit words, little-endian) to unsigned
710957b409SSimon J. Gerraty * big-endian encoding. The total encoding length is provided; all
720957b409SSimon J. Gerraty * the destination bytes will be filled.
730957b409SSimon J. Gerraty */
740957b409SSimon J. Gerraty static void
le13_to_be8(unsigned char * dst,size_t len,const uint32_t * src)750957b409SSimon J. Gerraty le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
760957b409SSimon J. Gerraty {
770957b409SSimon J. Gerraty uint32_t acc;
780957b409SSimon J. Gerraty int acc_len;
790957b409SSimon J. Gerraty
800957b409SSimon J. Gerraty acc = 0;
810957b409SSimon J. Gerraty acc_len = 0;
820957b409SSimon J. Gerraty while (len -- > 0) {
830957b409SSimon J. Gerraty if (acc_len < 8) {
840957b409SSimon J. Gerraty acc |= (*src ++) << acc_len;
850957b409SSimon J. Gerraty acc_len += 13;
860957b409SSimon J. Gerraty }
870957b409SSimon J. Gerraty dst[len] = (unsigned char)acc;
880957b409SSimon J. Gerraty acc >>= 8;
890957b409SSimon J. Gerraty acc_len -= 8;
900957b409SSimon J. Gerraty }
910957b409SSimon J. Gerraty }
920957b409SSimon J. Gerraty
930957b409SSimon J. Gerraty /*
940957b409SSimon J. Gerraty * Normalise an array of words to a strict 13 bits per word. Returned
950957b409SSimon J. Gerraty * value is the resulting carry. The source (w) and destination (d)
960957b409SSimon J. Gerraty * arrays may be identical, but shall not overlap partially.
970957b409SSimon J. Gerraty */
980957b409SSimon J. Gerraty static inline uint32_t
norm13(uint32_t * d,const uint32_t * w,size_t len)990957b409SSimon J. Gerraty norm13(uint32_t *d, const uint32_t *w, size_t len)
1000957b409SSimon J. Gerraty {
1010957b409SSimon J. Gerraty size_t u;
1020957b409SSimon J. Gerraty uint32_t cc;
1030957b409SSimon J. Gerraty
1040957b409SSimon J. Gerraty cc = 0;
1050957b409SSimon J. Gerraty for (u = 0; u < len; u ++) {
1060957b409SSimon J. Gerraty int32_t z;
1070957b409SSimon J. Gerraty
1080957b409SSimon J. Gerraty z = w[u] + cc;
1090957b409SSimon J. Gerraty d[u] = z & 0x1FFF;
1100957b409SSimon J. Gerraty cc = ARSH(z, 13);
1110957b409SSimon J. Gerraty }
1120957b409SSimon J. Gerraty return cc;
1130957b409SSimon J. Gerraty }
1140957b409SSimon J. Gerraty
1150957b409SSimon J. Gerraty /*
1160957b409SSimon J. Gerraty * mul20() multiplies two 260-bit integers together. Each word must fit
1170957b409SSimon J. Gerraty * on 13 bits; source operands use 20 words, destination operand
1180957b409SSimon J. Gerraty * receives 40 words. All overlaps allowed.
1190957b409SSimon J. Gerraty *
1200957b409SSimon J. Gerraty * square20() computes the square of a 260-bit integer. Each word must
1210957b409SSimon J. Gerraty * fit on 13 bits; source operand uses 20 words, destination operand
1220957b409SSimon J. Gerraty * receives 40 words. All overlaps allowed.
1230957b409SSimon J. Gerraty */
1240957b409SSimon J. Gerraty
1250957b409SSimon J. Gerraty #if BR_SLOW_MUL15
1260957b409SSimon J. Gerraty
1270957b409SSimon J. Gerraty static void
mul20(uint32_t * d,const uint32_t * a,const uint32_t * b)1280957b409SSimon J. Gerraty mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
1290957b409SSimon J. Gerraty {
1300957b409SSimon J. Gerraty /*
1310957b409SSimon J. Gerraty * Two-level Karatsuba: turns a 20x20 multiplication into
1320957b409SSimon J. Gerraty * nine 5x5 multiplications. We use 13-bit words but do not
1330957b409SSimon J. Gerraty * propagate carries immediately, so words may expand:
1340957b409SSimon J. Gerraty *
1350957b409SSimon J. Gerraty * - First Karatsuba decomposition turns the 20x20 mul on
1360957b409SSimon J. Gerraty * 13-bit words into three 10x10 muls, two on 13-bit words
1370957b409SSimon J. Gerraty * and one on 14-bit words.
1380957b409SSimon J. Gerraty *
1390957b409SSimon J. Gerraty * - Second Karatsuba decomposition further splits these into:
1400957b409SSimon J. Gerraty *
1410957b409SSimon J. Gerraty * * four 5x5 muls on 13-bit words
1420957b409SSimon J. Gerraty * * four 5x5 muls on 14-bit words
1430957b409SSimon J. Gerraty * * one 5x5 mul on 15-bit words
1440957b409SSimon J. Gerraty *
1450957b409SSimon J. Gerraty * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
1460957b409SSimon J. Gerraty * or 15-bit words, respectively.
1470957b409SSimon J. Gerraty */
1480957b409SSimon J. Gerraty uint32_t u[45], v[45], w[90];
1490957b409SSimon J. Gerraty uint32_t cc;
1500957b409SSimon J. Gerraty int i;
1510957b409SSimon J. Gerraty
1520957b409SSimon J. Gerraty #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
1530957b409SSimon J. Gerraty (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
1540957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 0]; \
1550957b409SSimon J. Gerraty (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
1560957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 1]; \
1570957b409SSimon J. Gerraty (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
1580957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 2]; \
1590957b409SSimon J. Gerraty (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
1600957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 3]; \
1610957b409SSimon J. Gerraty (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
1620957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 4]; \
1630957b409SSimon J. Gerraty } while (0)
1640957b409SSimon J. Gerraty
1650957b409SSimon J. Gerraty #define ZADDT(dw, d_off, sw, s_off) do { \
1660957b409SSimon J. Gerraty (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
1670957b409SSimon J. Gerraty (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
1680957b409SSimon J. Gerraty (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
1690957b409SSimon J. Gerraty (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
1700957b409SSimon J. Gerraty (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
1710957b409SSimon J. Gerraty } while (0)
1720957b409SSimon J. Gerraty
1730957b409SSimon J. Gerraty #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \
1740957b409SSimon J. Gerraty (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
1750957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 0]; \
1760957b409SSimon J. Gerraty (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
1770957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 1]; \
1780957b409SSimon J. Gerraty (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
1790957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 2]; \
1800957b409SSimon J. Gerraty (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
1810957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 3]; \
1820957b409SSimon J. Gerraty (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
1830957b409SSimon J. Gerraty + (s2w)[5 * (s2_off) + 4]; \
1840957b409SSimon J. Gerraty } while (0)
1850957b409SSimon J. Gerraty
1860957b409SSimon J. Gerraty #define CPR1(w, cprcc) do { \
1870957b409SSimon J. Gerraty uint32_t cprz = (w) + cprcc; \
1880957b409SSimon J. Gerraty (w) = cprz & 0x1FFF; \
1890957b409SSimon J. Gerraty cprcc = cprz >> 13; \
1900957b409SSimon J. Gerraty } while (0)
1910957b409SSimon J. Gerraty
1920957b409SSimon J. Gerraty #define CPR(dw, d_off) do { \
1930957b409SSimon J. Gerraty uint32_t cprcc; \
1940957b409SSimon J. Gerraty cprcc = 0; \
1950957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 0], cprcc); \
1960957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 1], cprcc); \
1970957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 2], cprcc); \
1980957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 3], cprcc); \
1990957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 4], cprcc); \
2000957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 5], cprcc); \
2010957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 6], cprcc); \
2020957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 7], cprcc); \
2030957b409SSimon J. Gerraty CPR1((dw)[(d_off) + 8], cprcc); \
2040957b409SSimon J. Gerraty (dw)[(d_off) + 9] = cprcc; \
2050957b409SSimon J. Gerraty } while (0)
2060957b409SSimon J. Gerraty
2070957b409SSimon J. Gerraty memcpy(u, a, 20 * sizeof *a);
2080957b409SSimon J. Gerraty ZADD(u, 4, a, 0, a, 1);
2090957b409SSimon J. Gerraty ZADD(u, 5, a, 2, a, 3);
2100957b409SSimon J. Gerraty ZADD(u, 6, a, 0, a, 2);
2110957b409SSimon J. Gerraty ZADD(u, 7, a, 1, a, 3);
2120957b409SSimon J. Gerraty ZADD(u, 8, u, 6, u, 7);
2130957b409SSimon J. Gerraty
2140957b409SSimon J. Gerraty memcpy(v, b, 20 * sizeof *b);
2150957b409SSimon J. Gerraty ZADD(v, 4, b, 0, b, 1);
2160957b409SSimon J. Gerraty ZADD(v, 5, b, 2, b, 3);
2170957b409SSimon J. Gerraty ZADD(v, 6, b, 0, b, 2);
2180957b409SSimon J. Gerraty ZADD(v, 7, b, 1, b, 3);
2190957b409SSimon J. Gerraty ZADD(v, 8, v, 6, v, 7);
2200957b409SSimon J. Gerraty
2210957b409SSimon J. Gerraty /*
2220957b409SSimon J. Gerraty * Do the eight first 8x8 muls. Source words are at most 16382
2230957b409SSimon J. Gerraty * each, so we can add product results together "as is" in 32-bit
2240957b409SSimon J. Gerraty * words.
2250957b409SSimon J. Gerraty */
2260957b409SSimon J. Gerraty for (i = 0; i < 40; i += 5) {
2270957b409SSimon J. Gerraty w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
2280957b409SSimon J. Gerraty w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
2290957b409SSimon J. Gerraty + MUL15(u[i + 1], v[i + 0]);
2300957b409SSimon J. Gerraty w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
2310957b409SSimon J. Gerraty + MUL15(u[i + 1], v[i + 1])
2320957b409SSimon J. Gerraty + MUL15(u[i + 2], v[i + 0]);
2330957b409SSimon J. Gerraty w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
2340957b409SSimon J. Gerraty + MUL15(u[i + 1], v[i + 2])
2350957b409SSimon J. Gerraty + MUL15(u[i + 2], v[i + 1])
2360957b409SSimon J. Gerraty + MUL15(u[i + 3], v[i + 0]);
2370957b409SSimon J. Gerraty w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
2380957b409SSimon J. Gerraty + MUL15(u[i + 1], v[i + 3])
2390957b409SSimon J. Gerraty + MUL15(u[i + 2], v[i + 2])
2400957b409SSimon J. Gerraty + MUL15(u[i + 3], v[i + 1])
2410957b409SSimon J. Gerraty + MUL15(u[i + 4], v[i + 0]);
2420957b409SSimon J. Gerraty w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
2430957b409SSimon J. Gerraty + MUL15(u[i + 2], v[i + 3])
2440957b409SSimon J. Gerraty + MUL15(u[i + 3], v[i + 2])
2450957b409SSimon J. Gerraty + MUL15(u[i + 4], v[i + 1]);
2460957b409SSimon J. Gerraty w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
2470957b409SSimon J. Gerraty + MUL15(u[i + 3], v[i + 3])
2480957b409SSimon J. Gerraty + MUL15(u[i + 4], v[i + 2]);
2490957b409SSimon J. Gerraty w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
2500957b409SSimon J. Gerraty + MUL15(u[i + 4], v[i + 3]);
2510957b409SSimon J. Gerraty w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
2520957b409SSimon J. Gerraty w[(i << 1) + 9] = 0;
2530957b409SSimon J. Gerraty }
2540957b409SSimon J. Gerraty
2550957b409SSimon J. Gerraty /*
2560957b409SSimon J. Gerraty * For the 9th multiplication, source words are up to 32764,
2570957b409SSimon J. Gerraty * so we must do some carry propagation. If we add up to
2580957b409SSimon J. Gerraty * 4 products and the carry is no more than 524224, then the
2590957b409SSimon J. Gerraty * result fits in 32 bits, and the next carry will be no more
2600957b409SSimon J. Gerraty * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
2610957b409SSimon J. Gerraty *
2620957b409SSimon J. Gerraty * We thus just skip one of the products in the middle word,
2630957b409SSimon J. Gerraty * then do a carry propagation (this reduces words to 13 bits
2640957b409SSimon J. Gerraty * each, except possibly the last, which may use up to 17 bits
2650957b409SSimon J. Gerraty * or so), then add the missing product.
2660957b409SSimon J. Gerraty */
2670957b409SSimon J. Gerraty w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
2680957b409SSimon J. Gerraty w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
2690957b409SSimon J. Gerraty + MUL15(u[40 + 1], v[40 + 0]);
2700957b409SSimon J. Gerraty w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
2710957b409SSimon J. Gerraty + MUL15(u[40 + 1], v[40 + 1])
2720957b409SSimon J. Gerraty + MUL15(u[40 + 2], v[40 + 0]);
2730957b409SSimon J. Gerraty w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
2740957b409SSimon J. Gerraty + MUL15(u[40 + 1], v[40 + 2])
2750957b409SSimon J. Gerraty + MUL15(u[40 + 2], v[40 + 1])
2760957b409SSimon J. Gerraty + MUL15(u[40 + 3], v[40 + 0]);
2770957b409SSimon J. Gerraty w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
2780957b409SSimon J. Gerraty + MUL15(u[40 + 1], v[40 + 3])
2790957b409SSimon J. Gerraty + MUL15(u[40 + 2], v[40 + 2])
2800957b409SSimon J. Gerraty + MUL15(u[40 + 3], v[40 + 1]);
2810957b409SSimon J. Gerraty /* + MUL15(u[40 + 4], v[40 + 0]) */
2820957b409SSimon J. Gerraty w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
2830957b409SSimon J. Gerraty + MUL15(u[40 + 2], v[40 + 3])
2840957b409SSimon J. Gerraty + MUL15(u[40 + 3], v[40 + 2])
2850957b409SSimon J. Gerraty + MUL15(u[40 + 4], v[40 + 1]);
2860957b409SSimon J. Gerraty w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
2870957b409SSimon J. Gerraty + MUL15(u[40 + 3], v[40 + 3])
2880957b409SSimon J. Gerraty + MUL15(u[40 + 4], v[40 + 2]);
2890957b409SSimon J. Gerraty w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
2900957b409SSimon J. Gerraty + MUL15(u[40 + 4], v[40 + 3]);
2910957b409SSimon J. Gerraty w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
2920957b409SSimon J. Gerraty
2930957b409SSimon J. Gerraty CPR(w, 80);
2940957b409SSimon J. Gerraty
2950957b409SSimon J. Gerraty w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
2960957b409SSimon J. Gerraty
2970957b409SSimon J. Gerraty /*
2980957b409SSimon J. Gerraty * The products on 14-bit words in slots 6 and 7 yield values
2990957b409SSimon J. Gerraty * up to 5*(16382^2) each, and we need to subtract two such
3000957b409SSimon J. Gerraty * values from the higher word. We need the subtraction to fit
3010957b409SSimon J. Gerraty * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
3020957b409SSimon J. Gerraty * However, 10*(16382^2) does not fit. So we must perform a
3030957b409SSimon J. Gerraty * bit of reduction here.
3040957b409SSimon J. Gerraty */
3050957b409SSimon J. Gerraty CPR(w, 60);
3060957b409SSimon J. Gerraty CPR(w, 70);
3070957b409SSimon J. Gerraty
3080957b409SSimon J. Gerraty /*
3090957b409SSimon J. Gerraty * Recompose results.
3100957b409SSimon J. Gerraty */
3110957b409SSimon J. Gerraty
3120957b409SSimon J. Gerraty /* 0..1*0..1 into 0..3 */
3130957b409SSimon J. Gerraty ZSUB2F(w, 8, w, 0, w, 2);
3140957b409SSimon J. Gerraty ZSUB2F(w, 9, w, 1, w, 3);
3150957b409SSimon J. Gerraty ZADDT(w, 1, w, 8);
3160957b409SSimon J. Gerraty ZADDT(w, 2, w, 9);
3170957b409SSimon J. Gerraty
3180957b409SSimon J. Gerraty /* 2..3*2..3 into 4..7 */
3190957b409SSimon J. Gerraty ZSUB2F(w, 10, w, 4, w, 6);
3200957b409SSimon J. Gerraty ZSUB2F(w, 11, w, 5, w, 7);
3210957b409SSimon J. Gerraty ZADDT(w, 5, w, 10);
3220957b409SSimon J. Gerraty ZADDT(w, 6, w, 11);
3230957b409SSimon J. Gerraty
3240957b409SSimon J. Gerraty /* (0..1+2..3)*(0..1+2..3) into 12..15 */
3250957b409SSimon J. Gerraty ZSUB2F(w, 16, w, 12, w, 14);
3260957b409SSimon J. Gerraty ZSUB2F(w, 17, w, 13, w, 15);
3270957b409SSimon J. Gerraty ZADDT(w, 13, w, 16);
3280957b409SSimon J. Gerraty ZADDT(w, 14, w, 17);
3290957b409SSimon J. Gerraty
3300957b409SSimon J. Gerraty /* first-level recomposition */
3310957b409SSimon J. Gerraty ZSUB2F(w, 12, w, 0, w, 4);
3320957b409SSimon J. Gerraty ZSUB2F(w, 13, w, 1, w, 5);
3330957b409SSimon J. Gerraty ZSUB2F(w, 14, w, 2, w, 6);
3340957b409SSimon J. Gerraty ZSUB2F(w, 15, w, 3, w, 7);
3350957b409SSimon J. Gerraty ZADDT(w, 2, w, 12);
3360957b409SSimon J. Gerraty ZADDT(w, 3, w, 13);
3370957b409SSimon J. Gerraty ZADDT(w, 4, w, 14);
3380957b409SSimon J. Gerraty ZADDT(w, 5, w, 15);
3390957b409SSimon J. Gerraty
3400957b409SSimon J. Gerraty /*
3410957b409SSimon J. Gerraty * Perform carry propagation to bring all words down to 13 bits.
3420957b409SSimon J. Gerraty */
3430957b409SSimon J. Gerraty cc = norm13(d, w, 40);
3440957b409SSimon J. Gerraty d[39] += (cc << 13);
3450957b409SSimon J. Gerraty
3460957b409SSimon J. Gerraty #undef ZADD
3470957b409SSimon J. Gerraty #undef ZADDT
3480957b409SSimon J. Gerraty #undef ZSUB2F
3490957b409SSimon J. Gerraty #undef CPR1
3500957b409SSimon J. Gerraty #undef CPR
3510957b409SSimon J. Gerraty }
3520957b409SSimon J. Gerraty
3530957b409SSimon J. Gerraty static inline void
square20(uint32_t * d,const uint32_t * a)3540957b409SSimon J. Gerraty square20(uint32_t *d, const uint32_t *a)
3550957b409SSimon J. Gerraty {
3560957b409SSimon J. Gerraty mul20(d, a, a);
3570957b409SSimon J. Gerraty }
3580957b409SSimon J. Gerraty
3590957b409SSimon J. Gerraty #else
3600957b409SSimon J. Gerraty
3610957b409SSimon J. Gerraty static void
mul20(uint32_t * d,const uint32_t * a,const uint32_t * b)3620957b409SSimon J. Gerraty mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
3630957b409SSimon J. Gerraty {
3640957b409SSimon J. Gerraty uint32_t t[39];
3650957b409SSimon J. Gerraty
3660957b409SSimon J. Gerraty t[ 0] = MUL15(a[ 0], b[ 0]);
3670957b409SSimon J. Gerraty t[ 1] = MUL15(a[ 0], b[ 1])
3680957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 0]);
3690957b409SSimon J. Gerraty t[ 2] = MUL15(a[ 0], b[ 2])
3700957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 1])
3710957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 0]);
3720957b409SSimon J. Gerraty t[ 3] = MUL15(a[ 0], b[ 3])
3730957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 2])
3740957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 1])
3750957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 0]);
3760957b409SSimon J. Gerraty t[ 4] = MUL15(a[ 0], b[ 4])
3770957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 3])
3780957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 2])
3790957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 1])
3800957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 0]);
3810957b409SSimon J. Gerraty t[ 5] = MUL15(a[ 0], b[ 5])
3820957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 4])
3830957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 3])
3840957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 2])
3850957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 1])
3860957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 0]);
3870957b409SSimon J. Gerraty t[ 6] = MUL15(a[ 0], b[ 6])
3880957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 5])
3890957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 4])
3900957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 3])
3910957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 2])
3920957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 1])
3930957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 0]);
3940957b409SSimon J. Gerraty t[ 7] = MUL15(a[ 0], b[ 7])
3950957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 6])
3960957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 5])
3970957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 4])
3980957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 3])
3990957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 2])
4000957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 1])
4010957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 0]);
4020957b409SSimon J. Gerraty t[ 8] = MUL15(a[ 0], b[ 8])
4030957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 7])
4040957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 6])
4050957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 5])
4060957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 4])
4070957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 3])
4080957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 2])
4090957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 1])
4100957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 0]);
4110957b409SSimon J. Gerraty t[ 9] = MUL15(a[ 0], b[ 9])
4120957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 8])
4130957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 7])
4140957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 6])
4150957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 5])
4160957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 4])
4170957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 3])
4180957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 2])
4190957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 1])
4200957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 0]);
4210957b409SSimon J. Gerraty t[10] = MUL15(a[ 0], b[10])
4220957b409SSimon J. Gerraty + MUL15(a[ 1], b[ 9])
4230957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 8])
4240957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 7])
4250957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 6])
4260957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 5])
4270957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 4])
4280957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 3])
4290957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 2])
4300957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 1])
4310957b409SSimon J. Gerraty + MUL15(a[10], b[ 0]);
4320957b409SSimon J. Gerraty t[11] = MUL15(a[ 0], b[11])
4330957b409SSimon J. Gerraty + MUL15(a[ 1], b[10])
4340957b409SSimon J. Gerraty + MUL15(a[ 2], b[ 9])
4350957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 8])
4360957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 7])
4370957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 6])
4380957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 5])
4390957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 4])
4400957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 3])
4410957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 2])
4420957b409SSimon J. Gerraty + MUL15(a[10], b[ 1])
4430957b409SSimon J. Gerraty + MUL15(a[11], b[ 0]);
4440957b409SSimon J. Gerraty t[12] = MUL15(a[ 0], b[12])
4450957b409SSimon J. Gerraty + MUL15(a[ 1], b[11])
4460957b409SSimon J. Gerraty + MUL15(a[ 2], b[10])
4470957b409SSimon J. Gerraty + MUL15(a[ 3], b[ 9])
4480957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 8])
4490957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 7])
4500957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 6])
4510957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 5])
4520957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 4])
4530957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 3])
4540957b409SSimon J. Gerraty + MUL15(a[10], b[ 2])
4550957b409SSimon J. Gerraty + MUL15(a[11], b[ 1])
4560957b409SSimon J. Gerraty + MUL15(a[12], b[ 0]);
4570957b409SSimon J. Gerraty t[13] = MUL15(a[ 0], b[13])
4580957b409SSimon J. Gerraty + MUL15(a[ 1], b[12])
4590957b409SSimon J. Gerraty + MUL15(a[ 2], b[11])
4600957b409SSimon J. Gerraty + MUL15(a[ 3], b[10])
4610957b409SSimon J. Gerraty + MUL15(a[ 4], b[ 9])
4620957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 8])
4630957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 7])
4640957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 6])
4650957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 5])
4660957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 4])
4670957b409SSimon J. Gerraty + MUL15(a[10], b[ 3])
4680957b409SSimon J. Gerraty + MUL15(a[11], b[ 2])
4690957b409SSimon J. Gerraty + MUL15(a[12], b[ 1])
4700957b409SSimon J. Gerraty + MUL15(a[13], b[ 0]);
4710957b409SSimon J. Gerraty t[14] = MUL15(a[ 0], b[14])
4720957b409SSimon J. Gerraty + MUL15(a[ 1], b[13])
4730957b409SSimon J. Gerraty + MUL15(a[ 2], b[12])
4740957b409SSimon J. Gerraty + MUL15(a[ 3], b[11])
4750957b409SSimon J. Gerraty + MUL15(a[ 4], b[10])
4760957b409SSimon J. Gerraty + MUL15(a[ 5], b[ 9])
4770957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 8])
4780957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 7])
4790957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 6])
4800957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 5])
4810957b409SSimon J. Gerraty + MUL15(a[10], b[ 4])
4820957b409SSimon J. Gerraty + MUL15(a[11], b[ 3])
4830957b409SSimon J. Gerraty + MUL15(a[12], b[ 2])
4840957b409SSimon J. Gerraty + MUL15(a[13], b[ 1])
4850957b409SSimon J. Gerraty + MUL15(a[14], b[ 0]);
4860957b409SSimon J. Gerraty t[15] = MUL15(a[ 0], b[15])
4870957b409SSimon J. Gerraty + MUL15(a[ 1], b[14])
4880957b409SSimon J. Gerraty + MUL15(a[ 2], b[13])
4890957b409SSimon J. Gerraty + MUL15(a[ 3], b[12])
4900957b409SSimon J. Gerraty + MUL15(a[ 4], b[11])
4910957b409SSimon J. Gerraty + MUL15(a[ 5], b[10])
4920957b409SSimon J. Gerraty + MUL15(a[ 6], b[ 9])
4930957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 8])
4940957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 7])
4950957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 6])
4960957b409SSimon J. Gerraty + MUL15(a[10], b[ 5])
4970957b409SSimon J. Gerraty + MUL15(a[11], b[ 4])
4980957b409SSimon J. Gerraty + MUL15(a[12], b[ 3])
4990957b409SSimon J. Gerraty + MUL15(a[13], b[ 2])
5000957b409SSimon J. Gerraty + MUL15(a[14], b[ 1])
5010957b409SSimon J. Gerraty + MUL15(a[15], b[ 0]);
5020957b409SSimon J. Gerraty t[16] = MUL15(a[ 0], b[16])
5030957b409SSimon J. Gerraty + MUL15(a[ 1], b[15])
5040957b409SSimon J. Gerraty + MUL15(a[ 2], b[14])
5050957b409SSimon J. Gerraty + MUL15(a[ 3], b[13])
5060957b409SSimon J. Gerraty + MUL15(a[ 4], b[12])
5070957b409SSimon J. Gerraty + MUL15(a[ 5], b[11])
5080957b409SSimon J. Gerraty + MUL15(a[ 6], b[10])
5090957b409SSimon J. Gerraty + MUL15(a[ 7], b[ 9])
5100957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 8])
5110957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 7])
5120957b409SSimon J. Gerraty + MUL15(a[10], b[ 6])
5130957b409SSimon J. Gerraty + MUL15(a[11], b[ 5])
5140957b409SSimon J. Gerraty + MUL15(a[12], b[ 4])
5150957b409SSimon J. Gerraty + MUL15(a[13], b[ 3])
5160957b409SSimon J. Gerraty + MUL15(a[14], b[ 2])
5170957b409SSimon J. Gerraty + MUL15(a[15], b[ 1])
5180957b409SSimon J. Gerraty + MUL15(a[16], b[ 0]);
5190957b409SSimon J. Gerraty t[17] = MUL15(a[ 0], b[17])
5200957b409SSimon J. Gerraty + MUL15(a[ 1], b[16])
5210957b409SSimon J. Gerraty + MUL15(a[ 2], b[15])
5220957b409SSimon J. Gerraty + MUL15(a[ 3], b[14])
5230957b409SSimon J. Gerraty + MUL15(a[ 4], b[13])
5240957b409SSimon J. Gerraty + MUL15(a[ 5], b[12])
5250957b409SSimon J. Gerraty + MUL15(a[ 6], b[11])
5260957b409SSimon J. Gerraty + MUL15(a[ 7], b[10])
5270957b409SSimon J. Gerraty + MUL15(a[ 8], b[ 9])
5280957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 8])
5290957b409SSimon J. Gerraty + MUL15(a[10], b[ 7])
5300957b409SSimon J. Gerraty + MUL15(a[11], b[ 6])
5310957b409SSimon J. Gerraty + MUL15(a[12], b[ 5])
5320957b409SSimon J. Gerraty + MUL15(a[13], b[ 4])
5330957b409SSimon J. Gerraty + MUL15(a[14], b[ 3])
5340957b409SSimon J. Gerraty + MUL15(a[15], b[ 2])
5350957b409SSimon J. Gerraty + MUL15(a[16], b[ 1])
5360957b409SSimon J. Gerraty + MUL15(a[17], b[ 0]);
5370957b409SSimon J. Gerraty t[18] = MUL15(a[ 0], b[18])
5380957b409SSimon J. Gerraty + MUL15(a[ 1], b[17])
5390957b409SSimon J. Gerraty + MUL15(a[ 2], b[16])
5400957b409SSimon J. Gerraty + MUL15(a[ 3], b[15])
5410957b409SSimon J. Gerraty + MUL15(a[ 4], b[14])
5420957b409SSimon J. Gerraty + MUL15(a[ 5], b[13])
5430957b409SSimon J. Gerraty + MUL15(a[ 6], b[12])
5440957b409SSimon J. Gerraty + MUL15(a[ 7], b[11])
5450957b409SSimon J. Gerraty + MUL15(a[ 8], b[10])
5460957b409SSimon J. Gerraty + MUL15(a[ 9], b[ 9])
5470957b409SSimon J. Gerraty + MUL15(a[10], b[ 8])
5480957b409SSimon J. Gerraty + MUL15(a[11], b[ 7])
5490957b409SSimon J. Gerraty + MUL15(a[12], b[ 6])
5500957b409SSimon J. Gerraty + MUL15(a[13], b[ 5])
5510957b409SSimon J. Gerraty + MUL15(a[14], b[ 4])
5520957b409SSimon J. Gerraty + MUL15(a[15], b[ 3])
5530957b409SSimon J. Gerraty + MUL15(a[16], b[ 2])
5540957b409SSimon J. Gerraty + MUL15(a[17], b[ 1])
5550957b409SSimon J. Gerraty + MUL15(a[18], b[ 0]);
5560957b409SSimon J. Gerraty t[19] = MUL15(a[ 0], b[19])
5570957b409SSimon J. Gerraty + MUL15(a[ 1], b[18])
5580957b409SSimon J. Gerraty + MUL15(a[ 2], b[17])
5590957b409SSimon J. Gerraty + MUL15(a[ 3], b[16])
5600957b409SSimon J. Gerraty + MUL15(a[ 4], b[15])
5610957b409SSimon J. Gerraty + MUL15(a[ 5], b[14])
5620957b409SSimon J. Gerraty + MUL15(a[ 6], b[13])
5630957b409SSimon J. Gerraty + MUL15(a[ 7], b[12])
5640957b409SSimon J. Gerraty + MUL15(a[ 8], b[11])
5650957b409SSimon J. Gerraty + MUL15(a[ 9], b[10])
5660957b409SSimon J. Gerraty + MUL15(a[10], b[ 9])
5670957b409SSimon J. Gerraty + MUL15(a[11], b[ 8])
5680957b409SSimon J. Gerraty + MUL15(a[12], b[ 7])
5690957b409SSimon J. Gerraty + MUL15(a[13], b[ 6])
5700957b409SSimon J. Gerraty + MUL15(a[14], b[ 5])
5710957b409SSimon J. Gerraty + MUL15(a[15], b[ 4])
5720957b409SSimon J. Gerraty + MUL15(a[16], b[ 3])
5730957b409SSimon J. Gerraty + MUL15(a[17], b[ 2])
5740957b409SSimon J. Gerraty + MUL15(a[18], b[ 1])
5750957b409SSimon J. Gerraty + MUL15(a[19], b[ 0]);
5760957b409SSimon J. Gerraty t[20] = MUL15(a[ 1], b[19])
5770957b409SSimon J. Gerraty + MUL15(a[ 2], b[18])
5780957b409SSimon J. Gerraty + MUL15(a[ 3], b[17])
5790957b409SSimon J. Gerraty + MUL15(a[ 4], b[16])
5800957b409SSimon J. Gerraty + MUL15(a[ 5], b[15])
5810957b409SSimon J. Gerraty + MUL15(a[ 6], b[14])
5820957b409SSimon J. Gerraty + MUL15(a[ 7], b[13])
5830957b409SSimon J. Gerraty + MUL15(a[ 8], b[12])
5840957b409SSimon J. Gerraty + MUL15(a[ 9], b[11])
5850957b409SSimon J. Gerraty + MUL15(a[10], b[10])
5860957b409SSimon J. Gerraty + MUL15(a[11], b[ 9])
5870957b409SSimon J. Gerraty + MUL15(a[12], b[ 8])
5880957b409SSimon J. Gerraty + MUL15(a[13], b[ 7])
5890957b409SSimon J. Gerraty + MUL15(a[14], b[ 6])
5900957b409SSimon J. Gerraty + MUL15(a[15], b[ 5])
5910957b409SSimon J. Gerraty + MUL15(a[16], b[ 4])
5920957b409SSimon J. Gerraty + MUL15(a[17], b[ 3])
5930957b409SSimon J. Gerraty + MUL15(a[18], b[ 2])
5940957b409SSimon J. Gerraty + MUL15(a[19], b[ 1]);
5950957b409SSimon J. Gerraty t[21] = MUL15(a[ 2], b[19])
5960957b409SSimon J. Gerraty + MUL15(a[ 3], b[18])
5970957b409SSimon J. Gerraty + MUL15(a[ 4], b[17])
5980957b409SSimon J. Gerraty + MUL15(a[ 5], b[16])
5990957b409SSimon J. Gerraty + MUL15(a[ 6], b[15])
6000957b409SSimon J. Gerraty + MUL15(a[ 7], b[14])
6010957b409SSimon J. Gerraty + MUL15(a[ 8], b[13])
6020957b409SSimon J. Gerraty + MUL15(a[ 9], b[12])
6030957b409SSimon J. Gerraty + MUL15(a[10], b[11])
6040957b409SSimon J. Gerraty + MUL15(a[11], b[10])
6050957b409SSimon J. Gerraty + MUL15(a[12], b[ 9])
6060957b409SSimon J. Gerraty + MUL15(a[13], b[ 8])
6070957b409SSimon J. Gerraty + MUL15(a[14], b[ 7])
6080957b409SSimon J. Gerraty + MUL15(a[15], b[ 6])
6090957b409SSimon J. Gerraty + MUL15(a[16], b[ 5])
6100957b409SSimon J. Gerraty + MUL15(a[17], b[ 4])
6110957b409SSimon J. Gerraty + MUL15(a[18], b[ 3])
6120957b409SSimon J. Gerraty + MUL15(a[19], b[ 2]);
6130957b409SSimon J. Gerraty t[22] = MUL15(a[ 3], b[19])
6140957b409SSimon J. Gerraty + MUL15(a[ 4], b[18])
6150957b409SSimon J. Gerraty + MUL15(a[ 5], b[17])
6160957b409SSimon J. Gerraty + MUL15(a[ 6], b[16])
6170957b409SSimon J. Gerraty + MUL15(a[ 7], b[15])
6180957b409SSimon J. Gerraty + MUL15(a[ 8], b[14])
6190957b409SSimon J. Gerraty + MUL15(a[ 9], b[13])
6200957b409SSimon J. Gerraty + MUL15(a[10], b[12])
6210957b409SSimon J. Gerraty + MUL15(a[11], b[11])
6220957b409SSimon J. Gerraty + MUL15(a[12], b[10])
6230957b409SSimon J. Gerraty + MUL15(a[13], b[ 9])
6240957b409SSimon J. Gerraty + MUL15(a[14], b[ 8])
6250957b409SSimon J. Gerraty + MUL15(a[15], b[ 7])
6260957b409SSimon J. Gerraty + MUL15(a[16], b[ 6])
6270957b409SSimon J. Gerraty + MUL15(a[17], b[ 5])
6280957b409SSimon J. Gerraty + MUL15(a[18], b[ 4])
6290957b409SSimon J. Gerraty + MUL15(a[19], b[ 3]);
6300957b409SSimon J. Gerraty t[23] = MUL15(a[ 4], b[19])
6310957b409SSimon J. Gerraty + MUL15(a[ 5], b[18])
6320957b409SSimon J. Gerraty + MUL15(a[ 6], b[17])
6330957b409SSimon J. Gerraty + MUL15(a[ 7], b[16])
6340957b409SSimon J. Gerraty + MUL15(a[ 8], b[15])
6350957b409SSimon J. Gerraty + MUL15(a[ 9], b[14])
6360957b409SSimon J. Gerraty + MUL15(a[10], b[13])
6370957b409SSimon J. Gerraty + MUL15(a[11], b[12])
6380957b409SSimon J. Gerraty + MUL15(a[12], b[11])
6390957b409SSimon J. Gerraty + MUL15(a[13], b[10])
6400957b409SSimon J. Gerraty + MUL15(a[14], b[ 9])
6410957b409SSimon J. Gerraty + MUL15(a[15], b[ 8])
6420957b409SSimon J. Gerraty + MUL15(a[16], b[ 7])
6430957b409SSimon J. Gerraty + MUL15(a[17], b[ 6])
6440957b409SSimon J. Gerraty + MUL15(a[18], b[ 5])
6450957b409SSimon J. Gerraty + MUL15(a[19], b[ 4]);
6460957b409SSimon J. Gerraty t[24] = MUL15(a[ 5], b[19])
6470957b409SSimon J. Gerraty + MUL15(a[ 6], b[18])
6480957b409SSimon J. Gerraty + MUL15(a[ 7], b[17])
6490957b409SSimon J. Gerraty + MUL15(a[ 8], b[16])
6500957b409SSimon J. Gerraty + MUL15(a[ 9], b[15])
6510957b409SSimon J. Gerraty + MUL15(a[10], b[14])
6520957b409SSimon J. Gerraty + MUL15(a[11], b[13])
6530957b409SSimon J. Gerraty + MUL15(a[12], b[12])
6540957b409SSimon J. Gerraty + MUL15(a[13], b[11])
6550957b409SSimon J. Gerraty + MUL15(a[14], b[10])
6560957b409SSimon J. Gerraty + MUL15(a[15], b[ 9])
6570957b409SSimon J. Gerraty + MUL15(a[16], b[ 8])
6580957b409SSimon J. Gerraty + MUL15(a[17], b[ 7])
6590957b409SSimon J. Gerraty + MUL15(a[18], b[ 6])
6600957b409SSimon J. Gerraty + MUL15(a[19], b[ 5]);
6610957b409SSimon J. Gerraty t[25] = MUL15(a[ 6], b[19])
6620957b409SSimon J. Gerraty + MUL15(a[ 7], b[18])
6630957b409SSimon J. Gerraty + MUL15(a[ 8], b[17])
6640957b409SSimon J. Gerraty + MUL15(a[ 9], b[16])
6650957b409SSimon J. Gerraty + MUL15(a[10], b[15])
6660957b409SSimon J. Gerraty + MUL15(a[11], b[14])
6670957b409SSimon J. Gerraty + MUL15(a[12], b[13])
6680957b409SSimon J. Gerraty + MUL15(a[13], b[12])
6690957b409SSimon J. Gerraty + MUL15(a[14], b[11])
6700957b409SSimon J. Gerraty + MUL15(a[15], b[10])
6710957b409SSimon J. Gerraty + MUL15(a[16], b[ 9])
6720957b409SSimon J. Gerraty + MUL15(a[17], b[ 8])
6730957b409SSimon J. Gerraty + MUL15(a[18], b[ 7])
6740957b409SSimon J. Gerraty + MUL15(a[19], b[ 6]);
6750957b409SSimon J. Gerraty t[26] = MUL15(a[ 7], b[19])
6760957b409SSimon J. Gerraty + MUL15(a[ 8], b[18])
6770957b409SSimon J. Gerraty + MUL15(a[ 9], b[17])
6780957b409SSimon J. Gerraty + MUL15(a[10], b[16])
6790957b409SSimon J. Gerraty + MUL15(a[11], b[15])
6800957b409SSimon J. Gerraty + MUL15(a[12], b[14])
6810957b409SSimon J. Gerraty + MUL15(a[13], b[13])
6820957b409SSimon J. Gerraty + MUL15(a[14], b[12])
6830957b409SSimon J. Gerraty + MUL15(a[15], b[11])
6840957b409SSimon J. Gerraty + MUL15(a[16], b[10])
6850957b409SSimon J. Gerraty + MUL15(a[17], b[ 9])
6860957b409SSimon J. Gerraty + MUL15(a[18], b[ 8])
6870957b409SSimon J. Gerraty + MUL15(a[19], b[ 7]);
6880957b409SSimon J. Gerraty t[27] = MUL15(a[ 8], b[19])
6890957b409SSimon J. Gerraty + MUL15(a[ 9], b[18])
6900957b409SSimon J. Gerraty + MUL15(a[10], b[17])
6910957b409SSimon J. Gerraty + MUL15(a[11], b[16])
6920957b409SSimon J. Gerraty + MUL15(a[12], b[15])
6930957b409SSimon J. Gerraty + MUL15(a[13], b[14])
6940957b409SSimon J. Gerraty + MUL15(a[14], b[13])
6950957b409SSimon J. Gerraty + MUL15(a[15], b[12])
6960957b409SSimon J. Gerraty + MUL15(a[16], b[11])
6970957b409SSimon J. Gerraty + MUL15(a[17], b[10])
6980957b409SSimon J. Gerraty + MUL15(a[18], b[ 9])
6990957b409SSimon J. Gerraty + MUL15(a[19], b[ 8]);
7000957b409SSimon J. Gerraty t[28] = MUL15(a[ 9], b[19])
7010957b409SSimon J. Gerraty + MUL15(a[10], b[18])
7020957b409SSimon J. Gerraty + MUL15(a[11], b[17])
7030957b409SSimon J. Gerraty + MUL15(a[12], b[16])
7040957b409SSimon J. Gerraty + MUL15(a[13], b[15])
7050957b409SSimon J. Gerraty + MUL15(a[14], b[14])
7060957b409SSimon J. Gerraty + MUL15(a[15], b[13])
7070957b409SSimon J. Gerraty + MUL15(a[16], b[12])
7080957b409SSimon J. Gerraty + MUL15(a[17], b[11])
7090957b409SSimon J. Gerraty + MUL15(a[18], b[10])
7100957b409SSimon J. Gerraty + MUL15(a[19], b[ 9]);
7110957b409SSimon J. Gerraty t[29] = MUL15(a[10], b[19])
7120957b409SSimon J. Gerraty + MUL15(a[11], b[18])
7130957b409SSimon J. Gerraty + MUL15(a[12], b[17])
7140957b409SSimon J. Gerraty + MUL15(a[13], b[16])
7150957b409SSimon J. Gerraty + MUL15(a[14], b[15])
7160957b409SSimon J. Gerraty + MUL15(a[15], b[14])
7170957b409SSimon J. Gerraty + MUL15(a[16], b[13])
7180957b409SSimon J. Gerraty + MUL15(a[17], b[12])
7190957b409SSimon J. Gerraty + MUL15(a[18], b[11])
7200957b409SSimon J. Gerraty + MUL15(a[19], b[10]);
7210957b409SSimon J. Gerraty t[30] = MUL15(a[11], b[19])
7220957b409SSimon J. Gerraty + MUL15(a[12], b[18])
7230957b409SSimon J. Gerraty + MUL15(a[13], b[17])
7240957b409SSimon J. Gerraty + MUL15(a[14], b[16])
7250957b409SSimon J. Gerraty + MUL15(a[15], b[15])
7260957b409SSimon J. Gerraty + MUL15(a[16], b[14])
7270957b409SSimon J. Gerraty + MUL15(a[17], b[13])
7280957b409SSimon J. Gerraty + MUL15(a[18], b[12])
7290957b409SSimon J. Gerraty + MUL15(a[19], b[11]);
7300957b409SSimon J. Gerraty t[31] = MUL15(a[12], b[19])
7310957b409SSimon J. Gerraty + MUL15(a[13], b[18])
7320957b409SSimon J. Gerraty + MUL15(a[14], b[17])
7330957b409SSimon J. Gerraty + MUL15(a[15], b[16])
7340957b409SSimon J. Gerraty + MUL15(a[16], b[15])
7350957b409SSimon J. Gerraty + MUL15(a[17], b[14])
7360957b409SSimon J. Gerraty + MUL15(a[18], b[13])
7370957b409SSimon J. Gerraty + MUL15(a[19], b[12]);
7380957b409SSimon J. Gerraty t[32] = MUL15(a[13], b[19])
7390957b409SSimon J. Gerraty + MUL15(a[14], b[18])
7400957b409SSimon J. Gerraty + MUL15(a[15], b[17])
7410957b409SSimon J. Gerraty + MUL15(a[16], b[16])
7420957b409SSimon J. Gerraty + MUL15(a[17], b[15])
7430957b409SSimon J. Gerraty + MUL15(a[18], b[14])
7440957b409SSimon J. Gerraty + MUL15(a[19], b[13]);
7450957b409SSimon J. Gerraty t[33] = MUL15(a[14], b[19])
7460957b409SSimon J. Gerraty + MUL15(a[15], b[18])
7470957b409SSimon J. Gerraty + MUL15(a[16], b[17])
7480957b409SSimon J. Gerraty + MUL15(a[17], b[16])
7490957b409SSimon J. Gerraty + MUL15(a[18], b[15])
7500957b409SSimon J. Gerraty + MUL15(a[19], b[14]);
7510957b409SSimon J. Gerraty t[34] = MUL15(a[15], b[19])
7520957b409SSimon J. Gerraty + MUL15(a[16], b[18])
7530957b409SSimon J. Gerraty + MUL15(a[17], b[17])
7540957b409SSimon J. Gerraty + MUL15(a[18], b[16])
7550957b409SSimon J. Gerraty + MUL15(a[19], b[15]);
7560957b409SSimon J. Gerraty t[35] = MUL15(a[16], b[19])
7570957b409SSimon J. Gerraty + MUL15(a[17], b[18])
7580957b409SSimon J. Gerraty + MUL15(a[18], b[17])
7590957b409SSimon J. Gerraty + MUL15(a[19], b[16]);
7600957b409SSimon J. Gerraty t[36] = MUL15(a[17], b[19])
7610957b409SSimon J. Gerraty + MUL15(a[18], b[18])
7620957b409SSimon J. Gerraty + MUL15(a[19], b[17]);
7630957b409SSimon J. Gerraty t[37] = MUL15(a[18], b[19])
7640957b409SSimon J. Gerraty + MUL15(a[19], b[18]);
7650957b409SSimon J. Gerraty t[38] = MUL15(a[19], b[19]);
7660957b409SSimon J. Gerraty d[39] = norm13(d, t, 39);
7670957b409SSimon J. Gerraty }
7680957b409SSimon J. Gerraty
7690957b409SSimon J. Gerraty static void
square20(uint32_t * d,const uint32_t * a)7700957b409SSimon J. Gerraty square20(uint32_t *d, const uint32_t *a)
7710957b409SSimon J. Gerraty {
7720957b409SSimon J. Gerraty uint32_t t[39];
7730957b409SSimon J. Gerraty
7740957b409SSimon J. Gerraty t[ 0] = MUL15(a[ 0], a[ 0]);
7750957b409SSimon J. Gerraty t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
7760957b409SSimon J. Gerraty t[ 2] = MUL15(a[ 1], a[ 1])
7770957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[ 2])) << 1);
7780957b409SSimon J. Gerraty t[ 3] = ((MUL15(a[ 0], a[ 3])
7790957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 2])) << 1);
7800957b409SSimon J. Gerraty t[ 4] = MUL15(a[ 2], a[ 2])
7810957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[ 4])
7820957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 3])) << 1);
7830957b409SSimon J. Gerraty t[ 5] = ((MUL15(a[ 0], a[ 5])
7840957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 4])
7850957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 3])) << 1);
7860957b409SSimon J. Gerraty t[ 6] = MUL15(a[ 3], a[ 3])
7870957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[ 6])
7880957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 5])
7890957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 4])) << 1);
7900957b409SSimon J. Gerraty t[ 7] = ((MUL15(a[ 0], a[ 7])
7910957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 6])
7920957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 5])
7930957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 4])) << 1);
7940957b409SSimon J. Gerraty t[ 8] = MUL15(a[ 4], a[ 4])
7950957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[ 8])
7960957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 7])
7970957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 6])
7980957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 5])) << 1);
7990957b409SSimon J. Gerraty t[ 9] = ((MUL15(a[ 0], a[ 9])
8000957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 8])
8010957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 7])
8020957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 6])
8030957b409SSimon J. Gerraty + MUL15(a[ 4], a[ 5])) << 1);
8040957b409SSimon J. Gerraty t[10] = MUL15(a[ 5], a[ 5])
8050957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[10])
8060957b409SSimon J. Gerraty + MUL15(a[ 1], a[ 9])
8070957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 8])
8080957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 7])
8090957b409SSimon J. Gerraty + MUL15(a[ 4], a[ 6])) << 1);
8100957b409SSimon J. Gerraty t[11] = ((MUL15(a[ 0], a[11])
8110957b409SSimon J. Gerraty + MUL15(a[ 1], a[10])
8120957b409SSimon J. Gerraty + MUL15(a[ 2], a[ 9])
8130957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 8])
8140957b409SSimon J. Gerraty + MUL15(a[ 4], a[ 7])
8150957b409SSimon J. Gerraty + MUL15(a[ 5], a[ 6])) << 1);
8160957b409SSimon J. Gerraty t[12] = MUL15(a[ 6], a[ 6])
8170957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[12])
8180957b409SSimon J. Gerraty + MUL15(a[ 1], a[11])
8190957b409SSimon J. Gerraty + MUL15(a[ 2], a[10])
8200957b409SSimon J. Gerraty + MUL15(a[ 3], a[ 9])
8210957b409SSimon J. Gerraty + MUL15(a[ 4], a[ 8])
8220957b409SSimon J. Gerraty + MUL15(a[ 5], a[ 7])) << 1);
8230957b409SSimon J. Gerraty t[13] = ((MUL15(a[ 0], a[13])
8240957b409SSimon J. Gerraty + MUL15(a[ 1], a[12])
8250957b409SSimon J. Gerraty + MUL15(a[ 2], a[11])
8260957b409SSimon J. Gerraty + MUL15(a[ 3], a[10])
8270957b409SSimon J. Gerraty + MUL15(a[ 4], a[ 9])
8280957b409SSimon J. Gerraty + MUL15(a[ 5], a[ 8])
8290957b409SSimon J. Gerraty + MUL15(a[ 6], a[ 7])) << 1);
8300957b409SSimon J. Gerraty t[14] = MUL15(a[ 7], a[ 7])
8310957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[14])
8320957b409SSimon J. Gerraty + MUL15(a[ 1], a[13])
8330957b409SSimon J. Gerraty + MUL15(a[ 2], a[12])
8340957b409SSimon J. Gerraty + MUL15(a[ 3], a[11])
8350957b409SSimon J. Gerraty + MUL15(a[ 4], a[10])
8360957b409SSimon J. Gerraty + MUL15(a[ 5], a[ 9])
8370957b409SSimon J. Gerraty + MUL15(a[ 6], a[ 8])) << 1);
8380957b409SSimon J. Gerraty t[15] = ((MUL15(a[ 0], a[15])
8390957b409SSimon J. Gerraty + MUL15(a[ 1], a[14])
8400957b409SSimon J. Gerraty + MUL15(a[ 2], a[13])
8410957b409SSimon J. Gerraty + MUL15(a[ 3], a[12])
8420957b409SSimon J. Gerraty + MUL15(a[ 4], a[11])
8430957b409SSimon J. Gerraty + MUL15(a[ 5], a[10])
8440957b409SSimon J. Gerraty + MUL15(a[ 6], a[ 9])
8450957b409SSimon J. Gerraty + MUL15(a[ 7], a[ 8])) << 1);
8460957b409SSimon J. Gerraty t[16] = MUL15(a[ 8], a[ 8])
8470957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[16])
8480957b409SSimon J. Gerraty + MUL15(a[ 1], a[15])
8490957b409SSimon J. Gerraty + MUL15(a[ 2], a[14])
8500957b409SSimon J. Gerraty + MUL15(a[ 3], a[13])
8510957b409SSimon J. Gerraty + MUL15(a[ 4], a[12])
8520957b409SSimon J. Gerraty + MUL15(a[ 5], a[11])
8530957b409SSimon J. Gerraty + MUL15(a[ 6], a[10])
8540957b409SSimon J. Gerraty + MUL15(a[ 7], a[ 9])) << 1);
8550957b409SSimon J. Gerraty t[17] = ((MUL15(a[ 0], a[17])
8560957b409SSimon J. Gerraty + MUL15(a[ 1], a[16])
8570957b409SSimon J. Gerraty + MUL15(a[ 2], a[15])
8580957b409SSimon J. Gerraty + MUL15(a[ 3], a[14])
8590957b409SSimon J. Gerraty + MUL15(a[ 4], a[13])
8600957b409SSimon J. Gerraty + MUL15(a[ 5], a[12])
8610957b409SSimon J. Gerraty + MUL15(a[ 6], a[11])
8620957b409SSimon J. Gerraty + MUL15(a[ 7], a[10])
8630957b409SSimon J. Gerraty + MUL15(a[ 8], a[ 9])) << 1);
8640957b409SSimon J. Gerraty t[18] = MUL15(a[ 9], a[ 9])
8650957b409SSimon J. Gerraty + ((MUL15(a[ 0], a[18])
8660957b409SSimon J. Gerraty + MUL15(a[ 1], a[17])
8670957b409SSimon J. Gerraty + MUL15(a[ 2], a[16])
8680957b409SSimon J. Gerraty + MUL15(a[ 3], a[15])
8690957b409SSimon J. Gerraty + MUL15(a[ 4], a[14])
8700957b409SSimon J. Gerraty + MUL15(a[ 5], a[13])
8710957b409SSimon J. Gerraty + MUL15(a[ 6], a[12])
8720957b409SSimon J. Gerraty + MUL15(a[ 7], a[11])
8730957b409SSimon J. Gerraty + MUL15(a[ 8], a[10])) << 1);
8740957b409SSimon J. Gerraty t[19] = ((MUL15(a[ 0], a[19])
8750957b409SSimon J. Gerraty + MUL15(a[ 1], a[18])
8760957b409SSimon J. Gerraty + MUL15(a[ 2], a[17])
8770957b409SSimon J. Gerraty + MUL15(a[ 3], a[16])
8780957b409SSimon J. Gerraty + MUL15(a[ 4], a[15])
8790957b409SSimon J. Gerraty + MUL15(a[ 5], a[14])
8800957b409SSimon J. Gerraty + MUL15(a[ 6], a[13])
8810957b409SSimon J. Gerraty + MUL15(a[ 7], a[12])
8820957b409SSimon J. Gerraty + MUL15(a[ 8], a[11])
8830957b409SSimon J. Gerraty + MUL15(a[ 9], a[10])) << 1);
8840957b409SSimon J. Gerraty t[20] = MUL15(a[10], a[10])
8850957b409SSimon J. Gerraty + ((MUL15(a[ 1], a[19])
8860957b409SSimon J. Gerraty + MUL15(a[ 2], a[18])
8870957b409SSimon J. Gerraty + MUL15(a[ 3], a[17])
8880957b409SSimon J. Gerraty + MUL15(a[ 4], a[16])
8890957b409SSimon J. Gerraty + MUL15(a[ 5], a[15])
8900957b409SSimon J. Gerraty + MUL15(a[ 6], a[14])
8910957b409SSimon J. Gerraty + MUL15(a[ 7], a[13])
8920957b409SSimon J. Gerraty + MUL15(a[ 8], a[12])
8930957b409SSimon J. Gerraty + MUL15(a[ 9], a[11])) << 1);
8940957b409SSimon J. Gerraty t[21] = ((MUL15(a[ 2], a[19])
8950957b409SSimon J. Gerraty + MUL15(a[ 3], a[18])
8960957b409SSimon J. Gerraty + MUL15(a[ 4], a[17])
8970957b409SSimon J. Gerraty + MUL15(a[ 5], a[16])
8980957b409SSimon J. Gerraty + MUL15(a[ 6], a[15])
8990957b409SSimon J. Gerraty + MUL15(a[ 7], a[14])
9000957b409SSimon J. Gerraty + MUL15(a[ 8], a[13])
9010957b409SSimon J. Gerraty + MUL15(a[ 9], a[12])
9020957b409SSimon J. Gerraty + MUL15(a[10], a[11])) << 1);
9030957b409SSimon J. Gerraty t[22] = MUL15(a[11], a[11])
9040957b409SSimon J. Gerraty + ((MUL15(a[ 3], a[19])
9050957b409SSimon J. Gerraty + MUL15(a[ 4], a[18])
9060957b409SSimon J. Gerraty + MUL15(a[ 5], a[17])
9070957b409SSimon J. Gerraty + MUL15(a[ 6], a[16])
9080957b409SSimon J. Gerraty + MUL15(a[ 7], a[15])
9090957b409SSimon J. Gerraty + MUL15(a[ 8], a[14])
9100957b409SSimon J. Gerraty + MUL15(a[ 9], a[13])
9110957b409SSimon J. Gerraty + MUL15(a[10], a[12])) << 1);
9120957b409SSimon J. Gerraty t[23] = ((MUL15(a[ 4], a[19])
9130957b409SSimon J. Gerraty + MUL15(a[ 5], a[18])
9140957b409SSimon J. Gerraty + MUL15(a[ 6], a[17])
9150957b409SSimon J. Gerraty + MUL15(a[ 7], a[16])
9160957b409SSimon J. Gerraty + MUL15(a[ 8], a[15])
9170957b409SSimon J. Gerraty + MUL15(a[ 9], a[14])
9180957b409SSimon J. Gerraty + MUL15(a[10], a[13])
9190957b409SSimon J. Gerraty + MUL15(a[11], a[12])) << 1);
9200957b409SSimon J. Gerraty t[24] = MUL15(a[12], a[12])
9210957b409SSimon J. Gerraty + ((MUL15(a[ 5], a[19])
9220957b409SSimon J. Gerraty + MUL15(a[ 6], a[18])
9230957b409SSimon J. Gerraty + MUL15(a[ 7], a[17])
9240957b409SSimon J. Gerraty + MUL15(a[ 8], a[16])
9250957b409SSimon J. Gerraty + MUL15(a[ 9], a[15])
9260957b409SSimon J. Gerraty + MUL15(a[10], a[14])
9270957b409SSimon J. Gerraty + MUL15(a[11], a[13])) << 1);
9280957b409SSimon J. Gerraty t[25] = ((MUL15(a[ 6], a[19])
9290957b409SSimon J. Gerraty + MUL15(a[ 7], a[18])
9300957b409SSimon J. Gerraty + MUL15(a[ 8], a[17])
9310957b409SSimon J. Gerraty + MUL15(a[ 9], a[16])
9320957b409SSimon J. Gerraty + MUL15(a[10], a[15])
9330957b409SSimon J. Gerraty + MUL15(a[11], a[14])
9340957b409SSimon J. Gerraty + MUL15(a[12], a[13])) << 1);
9350957b409SSimon J. Gerraty t[26] = MUL15(a[13], a[13])
9360957b409SSimon J. Gerraty + ((MUL15(a[ 7], a[19])
9370957b409SSimon J. Gerraty + MUL15(a[ 8], a[18])
9380957b409SSimon J. Gerraty + MUL15(a[ 9], a[17])
9390957b409SSimon J. Gerraty + MUL15(a[10], a[16])
9400957b409SSimon J. Gerraty + MUL15(a[11], a[15])
9410957b409SSimon J. Gerraty + MUL15(a[12], a[14])) << 1);
9420957b409SSimon J. Gerraty t[27] = ((MUL15(a[ 8], a[19])
9430957b409SSimon J. Gerraty + MUL15(a[ 9], a[18])
9440957b409SSimon J. Gerraty + MUL15(a[10], a[17])
9450957b409SSimon J. Gerraty + MUL15(a[11], a[16])
9460957b409SSimon J. Gerraty + MUL15(a[12], a[15])
9470957b409SSimon J. Gerraty + MUL15(a[13], a[14])) << 1);
9480957b409SSimon J. Gerraty t[28] = MUL15(a[14], a[14])
9490957b409SSimon J. Gerraty + ((MUL15(a[ 9], a[19])
9500957b409SSimon J. Gerraty + MUL15(a[10], a[18])
9510957b409SSimon J. Gerraty + MUL15(a[11], a[17])
9520957b409SSimon J. Gerraty + MUL15(a[12], a[16])
9530957b409SSimon J. Gerraty + MUL15(a[13], a[15])) << 1);
9540957b409SSimon J. Gerraty t[29] = ((MUL15(a[10], a[19])
9550957b409SSimon J. Gerraty + MUL15(a[11], a[18])
9560957b409SSimon J. Gerraty + MUL15(a[12], a[17])
9570957b409SSimon J. Gerraty + MUL15(a[13], a[16])
9580957b409SSimon J. Gerraty + MUL15(a[14], a[15])) << 1);
9590957b409SSimon J. Gerraty t[30] = MUL15(a[15], a[15])
9600957b409SSimon J. Gerraty + ((MUL15(a[11], a[19])
9610957b409SSimon J. Gerraty + MUL15(a[12], a[18])
9620957b409SSimon J. Gerraty + MUL15(a[13], a[17])
9630957b409SSimon J. Gerraty + MUL15(a[14], a[16])) << 1);
9640957b409SSimon J. Gerraty t[31] = ((MUL15(a[12], a[19])
9650957b409SSimon J. Gerraty + MUL15(a[13], a[18])
9660957b409SSimon J. Gerraty + MUL15(a[14], a[17])
9670957b409SSimon J. Gerraty + MUL15(a[15], a[16])) << 1);
9680957b409SSimon J. Gerraty t[32] = MUL15(a[16], a[16])
9690957b409SSimon J. Gerraty + ((MUL15(a[13], a[19])
9700957b409SSimon J. Gerraty + MUL15(a[14], a[18])
9710957b409SSimon J. Gerraty + MUL15(a[15], a[17])) << 1);
9720957b409SSimon J. Gerraty t[33] = ((MUL15(a[14], a[19])
9730957b409SSimon J. Gerraty + MUL15(a[15], a[18])
9740957b409SSimon J. Gerraty + MUL15(a[16], a[17])) << 1);
9750957b409SSimon J. Gerraty t[34] = MUL15(a[17], a[17])
9760957b409SSimon J. Gerraty + ((MUL15(a[15], a[19])
9770957b409SSimon J. Gerraty + MUL15(a[16], a[18])) << 1);
9780957b409SSimon J. Gerraty t[35] = ((MUL15(a[16], a[19])
9790957b409SSimon J. Gerraty + MUL15(a[17], a[18])) << 1);
9800957b409SSimon J. Gerraty t[36] = MUL15(a[18], a[18])
9810957b409SSimon J. Gerraty + ((MUL15(a[17], a[19])) << 1);
9820957b409SSimon J. Gerraty t[37] = ((MUL15(a[18], a[19])) << 1);
9830957b409SSimon J. Gerraty t[38] = MUL15(a[19], a[19]);
9840957b409SSimon J. Gerraty d[39] = norm13(d, t, 39);
9850957b409SSimon J. Gerraty }
9860957b409SSimon J. Gerraty
9870957b409SSimon J. Gerraty #endif
9880957b409SSimon J. Gerraty
9890957b409SSimon J. Gerraty /*
9900957b409SSimon J. Gerraty * Modulus for field F256 (field for point coordinates in curve P-256).
9910957b409SSimon J. Gerraty */
9920957b409SSimon J. Gerraty static const uint32_t F256[] = {
9930957b409SSimon J. Gerraty 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
9940957b409SSimon J. Gerraty 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
9950957b409SSimon J. Gerraty 0x0000, 0x1FF8, 0x1FFF, 0x01FF
9960957b409SSimon J. Gerraty };
9970957b409SSimon J. Gerraty
9980957b409SSimon J. Gerraty /*
9990957b409SSimon J. Gerraty * The 'b' curve equation coefficient for P-256.
10000957b409SSimon J. Gerraty */
10010957b409SSimon J. Gerraty static const uint32_t P256_B[] = {
10020957b409SSimon J. Gerraty 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
10030957b409SSimon J. Gerraty 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
10040957b409SSimon J. Gerraty 0x0A3A, 0x0EC5, 0x118D, 0x00B5
10050957b409SSimon J. Gerraty };
10060957b409SSimon J. Gerraty
10070957b409SSimon J. Gerraty /*
10080957b409SSimon J. Gerraty * Perform a "short reduction" in field F256 (field for curve P-256).
10090957b409SSimon J. Gerraty * The source value should be less than 262 bits; on output, it will
10100957b409SSimon J. Gerraty * be at most 257 bits, and less than twice the modulus.
10110957b409SSimon J. Gerraty */
10120957b409SSimon J. Gerraty static void
reduce_f256(uint32_t * d)10130957b409SSimon J. Gerraty reduce_f256(uint32_t *d)
10140957b409SSimon J. Gerraty {
10150957b409SSimon J. Gerraty uint32_t x;
10160957b409SSimon J. Gerraty
10170957b409SSimon J. Gerraty x = d[19] >> 9;
10180957b409SSimon J. Gerraty d[19] &= 0x01FF;
10190957b409SSimon J. Gerraty d[17] += x << 3;
10200957b409SSimon J. Gerraty d[14] -= x << 10;
10210957b409SSimon J. Gerraty d[7] -= x << 5;
10220957b409SSimon J. Gerraty d[0] += x;
10230957b409SSimon J. Gerraty norm13(d, d, 20);
10240957b409SSimon J. Gerraty }
10250957b409SSimon J. Gerraty
10260957b409SSimon J. Gerraty /*
10270957b409SSimon J. Gerraty * Perform a "final reduction" in field F256 (field for curve P-256).
10280957b409SSimon J. Gerraty * The source value must be less than twice the modulus. If the value
10290957b409SSimon J. Gerraty * is not lower than the modulus, then the modulus is subtracted and
10300957b409SSimon J. Gerraty * this function returns 1; otherwise, it leaves it untouched and it
10310957b409SSimon J. Gerraty * returns 0.
10320957b409SSimon J. Gerraty */
10330957b409SSimon J. Gerraty static uint32_t
reduce_final_f256(uint32_t * d)10340957b409SSimon J. Gerraty reduce_final_f256(uint32_t *d)
10350957b409SSimon J. Gerraty {
10360957b409SSimon J. Gerraty uint32_t t[20];
10370957b409SSimon J. Gerraty uint32_t cc;
10380957b409SSimon J. Gerraty int i;
10390957b409SSimon J. Gerraty
10400957b409SSimon J. Gerraty memcpy(t, d, sizeof t);
10410957b409SSimon J. Gerraty cc = 0;
10420957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
10430957b409SSimon J. Gerraty uint32_t w;
10440957b409SSimon J. Gerraty
10450957b409SSimon J. Gerraty w = t[i] - F256[i] - cc;
10460957b409SSimon J. Gerraty cc = w >> 31;
10470957b409SSimon J. Gerraty t[i] = w & 0x1FFF;
10480957b409SSimon J. Gerraty }
10490957b409SSimon J. Gerraty cc ^= 1;
10500957b409SSimon J. Gerraty CCOPY(cc, d, t, sizeof t);
10510957b409SSimon J. Gerraty return cc;
10520957b409SSimon J. Gerraty }
10530957b409SSimon J. Gerraty
10540957b409SSimon J. Gerraty /*
10550957b409SSimon J. Gerraty * Perform a multiplication of two integers modulo
10560957b409SSimon J. Gerraty * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
10570957b409SSimon J. Gerraty * of 20 words, each containing 13 bits of data, in little-endian order.
10580957b409SSimon J. Gerraty * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
10590957b409SSimon J. Gerraty * on output, value fits on 257 bits and is lower than twice the modulus.
10600957b409SSimon J. Gerraty */
10610957b409SSimon J. Gerraty static void
mul_f256(uint32_t * d,const uint32_t * a,const uint32_t * b)10620957b409SSimon J. Gerraty mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
10630957b409SSimon J. Gerraty {
10640957b409SSimon J. Gerraty uint32_t t[40], cc;
10650957b409SSimon J. Gerraty int i;
10660957b409SSimon J. Gerraty
10670957b409SSimon J. Gerraty /*
10680957b409SSimon J. Gerraty * Compute raw multiplication. All result words fit in 13 bits
10690957b409SSimon J. Gerraty * each.
10700957b409SSimon J. Gerraty */
10710957b409SSimon J. Gerraty mul20(t, a, b);
10720957b409SSimon J. Gerraty
10730957b409SSimon J. Gerraty /*
10740957b409SSimon J. Gerraty * Modular reduction: each high word in added/subtracted where
10750957b409SSimon J. Gerraty * necessary.
10760957b409SSimon J. Gerraty *
10770957b409SSimon J. Gerraty * The modulus is:
10780957b409SSimon J. Gerraty * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
10790957b409SSimon J. Gerraty * Therefore:
10800957b409SSimon J. Gerraty * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
10810957b409SSimon J. Gerraty *
10820957b409SSimon J. Gerraty * For a word x at bit offset n (n >= 256), we have:
10830957b409SSimon J. Gerraty * x*2^n = x*2^(n-32) - x*2^(n-64)
10840957b409SSimon J. Gerraty * - x*2^(n - 160) + x*2^(n-256) mod p
10850957b409SSimon J. Gerraty *
10860957b409SSimon J. Gerraty * Thus, we can nullify the high word if we reinject it at some
10870957b409SSimon J. Gerraty * proper emplacements.
10880957b409SSimon J. Gerraty */
10890957b409SSimon J. Gerraty for (i = 39; i >= 20; i --) {
10900957b409SSimon J. Gerraty uint32_t x;
10910957b409SSimon J. Gerraty
10920957b409SSimon J. Gerraty x = t[i];
10930957b409SSimon J. Gerraty t[i - 2] += ARSH(x, 6);
10940957b409SSimon J. Gerraty t[i - 3] += (x << 7) & 0x1FFF;
10950957b409SSimon J. Gerraty t[i - 4] -= ARSH(x, 12);
10960957b409SSimon J. Gerraty t[i - 5] -= (x << 1) & 0x1FFF;
10970957b409SSimon J. Gerraty t[i - 12] -= ARSH(x, 4);
10980957b409SSimon J. Gerraty t[i - 13] -= (x << 9) & 0x1FFF;
10990957b409SSimon J. Gerraty t[i - 19] += ARSH(x, 9);
11000957b409SSimon J. Gerraty t[i - 20] += (x << 4) & 0x1FFF;
11010957b409SSimon J. Gerraty }
11020957b409SSimon J. Gerraty
11030957b409SSimon J. Gerraty /*
11040957b409SSimon J. Gerraty * Propagate carries. This is a signed propagation, and the
11050957b409SSimon J. Gerraty * result may be negative. The loop above may enlarge values,
11060957b409SSimon J. Gerraty * but not two much: worst case is the chain involving t[i - 3],
11070957b409SSimon J. Gerraty * in which a value may be added to itself up to 7 times. Since
11080957b409SSimon J. Gerraty * starting values are 13-bit each, all words fit on 20 bits
11090957b409SSimon J. Gerraty * (21 to account for the sign bit).
11100957b409SSimon J. Gerraty */
11110957b409SSimon J. Gerraty cc = norm13(t, t, 20);
11120957b409SSimon J. Gerraty
11130957b409SSimon J. Gerraty /*
11140957b409SSimon J. Gerraty * Perform modular reduction again for the bits beyond 256 (the carry
11150957b409SSimon J. Gerraty * and the bits 256..259). Since the largest shift below is by 10
11160957b409SSimon J. Gerraty * bits, and the values fit on 21 bits, values fit in 32-bit words,
11170957b409SSimon J. Gerraty * thereby allowing injecting full word values.
11180957b409SSimon J. Gerraty */
11190957b409SSimon J. Gerraty cc = (cc << 4) | (t[19] >> 9);
11200957b409SSimon J. Gerraty t[19] &= 0x01FF;
11210957b409SSimon J. Gerraty t[17] += cc << 3;
11220957b409SSimon J. Gerraty t[14] -= cc << 10;
11230957b409SSimon J. Gerraty t[7] -= cc << 5;
11240957b409SSimon J. Gerraty t[0] += cc;
11250957b409SSimon J. Gerraty
11260957b409SSimon J. Gerraty /*
11270957b409SSimon J. Gerraty * If the carry is negative, then after carry propagation, we may
11280957b409SSimon J. Gerraty * end up with a value which is negative, and we don't want that.
11290957b409SSimon J. Gerraty * Thus, in that case, we add the modulus. Note that the subtraction
11300957b409SSimon J. Gerraty * result, when the carry is negative, is always smaller than the
11310957b409SSimon J. Gerraty * modulus, so the extra addition will not make the value exceed
11320957b409SSimon J. Gerraty * twice the modulus.
11330957b409SSimon J. Gerraty */
11340957b409SSimon J. Gerraty cc >>= 31;
11350957b409SSimon J. Gerraty t[0] -= cc;
11360957b409SSimon J. Gerraty t[7] += cc << 5;
11370957b409SSimon J. Gerraty t[14] += cc << 10;
11380957b409SSimon J. Gerraty t[17] -= cc << 3;
11390957b409SSimon J. Gerraty t[19] += cc << 9;
11400957b409SSimon J. Gerraty
11410957b409SSimon J. Gerraty norm13(d, t, 20);
11420957b409SSimon J. Gerraty }
11430957b409SSimon J. Gerraty
11440957b409SSimon J. Gerraty /*
11450957b409SSimon J. Gerraty * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
11460957b409SSimon J. Gerraty * P-256). Operand is an array of 20 words, each containing 13 bits of
11470957b409SSimon J. Gerraty * data, in little-endian order. On input, upper word may be up to 13
11480957b409SSimon J. Gerraty * bits (hence value up to 2^260-1); on output, value fits on 257 bits
11490957b409SSimon J. Gerraty * and is lower than twice the modulus.
11500957b409SSimon J. Gerraty */
11510957b409SSimon J. Gerraty static void
square_f256(uint32_t * d,const uint32_t * a)11520957b409SSimon J. Gerraty square_f256(uint32_t *d, const uint32_t *a)
11530957b409SSimon J. Gerraty {
11540957b409SSimon J. Gerraty uint32_t t[40], cc;
11550957b409SSimon J. Gerraty int i;
11560957b409SSimon J. Gerraty
11570957b409SSimon J. Gerraty /*
11580957b409SSimon J. Gerraty * Compute raw square. All result words fit in 13 bits each.
11590957b409SSimon J. Gerraty */
11600957b409SSimon J. Gerraty square20(t, a);
11610957b409SSimon J. Gerraty
11620957b409SSimon J. Gerraty /*
11630957b409SSimon J. Gerraty * Modular reduction: each high word in added/subtracted where
11640957b409SSimon J. Gerraty * necessary.
11650957b409SSimon J. Gerraty *
11660957b409SSimon J. Gerraty * The modulus is:
11670957b409SSimon J. Gerraty * p = 2^256 - 2^224 + 2^192 + 2^96 - 1
11680957b409SSimon J. Gerraty * Therefore:
11690957b409SSimon J. Gerraty * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
11700957b409SSimon J. Gerraty *
11710957b409SSimon J. Gerraty * For a word x at bit offset n (n >= 256), we have:
11720957b409SSimon J. Gerraty * x*2^n = x*2^(n-32) - x*2^(n-64)
11730957b409SSimon J. Gerraty * - x*2^(n - 160) + x*2^(n-256) mod p
11740957b409SSimon J. Gerraty *
11750957b409SSimon J. Gerraty * Thus, we can nullify the high word if we reinject it at some
11760957b409SSimon J. Gerraty * proper emplacements.
11770957b409SSimon J. Gerraty */
11780957b409SSimon J. Gerraty for (i = 39; i >= 20; i --) {
11790957b409SSimon J. Gerraty uint32_t x;
11800957b409SSimon J. Gerraty
11810957b409SSimon J. Gerraty x = t[i];
11820957b409SSimon J. Gerraty t[i - 2] += ARSH(x, 6);
11830957b409SSimon J. Gerraty t[i - 3] += (x << 7) & 0x1FFF;
11840957b409SSimon J. Gerraty t[i - 4] -= ARSH(x, 12);
11850957b409SSimon J. Gerraty t[i - 5] -= (x << 1) & 0x1FFF;
11860957b409SSimon J. Gerraty t[i - 12] -= ARSH(x, 4);
11870957b409SSimon J. Gerraty t[i - 13] -= (x << 9) & 0x1FFF;
11880957b409SSimon J. Gerraty t[i - 19] += ARSH(x, 9);
11890957b409SSimon J. Gerraty t[i - 20] += (x << 4) & 0x1FFF;
11900957b409SSimon J. Gerraty }
11910957b409SSimon J. Gerraty
11920957b409SSimon J. Gerraty /*
11930957b409SSimon J. Gerraty * Propagate carries. This is a signed propagation, and the
11940957b409SSimon J. Gerraty * result may be negative. The loop above may enlarge values,
11950957b409SSimon J. Gerraty * but not two much: worst case is the chain involving t[i - 3],
11960957b409SSimon J. Gerraty * in which a value may be added to itself up to 7 times. Since
11970957b409SSimon J. Gerraty * starting values are 13-bit each, all words fit on 20 bits
11980957b409SSimon J. Gerraty * (21 to account for the sign bit).
11990957b409SSimon J. Gerraty */
12000957b409SSimon J. Gerraty cc = norm13(t, t, 20);
12010957b409SSimon J. Gerraty
12020957b409SSimon J. Gerraty /*
12030957b409SSimon J. Gerraty * Perform modular reduction again for the bits beyond 256 (the carry
12040957b409SSimon J. Gerraty * and the bits 256..259). Since the largest shift below is by 10
12050957b409SSimon J. Gerraty * bits, and the values fit on 21 bits, values fit in 32-bit words,
12060957b409SSimon J. Gerraty * thereby allowing injecting full word values.
12070957b409SSimon J. Gerraty */
12080957b409SSimon J. Gerraty cc = (cc << 4) | (t[19] >> 9);
12090957b409SSimon J. Gerraty t[19] &= 0x01FF;
12100957b409SSimon J. Gerraty t[17] += cc << 3;
12110957b409SSimon J. Gerraty t[14] -= cc << 10;
12120957b409SSimon J. Gerraty t[7] -= cc << 5;
12130957b409SSimon J. Gerraty t[0] += cc;
12140957b409SSimon J. Gerraty
12150957b409SSimon J. Gerraty /*
12160957b409SSimon J. Gerraty * If the carry is negative, then after carry propagation, we may
12170957b409SSimon J. Gerraty * end up with a value which is negative, and we don't want that.
12180957b409SSimon J. Gerraty * Thus, in that case, we add the modulus. Note that the subtraction
12190957b409SSimon J. Gerraty * result, when the carry is negative, is always smaller than the
12200957b409SSimon J. Gerraty * modulus, so the extra addition will not make the value exceed
12210957b409SSimon J. Gerraty * twice the modulus.
12220957b409SSimon J. Gerraty */
12230957b409SSimon J. Gerraty cc >>= 31;
12240957b409SSimon J. Gerraty t[0] -= cc;
12250957b409SSimon J. Gerraty t[7] += cc << 5;
12260957b409SSimon J. Gerraty t[14] += cc << 10;
12270957b409SSimon J. Gerraty t[17] -= cc << 3;
12280957b409SSimon J. Gerraty t[19] += cc << 9;
12290957b409SSimon J. Gerraty
12300957b409SSimon J. Gerraty norm13(d, t, 20);
12310957b409SSimon J. Gerraty }
12320957b409SSimon J. Gerraty
12330957b409SSimon J. Gerraty /*
12340957b409SSimon J. Gerraty * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
12350957b409SSimon J. Gerraty * are such that:
12360957b409SSimon J. Gerraty * X = x / z^2
12370957b409SSimon J. Gerraty * Y = y / z^3
12380957b409SSimon J. Gerraty * For the point at infinity, z = 0.
12390957b409SSimon J. Gerraty * Each point thus admits many possible representations.
12400957b409SSimon J. Gerraty *
12410957b409SSimon J. Gerraty * Coordinates are represented in arrays of 32-bit integers, each holding
12420957b409SSimon J. Gerraty * 13 bits of data. Values may also be slightly greater than the modulus,
12430957b409SSimon J. Gerraty * but they will always be lower than twice the modulus.
12440957b409SSimon J. Gerraty */
12450957b409SSimon J. Gerraty typedef struct {
12460957b409SSimon J. Gerraty uint32_t x[20];
12470957b409SSimon J. Gerraty uint32_t y[20];
12480957b409SSimon J. Gerraty uint32_t z[20];
12490957b409SSimon J. Gerraty } p256_jacobian;
12500957b409SSimon J. Gerraty
12510957b409SSimon J. Gerraty /*
12520957b409SSimon J. Gerraty * Convert a point to affine coordinates:
12530957b409SSimon J. Gerraty * - If the point is the point at infinity, then all three coordinates
12540957b409SSimon J. Gerraty * are set to 0.
12550957b409SSimon J. Gerraty * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
12560957b409SSimon J. Gerraty * coordinates are the 'X' and 'Y' affine coordinates.
12570957b409SSimon J. Gerraty * The coordinates are guaranteed to be lower than the modulus.
12580957b409SSimon J. Gerraty */
12590957b409SSimon J. Gerraty static void
p256_to_affine(p256_jacobian * P)12600957b409SSimon J. Gerraty p256_to_affine(p256_jacobian *P)
12610957b409SSimon J. Gerraty {
12620957b409SSimon J. Gerraty uint32_t t1[20], t2[20];
12630957b409SSimon J. Gerraty int i;
12640957b409SSimon J. Gerraty
12650957b409SSimon J. Gerraty /*
12660957b409SSimon J. Gerraty * Invert z with a modular exponentiation: the modulus is
12670957b409SSimon J. Gerraty * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
12680957b409SSimon J. Gerraty * p-2. Exponent bit pattern (from high to low) is:
12690957b409SSimon J. Gerraty * - 32 bits of value 1
12700957b409SSimon J. Gerraty * - 31 bits of value 0
12710957b409SSimon J. Gerraty * - 1 bit of value 1
12720957b409SSimon J. Gerraty * - 96 bits of value 0
12730957b409SSimon J. Gerraty * - 94 bits of value 1
12740957b409SSimon J. Gerraty * - 1 bit of value 0
12750957b409SSimon J. Gerraty * - 1 bit of value 1
12760957b409SSimon J. Gerraty * Thus, we precompute z^(2^31-1) to speed things up.
12770957b409SSimon J. Gerraty *
12780957b409SSimon J. Gerraty * If z = 0 (point at infinity) then the modular exponentiation
12790957b409SSimon J. Gerraty * will yield 0, which leads to the expected result (all three
12800957b409SSimon J. Gerraty * coordinates set to 0).
12810957b409SSimon J. Gerraty */
12820957b409SSimon J. Gerraty
12830957b409SSimon J. Gerraty /*
12840957b409SSimon J. Gerraty * A simple square-and-multiply for z^(2^31-1). We could save about
12850957b409SSimon J. Gerraty * two dozen multiplications here with an addition chain, but
12860957b409SSimon J. Gerraty * this would require a bit more code, and extra stack buffers.
12870957b409SSimon J. Gerraty */
12880957b409SSimon J. Gerraty memcpy(t1, P->z, sizeof P->z);
12890957b409SSimon J. Gerraty for (i = 0; i < 30; i ++) {
12900957b409SSimon J. Gerraty square_f256(t1, t1);
12910957b409SSimon J. Gerraty mul_f256(t1, t1, P->z);
12920957b409SSimon J. Gerraty }
12930957b409SSimon J. Gerraty
12940957b409SSimon J. Gerraty /*
12950957b409SSimon J. Gerraty * Square-and-multiply. Apart from the squarings, we have a few
12960957b409SSimon J. Gerraty * multiplications to set bits to 1; we multiply by the original z
12970957b409SSimon J. Gerraty * for setting 1 bit, and by t1 for setting 31 bits.
12980957b409SSimon J. Gerraty */
12990957b409SSimon J. Gerraty memcpy(t2, P->z, sizeof P->z);
13000957b409SSimon J. Gerraty for (i = 1; i < 256; i ++) {
13010957b409SSimon J. Gerraty square_f256(t2, t2);
13020957b409SSimon J. Gerraty switch (i) {
13030957b409SSimon J. Gerraty case 31:
13040957b409SSimon J. Gerraty case 190:
13050957b409SSimon J. Gerraty case 221:
13060957b409SSimon J. Gerraty case 252:
13070957b409SSimon J. Gerraty mul_f256(t2, t2, t1);
13080957b409SSimon J. Gerraty break;
13090957b409SSimon J. Gerraty case 63:
13100957b409SSimon J. Gerraty case 253:
13110957b409SSimon J. Gerraty case 255:
13120957b409SSimon J. Gerraty mul_f256(t2, t2, P->z);
13130957b409SSimon J. Gerraty break;
13140957b409SSimon J. Gerraty }
13150957b409SSimon J. Gerraty }
13160957b409SSimon J. Gerraty
13170957b409SSimon J. Gerraty /*
13180957b409SSimon J. Gerraty * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
13190957b409SSimon J. Gerraty */
13200957b409SSimon J. Gerraty mul_f256(t1, t2, t2);
13210957b409SSimon J. Gerraty mul_f256(P->x, t1, P->x);
13220957b409SSimon J. Gerraty mul_f256(t1, t1, t2);
13230957b409SSimon J. Gerraty mul_f256(P->y, t1, P->y);
13240957b409SSimon J. Gerraty reduce_final_f256(P->x);
13250957b409SSimon J. Gerraty reduce_final_f256(P->y);
13260957b409SSimon J. Gerraty
13270957b409SSimon J. Gerraty /*
13280957b409SSimon J. Gerraty * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
13290957b409SSimon J. Gerraty * this will set z to 1.
13300957b409SSimon J. Gerraty */
13310957b409SSimon J. Gerraty mul_f256(P->z, P->z, t2);
13320957b409SSimon J. Gerraty reduce_final_f256(P->z);
13330957b409SSimon J. Gerraty }
13340957b409SSimon J. Gerraty
13350957b409SSimon J. Gerraty /*
13360957b409SSimon J. Gerraty * Double a point in P-256. This function works for all valid points,
13370957b409SSimon J. Gerraty * including the point at infinity.
13380957b409SSimon J. Gerraty */
13390957b409SSimon J. Gerraty static void
p256_double(p256_jacobian * Q)13400957b409SSimon J. Gerraty p256_double(p256_jacobian *Q)
13410957b409SSimon J. Gerraty {
13420957b409SSimon J. Gerraty /*
13430957b409SSimon J. Gerraty * Doubling formulas are:
13440957b409SSimon J. Gerraty *
13450957b409SSimon J. Gerraty * s = 4*x*y^2
13460957b409SSimon J. Gerraty * m = 3*(x + z^2)*(x - z^2)
13470957b409SSimon J. Gerraty * x' = m^2 - 2*s
13480957b409SSimon J. Gerraty * y' = m*(s - x') - 8*y^4
13490957b409SSimon J. Gerraty * z' = 2*y*z
13500957b409SSimon J. Gerraty *
13510957b409SSimon J. Gerraty * These formulas work for all points, including points of order 2
13520957b409SSimon J. Gerraty * and points at infinity:
13530957b409SSimon J. Gerraty * - If y = 0 then z' = 0. But there is no such point in P-256
13540957b409SSimon J. Gerraty * anyway.
13550957b409SSimon J. Gerraty * - If z = 0 then z' = 0.
13560957b409SSimon J. Gerraty */
13570957b409SSimon J. Gerraty uint32_t t1[20], t2[20], t3[20], t4[20];
13580957b409SSimon J. Gerraty int i;
13590957b409SSimon J. Gerraty
13600957b409SSimon J. Gerraty /*
13610957b409SSimon J. Gerraty * Compute z^2 in t1.
13620957b409SSimon J. Gerraty */
13630957b409SSimon J. Gerraty square_f256(t1, Q->z);
13640957b409SSimon J. Gerraty
13650957b409SSimon J. Gerraty /*
13660957b409SSimon J. Gerraty * Compute x-z^2 in t2 and x+z^2 in t1.
13670957b409SSimon J. Gerraty */
13680957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
13690957b409SSimon J. Gerraty t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
13700957b409SSimon J. Gerraty t1[i] += Q->x[i];
13710957b409SSimon J. Gerraty }
13720957b409SSimon J. Gerraty norm13(t1, t1, 20);
13730957b409SSimon J. Gerraty norm13(t2, t2, 20);
13740957b409SSimon J. Gerraty
13750957b409SSimon J. Gerraty /*
13760957b409SSimon J. Gerraty * Compute 3*(x+z^2)*(x-z^2) in t1.
13770957b409SSimon J. Gerraty */
13780957b409SSimon J. Gerraty mul_f256(t3, t1, t2);
13790957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
13800957b409SSimon J. Gerraty t1[i] = MUL15(3, t3[i]);
13810957b409SSimon J. Gerraty }
13820957b409SSimon J. Gerraty norm13(t1, t1, 20);
13830957b409SSimon J. Gerraty
13840957b409SSimon J. Gerraty /*
13850957b409SSimon J. Gerraty * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
13860957b409SSimon J. Gerraty */
13870957b409SSimon J. Gerraty square_f256(t3, Q->y);
13880957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
13890957b409SSimon J. Gerraty t3[i] <<= 1;
13900957b409SSimon J. Gerraty }
13910957b409SSimon J. Gerraty norm13(t3, t3, 20);
13920957b409SSimon J. Gerraty mul_f256(t2, Q->x, t3);
13930957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
13940957b409SSimon J. Gerraty t2[i] <<= 1;
13950957b409SSimon J. Gerraty }
13960957b409SSimon J. Gerraty norm13(t2, t2, 20);
13970957b409SSimon J. Gerraty reduce_f256(t2);
13980957b409SSimon J. Gerraty
13990957b409SSimon J. Gerraty /*
14000957b409SSimon J. Gerraty * Compute x' = m^2 - 2*s.
14010957b409SSimon J. Gerraty */
14020957b409SSimon J. Gerraty square_f256(Q->x, t1);
14030957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
14040957b409SSimon J. Gerraty Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
14050957b409SSimon J. Gerraty }
14060957b409SSimon J. Gerraty norm13(Q->x, Q->x, 20);
14070957b409SSimon J. Gerraty reduce_f256(Q->x);
14080957b409SSimon J. Gerraty
14090957b409SSimon J. Gerraty /*
14100957b409SSimon J. Gerraty * Compute z' = 2*y*z.
14110957b409SSimon J. Gerraty */
14120957b409SSimon J. Gerraty mul_f256(t4, Q->y, Q->z);
14130957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
14140957b409SSimon J. Gerraty Q->z[i] = t4[i] << 1;
14150957b409SSimon J. Gerraty }
14160957b409SSimon J. Gerraty norm13(Q->z, Q->z, 20);
14170957b409SSimon J. Gerraty reduce_f256(Q->z);
14180957b409SSimon J. Gerraty
14190957b409SSimon J. Gerraty /*
14200957b409SSimon J. Gerraty * Compute y' = m*(s - x') - 8*y^4. Note that we already have
14210957b409SSimon J. Gerraty * 2*y^2 in t3.
14220957b409SSimon J. Gerraty */
14230957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
14240957b409SSimon J. Gerraty t2[i] += (F256[i] << 1) - Q->x[i];
14250957b409SSimon J. Gerraty }
14260957b409SSimon J. Gerraty norm13(t2, t2, 20);
14270957b409SSimon J. Gerraty mul_f256(Q->y, t1, t2);
14280957b409SSimon J. Gerraty square_f256(t4, t3);
14290957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
14300957b409SSimon J. Gerraty Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
14310957b409SSimon J. Gerraty }
14320957b409SSimon J. Gerraty norm13(Q->y, Q->y, 20);
14330957b409SSimon J. Gerraty reduce_f256(Q->y);
14340957b409SSimon J. Gerraty }
14350957b409SSimon J. Gerraty
14360957b409SSimon J. Gerraty /*
14370957b409SSimon J. Gerraty * Add point P2 to point P1.
14380957b409SSimon J. Gerraty *
14390957b409SSimon J. Gerraty * This function computes the wrong result in the following cases:
14400957b409SSimon J. Gerraty *
14410957b409SSimon J. Gerraty * - If P1 == 0 but P2 != 0
14420957b409SSimon J. Gerraty * - If P1 != 0 but P2 == 0
14430957b409SSimon J. Gerraty * - If P1 == P2
14440957b409SSimon J. Gerraty *
14450957b409SSimon J. Gerraty * In all three cases, P1 is set to the point at infinity.
14460957b409SSimon J. Gerraty *
14470957b409SSimon J. Gerraty * Returned value is 0 if one of the following occurs:
14480957b409SSimon J. Gerraty *
14490957b409SSimon J. Gerraty * - P1 and P2 have the same Y coordinate
14500957b409SSimon J. Gerraty * - P1 == 0 and P2 == 0
14510957b409SSimon J. Gerraty * - The Y coordinate of one of the points is 0 and the other point is
14520957b409SSimon J. Gerraty * the point at infinity.
14530957b409SSimon J. Gerraty *
14540957b409SSimon J. Gerraty * The third case cannot actually happen with valid points, since a point
14550957b409SSimon J. Gerraty * with Y == 0 is a point of order 2, and there is no point of order 2 on
14560957b409SSimon J. Gerraty * curve P-256.
14570957b409SSimon J. Gerraty *
14580957b409SSimon J. Gerraty * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
14590957b409SSimon J. Gerraty * can apply the following:
14600957b409SSimon J. Gerraty *
14610957b409SSimon J. Gerraty * - If the result is not the point at infinity, then it is correct.
14620957b409SSimon J. Gerraty * - Otherwise, if the returned value is 1, then this is a case of
14630957b409SSimon J. Gerraty * P1+P2 == 0, so the result is indeed the point at infinity.
14640957b409SSimon J. Gerraty * - Otherwise, P1 == P2, so a "double" operation should have been
14650957b409SSimon J. Gerraty * performed.
14660957b409SSimon J. Gerraty */
14670957b409SSimon J. Gerraty static uint32_t
p256_add(p256_jacobian * P1,const p256_jacobian * P2)14680957b409SSimon J. Gerraty p256_add(p256_jacobian *P1, const p256_jacobian *P2)
14690957b409SSimon J. Gerraty {
14700957b409SSimon J. Gerraty /*
14710957b409SSimon J. Gerraty * Addtions formulas are:
14720957b409SSimon J. Gerraty *
14730957b409SSimon J. Gerraty * u1 = x1 * z2^2
14740957b409SSimon J. Gerraty * u2 = x2 * z1^2
14750957b409SSimon J. Gerraty * s1 = y1 * z2^3
14760957b409SSimon J. Gerraty * s2 = y2 * z1^3
14770957b409SSimon J. Gerraty * h = u2 - u1
14780957b409SSimon J. Gerraty * r = s2 - s1
14790957b409SSimon J. Gerraty * x3 = r^2 - h^3 - 2 * u1 * h^2
14800957b409SSimon J. Gerraty * y3 = r * (u1 * h^2 - x3) - s1 * h^3
14810957b409SSimon J. Gerraty * z3 = h * z1 * z2
14820957b409SSimon J. Gerraty */
14830957b409SSimon J. Gerraty uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
14840957b409SSimon J. Gerraty uint32_t ret;
14850957b409SSimon J. Gerraty int i;
14860957b409SSimon J. Gerraty
14870957b409SSimon J. Gerraty /*
14880957b409SSimon J. Gerraty * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
14890957b409SSimon J. Gerraty */
14900957b409SSimon J. Gerraty square_f256(t3, P2->z);
14910957b409SSimon J. Gerraty mul_f256(t1, P1->x, t3);
14920957b409SSimon J. Gerraty mul_f256(t4, P2->z, t3);
14930957b409SSimon J. Gerraty mul_f256(t3, P1->y, t4);
14940957b409SSimon J. Gerraty
14950957b409SSimon J. Gerraty /*
14960957b409SSimon J. Gerraty * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
14970957b409SSimon J. Gerraty */
14980957b409SSimon J. Gerraty square_f256(t4, P1->z);
14990957b409SSimon J. Gerraty mul_f256(t2, P2->x, t4);
15000957b409SSimon J. Gerraty mul_f256(t5, P1->z, t4);
15010957b409SSimon J. Gerraty mul_f256(t4, P2->y, t5);
15020957b409SSimon J. Gerraty
15030957b409SSimon J. Gerraty /*
15040957b409SSimon J. Gerraty * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
15050957b409SSimon J. Gerraty * We need to test whether r is zero, so we will do some extra
15060957b409SSimon J. Gerraty * reduce.
15070957b409SSimon J. Gerraty */
15080957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
15090957b409SSimon J. Gerraty t2[i] += (F256[i] << 1) - t1[i];
15100957b409SSimon J. Gerraty t4[i] += (F256[i] << 1) - t3[i];
15110957b409SSimon J. Gerraty }
15120957b409SSimon J. Gerraty norm13(t2, t2, 20);
15130957b409SSimon J. Gerraty norm13(t4, t4, 20);
15140957b409SSimon J. Gerraty reduce_f256(t4);
15150957b409SSimon J. Gerraty reduce_final_f256(t4);
15160957b409SSimon J. Gerraty ret = 0;
15170957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
15180957b409SSimon J. Gerraty ret |= t4[i];
15190957b409SSimon J. Gerraty }
15200957b409SSimon J. Gerraty ret = (ret | -ret) >> 31;
15210957b409SSimon J. Gerraty
15220957b409SSimon J. Gerraty /*
15230957b409SSimon J. Gerraty * Compute u1*h^2 (in t6) and h^3 (in t5);
15240957b409SSimon J. Gerraty */
15250957b409SSimon J. Gerraty square_f256(t7, t2);
15260957b409SSimon J. Gerraty mul_f256(t6, t1, t7);
15270957b409SSimon J. Gerraty mul_f256(t5, t7, t2);
15280957b409SSimon J. Gerraty
15290957b409SSimon J. Gerraty /*
15300957b409SSimon J. Gerraty * Compute x3 = r^2 - h^3 - 2*u1*h^2.
15310957b409SSimon J. Gerraty */
15320957b409SSimon J. Gerraty square_f256(P1->x, t4);
15330957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
15340957b409SSimon J. Gerraty P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
15350957b409SSimon J. Gerraty }
15360957b409SSimon J. Gerraty norm13(P1->x, P1->x, 20);
15370957b409SSimon J. Gerraty reduce_f256(P1->x);
15380957b409SSimon J. Gerraty
15390957b409SSimon J. Gerraty /*
15400957b409SSimon J. Gerraty * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
15410957b409SSimon J. Gerraty */
15420957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
15430957b409SSimon J. Gerraty t6[i] += (F256[i] << 1) - P1->x[i];
15440957b409SSimon J. Gerraty }
15450957b409SSimon J. Gerraty norm13(t6, t6, 20);
15460957b409SSimon J. Gerraty mul_f256(P1->y, t4, t6);
15470957b409SSimon J. Gerraty mul_f256(t1, t5, t3);
15480957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
15490957b409SSimon J. Gerraty P1->y[i] += (F256[i] << 1) - t1[i];
15500957b409SSimon J. Gerraty }
15510957b409SSimon J. Gerraty norm13(P1->y, P1->y, 20);
15520957b409SSimon J. Gerraty reduce_f256(P1->y);
15530957b409SSimon J. Gerraty
15540957b409SSimon J. Gerraty /*
15550957b409SSimon J. Gerraty * Compute z3 = h*z1*z2.
15560957b409SSimon J. Gerraty */
15570957b409SSimon J. Gerraty mul_f256(t1, P1->z, P2->z);
15580957b409SSimon J. Gerraty mul_f256(P1->z, t1, t2);
15590957b409SSimon J. Gerraty
15600957b409SSimon J. Gerraty return ret;
15610957b409SSimon J. Gerraty }
15620957b409SSimon J. Gerraty
15630957b409SSimon J. Gerraty /*
15640957b409SSimon J. Gerraty * Add point P2 to point P1. This is a specialised function for the
15650957b409SSimon J. Gerraty * case when P2 is a non-zero point in affine coordinate.
15660957b409SSimon J. Gerraty *
15670957b409SSimon J. Gerraty * This function computes the wrong result in the following cases:
15680957b409SSimon J. Gerraty *
15690957b409SSimon J. Gerraty * - If P1 == 0
15700957b409SSimon J. Gerraty * - If P1 == P2
15710957b409SSimon J. Gerraty *
15720957b409SSimon J. Gerraty * In both cases, P1 is set to the point at infinity.
15730957b409SSimon J. Gerraty *
15740957b409SSimon J. Gerraty * Returned value is 0 if one of the following occurs:
15750957b409SSimon J. Gerraty *
15760957b409SSimon J. Gerraty * - P1 and P2 have the same Y coordinate
15770957b409SSimon J. Gerraty * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
15780957b409SSimon J. Gerraty *
15790957b409SSimon J. Gerraty * The second case cannot actually happen with valid points, since a point
15800957b409SSimon J. Gerraty * with Y == 0 is a point of order 2, and there is no point of order 2 on
15810957b409SSimon J. Gerraty * curve P-256.
15820957b409SSimon J. Gerraty *
15830957b409SSimon J. Gerraty * Therefore, assuming that P1 != 0 on input, then the caller
15840957b409SSimon J. Gerraty * can apply the following:
15850957b409SSimon J. Gerraty *
15860957b409SSimon J. Gerraty * - If the result is not the point at infinity, then it is correct.
15870957b409SSimon J. Gerraty * - Otherwise, if the returned value is 1, then this is a case of
15880957b409SSimon J. Gerraty * P1+P2 == 0, so the result is indeed the point at infinity.
15890957b409SSimon J. Gerraty * - Otherwise, P1 == P2, so a "double" operation should have been
15900957b409SSimon J. Gerraty * performed.
15910957b409SSimon J. Gerraty */
15920957b409SSimon J. Gerraty static uint32_t
p256_add_mixed(p256_jacobian * P1,const p256_jacobian * P2)15930957b409SSimon J. Gerraty p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
15940957b409SSimon J. Gerraty {
15950957b409SSimon J. Gerraty /*
15960957b409SSimon J. Gerraty * Addtions formulas are:
15970957b409SSimon J. Gerraty *
15980957b409SSimon J. Gerraty * u1 = x1
15990957b409SSimon J. Gerraty * u2 = x2 * z1^2
16000957b409SSimon J. Gerraty * s1 = y1
16010957b409SSimon J. Gerraty * s2 = y2 * z1^3
16020957b409SSimon J. Gerraty * h = u2 - u1
16030957b409SSimon J. Gerraty * r = s2 - s1
16040957b409SSimon J. Gerraty * x3 = r^2 - h^3 - 2 * u1 * h^2
16050957b409SSimon J. Gerraty * y3 = r * (u1 * h^2 - x3) - s1 * h^3
16060957b409SSimon J. Gerraty * z3 = h * z1
16070957b409SSimon J. Gerraty */
16080957b409SSimon J. Gerraty uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
16090957b409SSimon J. Gerraty uint32_t ret;
16100957b409SSimon J. Gerraty int i;
16110957b409SSimon J. Gerraty
16120957b409SSimon J. Gerraty /*
16130957b409SSimon J. Gerraty * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
16140957b409SSimon J. Gerraty */
16150957b409SSimon J. Gerraty memcpy(t1, P1->x, sizeof t1);
16160957b409SSimon J. Gerraty memcpy(t3, P1->y, sizeof t3);
16170957b409SSimon J. Gerraty
16180957b409SSimon J. Gerraty /*
16190957b409SSimon J. Gerraty * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
16200957b409SSimon J. Gerraty */
16210957b409SSimon J. Gerraty square_f256(t4, P1->z);
16220957b409SSimon J. Gerraty mul_f256(t2, P2->x, t4);
16230957b409SSimon J. Gerraty mul_f256(t5, P1->z, t4);
16240957b409SSimon J. Gerraty mul_f256(t4, P2->y, t5);
16250957b409SSimon J. Gerraty
16260957b409SSimon J. Gerraty /*
16270957b409SSimon J. Gerraty * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
16280957b409SSimon J. Gerraty * We need to test whether r is zero, so we will do some extra
16290957b409SSimon J. Gerraty * reduce.
16300957b409SSimon J. Gerraty */
16310957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
16320957b409SSimon J. Gerraty t2[i] += (F256[i] << 1) - t1[i];
16330957b409SSimon J. Gerraty t4[i] += (F256[i] << 1) - t3[i];
16340957b409SSimon J. Gerraty }
16350957b409SSimon J. Gerraty norm13(t2, t2, 20);
16360957b409SSimon J. Gerraty norm13(t4, t4, 20);
16370957b409SSimon J. Gerraty reduce_f256(t4);
16380957b409SSimon J. Gerraty reduce_final_f256(t4);
16390957b409SSimon J. Gerraty ret = 0;
16400957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
16410957b409SSimon J. Gerraty ret |= t4[i];
16420957b409SSimon J. Gerraty }
16430957b409SSimon J. Gerraty ret = (ret | -ret) >> 31;
16440957b409SSimon J. Gerraty
16450957b409SSimon J. Gerraty /*
16460957b409SSimon J. Gerraty * Compute u1*h^2 (in t6) and h^3 (in t5);
16470957b409SSimon J. Gerraty */
16480957b409SSimon J. Gerraty square_f256(t7, t2);
16490957b409SSimon J. Gerraty mul_f256(t6, t1, t7);
16500957b409SSimon J. Gerraty mul_f256(t5, t7, t2);
16510957b409SSimon J. Gerraty
16520957b409SSimon J. Gerraty /*
16530957b409SSimon J. Gerraty * Compute x3 = r^2 - h^3 - 2*u1*h^2.
16540957b409SSimon J. Gerraty */
16550957b409SSimon J. Gerraty square_f256(P1->x, t4);
16560957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
16570957b409SSimon J. Gerraty P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
16580957b409SSimon J. Gerraty }
16590957b409SSimon J. Gerraty norm13(P1->x, P1->x, 20);
16600957b409SSimon J. Gerraty reduce_f256(P1->x);
16610957b409SSimon J. Gerraty
16620957b409SSimon J. Gerraty /*
16630957b409SSimon J. Gerraty * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
16640957b409SSimon J. Gerraty */
16650957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
16660957b409SSimon J. Gerraty t6[i] += (F256[i] << 1) - P1->x[i];
16670957b409SSimon J. Gerraty }
16680957b409SSimon J. Gerraty norm13(t6, t6, 20);
16690957b409SSimon J. Gerraty mul_f256(P1->y, t4, t6);
16700957b409SSimon J. Gerraty mul_f256(t1, t5, t3);
16710957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
16720957b409SSimon J. Gerraty P1->y[i] += (F256[i] << 1) - t1[i];
16730957b409SSimon J. Gerraty }
16740957b409SSimon J. Gerraty norm13(P1->y, P1->y, 20);
16750957b409SSimon J. Gerraty reduce_f256(P1->y);
16760957b409SSimon J. Gerraty
16770957b409SSimon J. Gerraty /*
16780957b409SSimon J. Gerraty * Compute z3 = h*z1*z2.
16790957b409SSimon J. Gerraty */
16800957b409SSimon J. Gerraty mul_f256(P1->z, P1->z, t2);
16810957b409SSimon J. Gerraty
16820957b409SSimon J. Gerraty return ret;
16830957b409SSimon J. Gerraty }
16840957b409SSimon J. Gerraty
16850957b409SSimon J. Gerraty /*
16860957b409SSimon J. Gerraty * Decode a P-256 point. This function does not support the point at
16870957b409SSimon J. Gerraty * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
16880957b409SSimon J. Gerraty */
16890957b409SSimon J. Gerraty static uint32_t
p256_decode(p256_jacobian * P,const void * src,size_t len)16900957b409SSimon J. Gerraty p256_decode(p256_jacobian *P, const void *src, size_t len)
16910957b409SSimon J. Gerraty {
16920957b409SSimon J. Gerraty const unsigned char *buf;
16930957b409SSimon J. Gerraty uint32_t tx[20], ty[20], t1[20], t2[20];
16940957b409SSimon J. Gerraty uint32_t bad;
16950957b409SSimon J. Gerraty int i;
16960957b409SSimon J. Gerraty
16970957b409SSimon J. Gerraty if (len != 65) {
16980957b409SSimon J. Gerraty return 0;
16990957b409SSimon J. Gerraty }
17000957b409SSimon J. Gerraty buf = src;
17010957b409SSimon J. Gerraty
17020957b409SSimon J. Gerraty /*
17030957b409SSimon J. Gerraty * First byte must be 0x04 (uncompressed format). We could support
17040957b409SSimon J. Gerraty * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
17050957b409SSimon J. Gerraty * least significant bit of the Y coordinate), but it is explicitly
17060957b409SSimon J. Gerraty * forbidden by RFC 5480 (section 2.2).
17070957b409SSimon J. Gerraty */
17080957b409SSimon J. Gerraty bad = NEQ(buf[0], 0x04);
17090957b409SSimon J. Gerraty
17100957b409SSimon J. Gerraty /*
17110957b409SSimon J. Gerraty * Decode the coordinates, and check that they are both lower
17120957b409SSimon J. Gerraty * than the modulus.
17130957b409SSimon J. Gerraty */
17140957b409SSimon J. Gerraty tx[19] = be8_to_le13(tx, buf + 1, 32);
17150957b409SSimon J. Gerraty ty[19] = be8_to_le13(ty, buf + 33, 32);
17160957b409SSimon J. Gerraty bad |= reduce_final_f256(tx);
17170957b409SSimon J. Gerraty bad |= reduce_final_f256(ty);
17180957b409SSimon J. Gerraty
17190957b409SSimon J. Gerraty /*
17200957b409SSimon J. Gerraty * Check curve equation.
17210957b409SSimon J. Gerraty */
17220957b409SSimon J. Gerraty square_f256(t1, tx);
17230957b409SSimon J. Gerraty mul_f256(t1, tx, t1);
17240957b409SSimon J. Gerraty square_f256(t2, ty);
17250957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
17260957b409SSimon J. Gerraty t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
17270957b409SSimon J. Gerraty }
17280957b409SSimon J. Gerraty norm13(t1, t1, 20);
17290957b409SSimon J. Gerraty reduce_f256(t1);
17300957b409SSimon J. Gerraty reduce_final_f256(t1);
17310957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
17320957b409SSimon J. Gerraty bad |= t1[i];
17330957b409SSimon J. Gerraty }
17340957b409SSimon J. Gerraty
17350957b409SSimon J. Gerraty /*
17360957b409SSimon J. Gerraty * Copy coordinates to the point structure.
17370957b409SSimon J. Gerraty */
17380957b409SSimon J. Gerraty memcpy(P->x, tx, sizeof tx);
17390957b409SSimon J. Gerraty memcpy(P->y, ty, sizeof ty);
17400957b409SSimon J. Gerraty memset(P->z, 0, sizeof P->z);
17410957b409SSimon J. Gerraty P->z[0] = 1;
17420957b409SSimon J. Gerraty return EQ(bad, 0);
17430957b409SSimon J. Gerraty }
17440957b409SSimon J. Gerraty
17450957b409SSimon J. Gerraty /*
17460957b409SSimon J. Gerraty * Encode a point into a buffer. This function assumes that the point is
17470957b409SSimon J. Gerraty * valid, in affine coordinates, and not the point at infinity.
17480957b409SSimon J. Gerraty */
17490957b409SSimon J. Gerraty static void
p256_encode(void * dst,const p256_jacobian * P)17500957b409SSimon J. Gerraty p256_encode(void *dst, const p256_jacobian *P)
17510957b409SSimon J. Gerraty {
17520957b409SSimon J. Gerraty unsigned char *buf;
17530957b409SSimon J. Gerraty
17540957b409SSimon J. Gerraty buf = dst;
17550957b409SSimon J. Gerraty buf[0] = 0x04;
17560957b409SSimon J. Gerraty le13_to_be8(buf + 1, 32, P->x);
17570957b409SSimon J. Gerraty le13_to_be8(buf + 33, 32, P->y);
17580957b409SSimon J. Gerraty }
17590957b409SSimon J. Gerraty
17600957b409SSimon J. Gerraty /*
17610957b409SSimon J. Gerraty * Multiply a curve point by an integer. The integer is assumed to be
17620957b409SSimon J. Gerraty * lower than the curve order, and the base point must not be the point
17630957b409SSimon J. Gerraty * at infinity.
17640957b409SSimon J. Gerraty */
17650957b409SSimon J. Gerraty static void
p256_mul(p256_jacobian * P,const unsigned char * x,size_t xlen)17660957b409SSimon J. Gerraty p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
17670957b409SSimon J. Gerraty {
17680957b409SSimon J. Gerraty /*
17690957b409SSimon J. Gerraty * qz is a flag that is initially 1, and remains equal to 1
17700957b409SSimon J. Gerraty * as long as the point is the point at infinity.
17710957b409SSimon J. Gerraty *
17720957b409SSimon J. Gerraty * We use a 2-bit window to handle multiplier bits by pairs.
17730957b409SSimon J. Gerraty * The precomputed window really is the points P2 and P3.
17740957b409SSimon J. Gerraty */
17750957b409SSimon J. Gerraty uint32_t qz;
17760957b409SSimon J. Gerraty p256_jacobian P2, P3, Q, T, U;
17770957b409SSimon J. Gerraty
17780957b409SSimon J. Gerraty /*
17790957b409SSimon J. Gerraty * Compute window values.
17800957b409SSimon J. Gerraty */
17810957b409SSimon J. Gerraty P2 = *P;
17820957b409SSimon J. Gerraty p256_double(&P2);
17830957b409SSimon J. Gerraty P3 = *P;
17840957b409SSimon J. Gerraty p256_add(&P3, &P2);
17850957b409SSimon J. Gerraty
17860957b409SSimon J. Gerraty /*
17870957b409SSimon J. Gerraty * We start with Q = 0. We process multiplier bits 2 by 2.
17880957b409SSimon J. Gerraty */
17890957b409SSimon J. Gerraty memset(&Q, 0, sizeof Q);
17900957b409SSimon J. Gerraty qz = 1;
17910957b409SSimon J. Gerraty while (xlen -- > 0) {
17920957b409SSimon J. Gerraty int k;
17930957b409SSimon J. Gerraty
17940957b409SSimon J. Gerraty for (k = 6; k >= 0; k -= 2) {
17950957b409SSimon J. Gerraty uint32_t bits;
17960957b409SSimon J. Gerraty uint32_t bnz;
17970957b409SSimon J. Gerraty
17980957b409SSimon J. Gerraty p256_double(&Q);
17990957b409SSimon J. Gerraty p256_double(&Q);
18000957b409SSimon J. Gerraty T = *P;
18010957b409SSimon J. Gerraty U = Q;
18020957b409SSimon J. Gerraty bits = (*x >> k) & (uint32_t)3;
18030957b409SSimon J. Gerraty bnz = NEQ(bits, 0);
18040957b409SSimon J. Gerraty CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
18050957b409SSimon J. Gerraty CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
18060957b409SSimon J. Gerraty p256_add(&U, &T);
18070957b409SSimon J. Gerraty CCOPY(bnz & qz, &Q, &T, sizeof Q);
18080957b409SSimon J. Gerraty CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
18090957b409SSimon J. Gerraty qz &= ~bnz;
18100957b409SSimon J. Gerraty }
18110957b409SSimon J. Gerraty x ++;
18120957b409SSimon J. Gerraty }
18130957b409SSimon J. Gerraty *P = Q;
18140957b409SSimon J. Gerraty }
18150957b409SSimon J. Gerraty
18160957b409SSimon J. Gerraty /*
18170957b409SSimon J. Gerraty * Precomputed window: k*G points, where G is the curve generator, and k
18180957b409SSimon J. Gerraty * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
18190957b409SSimon J. Gerraty * the point are encoded as 20 words of 13 bits each (little-endian
18200957b409SSimon J. Gerraty * order); 13-bit words are then grouped 2-by-2 into 32-bit words
18210957b409SSimon J. Gerraty * (little-endian order within each word).
18220957b409SSimon J. Gerraty */
18230957b409SSimon J. Gerraty static const uint32_t Gwin[15][20] = {
18240957b409SSimon J. Gerraty
18250957b409SSimon J. Gerraty { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
18260957b409SSimon J. Gerraty 0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
18270957b409SSimon J. Gerraty 0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
18280957b409SSimon J. Gerraty 0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
18290957b409SSimon J. Gerraty 0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
18300957b409SSimon J. Gerraty
18310957b409SSimon J. Gerraty { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
18320957b409SSimon J. Gerraty 0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
18330957b409SSimon J. Gerraty 0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
18340957b409SSimon J. Gerraty 0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
18350957b409SSimon J. Gerraty 0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
18360957b409SSimon J. Gerraty
18370957b409SSimon J. Gerraty { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
18380957b409SSimon J. Gerraty 0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
18390957b409SSimon J. Gerraty 0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
18400957b409SSimon J. Gerraty 0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
18410957b409SSimon J. Gerraty 0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
18420957b409SSimon J. Gerraty
18430957b409SSimon J. Gerraty { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
18440957b409SSimon J. Gerraty 0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
18450957b409SSimon J. Gerraty 0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
18460957b409SSimon J. Gerraty 0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
18470957b409SSimon J. Gerraty 0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
18480957b409SSimon J. Gerraty
18490957b409SSimon J. Gerraty { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
18500957b409SSimon J. Gerraty 0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
18510957b409SSimon J. Gerraty 0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
18520957b409SSimon J. Gerraty 0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
18530957b409SSimon J. Gerraty 0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
18540957b409SSimon J. Gerraty
18550957b409SSimon J. Gerraty { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
18560957b409SSimon J. Gerraty 0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
18570957b409SSimon J. Gerraty 0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
18580957b409SSimon J. Gerraty 0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
18590957b409SSimon J. Gerraty 0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
18600957b409SSimon J. Gerraty
18610957b409SSimon J. Gerraty { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
18620957b409SSimon J. Gerraty 0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
18630957b409SSimon J. Gerraty 0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
18640957b409SSimon J. Gerraty 0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
18650957b409SSimon J. Gerraty 0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
18660957b409SSimon J. Gerraty
18670957b409SSimon J. Gerraty { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
18680957b409SSimon J. Gerraty 0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
18690957b409SSimon J. Gerraty 0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
18700957b409SSimon J. Gerraty 0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
18710957b409SSimon J. Gerraty 0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
18720957b409SSimon J. Gerraty
18730957b409SSimon J. Gerraty { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
18740957b409SSimon J. Gerraty 0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
18750957b409SSimon J. Gerraty 0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
18760957b409SSimon J. Gerraty 0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
18770957b409SSimon J. Gerraty 0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
18780957b409SSimon J. Gerraty
18790957b409SSimon J. Gerraty { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
18800957b409SSimon J. Gerraty 0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
18810957b409SSimon J. Gerraty 0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
18820957b409SSimon J. Gerraty 0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
18830957b409SSimon J. Gerraty 0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
18840957b409SSimon J. Gerraty
18850957b409SSimon J. Gerraty { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
18860957b409SSimon J. Gerraty 0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
18870957b409SSimon J. Gerraty 0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
18880957b409SSimon J. Gerraty 0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
18890957b409SSimon J. Gerraty 0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
18900957b409SSimon J. Gerraty
18910957b409SSimon J. Gerraty { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
18920957b409SSimon J. Gerraty 0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
18930957b409SSimon J. Gerraty 0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
18940957b409SSimon J. Gerraty 0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
18950957b409SSimon J. Gerraty 0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
18960957b409SSimon J. Gerraty
18970957b409SSimon J. Gerraty { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
18980957b409SSimon J. Gerraty 0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
18990957b409SSimon J. Gerraty 0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
19000957b409SSimon J. Gerraty 0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
19010957b409SSimon J. Gerraty 0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
19020957b409SSimon J. Gerraty
19030957b409SSimon J. Gerraty { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
19040957b409SSimon J. Gerraty 0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
19050957b409SSimon J. Gerraty 0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
19060957b409SSimon J. Gerraty 0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
19070957b409SSimon J. Gerraty 0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
19080957b409SSimon J. Gerraty
19090957b409SSimon J. Gerraty { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
19100957b409SSimon J. Gerraty 0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
19110957b409SSimon J. Gerraty 0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
19120957b409SSimon J. Gerraty 0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
19130957b409SSimon J. Gerraty 0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
19140957b409SSimon J. Gerraty };
19150957b409SSimon J. Gerraty
19160957b409SSimon J. Gerraty /*
19170957b409SSimon J. Gerraty * Lookup one of the Gwin[] values, by index. This is constant-time.
19180957b409SSimon J. Gerraty */
19190957b409SSimon J. Gerraty static void
lookup_Gwin(p256_jacobian * T,uint32_t idx)19200957b409SSimon J. Gerraty lookup_Gwin(p256_jacobian *T, uint32_t idx)
19210957b409SSimon J. Gerraty {
19220957b409SSimon J. Gerraty uint32_t xy[20];
19230957b409SSimon J. Gerraty uint32_t k;
19240957b409SSimon J. Gerraty size_t u;
19250957b409SSimon J. Gerraty
19260957b409SSimon J. Gerraty memset(xy, 0, sizeof xy);
19270957b409SSimon J. Gerraty for (k = 0; k < 15; k ++) {
19280957b409SSimon J. Gerraty uint32_t m;
19290957b409SSimon J. Gerraty
19300957b409SSimon J. Gerraty m = -EQ(idx, k + 1);
19310957b409SSimon J. Gerraty for (u = 0; u < 20; u ++) {
19320957b409SSimon J. Gerraty xy[u] |= m & Gwin[k][u];
19330957b409SSimon J. Gerraty }
19340957b409SSimon J. Gerraty }
19350957b409SSimon J. Gerraty for (u = 0; u < 10; u ++) {
19360957b409SSimon J. Gerraty T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
19370957b409SSimon J. Gerraty T->x[(u << 1) + 1] = xy[u] >> 16;
19380957b409SSimon J. Gerraty T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
19390957b409SSimon J. Gerraty T->y[(u << 1) + 1] = xy[u + 10] >> 16;
19400957b409SSimon J. Gerraty }
19410957b409SSimon J. Gerraty memset(T->z, 0, sizeof T->z);
19420957b409SSimon J. Gerraty T->z[0] = 1;
19430957b409SSimon J. Gerraty }
19440957b409SSimon J. Gerraty
19450957b409SSimon J. Gerraty /*
19460957b409SSimon J. Gerraty * Multiply the generator by an integer. The integer is assumed non-zero
19470957b409SSimon J. Gerraty * and lower than the curve order.
19480957b409SSimon J. Gerraty */
19490957b409SSimon J. Gerraty static void
p256_mulgen(p256_jacobian * P,const unsigned char * x,size_t xlen)19500957b409SSimon J. Gerraty p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
19510957b409SSimon J. Gerraty {
19520957b409SSimon J. Gerraty /*
19530957b409SSimon J. Gerraty * qz is a flag that is initially 1, and remains equal to 1
19540957b409SSimon J. Gerraty * as long as the point is the point at infinity.
19550957b409SSimon J. Gerraty *
19560957b409SSimon J. Gerraty * We use a 4-bit window to handle multiplier bits by groups
19570957b409SSimon J. Gerraty * of 4. The precomputed window is constant static data, with
19580957b409SSimon J. Gerraty * points in affine coordinates; we use a constant-time lookup.
19590957b409SSimon J. Gerraty */
19600957b409SSimon J. Gerraty p256_jacobian Q;
19610957b409SSimon J. Gerraty uint32_t qz;
19620957b409SSimon J. Gerraty
19630957b409SSimon J. Gerraty memset(&Q, 0, sizeof Q);
19640957b409SSimon J. Gerraty qz = 1;
19650957b409SSimon J. Gerraty while (xlen -- > 0) {
19660957b409SSimon J. Gerraty int k;
19670957b409SSimon J. Gerraty unsigned bx;
19680957b409SSimon J. Gerraty
19690957b409SSimon J. Gerraty bx = *x ++;
19700957b409SSimon J. Gerraty for (k = 0; k < 2; k ++) {
19710957b409SSimon J. Gerraty uint32_t bits;
19720957b409SSimon J. Gerraty uint32_t bnz;
19730957b409SSimon J. Gerraty p256_jacobian T, U;
19740957b409SSimon J. Gerraty
19750957b409SSimon J. Gerraty p256_double(&Q);
19760957b409SSimon J. Gerraty p256_double(&Q);
19770957b409SSimon J. Gerraty p256_double(&Q);
19780957b409SSimon J. Gerraty p256_double(&Q);
19790957b409SSimon J. Gerraty bits = (bx >> 4) & 0x0F;
19800957b409SSimon J. Gerraty bnz = NEQ(bits, 0);
19810957b409SSimon J. Gerraty lookup_Gwin(&T, bits);
19820957b409SSimon J. Gerraty U = Q;
19830957b409SSimon J. Gerraty p256_add_mixed(&U, &T);
19840957b409SSimon J. Gerraty CCOPY(bnz & qz, &Q, &T, sizeof Q);
19850957b409SSimon J. Gerraty CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
19860957b409SSimon J. Gerraty qz &= ~bnz;
19870957b409SSimon J. Gerraty bx <<= 4;
19880957b409SSimon J. Gerraty }
19890957b409SSimon J. Gerraty }
19900957b409SSimon J. Gerraty *P = Q;
19910957b409SSimon J. Gerraty }
19920957b409SSimon J. Gerraty
19930957b409SSimon J. Gerraty static const unsigned char P256_G[] = {
19940957b409SSimon J. Gerraty 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
19950957b409SSimon J. Gerraty 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
19960957b409SSimon J. Gerraty 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
19970957b409SSimon J. Gerraty 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
19980957b409SSimon J. Gerraty 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
19990957b409SSimon J. Gerraty 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
20000957b409SSimon J. Gerraty 0x68, 0x37, 0xBF, 0x51, 0xF5
20010957b409SSimon J. Gerraty };
20020957b409SSimon J. Gerraty
20030957b409SSimon J. Gerraty static const unsigned char P256_N[] = {
20040957b409SSimon J. Gerraty 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
20050957b409SSimon J. Gerraty 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
20060957b409SSimon J. Gerraty 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
20070957b409SSimon J. Gerraty 0x25, 0x51
20080957b409SSimon J. Gerraty };
20090957b409SSimon J. Gerraty
20100957b409SSimon J. Gerraty static const unsigned char *
api_generator(int curve,size_t * len)20110957b409SSimon J. Gerraty api_generator(int curve, size_t *len)
20120957b409SSimon J. Gerraty {
20130957b409SSimon J. Gerraty (void)curve;
20140957b409SSimon J. Gerraty *len = sizeof P256_G;
20150957b409SSimon J. Gerraty return P256_G;
20160957b409SSimon J. Gerraty }
20170957b409SSimon J. Gerraty
20180957b409SSimon J. Gerraty static const unsigned char *
api_order(int curve,size_t * len)20190957b409SSimon J. Gerraty api_order(int curve, size_t *len)
20200957b409SSimon J. Gerraty {
20210957b409SSimon J. Gerraty (void)curve;
20220957b409SSimon J. Gerraty *len = sizeof P256_N;
20230957b409SSimon J. Gerraty return P256_N;
20240957b409SSimon J. Gerraty }
20250957b409SSimon J. Gerraty
20260957b409SSimon J. Gerraty static size_t
api_xoff(int curve,size_t * len)20270957b409SSimon J. Gerraty api_xoff(int curve, size_t *len)
20280957b409SSimon J. Gerraty {
20290957b409SSimon J. Gerraty (void)curve;
20300957b409SSimon J. Gerraty *len = 32;
20310957b409SSimon J. Gerraty return 1;
20320957b409SSimon J. Gerraty }
20330957b409SSimon J. Gerraty
20340957b409SSimon J. Gerraty static uint32_t
api_mul(unsigned char * G,size_t Glen,const unsigned char * x,size_t xlen,int curve)20350957b409SSimon J. Gerraty api_mul(unsigned char *G, size_t Glen,
20360957b409SSimon J. Gerraty const unsigned char *x, size_t xlen, int curve)
20370957b409SSimon J. Gerraty {
20380957b409SSimon J. Gerraty uint32_t r;
20390957b409SSimon J. Gerraty p256_jacobian P;
20400957b409SSimon J. Gerraty
20410957b409SSimon J. Gerraty (void)curve;
2042*cc9e6590SSimon J. Gerraty if (Glen != 65) {
2043*cc9e6590SSimon J. Gerraty return 0;
2044*cc9e6590SSimon J. Gerraty }
20450957b409SSimon J. Gerraty r = p256_decode(&P, G, Glen);
20460957b409SSimon J. Gerraty p256_mul(&P, x, xlen);
20470957b409SSimon J. Gerraty p256_to_affine(&P);
20480957b409SSimon J. Gerraty p256_encode(G, &P);
20490957b409SSimon J. Gerraty return r;
20500957b409SSimon J. Gerraty }
20510957b409SSimon J. Gerraty
20520957b409SSimon J. Gerraty static size_t
api_mulgen(unsigned char * R,const unsigned char * x,size_t xlen,int curve)20530957b409SSimon J. Gerraty api_mulgen(unsigned char *R,
20540957b409SSimon J. Gerraty const unsigned char *x, size_t xlen, int curve)
20550957b409SSimon J. Gerraty {
20560957b409SSimon J. Gerraty p256_jacobian P;
20570957b409SSimon J. Gerraty
20580957b409SSimon J. Gerraty (void)curve;
20590957b409SSimon J. Gerraty p256_mulgen(&P, x, xlen);
20600957b409SSimon J. Gerraty p256_to_affine(&P);
20610957b409SSimon J. Gerraty p256_encode(R, &P);
20620957b409SSimon J. Gerraty return 65;
20630957b409SSimon J. Gerraty }
20640957b409SSimon J. Gerraty
20650957b409SSimon J. Gerraty static uint32_t
api_muladd(unsigned char * A,const unsigned char * B,size_t len,const unsigned char * x,size_t xlen,const unsigned char * y,size_t ylen,int curve)20660957b409SSimon J. Gerraty api_muladd(unsigned char *A, const unsigned char *B, size_t len,
20670957b409SSimon J. Gerraty const unsigned char *x, size_t xlen,
20680957b409SSimon J. Gerraty const unsigned char *y, size_t ylen, int curve)
20690957b409SSimon J. Gerraty {
20700957b409SSimon J. Gerraty p256_jacobian P, Q;
20710957b409SSimon J. Gerraty uint32_t r, t, z;
20720957b409SSimon J. Gerraty int i;
20730957b409SSimon J. Gerraty
20740957b409SSimon J. Gerraty (void)curve;
2075*cc9e6590SSimon J. Gerraty if (len != 65) {
2076*cc9e6590SSimon J. Gerraty return 0;
2077*cc9e6590SSimon J. Gerraty }
20780957b409SSimon J. Gerraty r = p256_decode(&P, A, len);
20790957b409SSimon J. Gerraty p256_mul(&P, x, xlen);
20800957b409SSimon J. Gerraty if (B == NULL) {
20810957b409SSimon J. Gerraty p256_mulgen(&Q, y, ylen);
20820957b409SSimon J. Gerraty } else {
20830957b409SSimon J. Gerraty r &= p256_decode(&Q, B, len);
20840957b409SSimon J. Gerraty p256_mul(&Q, y, ylen);
20850957b409SSimon J. Gerraty }
20860957b409SSimon J. Gerraty
20870957b409SSimon J. Gerraty /*
20880957b409SSimon J. Gerraty * The final addition may fail in case both points are equal.
20890957b409SSimon J. Gerraty */
20900957b409SSimon J. Gerraty t = p256_add(&P, &Q);
20910957b409SSimon J. Gerraty reduce_final_f256(P.z);
20920957b409SSimon J. Gerraty z = 0;
20930957b409SSimon J. Gerraty for (i = 0; i < 20; i ++) {
20940957b409SSimon J. Gerraty z |= P.z[i];
20950957b409SSimon J. Gerraty }
20960957b409SSimon J. Gerraty z = EQ(z, 0);
20970957b409SSimon J. Gerraty p256_double(&Q);
20980957b409SSimon J. Gerraty
20990957b409SSimon J. Gerraty /*
21000957b409SSimon J. Gerraty * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
21010957b409SSimon J. Gerraty * have the following:
21020957b409SSimon J. Gerraty *
21030957b409SSimon J. Gerraty * z = 0, t = 0 return P (normal addition)
21040957b409SSimon J. Gerraty * z = 0, t = 1 return P (normal addition)
21050957b409SSimon J. Gerraty * z = 1, t = 0 return Q (a 'double' case)
21060957b409SSimon J. Gerraty * z = 1, t = 1 report an error (P+Q = 0)
21070957b409SSimon J. Gerraty */
21080957b409SSimon J. Gerraty CCOPY(z & ~t, &P, &Q, sizeof Q);
21090957b409SSimon J. Gerraty p256_to_affine(&P);
21100957b409SSimon J. Gerraty p256_encode(A, &P);
21110957b409SSimon J. Gerraty r &= ~(z & t);
21120957b409SSimon J. Gerraty return r;
21130957b409SSimon J. Gerraty }
21140957b409SSimon J. Gerraty
21150957b409SSimon J. Gerraty /* see bearssl_ec.h */
21160957b409SSimon J. Gerraty const br_ec_impl br_ec_p256_m15 = {
21170957b409SSimon J. Gerraty (uint32_t)0x00800000,
21180957b409SSimon J. Gerraty &api_generator,
21190957b409SSimon J. Gerraty &api_order,
21200957b409SSimon J. Gerraty &api_xoff,
21210957b409SSimon J. Gerraty &api_mul,
21220957b409SSimon J. Gerraty &api_mulgen,
21230957b409SSimon J. Gerraty &api_muladd
21240957b409SSimon J. Gerraty };
2125