10957b409SSimon J. Gerraty /*
20957b409SSimon J. Gerraty * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
30957b409SSimon J. Gerraty *
40957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining
50957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the
60957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including
70957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish,
80957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to
90957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to
100957b409SSimon J. Gerraty * the following conditions:
110957b409SSimon J. Gerraty *
120957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be
130957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software.
140957b409SSimon J. Gerraty *
150957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
160957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
170957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
180957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
190957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
200957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
210957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
220957b409SSimon J. Gerraty * SOFTWARE.
230957b409SSimon J. Gerraty */
240957b409SSimon J. Gerraty
250957b409SSimon J. Gerraty #include "inner.h"
260957b409SSimon J. Gerraty
270957b409SSimon J. Gerraty #if BR_INT128 || BR_UMUL128
280957b409SSimon J. Gerraty
290957b409SSimon J. Gerraty #if BR_UMUL128
300957b409SSimon J. Gerraty #include <intrin.h>
310957b409SSimon J. Gerraty #endif
320957b409SSimon J. Gerraty
330957b409SSimon J. Gerraty static const unsigned char P256_G[] = {
340957b409SSimon J. Gerraty 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
350957b409SSimon J. Gerraty 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
360957b409SSimon J. Gerraty 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
370957b409SSimon J. Gerraty 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
380957b409SSimon J. Gerraty 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
390957b409SSimon J. Gerraty 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
400957b409SSimon J. Gerraty 0x68, 0x37, 0xBF, 0x51, 0xF5
410957b409SSimon J. Gerraty };
420957b409SSimon J. Gerraty
430957b409SSimon J. Gerraty static const unsigned char P256_N[] = {
440957b409SSimon J. Gerraty 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
450957b409SSimon J. Gerraty 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
460957b409SSimon J. Gerraty 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
470957b409SSimon J. Gerraty 0x25, 0x51
480957b409SSimon J. Gerraty };
490957b409SSimon J. Gerraty
500957b409SSimon J. Gerraty static const unsigned char *
api_generator(int curve,size_t * len)510957b409SSimon J. Gerraty api_generator(int curve, size_t *len)
520957b409SSimon J. Gerraty {
530957b409SSimon J. Gerraty (void)curve;
540957b409SSimon J. Gerraty *len = sizeof P256_G;
550957b409SSimon J. Gerraty return P256_G;
560957b409SSimon J. Gerraty }
570957b409SSimon J. Gerraty
580957b409SSimon J. Gerraty static const unsigned char *
api_order(int curve,size_t * len)590957b409SSimon J. Gerraty api_order(int curve, size_t *len)
600957b409SSimon J. Gerraty {
610957b409SSimon J. Gerraty (void)curve;
620957b409SSimon J. Gerraty *len = sizeof P256_N;
630957b409SSimon J. Gerraty return P256_N;
640957b409SSimon J. Gerraty }
650957b409SSimon J. Gerraty
660957b409SSimon J. Gerraty static size_t
api_xoff(int curve,size_t * len)670957b409SSimon J. Gerraty api_xoff(int curve, size_t *len)
680957b409SSimon J. Gerraty {
690957b409SSimon J. Gerraty (void)curve;
700957b409SSimon J. Gerraty *len = 32;
710957b409SSimon J. Gerraty return 1;
720957b409SSimon J. Gerraty }
730957b409SSimon J. Gerraty
740957b409SSimon J. Gerraty /*
750957b409SSimon J. Gerraty * A field element is encoded as five 64-bit integers, in basis 2^52.
760957b409SSimon J. Gerraty * Limbs may occasionally exceed 2^52.
770957b409SSimon J. Gerraty *
780957b409SSimon J. Gerraty * A _partially reduced_ value is such that the following hold:
790957b409SSimon J. Gerraty * - top limb is less than 2^48 + 2^30
800957b409SSimon J. Gerraty * - the other limbs fit on 53 bits each
810957b409SSimon J. Gerraty * In particular, such a value is less than twice the modulus p.
820957b409SSimon J. Gerraty */
830957b409SSimon J. Gerraty
840957b409SSimon J. Gerraty #define BIT(n) ((uint64_t)1 << (n))
850957b409SSimon J. Gerraty #define MASK48 (BIT(48) - BIT(0))
860957b409SSimon J. Gerraty #define MASK52 (BIT(52) - BIT(0))
870957b409SSimon J. Gerraty
880957b409SSimon J. Gerraty /* R = 2^260 mod p */
890957b409SSimon J. Gerraty static const uint64_t F256_R[] = {
900957b409SSimon J. Gerraty 0x0000000000010, 0xF000000000000, 0xFFFFFFFFFFFFF,
910957b409SSimon J. Gerraty 0xFFEFFFFFFFFFF, 0x00000000FFFFF
920957b409SSimon J. Gerraty };
930957b409SSimon J. Gerraty
940957b409SSimon J. Gerraty /* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
950957b409SSimon J. Gerraty (Montgomery representation of B). */
960957b409SSimon J. Gerraty static const uint64_t P256_B_MONTY[] = {
970957b409SSimon J. Gerraty 0xDF6229C4BDDFD, 0xCA8843090D89C, 0x212ED6ACF005C,
980957b409SSimon J. Gerraty 0x83415A220ABF7, 0x0C30061DD4874
990957b409SSimon J. Gerraty };
1000957b409SSimon J. Gerraty
1010957b409SSimon J. Gerraty /*
1020957b409SSimon J. Gerraty * Addition in the field. Carry propagation is not performed.
1030957b409SSimon J. Gerraty * On input, limbs may be up to 63 bits each; on output, they will
1040957b409SSimon J. Gerraty * be up to one bit more than on input.
1050957b409SSimon J. Gerraty */
1060957b409SSimon J. Gerraty static inline void
f256_add(uint64_t * d,const uint64_t * a,const uint64_t * b)1070957b409SSimon J. Gerraty f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
1080957b409SSimon J. Gerraty {
1090957b409SSimon J. Gerraty d[0] = a[0] + b[0];
1100957b409SSimon J. Gerraty d[1] = a[1] + b[1];
1110957b409SSimon J. Gerraty d[2] = a[2] + b[2];
1120957b409SSimon J. Gerraty d[3] = a[3] + b[3];
1130957b409SSimon J. Gerraty d[4] = a[4] + b[4];
1140957b409SSimon J. Gerraty }
1150957b409SSimon J. Gerraty
1160957b409SSimon J. Gerraty /*
1170957b409SSimon J. Gerraty * Partially reduce the provided value.
1180957b409SSimon J. Gerraty * Input: limbs can go up to 61 bits each.
1190957b409SSimon J. Gerraty * Output: partially reduced.
1200957b409SSimon J. Gerraty */
1210957b409SSimon J. Gerraty static inline void
f256_partial_reduce(uint64_t * a)1220957b409SSimon J. Gerraty f256_partial_reduce(uint64_t *a)
1230957b409SSimon J. Gerraty {
1240957b409SSimon J. Gerraty uint64_t w, cc, s;
1250957b409SSimon J. Gerraty
1260957b409SSimon J. Gerraty /*
1270957b409SSimon J. Gerraty * Propagate carries.
1280957b409SSimon J. Gerraty */
1290957b409SSimon J. Gerraty w = a[0];
1300957b409SSimon J. Gerraty a[0] = w & MASK52;
1310957b409SSimon J. Gerraty cc = w >> 52;
1320957b409SSimon J. Gerraty w = a[1] + cc;
1330957b409SSimon J. Gerraty a[1] = w & MASK52;
1340957b409SSimon J. Gerraty cc = w >> 52;
1350957b409SSimon J. Gerraty w = a[2] + cc;
1360957b409SSimon J. Gerraty a[2] = w & MASK52;
1370957b409SSimon J. Gerraty cc = w >> 52;
1380957b409SSimon J. Gerraty w = a[3] + cc;
1390957b409SSimon J. Gerraty a[3] = w & MASK52;
1400957b409SSimon J. Gerraty cc = w >> 52;
1410957b409SSimon J. Gerraty a[4] += cc;
1420957b409SSimon J. Gerraty
1430957b409SSimon J. Gerraty s = a[4] >> 48; /* s < 2^14 */
1440957b409SSimon J. Gerraty a[0] += s; /* a[0] < 2^52 + 2^14 */
1450957b409SSimon J. Gerraty w = a[1] - (s << 44);
1460957b409SSimon J. Gerraty a[1] = w & MASK52; /* a[1] < 2^52 */
1470957b409SSimon J. Gerraty cc = -(w >> 52) & 0xFFF; /* cc < 16 */
1480957b409SSimon J. Gerraty w = a[2] - cc;
1490957b409SSimon J. Gerraty a[2] = w & MASK52; /* a[2] < 2^52 */
1500957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
1510957b409SSimon J. Gerraty w = a[3] - cc - (s << 36);
1520957b409SSimon J. Gerraty a[3] = w & MASK52; /* a[3] < 2^52 */
1530957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
1540957b409SSimon J. Gerraty w = a[4] & MASK48;
1550957b409SSimon J. Gerraty a[4] = w + (s << 16) - cc; /* a[4] < 2^48 + 2^30 */
1560957b409SSimon J. Gerraty }
1570957b409SSimon J. Gerraty
1580957b409SSimon J. Gerraty /*
1590957b409SSimon J. Gerraty * Subtraction in the field.
1600957b409SSimon J. Gerraty * Input: limbs must fit on 60 bits each; in particular, the complete
1610957b409SSimon J. Gerraty * integer will be less than 2^268 + 2^217.
1620957b409SSimon J. Gerraty * Output: partially reduced.
1630957b409SSimon J. Gerraty */
1640957b409SSimon J. Gerraty static inline void
f256_sub(uint64_t * d,const uint64_t * a,const uint64_t * b)1650957b409SSimon J. Gerraty f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
1660957b409SSimon J. Gerraty {
1670957b409SSimon J. Gerraty uint64_t t[5], w, s, cc;
1680957b409SSimon J. Gerraty
1690957b409SSimon J. Gerraty /*
1700957b409SSimon J. Gerraty * We compute d = 2^13*p + a - b; this ensures a positive
1710957b409SSimon J. Gerraty * intermediate value.
1720957b409SSimon J. Gerraty *
1730957b409SSimon J. Gerraty * Each individual addition/subtraction may yield a positive or
1740957b409SSimon J. Gerraty * negative result; thus, we need to handle a signed carry, thus
1750957b409SSimon J. Gerraty * with sign extension. We prefer not to use signed types (int64_t)
1760957b409SSimon J. Gerraty * because conversion from unsigned to signed is cumbersome (a
1770957b409SSimon J. Gerraty * direct cast with the top bit set is undefined behavior; instead,
1780957b409SSimon J. Gerraty * we have to use pointer aliasing, using the guaranteed properties
1790957b409SSimon J. Gerraty * of exact-width types, but this requires the compiler to optimize
1800957b409SSimon J. Gerraty * away the writes and reads from RAM), and right-shifting a
1810957b409SSimon J. Gerraty * signed negative value is implementation-defined. Therefore,
1820957b409SSimon J. Gerraty * we use a custom sign extension.
1830957b409SSimon J. Gerraty */
1840957b409SSimon J. Gerraty
1850957b409SSimon J. Gerraty w = a[0] - b[0] - BIT(13);
1860957b409SSimon J. Gerraty t[0] = w & MASK52;
1870957b409SSimon J. Gerraty cc = w >> 52;
1880957b409SSimon J. Gerraty cc |= -(cc & BIT(11));
1890957b409SSimon J. Gerraty w = a[1] - b[1] + cc;
1900957b409SSimon J. Gerraty t[1] = w & MASK52;
1910957b409SSimon J. Gerraty cc = w >> 52;
1920957b409SSimon J. Gerraty cc |= -(cc & BIT(11));
1930957b409SSimon J. Gerraty w = a[2] - b[2] + cc;
1940957b409SSimon J. Gerraty t[2] = (w & MASK52) + BIT(5);
1950957b409SSimon J. Gerraty cc = w >> 52;
1960957b409SSimon J. Gerraty cc |= -(cc & BIT(11));
1970957b409SSimon J. Gerraty w = a[3] - b[3] + cc;
1980957b409SSimon J. Gerraty t[3] = (w & MASK52) + BIT(49);
1990957b409SSimon J. Gerraty cc = w >> 52;
2000957b409SSimon J. Gerraty cc |= -(cc & BIT(11));
2010957b409SSimon J. Gerraty t[4] = (BIT(61) - BIT(29)) + a[4] - b[4] + cc;
2020957b409SSimon J. Gerraty
2030957b409SSimon J. Gerraty /*
2040957b409SSimon J. Gerraty * Perform partial reduction. Rule is:
2050957b409SSimon J. Gerraty * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
2060957b409SSimon J. Gerraty *
2070957b409SSimon J. Gerraty * At that point:
2080957b409SSimon J. Gerraty * 0 <= t[0] <= 2^52 - 1
2090957b409SSimon J. Gerraty * 0 <= t[1] <= 2^52 - 1
2100957b409SSimon J. Gerraty * 2^5 <= t[2] <= 2^52 + 2^5 - 1
2110957b409SSimon J. Gerraty * 2^49 <= t[3] <= 2^52 + 2^49 - 1
2120957b409SSimon J. Gerraty * 2^59 < t[4] <= 2^61 + 2^60 - 2^29
2130957b409SSimon J. Gerraty *
2140957b409SSimon J. Gerraty * Thus, the value 's' (t[4] / 2^48) will be necessarily
2150957b409SSimon J. Gerraty * greater than 2048, and less than 12288.
2160957b409SSimon J. Gerraty */
2170957b409SSimon J. Gerraty s = t[4] >> 48;
2180957b409SSimon J. Gerraty
2190957b409SSimon J. Gerraty d[0] = t[0] + s; /* d[0] <= 2^52 + 12287 */
2200957b409SSimon J. Gerraty w = t[1] - (s << 44);
2210957b409SSimon J. Gerraty d[1] = w & MASK52; /* d[1] <= 2^52 - 1 */
2220957b409SSimon J. Gerraty cc = -(w >> 52) & 0xFFF; /* cc <= 48 */
2230957b409SSimon J. Gerraty w = t[2] - cc;
2240957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
2250957b409SSimon J. Gerraty d[2] = w + (cc << 52); /* d[2] <= 2^52 + 31 */
2260957b409SSimon J. Gerraty w = t[3] - cc - (s << 36);
2270957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
2280957b409SSimon J. Gerraty d[3] = w + (cc << 52); /* t[3] <= 2^52 + 2^49 - 1 */
2290957b409SSimon J. Gerraty d[4] = (t[4] & MASK48) + (s << 16) - cc; /* d[4] < 2^48 + 2^30 */
2300957b409SSimon J. Gerraty
2310957b409SSimon J. Gerraty /*
2320957b409SSimon J. Gerraty * If s = 0, then none of the limbs is modified, and there cannot
2330957b409SSimon J. Gerraty * be an overflow; if s != 0, then (s << 16) > cc, and there is
2340957b409SSimon J. Gerraty * no overflow either.
2350957b409SSimon J. Gerraty */
2360957b409SSimon J. Gerraty }
2370957b409SSimon J. Gerraty
2380957b409SSimon J. Gerraty /*
2390957b409SSimon J. Gerraty * Montgomery multiplication in the field.
2400957b409SSimon J. Gerraty * Input: limbs must fit on 56 bits each.
2410957b409SSimon J. Gerraty * Output: partially reduced.
2420957b409SSimon J. Gerraty */
2430957b409SSimon J. Gerraty static void
f256_montymul(uint64_t * d,const uint64_t * a,const uint64_t * b)2440957b409SSimon J. Gerraty f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
2450957b409SSimon J. Gerraty {
2460957b409SSimon J. Gerraty #if BR_INT128
2470957b409SSimon J. Gerraty
2480957b409SSimon J. Gerraty int i;
2490957b409SSimon J. Gerraty uint64_t t[5];
2500957b409SSimon J. Gerraty
2510957b409SSimon J. Gerraty t[0] = 0;
2520957b409SSimon J. Gerraty t[1] = 0;
2530957b409SSimon J. Gerraty t[2] = 0;
2540957b409SSimon J. Gerraty t[3] = 0;
2550957b409SSimon J. Gerraty t[4] = 0;
2560957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
2570957b409SSimon J. Gerraty uint64_t x, f, cc, w, s;
2580957b409SSimon J. Gerraty unsigned __int128 z;
2590957b409SSimon J. Gerraty
2600957b409SSimon J. Gerraty /*
2610957b409SSimon J. Gerraty * Since limbs of a[] and b[] fit on 56 bits each,
2620957b409SSimon J. Gerraty * each individual product fits on 112 bits. Also,
2630957b409SSimon J. Gerraty * the factor f fits on 52 bits, so f<<48 fits on
2640957b409SSimon J. Gerraty * 112 bits too. This guarantees that carries (cc)
2650957b409SSimon J. Gerraty * will fit on 62 bits, thus no overflow.
2660957b409SSimon J. Gerraty *
2670957b409SSimon J. Gerraty * The operations below compute:
2680957b409SSimon J. Gerraty * t <- (t + x*b + f*p) / 2^64
2690957b409SSimon J. Gerraty */
2700957b409SSimon J. Gerraty x = a[i];
2710957b409SSimon J. Gerraty z = (unsigned __int128)b[0] * (unsigned __int128)x
2720957b409SSimon J. Gerraty + (unsigned __int128)t[0];
2730957b409SSimon J. Gerraty f = (uint64_t)z & MASK52;
2740957b409SSimon J. Gerraty cc = (uint64_t)(z >> 52);
2750957b409SSimon J. Gerraty z = (unsigned __int128)b[1] * (unsigned __int128)x
2760957b409SSimon J. Gerraty + (unsigned __int128)t[1] + cc
2770957b409SSimon J. Gerraty + ((unsigned __int128)f << 44);
2780957b409SSimon J. Gerraty t[0] = (uint64_t)z & MASK52;
2790957b409SSimon J. Gerraty cc = (uint64_t)(z >> 52);
2800957b409SSimon J. Gerraty z = (unsigned __int128)b[2] * (unsigned __int128)x
2810957b409SSimon J. Gerraty + (unsigned __int128)t[2] + cc;
2820957b409SSimon J. Gerraty t[1] = (uint64_t)z & MASK52;
2830957b409SSimon J. Gerraty cc = (uint64_t)(z >> 52);
2840957b409SSimon J. Gerraty z = (unsigned __int128)b[3] * (unsigned __int128)x
2850957b409SSimon J. Gerraty + (unsigned __int128)t[3] + cc
2860957b409SSimon J. Gerraty + ((unsigned __int128)f << 36);
2870957b409SSimon J. Gerraty t[2] = (uint64_t)z & MASK52;
2880957b409SSimon J. Gerraty cc = (uint64_t)(z >> 52);
2890957b409SSimon J. Gerraty z = (unsigned __int128)b[4] * (unsigned __int128)x
2900957b409SSimon J. Gerraty + (unsigned __int128)t[4] + cc
2910957b409SSimon J. Gerraty + ((unsigned __int128)f << 48)
2920957b409SSimon J. Gerraty - ((unsigned __int128)f << 16);
2930957b409SSimon J. Gerraty t[3] = (uint64_t)z & MASK52;
2940957b409SSimon J. Gerraty t[4] = (uint64_t)(z >> 52);
2950957b409SSimon J. Gerraty
2960957b409SSimon J. Gerraty /*
2970957b409SSimon J. Gerraty * t[4] may be up to 62 bits here; we need to do a
2980957b409SSimon J. Gerraty * partial reduction. Note that limbs t[0] to t[3]
2990957b409SSimon J. Gerraty * fit on 52 bits each.
3000957b409SSimon J. Gerraty */
3010957b409SSimon J. Gerraty s = t[4] >> 48; /* s < 2^14 */
3020957b409SSimon J. Gerraty t[0] += s; /* t[0] < 2^52 + 2^14 */
3030957b409SSimon J. Gerraty w = t[1] - (s << 44);
3040957b409SSimon J. Gerraty t[1] = w & MASK52; /* t[1] < 2^52 */
3050957b409SSimon J. Gerraty cc = -(w >> 52) & 0xFFF; /* cc < 16 */
3060957b409SSimon J. Gerraty w = t[2] - cc;
3070957b409SSimon J. Gerraty t[2] = w & MASK52; /* t[2] < 2^52 */
3080957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
3090957b409SSimon J. Gerraty w = t[3] - cc - (s << 36);
3100957b409SSimon J. Gerraty t[3] = w & MASK52; /* t[3] < 2^52 */
3110957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
3120957b409SSimon J. Gerraty w = t[4] & MASK48;
3130957b409SSimon J. Gerraty t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */
3140957b409SSimon J. Gerraty
3150957b409SSimon J. Gerraty /*
3160957b409SSimon J. Gerraty * The final t[4] cannot overflow because cc is 0 or 1,
3170957b409SSimon J. Gerraty * and cc can be 1 only if s != 0.
3180957b409SSimon J. Gerraty */
3190957b409SSimon J. Gerraty }
3200957b409SSimon J. Gerraty
3210957b409SSimon J. Gerraty d[0] = t[0];
3220957b409SSimon J. Gerraty d[1] = t[1];
3230957b409SSimon J. Gerraty d[2] = t[2];
3240957b409SSimon J. Gerraty d[3] = t[3];
3250957b409SSimon J. Gerraty d[4] = t[4];
3260957b409SSimon J. Gerraty
3270957b409SSimon J. Gerraty #elif BR_UMUL128
3280957b409SSimon J. Gerraty
3290957b409SSimon J. Gerraty int i;
3300957b409SSimon J. Gerraty uint64_t t[5];
3310957b409SSimon J. Gerraty
3320957b409SSimon J. Gerraty t[0] = 0;
3330957b409SSimon J. Gerraty t[1] = 0;
3340957b409SSimon J. Gerraty t[2] = 0;
3350957b409SSimon J. Gerraty t[3] = 0;
3360957b409SSimon J. Gerraty t[4] = 0;
3370957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
3380957b409SSimon J. Gerraty uint64_t x, f, cc, w, s, zh, zl;
3390957b409SSimon J. Gerraty unsigned char k;
3400957b409SSimon J. Gerraty
3410957b409SSimon J. Gerraty /*
3420957b409SSimon J. Gerraty * Since limbs of a[] and b[] fit on 56 bits each,
3430957b409SSimon J. Gerraty * each individual product fits on 112 bits. Also,
3440957b409SSimon J. Gerraty * the factor f fits on 52 bits, so f<<48 fits on
3450957b409SSimon J. Gerraty * 112 bits too. This guarantees that carries (cc)
3460957b409SSimon J. Gerraty * will fit on 62 bits, thus no overflow.
3470957b409SSimon J. Gerraty *
3480957b409SSimon J. Gerraty * The operations below compute:
3490957b409SSimon J. Gerraty * t <- (t + x*b + f*p) / 2^64
3500957b409SSimon J. Gerraty */
3510957b409SSimon J. Gerraty x = a[i];
3520957b409SSimon J. Gerraty zl = _umul128(b[0], x, &zh);
3530957b409SSimon J. Gerraty k = _addcarry_u64(0, t[0], zl, &zl);
3540957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3550957b409SSimon J. Gerraty f = zl & MASK52;
3560957b409SSimon J. Gerraty cc = (zl >> 52) | (zh << 12);
3570957b409SSimon J. Gerraty
3580957b409SSimon J. Gerraty zl = _umul128(b[1], x, &zh);
3590957b409SSimon J. Gerraty k = _addcarry_u64(0, t[1], zl, &zl);
3600957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3610957b409SSimon J. Gerraty k = _addcarry_u64(0, cc, zl, &zl);
3620957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3630957b409SSimon J. Gerraty k = _addcarry_u64(0, f << 44, zl, &zl);
3640957b409SSimon J. Gerraty (void)_addcarry_u64(k, f >> 20, zh, &zh);
3650957b409SSimon J. Gerraty t[0] = zl & MASK52;
3660957b409SSimon J. Gerraty cc = (zl >> 52) | (zh << 12);
3670957b409SSimon J. Gerraty
3680957b409SSimon J. Gerraty zl = _umul128(b[2], x, &zh);
3690957b409SSimon J. Gerraty k = _addcarry_u64(0, t[2], zl, &zl);
3700957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3710957b409SSimon J. Gerraty k = _addcarry_u64(0, cc, zl, &zl);
3720957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3730957b409SSimon J. Gerraty t[1] = zl & MASK52;
3740957b409SSimon J. Gerraty cc = (zl >> 52) | (zh << 12);
3750957b409SSimon J. Gerraty
3760957b409SSimon J. Gerraty zl = _umul128(b[3], x, &zh);
3770957b409SSimon J. Gerraty k = _addcarry_u64(0, t[3], zl, &zl);
3780957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3790957b409SSimon J. Gerraty k = _addcarry_u64(0, cc, zl, &zl);
3800957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3810957b409SSimon J. Gerraty k = _addcarry_u64(0, f << 36, zl, &zl);
3820957b409SSimon J. Gerraty (void)_addcarry_u64(k, f >> 28, zh, &zh);
3830957b409SSimon J. Gerraty t[2] = zl & MASK52;
3840957b409SSimon J. Gerraty cc = (zl >> 52) | (zh << 12);
3850957b409SSimon J. Gerraty
3860957b409SSimon J. Gerraty zl = _umul128(b[4], x, &zh);
3870957b409SSimon J. Gerraty k = _addcarry_u64(0, t[4], zl, &zl);
3880957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3890957b409SSimon J. Gerraty k = _addcarry_u64(0, cc, zl, &zl);
3900957b409SSimon J. Gerraty (void)_addcarry_u64(k, 0, zh, &zh);
3910957b409SSimon J. Gerraty k = _addcarry_u64(0, f << 48, zl, &zl);
3920957b409SSimon J. Gerraty (void)_addcarry_u64(k, f >> 16, zh, &zh);
3930957b409SSimon J. Gerraty k = _subborrow_u64(0, zl, f << 16, &zl);
3940957b409SSimon J. Gerraty (void)_subborrow_u64(k, zh, f >> 48, &zh);
3950957b409SSimon J. Gerraty t[3] = zl & MASK52;
3960957b409SSimon J. Gerraty t[4] = (zl >> 52) | (zh << 12);
3970957b409SSimon J. Gerraty
3980957b409SSimon J. Gerraty /*
3990957b409SSimon J. Gerraty * t[4] may be up to 62 bits here; we need to do a
4000957b409SSimon J. Gerraty * partial reduction. Note that limbs t[0] to t[3]
4010957b409SSimon J. Gerraty * fit on 52 bits each.
4020957b409SSimon J. Gerraty */
4030957b409SSimon J. Gerraty s = t[4] >> 48; /* s < 2^14 */
4040957b409SSimon J. Gerraty t[0] += s; /* t[0] < 2^52 + 2^14 */
4050957b409SSimon J. Gerraty w = t[1] - (s << 44);
4060957b409SSimon J. Gerraty t[1] = w & MASK52; /* t[1] < 2^52 */
4070957b409SSimon J. Gerraty cc = -(w >> 52) & 0xFFF; /* cc < 16 */
4080957b409SSimon J. Gerraty w = t[2] - cc;
4090957b409SSimon J. Gerraty t[2] = w & MASK52; /* t[2] < 2^52 */
4100957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
4110957b409SSimon J. Gerraty w = t[3] - cc - (s << 36);
4120957b409SSimon J. Gerraty t[3] = w & MASK52; /* t[3] < 2^52 */
4130957b409SSimon J. Gerraty cc = w >> 63; /* cc = 0 or 1 */
4140957b409SSimon J. Gerraty w = t[4] & MASK48;
4150957b409SSimon J. Gerraty t[4] = w + (s << 16) - cc; /* t[4] < 2^48 + 2^30 */
4160957b409SSimon J. Gerraty
4170957b409SSimon J. Gerraty /*
4180957b409SSimon J. Gerraty * The final t[4] cannot overflow because cc is 0 or 1,
4190957b409SSimon J. Gerraty * and cc can be 1 only if s != 0.
4200957b409SSimon J. Gerraty */
4210957b409SSimon J. Gerraty }
4220957b409SSimon J. Gerraty
4230957b409SSimon J. Gerraty d[0] = t[0];
4240957b409SSimon J. Gerraty d[1] = t[1];
4250957b409SSimon J. Gerraty d[2] = t[2];
4260957b409SSimon J. Gerraty d[3] = t[3];
4270957b409SSimon J. Gerraty d[4] = t[4];
4280957b409SSimon J. Gerraty
4290957b409SSimon J. Gerraty #endif
4300957b409SSimon J. Gerraty }
4310957b409SSimon J. Gerraty
4320957b409SSimon J. Gerraty /*
4330957b409SSimon J. Gerraty * Montgomery squaring in the field; currently a basic wrapper around
4340957b409SSimon J. Gerraty * multiplication (inline, should be optimized away).
4350957b409SSimon J. Gerraty * TODO: see if some extra speed can be gained here.
4360957b409SSimon J. Gerraty */
4370957b409SSimon J. Gerraty static inline void
f256_montysquare(uint64_t * d,const uint64_t * a)4380957b409SSimon J. Gerraty f256_montysquare(uint64_t *d, const uint64_t *a)
4390957b409SSimon J. Gerraty {
4400957b409SSimon J. Gerraty f256_montymul(d, a, a);
4410957b409SSimon J. Gerraty }
4420957b409SSimon J. Gerraty
4430957b409SSimon J. Gerraty /*
4440957b409SSimon J. Gerraty * Convert to Montgomery representation.
4450957b409SSimon J. Gerraty */
4460957b409SSimon J. Gerraty static void
f256_tomonty(uint64_t * d,const uint64_t * a)4470957b409SSimon J. Gerraty f256_tomonty(uint64_t *d, const uint64_t *a)
4480957b409SSimon J. Gerraty {
4490957b409SSimon J. Gerraty /*
4500957b409SSimon J. Gerraty * R2 = 2^520 mod p.
4510957b409SSimon J. Gerraty * If R = 2^260 mod p, then R2 = R^2 mod p; and the Montgomery
4520957b409SSimon J. Gerraty * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
4530957b409SSimon J. Gerraty * conversion to Montgomery representation.
4540957b409SSimon J. Gerraty */
4550957b409SSimon J. Gerraty static const uint64_t R2[] = {
4560957b409SSimon J. Gerraty 0x0000000000300, 0xFFFFFFFF00000, 0xFFFFEFFFFFFFB,
4570957b409SSimon J. Gerraty 0xFDFFFFFFFFFFF, 0x0000004FFFFFF
4580957b409SSimon J. Gerraty };
4590957b409SSimon J. Gerraty
4600957b409SSimon J. Gerraty f256_montymul(d, a, R2);
4610957b409SSimon J. Gerraty }
4620957b409SSimon J. Gerraty
4630957b409SSimon J. Gerraty /*
4640957b409SSimon J. Gerraty * Convert from Montgomery representation.
4650957b409SSimon J. Gerraty */
4660957b409SSimon J. Gerraty static void
f256_frommonty(uint64_t * d,const uint64_t * a)4670957b409SSimon J. Gerraty f256_frommonty(uint64_t *d, const uint64_t *a)
4680957b409SSimon J. Gerraty {
4690957b409SSimon J. Gerraty /*
4700957b409SSimon J. Gerraty * Montgomery multiplication by 1 is division by 2^260 modulo p.
4710957b409SSimon J. Gerraty */
4720957b409SSimon J. Gerraty static const uint64_t one[] = { 1, 0, 0, 0, 0 };
4730957b409SSimon J. Gerraty
4740957b409SSimon J. Gerraty f256_montymul(d, a, one);
4750957b409SSimon J. Gerraty }
4760957b409SSimon J. Gerraty
4770957b409SSimon J. Gerraty /*
4780957b409SSimon J. Gerraty * Inversion in the field. If the source value is 0 modulo p, then this
4790957b409SSimon J. Gerraty * returns 0 or p. This function uses Montgomery representation.
4800957b409SSimon J. Gerraty */
4810957b409SSimon J. Gerraty static void
f256_invert(uint64_t * d,const uint64_t * a)4820957b409SSimon J. Gerraty f256_invert(uint64_t *d, const uint64_t *a)
4830957b409SSimon J. Gerraty {
4840957b409SSimon J. Gerraty /*
4850957b409SSimon J. Gerraty * We compute a^(p-2) mod p. The exponent pattern (from high to
4860957b409SSimon J. Gerraty * low) is:
4870957b409SSimon J. Gerraty * - 32 bits of value 1
4880957b409SSimon J. Gerraty * - 31 bits of value 0
4890957b409SSimon J. Gerraty * - 1 bit of value 1
4900957b409SSimon J. Gerraty * - 96 bits of value 0
4910957b409SSimon J. Gerraty * - 94 bits of value 1
4920957b409SSimon J. Gerraty * - 1 bit of value 0
4930957b409SSimon J. Gerraty * - 1 bit of value 1
4940957b409SSimon J. Gerraty * To speed up the square-and-multiply algorithm, we precompute
4950957b409SSimon J. Gerraty * a^(2^31-1).
4960957b409SSimon J. Gerraty */
4970957b409SSimon J. Gerraty
4980957b409SSimon J. Gerraty uint64_t r[5], t[5];
4990957b409SSimon J. Gerraty int i;
5000957b409SSimon J. Gerraty
5010957b409SSimon J. Gerraty memcpy(t, a, sizeof t);
5020957b409SSimon J. Gerraty for (i = 0; i < 30; i ++) {
5030957b409SSimon J. Gerraty f256_montysquare(t, t);
5040957b409SSimon J. Gerraty f256_montymul(t, t, a);
5050957b409SSimon J. Gerraty }
5060957b409SSimon J. Gerraty
5070957b409SSimon J. Gerraty memcpy(r, t, sizeof t);
5080957b409SSimon J. Gerraty for (i = 224; i >= 0; i --) {
5090957b409SSimon J. Gerraty f256_montysquare(r, r);
5100957b409SSimon J. Gerraty switch (i) {
5110957b409SSimon J. Gerraty case 0:
5120957b409SSimon J. Gerraty case 2:
5130957b409SSimon J. Gerraty case 192:
5140957b409SSimon J. Gerraty case 224:
5150957b409SSimon J. Gerraty f256_montymul(r, r, a);
5160957b409SSimon J. Gerraty break;
5170957b409SSimon J. Gerraty case 3:
5180957b409SSimon J. Gerraty case 34:
5190957b409SSimon J. Gerraty case 65:
5200957b409SSimon J. Gerraty f256_montymul(r, r, t);
5210957b409SSimon J. Gerraty break;
5220957b409SSimon J. Gerraty }
5230957b409SSimon J. Gerraty }
5240957b409SSimon J. Gerraty memcpy(d, r, sizeof r);
5250957b409SSimon J. Gerraty }
5260957b409SSimon J. Gerraty
5270957b409SSimon J. Gerraty /*
5280957b409SSimon J. Gerraty * Finalize reduction.
5290957b409SSimon J. Gerraty * Input value should be partially reduced.
5300957b409SSimon J. Gerraty * On output, limbs a[0] to a[3] fit on 52 bits each, limb a[4] fits
5310957b409SSimon J. Gerraty * on 48 bits, and the integer is less than p.
5320957b409SSimon J. Gerraty */
5330957b409SSimon J. Gerraty static inline void
f256_final_reduce(uint64_t * a)5340957b409SSimon J. Gerraty f256_final_reduce(uint64_t *a)
5350957b409SSimon J. Gerraty {
5360957b409SSimon J. Gerraty uint64_t r[5], t[5], w, cc;
5370957b409SSimon J. Gerraty int i;
5380957b409SSimon J. Gerraty
5390957b409SSimon J. Gerraty /*
5400957b409SSimon J. Gerraty * Propagate carries to ensure that limbs 0 to 3 fit on 52 bits.
5410957b409SSimon J. Gerraty */
5420957b409SSimon J. Gerraty cc = 0;
5430957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
5440957b409SSimon J. Gerraty w = a[i] + cc;
5450957b409SSimon J. Gerraty r[i] = w & MASK52;
5460957b409SSimon J. Gerraty cc = w >> 52;
5470957b409SSimon J. Gerraty }
5480957b409SSimon J. Gerraty
5490957b409SSimon J. Gerraty /*
5500957b409SSimon J. Gerraty * We compute t = r + (2^256 - p) = r + 2^224 - 2^192 - 2^96 + 1.
5510957b409SSimon J. Gerraty * If t < 2^256, then r < p, and we return r. Otherwise, we
5520957b409SSimon J. Gerraty * want to return r - p = t - 2^256.
5530957b409SSimon J. Gerraty */
5540957b409SSimon J. Gerraty
5550957b409SSimon J. Gerraty /*
5560957b409SSimon J. Gerraty * Add 2^224 + 1, and propagate carries to ensure that limbs
5570957b409SSimon J. Gerraty * t[0] to t[3] fit in 52 bits each.
5580957b409SSimon J. Gerraty */
5590957b409SSimon J. Gerraty w = r[0] + 1;
5600957b409SSimon J. Gerraty t[0] = w & MASK52;
5610957b409SSimon J. Gerraty cc = w >> 52;
5620957b409SSimon J. Gerraty w = r[1] + cc;
5630957b409SSimon J. Gerraty t[1] = w & MASK52;
5640957b409SSimon J. Gerraty cc = w >> 52;
5650957b409SSimon J. Gerraty w = r[2] + cc;
5660957b409SSimon J. Gerraty t[2] = w & MASK52;
5670957b409SSimon J. Gerraty cc = w >> 52;
5680957b409SSimon J. Gerraty w = r[3] + cc;
5690957b409SSimon J. Gerraty t[3] = w & MASK52;
5700957b409SSimon J. Gerraty cc = w >> 52;
5710957b409SSimon J. Gerraty t[4] = r[4] + cc + BIT(16);
5720957b409SSimon J. Gerraty
5730957b409SSimon J. Gerraty /*
5740957b409SSimon J. Gerraty * Subtract 2^192 + 2^96. Since we just added 2^224 + 1, the
5750957b409SSimon J. Gerraty * result cannot be negative.
5760957b409SSimon J. Gerraty */
5770957b409SSimon J. Gerraty w = t[1] - BIT(44);
5780957b409SSimon J. Gerraty t[1] = w & MASK52;
5790957b409SSimon J. Gerraty cc = w >> 63;
5800957b409SSimon J. Gerraty w = t[2] - cc;
5810957b409SSimon J. Gerraty t[2] = w & MASK52;
5820957b409SSimon J. Gerraty cc = w >> 63;
583*cc9e6590SSimon J. Gerraty w = t[3] - BIT(36) - cc;
5840957b409SSimon J. Gerraty t[3] = w & MASK52;
5850957b409SSimon J. Gerraty cc = w >> 63;
5860957b409SSimon J. Gerraty t[4] -= cc;
5870957b409SSimon J. Gerraty
5880957b409SSimon J. Gerraty /*
5890957b409SSimon J. Gerraty * If the top limb t[4] fits on 48 bits, then r[] is already
5900957b409SSimon J. Gerraty * in the proper range. Otherwise, t[] is the value to return
5910957b409SSimon J. Gerraty * (truncated to 256 bits).
5920957b409SSimon J. Gerraty */
5930957b409SSimon J. Gerraty cc = -(t[4] >> 48);
5940957b409SSimon J. Gerraty t[4] &= MASK48;
5950957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
5960957b409SSimon J. Gerraty a[i] = r[i] ^ (cc & (r[i] ^ t[i]));
5970957b409SSimon J. Gerraty }
5980957b409SSimon J. Gerraty }
5990957b409SSimon J. Gerraty
6000957b409SSimon J. Gerraty /*
6010957b409SSimon J. Gerraty * Points in affine and Jacobian coordinates.
6020957b409SSimon J. Gerraty *
6030957b409SSimon J. Gerraty * - In affine coordinates, the point-at-infinity cannot be encoded.
6040957b409SSimon J. Gerraty * - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
6050957b409SSimon J. Gerraty * if Z = 0 then this is the point-at-infinity.
6060957b409SSimon J. Gerraty */
6070957b409SSimon J. Gerraty typedef struct {
6080957b409SSimon J. Gerraty uint64_t x[5];
6090957b409SSimon J. Gerraty uint64_t y[5];
6100957b409SSimon J. Gerraty } p256_affine;
6110957b409SSimon J. Gerraty
6120957b409SSimon J. Gerraty typedef struct {
6130957b409SSimon J. Gerraty uint64_t x[5];
6140957b409SSimon J. Gerraty uint64_t y[5];
6150957b409SSimon J. Gerraty uint64_t z[5];
6160957b409SSimon J. Gerraty } p256_jacobian;
6170957b409SSimon J. Gerraty
6180957b409SSimon J. Gerraty /*
6190957b409SSimon J. Gerraty * Decode a field element (unsigned big endian notation).
6200957b409SSimon J. Gerraty */
6210957b409SSimon J. Gerraty static void
f256_decode(uint64_t * a,const unsigned char * buf)6220957b409SSimon J. Gerraty f256_decode(uint64_t *a, const unsigned char *buf)
6230957b409SSimon J. Gerraty {
6240957b409SSimon J. Gerraty uint64_t w0, w1, w2, w3;
6250957b409SSimon J. Gerraty
6260957b409SSimon J. Gerraty w3 = br_dec64be(buf + 0);
6270957b409SSimon J. Gerraty w2 = br_dec64be(buf + 8);
6280957b409SSimon J. Gerraty w1 = br_dec64be(buf + 16);
6290957b409SSimon J. Gerraty w0 = br_dec64be(buf + 24);
6300957b409SSimon J. Gerraty a[0] = w0 & MASK52;
6310957b409SSimon J. Gerraty a[1] = ((w0 >> 52) | (w1 << 12)) & MASK52;
6320957b409SSimon J. Gerraty a[2] = ((w1 >> 40) | (w2 << 24)) & MASK52;
6330957b409SSimon J. Gerraty a[3] = ((w2 >> 28) | (w3 << 36)) & MASK52;
6340957b409SSimon J. Gerraty a[4] = w3 >> 16;
6350957b409SSimon J. Gerraty }
6360957b409SSimon J. Gerraty
6370957b409SSimon J. Gerraty /*
6380957b409SSimon J. Gerraty * Encode a field element (unsigned big endian notation). The field
6390957b409SSimon J. Gerraty * element MUST be fully reduced.
6400957b409SSimon J. Gerraty */
6410957b409SSimon J. Gerraty static void
f256_encode(unsigned char * buf,const uint64_t * a)6420957b409SSimon J. Gerraty f256_encode(unsigned char *buf, const uint64_t *a)
6430957b409SSimon J. Gerraty {
6440957b409SSimon J. Gerraty uint64_t w0, w1, w2, w3;
6450957b409SSimon J. Gerraty
6460957b409SSimon J. Gerraty w0 = a[0] | (a[1] << 52);
6470957b409SSimon J. Gerraty w1 = (a[1] >> 12) | (a[2] << 40);
6480957b409SSimon J. Gerraty w2 = (a[2] >> 24) | (a[3] << 28);
6490957b409SSimon J. Gerraty w3 = (a[3] >> 36) | (a[4] << 16);
6500957b409SSimon J. Gerraty br_enc64be(buf + 0, w3);
6510957b409SSimon J. Gerraty br_enc64be(buf + 8, w2);
6520957b409SSimon J. Gerraty br_enc64be(buf + 16, w1);
6530957b409SSimon J. Gerraty br_enc64be(buf + 24, w0);
6540957b409SSimon J. Gerraty }
6550957b409SSimon J. Gerraty
6560957b409SSimon J. Gerraty /*
6570957b409SSimon J. Gerraty * Decode a point. The returned point is in Jacobian coordinates, but
6580957b409SSimon J. Gerraty * with z = 1. If the encoding is invalid, or encodes a point which is
6590957b409SSimon J. Gerraty * not on the curve, or encodes the point at infinity, then this function
6600957b409SSimon J. Gerraty * returns 0. Otherwise, 1 is returned.
6610957b409SSimon J. Gerraty *
6620957b409SSimon J. Gerraty * The buffer is assumed to have length exactly 65 bytes.
6630957b409SSimon J. Gerraty */
6640957b409SSimon J. Gerraty static uint32_t
point_decode(p256_jacobian * P,const unsigned char * buf)6650957b409SSimon J. Gerraty point_decode(p256_jacobian *P, const unsigned char *buf)
6660957b409SSimon J. Gerraty {
6670957b409SSimon J. Gerraty uint64_t x[5], y[5], t[5], x3[5], tt;
6680957b409SSimon J. Gerraty uint32_t r;
6690957b409SSimon J. Gerraty
6700957b409SSimon J. Gerraty /*
6710957b409SSimon J. Gerraty * Header byte shall be 0x04.
6720957b409SSimon J. Gerraty */
6730957b409SSimon J. Gerraty r = EQ(buf[0], 0x04);
6740957b409SSimon J. Gerraty
6750957b409SSimon J. Gerraty /*
6760957b409SSimon J. Gerraty * Decode X and Y coordinates, and convert them into
6770957b409SSimon J. Gerraty * Montgomery representation.
6780957b409SSimon J. Gerraty */
6790957b409SSimon J. Gerraty f256_decode(x, buf + 1);
6800957b409SSimon J. Gerraty f256_decode(y, buf + 33);
6810957b409SSimon J. Gerraty f256_tomonty(x, x);
6820957b409SSimon J. Gerraty f256_tomonty(y, y);
6830957b409SSimon J. Gerraty
6840957b409SSimon J. Gerraty /*
6850957b409SSimon J. Gerraty * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
6860957b409SSimon J. Gerraty * Note that the Montgomery representation of 0 is 0. We must
6870957b409SSimon J. Gerraty * take care to apply the final reduction to make sure we have
6880957b409SSimon J. Gerraty * 0 and not p.
6890957b409SSimon J. Gerraty */
6900957b409SSimon J. Gerraty f256_montysquare(t, y);
6910957b409SSimon J. Gerraty f256_montysquare(x3, x);
6920957b409SSimon J. Gerraty f256_montymul(x3, x3, x);
6930957b409SSimon J. Gerraty f256_sub(t, t, x3);
6940957b409SSimon J. Gerraty f256_add(t, t, x);
6950957b409SSimon J. Gerraty f256_add(t, t, x);
6960957b409SSimon J. Gerraty f256_add(t, t, x);
6970957b409SSimon J. Gerraty f256_sub(t, t, P256_B_MONTY);
6980957b409SSimon J. Gerraty f256_final_reduce(t);
6990957b409SSimon J. Gerraty tt = t[0] | t[1] | t[2] | t[3] | t[4];
7000957b409SSimon J. Gerraty r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
7010957b409SSimon J. Gerraty
7020957b409SSimon J. Gerraty /*
7030957b409SSimon J. Gerraty * Return the point in Jacobian coordinates (and Montgomery
7040957b409SSimon J. Gerraty * representation).
7050957b409SSimon J. Gerraty */
7060957b409SSimon J. Gerraty memcpy(P->x, x, sizeof x);
7070957b409SSimon J. Gerraty memcpy(P->y, y, sizeof y);
7080957b409SSimon J. Gerraty memcpy(P->z, F256_R, sizeof F256_R);
7090957b409SSimon J. Gerraty return r;
7100957b409SSimon J. Gerraty }
7110957b409SSimon J. Gerraty
7120957b409SSimon J. Gerraty /*
7130957b409SSimon J. Gerraty * Final conversion for a point:
7140957b409SSimon J. Gerraty * - The point is converted back to affine coordinates.
7150957b409SSimon J. Gerraty * - Final reduction is performed.
7160957b409SSimon J. Gerraty * - The point is encoded into the provided buffer.
7170957b409SSimon J. Gerraty *
7180957b409SSimon J. Gerraty * If the point is the point-at-infinity, all operations are performed,
7190957b409SSimon J. Gerraty * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
7200957b409SSimon J. Gerraty * the encoded point is written in the buffer, and 1 is returned.
7210957b409SSimon J. Gerraty */
7220957b409SSimon J. Gerraty static uint32_t
point_encode(unsigned char * buf,const p256_jacobian * P)7230957b409SSimon J. Gerraty point_encode(unsigned char *buf, const p256_jacobian *P)
7240957b409SSimon J. Gerraty {
7250957b409SSimon J. Gerraty uint64_t t1[5], t2[5], z;
7260957b409SSimon J. Gerraty
7270957b409SSimon J. Gerraty /* Set t1 = 1/z^2 and t2 = 1/z^3. */
7280957b409SSimon J. Gerraty f256_invert(t2, P->z);
7290957b409SSimon J. Gerraty f256_montysquare(t1, t2);
7300957b409SSimon J. Gerraty f256_montymul(t2, t2, t1);
7310957b409SSimon J. Gerraty
7320957b409SSimon J. Gerraty /* Compute affine coordinates x (in t1) and y (in t2). */
7330957b409SSimon J. Gerraty f256_montymul(t1, P->x, t1);
7340957b409SSimon J. Gerraty f256_montymul(t2, P->y, t2);
7350957b409SSimon J. Gerraty
7360957b409SSimon J. Gerraty /* Convert back from Montgomery representation, and finalize
7370957b409SSimon J. Gerraty reductions. */
7380957b409SSimon J. Gerraty f256_frommonty(t1, t1);
7390957b409SSimon J. Gerraty f256_frommonty(t2, t2);
7400957b409SSimon J. Gerraty f256_final_reduce(t1);
7410957b409SSimon J. Gerraty f256_final_reduce(t2);
7420957b409SSimon J. Gerraty
7430957b409SSimon J. Gerraty /* Encode. */
7440957b409SSimon J. Gerraty buf[0] = 0x04;
7450957b409SSimon J. Gerraty f256_encode(buf + 1, t1);
7460957b409SSimon J. Gerraty f256_encode(buf + 33, t2);
7470957b409SSimon J. Gerraty
7480957b409SSimon J. Gerraty /* Return success if and only if P->z != 0. */
7490957b409SSimon J. Gerraty z = P->z[0] | P->z[1] | P->z[2] | P->z[3] | P->z[4];
7500957b409SSimon J. Gerraty return NEQ((uint32_t)(z | z >> 32), 0);
7510957b409SSimon J. Gerraty }
7520957b409SSimon J. Gerraty
7530957b409SSimon J. Gerraty /*
7540957b409SSimon J. Gerraty * Point doubling in Jacobian coordinates: point P is doubled.
7550957b409SSimon J. Gerraty * Note: if the source point is the point-at-infinity, then the result is
7560957b409SSimon J. Gerraty * still the point-at-infinity, which is correct. Moreover, if the three
7570957b409SSimon J. Gerraty * coordinates were zero, then they still are zero in the returned value.
7580957b409SSimon J. Gerraty */
7590957b409SSimon J. Gerraty static void
p256_double(p256_jacobian * P)7600957b409SSimon J. Gerraty p256_double(p256_jacobian *P)
7610957b409SSimon J. Gerraty {
7620957b409SSimon J. Gerraty /*
7630957b409SSimon J. Gerraty * Doubling formulas are:
7640957b409SSimon J. Gerraty *
7650957b409SSimon J. Gerraty * s = 4*x*y^2
7660957b409SSimon J. Gerraty * m = 3*(x + z^2)*(x - z^2)
7670957b409SSimon J. Gerraty * x' = m^2 - 2*s
7680957b409SSimon J. Gerraty * y' = m*(s - x') - 8*y^4
7690957b409SSimon J. Gerraty * z' = 2*y*z
7700957b409SSimon J. Gerraty *
7710957b409SSimon J. Gerraty * These formulas work for all points, including points of order 2
7720957b409SSimon J. Gerraty * and points at infinity:
7730957b409SSimon J. Gerraty * - If y = 0 then z' = 0. But there is no such point in P-256
7740957b409SSimon J. Gerraty * anyway.
7750957b409SSimon J. Gerraty * - If z = 0 then z' = 0.
7760957b409SSimon J. Gerraty */
7770957b409SSimon J. Gerraty uint64_t t1[5], t2[5], t3[5], t4[5];
7780957b409SSimon J. Gerraty
7790957b409SSimon J. Gerraty /*
7800957b409SSimon J. Gerraty * Compute z^2 in t1.
7810957b409SSimon J. Gerraty */
7820957b409SSimon J. Gerraty f256_montysquare(t1, P->z);
7830957b409SSimon J. Gerraty
7840957b409SSimon J. Gerraty /*
7850957b409SSimon J. Gerraty * Compute x-z^2 in t2 and x+z^2 in t1.
7860957b409SSimon J. Gerraty */
7870957b409SSimon J. Gerraty f256_add(t2, P->x, t1);
7880957b409SSimon J. Gerraty f256_sub(t1, P->x, t1);
7890957b409SSimon J. Gerraty
7900957b409SSimon J. Gerraty /*
7910957b409SSimon J. Gerraty * Compute 3*(x+z^2)*(x-z^2) in t1.
7920957b409SSimon J. Gerraty */
7930957b409SSimon J. Gerraty f256_montymul(t3, t1, t2);
7940957b409SSimon J. Gerraty f256_add(t1, t3, t3);
7950957b409SSimon J. Gerraty f256_add(t1, t3, t1);
7960957b409SSimon J. Gerraty
7970957b409SSimon J. Gerraty /*
7980957b409SSimon J. Gerraty * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
7990957b409SSimon J. Gerraty */
8000957b409SSimon J. Gerraty f256_montysquare(t3, P->y);
8010957b409SSimon J. Gerraty f256_add(t3, t3, t3);
8020957b409SSimon J. Gerraty f256_montymul(t2, P->x, t3);
8030957b409SSimon J. Gerraty f256_add(t2, t2, t2);
8040957b409SSimon J. Gerraty
8050957b409SSimon J. Gerraty /*
8060957b409SSimon J. Gerraty * Compute x' = m^2 - 2*s.
8070957b409SSimon J. Gerraty */
8080957b409SSimon J. Gerraty f256_montysquare(P->x, t1);
8090957b409SSimon J. Gerraty f256_sub(P->x, P->x, t2);
8100957b409SSimon J. Gerraty f256_sub(P->x, P->x, t2);
8110957b409SSimon J. Gerraty
8120957b409SSimon J. Gerraty /*
8130957b409SSimon J. Gerraty * Compute z' = 2*y*z.
8140957b409SSimon J. Gerraty */
8150957b409SSimon J. Gerraty f256_montymul(t4, P->y, P->z);
8160957b409SSimon J. Gerraty f256_add(P->z, t4, t4);
8170957b409SSimon J. Gerraty f256_partial_reduce(P->z);
8180957b409SSimon J. Gerraty
8190957b409SSimon J. Gerraty /*
8200957b409SSimon J. Gerraty * Compute y' = m*(s - x') - 8*y^4. Note that we already have
8210957b409SSimon J. Gerraty * 2*y^2 in t3.
8220957b409SSimon J. Gerraty */
8230957b409SSimon J. Gerraty f256_sub(t2, t2, P->x);
8240957b409SSimon J. Gerraty f256_montymul(P->y, t1, t2);
8250957b409SSimon J. Gerraty f256_montysquare(t4, t3);
8260957b409SSimon J. Gerraty f256_add(t4, t4, t4);
8270957b409SSimon J. Gerraty f256_sub(P->y, P->y, t4);
8280957b409SSimon J. Gerraty }
8290957b409SSimon J. Gerraty
8300957b409SSimon J. Gerraty /*
8310957b409SSimon J. Gerraty * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
8320957b409SSimon J. Gerraty * This function computes the wrong result in the following cases:
8330957b409SSimon J. Gerraty *
8340957b409SSimon J. Gerraty * - If P1 == 0 but P2 != 0
8350957b409SSimon J. Gerraty * - If P1 != 0 but P2 == 0
8360957b409SSimon J. Gerraty * - If P1 == P2
8370957b409SSimon J. Gerraty *
8380957b409SSimon J. Gerraty * In all three cases, P1 is set to the point at infinity.
8390957b409SSimon J. Gerraty *
8400957b409SSimon J. Gerraty * Returned value is 0 if one of the following occurs:
8410957b409SSimon J. Gerraty *
8420957b409SSimon J. Gerraty * - P1 and P2 have the same Y coordinate.
8430957b409SSimon J. Gerraty * - P1 == 0 and P2 == 0.
8440957b409SSimon J. Gerraty * - The Y coordinate of one of the points is 0 and the other point is
8450957b409SSimon J. Gerraty * the point at infinity.
8460957b409SSimon J. Gerraty *
8470957b409SSimon J. Gerraty * The third case cannot actually happen with valid points, since a point
8480957b409SSimon J. Gerraty * with Y == 0 is a point of order 2, and there is no point of order 2 on
8490957b409SSimon J. Gerraty * curve P-256.
8500957b409SSimon J. Gerraty *
8510957b409SSimon J. Gerraty * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
8520957b409SSimon J. Gerraty * can apply the following:
8530957b409SSimon J. Gerraty *
8540957b409SSimon J. Gerraty * - If the result is not the point at infinity, then it is correct.
8550957b409SSimon J. Gerraty * - Otherwise, if the returned value is 1, then this is a case of
8560957b409SSimon J. Gerraty * P1+P2 == 0, so the result is indeed the point at infinity.
8570957b409SSimon J. Gerraty * - Otherwise, P1 == P2, so a "double" operation should have been
8580957b409SSimon J. Gerraty * performed.
8590957b409SSimon J. Gerraty *
8600957b409SSimon J. Gerraty * Note that you can get a returned value of 0 with a correct result,
8610957b409SSimon J. Gerraty * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
8620957b409SSimon J. Gerraty */
8630957b409SSimon J. Gerraty static uint32_t
p256_add(p256_jacobian * P1,const p256_jacobian * P2)8640957b409SSimon J. Gerraty p256_add(p256_jacobian *P1, const p256_jacobian *P2)
8650957b409SSimon J. Gerraty {
8660957b409SSimon J. Gerraty /*
8670957b409SSimon J. Gerraty * Addtions formulas are:
8680957b409SSimon J. Gerraty *
8690957b409SSimon J. Gerraty * u1 = x1 * z2^2
8700957b409SSimon J. Gerraty * u2 = x2 * z1^2
8710957b409SSimon J. Gerraty * s1 = y1 * z2^3
8720957b409SSimon J. Gerraty * s2 = y2 * z1^3
8730957b409SSimon J. Gerraty * h = u2 - u1
8740957b409SSimon J. Gerraty * r = s2 - s1
8750957b409SSimon J. Gerraty * x3 = r^2 - h^3 - 2 * u1 * h^2
8760957b409SSimon J. Gerraty * y3 = r * (u1 * h^2 - x3) - s1 * h^3
8770957b409SSimon J. Gerraty * z3 = h * z1 * z2
8780957b409SSimon J. Gerraty */
8790957b409SSimon J. Gerraty uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
8800957b409SSimon J. Gerraty uint32_t ret;
8810957b409SSimon J. Gerraty
8820957b409SSimon J. Gerraty /*
8830957b409SSimon J. Gerraty * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
8840957b409SSimon J. Gerraty */
8850957b409SSimon J. Gerraty f256_montysquare(t3, P2->z);
8860957b409SSimon J. Gerraty f256_montymul(t1, P1->x, t3);
8870957b409SSimon J. Gerraty f256_montymul(t4, P2->z, t3);
8880957b409SSimon J. Gerraty f256_montymul(t3, P1->y, t4);
8890957b409SSimon J. Gerraty
8900957b409SSimon J. Gerraty /*
8910957b409SSimon J. Gerraty * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
8920957b409SSimon J. Gerraty */
8930957b409SSimon J. Gerraty f256_montysquare(t4, P1->z);
8940957b409SSimon J. Gerraty f256_montymul(t2, P2->x, t4);
8950957b409SSimon J. Gerraty f256_montymul(t5, P1->z, t4);
8960957b409SSimon J. Gerraty f256_montymul(t4, P2->y, t5);
8970957b409SSimon J. Gerraty
8980957b409SSimon J. Gerraty /*
8990957b409SSimon J. Gerraty * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
9000957b409SSimon J. Gerraty * We need to test whether r is zero, so we will do some extra
9010957b409SSimon J. Gerraty * reduce.
9020957b409SSimon J. Gerraty */
9030957b409SSimon J. Gerraty f256_sub(t2, t2, t1);
9040957b409SSimon J. Gerraty f256_sub(t4, t4, t3);
9050957b409SSimon J. Gerraty f256_final_reduce(t4);
9060957b409SSimon J. Gerraty tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
9070957b409SSimon J. Gerraty ret = (uint32_t)(tt | (tt >> 32));
9080957b409SSimon J. Gerraty ret = (ret | -ret) >> 31;
9090957b409SSimon J. Gerraty
9100957b409SSimon J. Gerraty /*
9110957b409SSimon J. Gerraty * Compute u1*h^2 (in t6) and h^3 (in t5);
9120957b409SSimon J. Gerraty */
9130957b409SSimon J. Gerraty f256_montysquare(t7, t2);
9140957b409SSimon J. Gerraty f256_montymul(t6, t1, t7);
9150957b409SSimon J. Gerraty f256_montymul(t5, t7, t2);
9160957b409SSimon J. Gerraty
9170957b409SSimon J. Gerraty /*
9180957b409SSimon J. Gerraty * Compute x3 = r^2 - h^3 - 2*u1*h^2.
9190957b409SSimon J. Gerraty */
9200957b409SSimon J. Gerraty f256_montysquare(P1->x, t4);
9210957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t5);
9220957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
9230957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
9240957b409SSimon J. Gerraty
9250957b409SSimon J. Gerraty /*
9260957b409SSimon J. Gerraty * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
9270957b409SSimon J. Gerraty */
9280957b409SSimon J. Gerraty f256_sub(t6, t6, P1->x);
9290957b409SSimon J. Gerraty f256_montymul(P1->y, t4, t6);
9300957b409SSimon J. Gerraty f256_montymul(t1, t5, t3);
9310957b409SSimon J. Gerraty f256_sub(P1->y, P1->y, t1);
9320957b409SSimon J. Gerraty
9330957b409SSimon J. Gerraty /*
9340957b409SSimon J. Gerraty * Compute z3 = h*z1*z2.
9350957b409SSimon J. Gerraty */
9360957b409SSimon J. Gerraty f256_montymul(t1, P1->z, P2->z);
9370957b409SSimon J. Gerraty f256_montymul(P1->z, t1, t2);
9380957b409SSimon J. Gerraty
9390957b409SSimon J. Gerraty return ret;
9400957b409SSimon J. Gerraty }
9410957b409SSimon J. Gerraty
9420957b409SSimon J. Gerraty /*
9430957b409SSimon J. Gerraty * Point addition (mixed coordinates): P1 is replaced with P1+P2.
9440957b409SSimon J. Gerraty * This is a specialised function for the case when P2 is a non-zero point
9450957b409SSimon J. Gerraty * in affine coordinates.
9460957b409SSimon J. Gerraty *
9470957b409SSimon J. Gerraty * This function computes the wrong result in the following cases:
9480957b409SSimon J. Gerraty *
9490957b409SSimon J. Gerraty * - If P1 == 0
9500957b409SSimon J. Gerraty * - If P1 == P2
9510957b409SSimon J. Gerraty *
9520957b409SSimon J. Gerraty * In both cases, P1 is set to the point at infinity.
9530957b409SSimon J. Gerraty *
9540957b409SSimon J. Gerraty * Returned value is 0 if one of the following occurs:
9550957b409SSimon J. Gerraty *
9560957b409SSimon J. Gerraty * - P1 and P2 have the same Y (affine) coordinate.
9570957b409SSimon J. Gerraty * - The Y coordinate of P2 is 0 and P1 is the point at infinity.
9580957b409SSimon J. Gerraty *
9590957b409SSimon J. Gerraty * The second case cannot actually happen with valid points, since a point
9600957b409SSimon J. Gerraty * with Y == 0 is a point of order 2, and there is no point of order 2 on
9610957b409SSimon J. Gerraty * curve P-256.
9620957b409SSimon J. Gerraty *
9630957b409SSimon J. Gerraty * Therefore, assuming that P1 != 0 on input, then the caller
9640957b409SSimon J. Gerraty * can apply the following:
9650957b409SSimon J. Gerraty *
9660957b409SSimon J. Gerraty * - If the result is not the point at infinity, then it is correct.
9670957b409SSimon J. Gerraty * - Otherwise, if the returned value is 1, then this is a case of
9680957b409SSimon J. Gerraty * P1+P2 == 0, so the result is indeed the point at infinity.
9690957b409SSimon J. Gerraty * - Otherwise, P1 == P2, so a "double" operation should have been
9700957b409SSimon J. Gerraty * performed.
9710957b409SSimon J. Gerraty *
9720957b409SSimon J. Gerraty * Again, a value of 0 may be returned in some cases where the addition
9730957b409SSimon J. Gerraty * result is correct.
9740957b409SSimon J. Gerraty */
9750957b409SSimon J. Gerraty static uint32_t
p256_add_mixed(p256_jacobian * P1,const p256_affine * P2)9760957b409SSimon J. Gerraty p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
9770957b409SSimon J. Gerraty {
9780957b409SSimon J. Gerraty /*
9790957b409SSimon J. Gerraty * Addtions formulas are:
9800957b409SSimon J. Gerraty *
9810957b409SSimon J. Gerraty * u1 = x1
9820957b409SSimon J. Gerraty * u2 = x2 * z1^2
9830957b409SSimon J. Gerraty * s1 = y1
9840957b409SSimon J. Gerraty * s2 = y2 * z1^3
9850957b409SSimon J. Gerraty * h = u2 - u1
9860957b409SSimon J. Gerraty * r = s2 - s1
9870957b409SSimon J. Gerraty * x3 = r^2 - h^3 - 2 * u1 * h^2
9880957b409SSimon J. Gerraty * y3 = r * (u1 * h^2 - x3) - s1 * h^3
9890957b409SSimon J. Gerraty * z3 = h * z1
9900957b409SSimon J. Gerraty */
9910957b409SSimon J. Gerraty uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt;
9920957b409SSimon J. Gerraty uint32_t ret;
9930957b409SSimon J. Gerraty
9940957b409SSimon J. Gerraty /*
9950957b409SSimon J. Gerraty * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
9960957b409SSimon J. Gerraty */
9970957b409SSimon J. Gerraty memcpy(t1, P1->x, sizeof t1);
9980957b409SSimon J. Gerraty memcpy(t3, P1->y, sizeof t3);
9990957b409SSimon J. Gerraty
10000957b409SSimon J. Gerraty /*
10010957b409SSimon J. Gerraty * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
10020957b409SSimon J. Gerraty */
10030957b409SSimon J. Gerraty f256_montysquare(t4, P1->z);
10040957b409SSimon J. Gerraty f256_montymul(t2, P2->x, t4);
10050957b409SSimon J. Gerraty f256_montymul(t5, P1->z, t4);
10060957b409SSimon J. Gerraty f256_montymul(t4, P2->y, t5);
10070957b409SSimon J. Gerraty
10080957b409SSimon J. Gerraty /*
10090957b409SSimon J. Gerraty * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
10100957b409SSimon J. Gerraty * We need to test whether r is zero, so we will do some extra
10110957b409SSimon J. Gerraty * reduce.
10120957b409SSimon J. Gerraty */
10130957b409SSimon J. Gerraty f256_sub(t2, t2, t1);
10140957b409SSimon J. Gerraty f256_sub(t4, t4, t3);
10150957b409SSimon J. Gerraty f256_final_reduce(t4);
10160957b409SSimon J. Gerraty tt = t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
10170957b409SSimon J. Gerraty ret = (uint32_t)(tt | (tt >> 32));
10180957b409SSimon J. Gerraty ret = (ret | -ret) >> 31;
10190957b409SSimon J. Gerraty
10200957b409SSimon J. Gerraty /*
10210957b409SSimon J. Gerraty * Compute u1*h^2 (in t6) and h^3 (in t5);
10220957b409SSimon J. Gerraty */
10230957b409SSimon J. Gerraty f256_montysquare(t7, t2);
10240957b409SSimon J. Gerraty f256_montymul(t6, t1, t7);
10250957b409SSimon J. Gerraty f256_montymul(t5, t7, t2);
10260957b409SSimon J. Gerraty
10270957b409SSimon J. Gerraty /*
10280957b409SSimon J. Gerraty * Compute x3 = r^2 - h^3 - 2*u1*h^2.
10290957b409SSimon J. Gerraty */
10300957b409SSimon J. Gerraty f256_montysquare(P1->x, t4);
10310957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t5);
10320957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
10330957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
10340957b409SSimon J. Gerraty
10350957b409SSimon J. Gerraty /*
10360957b409SSimon J. Gerraty * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
10370957b409SSimon J. Gerraty */
10380957b409SSimon J. Gerraty f256_sub(t6, t6, P1->x);
10390957b409SSimon J. Gerraty f256_montymul(P1->y, t4, t6);
10400957b409SSimon J. Gerraty f256_montymul(t1, t5, t3);
10410957b409SSimon J. Gerraty f256_sub(P1->y, P1->y, t1);
10420957b409SSimon J. Gerraty
10430957b409SSimon J. Gerraty /*
10440957b409SSimon J. Gerraty * Compute z3 = h*z1*z2.
10450957b409SSimon J. Gerraty */
10460957b409SSimon J. Gerraty f256_montymul(P1->z, P1->z, t2);
10470957b409SSimon J. Gerraty
10480957b409SSimon J. Gerraty return ret;
10490957b409SSimon J. Gerraty }
10500957b409SSimon J. Gerraty
10510957b409SSimon J. Gerraty #if 0
10520957b409SSimon J. Gerraty /* unused */
10530957b409SSimon J. Gerraty /*
10540957b409SSimon J. Gerraty * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
10550957b409SSimon J. Gerraty * This is a specialised function for the case when P2 is a non-zero point
10560957b409SSimon J. Gerraty * in affine coordinates.
10570957b409SSimon J. Gerraty *
10580957b409SSimon J. Gerraty * This function returns the correct result in all cases.
10590957b409SSimon J. Gerraty */
10600957b409SSimon J. Gerraty static uint32_t
10610957b409SSimon J. Gerraty p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
10620957b409SSimon J. Gerraty {
10630957b409SSimon J. Gerraty /*
10640957b409SSimon J. Gerraty * Addtions formulas, in the general case, are:
10650957b409SSimon J. Gerraty *
10660957b409SSimon J. Gerraty * u1 = x1
10670957b409SSimon J. Gerraty * u2 = x2 * z1^2
10680957b409SSimon J. Gerraty * s1 = y1
10690957b409SSimon J. Gerraty * s2 = y2 * z1^3
10700957b409SSimon J. Gerraty * h = u2 - u1
10710957b409SSimon J. Gerraty * r = s2 - s1
10720957b409SSimon J. Gerraty * x3 = r^2 - h^3 - 2 * u1 * h^2
10730957b409SSimon J. Gerraty * y3 = r * (u1 * h^2 - x3) - s1 * h^3
10740957b409SSimon J. Gerraty * z3 = h * z1
10750957b409SSimon J. Gerraty *
10760957b409SSimon J. Gerraty * These formulas mishandle the two following cases:
10770957b409SSimon J. Gerraty *
10780957b409SSimon J. Gerraty * - If P1 is the point-at-infinity (z1 = 0), then z3 is
10790957b409SSimon J. Gerraty * incorrectly set to 0.
10800957b409SSimon J. Gerraty *
10810957b409SSimon J. Gerraty * - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
10820957b409SSimon J. Gerraty * are all set to 0.
10830957b409SSimon J. Gerraty *
10840957b409SSimon J. Gerraty * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
10850957b409SSimon J. Gerraty * we correctly get z3 = 0 (the point-at-infinity).
10860957b409SSimon J. Gerraty *
10870957b409SSimon J. Gerraty * To fix the case P1 = 0, we perform at the end a copy of P2
10880957b409SSimon J. Gerraty * over P1, conditional to z1 = 0.
10890957b409SSimon J. Gerraty *
10900957b409SSimon J. Gerraty * For P1 = P2: in that case, both h and r are set to 0, and
10910957b409SSimon J. Gerraty * we get x3, y3 and z3 equal to 0. We can test for that
10920957b409SSimon J. Gerraty * occurrence to make a mask which will be all-one if P1 = P2,
10930957b409SSimon J. Gerraty * or all-zero otherwise; then we can compute the double of P2
10940957b409SSimon J. Gerraty * and add it, combined with the mask, to (x3,y3,z3).
10950957b409SSimon J. Gerraty *
10960957b409SSimon J. Gerraty * Using the doubling formulas in p256_double() on (x2,y2),
10970957b409SSimon J. Gerraty * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
10980957b409SSimon J. Gerraty * we get:
10990957b409SSimon J. Gerraty * s = 4*x2*y2^2
11000957b409SSimon J. Gerraty * m = 3*(x2 + 1)*(x2 - 1)
11010957b409SSimon J. Gerraty * x' = m^2 - 2*s
11020957b409SSimon J. Gerraty * y' = m*(s - x') - 8*y2^4
11030957b409SSimon J. Gerraty * z' = 2*y2
11040957b409SSimon J. Gerraty * which requires only 6 multiplications. Added to the 11
11050957b409SSimon J. Gerraty * multiplications of the normal mixed addition in Jacobian
11060957b409SSimon J. Gerraty * coordinates, we get a cost of 17 multiplications in total.
11070957b409SSimon J. Gerraty */
11080957b409SSimon J. Gerraty uint64_t t1[5], t2[5], t3[5], t4[5], t5[5], t6[5], t7[5], tt, zz;
11090957b409SSimon J. Gerraty int i;
11100957b409SSimon J. Gerraty
11110957b409SSimon J. Gerraty /*
11120957b409SSimon J. Gerraty * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
11130957b409SSimon J. Gerraty */
11140957b409SSimon J. Gerraty zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3] | P1->z[4];
11150957b409SSimon J. Gerraty zz = ((zz | -zz) >> 63) - (uint64_t)1;
11160957b409SSimon J. Gerraty
11170957b409SSimon J. Gerraty /*
11180957b409SSimon J. Gerraty * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
11190957b409SSimon J. Gerraty */
11200957b409SSimon J. Gerraty memcpy(t1, P1->x, sizeof t1);
11210957b409SSimon J. Gerraty memcpy(t3, P1->y, sizeof t3);
11220957b409SSimon J. Gerraty
11230957b409SSimon J. Gerraty /*
11240957b409SSimon J. Gerraty * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
11250957b409SSimon J. Gerraty */
11260957b409SSimon J. Gerraty f256_montysquare(t4, P1->z);
11270957b409SSimon J. Gerraty f256_montymul(t2, P2->x, t4);
11280957b409SSimon J. Gerraty f256_montymul(t5, P1->z, t4);
11290957b409SSimon J. Gerraty f256_montymul(t4, P2->y, t5);
11300957b409SSimon J. Gerraty
11310957b409SSimon J. Gerraty /*
11320957b409SSimon J. Gerraty * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
11330957b409SSimon J. Gerraty * reduce.
11340957b409SSimon J. Gerraty */
11350957b409SSimon J. Gerraty f256_sub(t2, t2, t1);
11360957b409SSimon J. Gerraty f256_sub(t4, t4, t3);
11370957b409SSimon J. Gerraty
11380957b409SSimon J. Gerraty /*
11390957b409SSimon J. Gerraty * If both h = 0 and r = 0, then P1 = P2, and we want to set
11400957b409SSimon J. Gerraty * the mask tt to -1; otherwise, the mask will be 0.
11410957b409SSimon J. Gerraty */
11420957b409SSimon J. Gerraty f256_final_reduce(t2);
11430957b409SSimon J. Gerraty f256_final_reduce(t4);
11440957b409SSimon J. Gerraty tt = t2[0] | t2[1] | t2[2] | t2[3] | t2[4]
11450957b409SSimon J. Gerraty | t4[0] | t4[1] | t4[2] | t4[3] | t4[4];
11460957b409SSimon J. Gerraty tt = ((tt | -tt) >> 63) - (uint64_t)1;
11470957b409SSimon J. Gerraty
11480957b409SSimon J. Gerraty /*
11490957b409SSimon J. Gerraty * Compute u1*h^2 (in t6) and h^3 (in t5);
11500957b409SSimon J. Gerraty */
11510957b409SSimon J. Gerraty f256_montysquare(t7, t2);
11520957b409SSimon J. Gerraty f256_montymul(t6, t1, t7);
11530957b409SSimon J. Gerraty f256_montymul(t5, t7, t2);
11540957b409SSimon J. Gerraty
11550957b409SSimon J. Gerraty /*
11560957b409SSimon J. Gerraty * Compute x3 = r^2 - h^3 - 2*u1*h^2.
11570957b409SSimon J. Gerraty */
11580957b409SSimon J. Gerraty f256_montysquare(P1->x, t4);
11590957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t5);
11600957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
11610957b409SSimon J. Gerraty f256_sub(P1->x, P1->x, t6);
11620957b409SSimon J. Gerraty
11630957b409SSimon J. Gerraty /*
11640957b409SSimon J. Gerraty * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
11650957b409SSimon J. Gerraty */
11660957b409SSimon J. Gerraty f256_sub(t6, t6, P1->x);
11670957b409SSimon J. Gerraty f256_montymul(P1->y, t4, t6);
11680957b409SSimon J. Gerraty f256_montymul(t1, t5, t3);
11690957b409SSimon J. Gerraty f256_sub(P1->y, P1->y, t1);
11700957b409SSimon J. Gerraty
11710957b409SSimon J. Gerraty /*
11720957b409SSimon J. Gerraty * Compute z3 = h*z1.
11730957b409SSimon J. Gerraty */
11740957b409SSimon J. Gerraty f256_montymul(P1->z, P1->z, t2);
11750957b409SSimon J. Gerraty
11760957b409SSimon J. Gerraty /*
11770957b409SSimon J. Gerraty * The "double" result, in case P1 = P2.
11780957b409SSimon J. Gerraty */
11790957b409SSimon J. Gerraty
11800957b409SSimon J. Gerraty /*
11810957b409SSimon J. Gerraty * Compute z' = 2*y2 (in t1).
11820957b409SSimon J. Gerraty */
11830957b409SSimon J. Gerraty f256_add(t1, P2->y, P2->y);
11840957b409SSimon J. Gerraty f256_partial_reduce(t1);
11850957b409SSimon J. Gerraty
11860957b409SSimon J. Gerraty /*
11870957b409SSimon J. Gerraty * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
11880957b409SSimon J. Gerraty */
11890957b409SSimon J. Gerraty f256_montysquare(t2, P2->y);
11900957b409SSimon J. Gerraty f256_add(t2, t2, t2);
11910957b409SSimon J. Gerraty f256_add(t3, t2, t2);
11920957b409SSimon J. Gerraty f256_montymul(t3, P2->x, t3);
11930957b409SSimon J. Gerraty
11940957b409SSimon J. Gerraty /*
11950957b409SSimon J. Gerraty * Compute m = 3*(x2^2 - 1) (in t4).
11960957b409SSimon J. Gerraty */
11970957b409SSimon J. Gerraty f256_montysquare(t4, P2->x);
11980957b409SSimon J. Gerraty f256_sub(t4, t4, F256_R);
11990957b409SSimon J. Gerraty f256_add(t5, t4, t4);
12000957b409SSimon J. Gerraty f256_add(t4, t4, t5);
12010957b409SSimon J. Gerraty
12020957b409SSimon J. Gerraty /*
12030957b409SSimon J. Gerraty * Compute x' = m^2 - 2*s (in t5).
12040957b409SSimon J. Gerraty */
12050957b409SSimon J. Gerraty f256_montysquare(t5, t4);
12060957b409SSimon J. Gerraty f256_sub(t5, t3);
12070957b409SSimon J. Gerraty f256_sub(t5, t3);
12080957b409SSimon J. Gerraty
12090957b409SSimon J. Gerraty /*
12100957b409SSimon J. Gerraty * Compute y' = m*(s - x') - 8*y2^4 (in t6).
12110957b409SSimon J. Gerraty */
12120957b409SSimon J. Gerraty f256_sub(t6, t3, t5);
12130957b409SSimon J. Gerraty f256_montymul(t6, t6, t4);
12140957b409SSimon J. Gerraty f256_montysquare(t7, t2);
12150957b409SSimon J. Gerraty f256_sub(t6, t6, t7);
12160957b409SSimon J. Gerraty f256_sub(t6, t6, t7);
12170957b409SSimon J. Gerraty
12180957b409SSimon J. Gerraty /*
12190957b409SSimon J. Gerraty * We now have the alternate (doubling) coordinates in (t5,t6,t1).
12200957b409SSimon J. Gerraty * We combine them with (x3,y3,z3).
12210957b409SSimon J. Gerraty */
12220957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
12230957b409SSimon J. Gerraty P1->x[i] |= tt & t5[i];
12240957b409SSimon J. Gerraty P1->y[i] |= tt & t6[i];
12250957b409SSimon J. Gerraty P1->z[i] |= tt & t1[i];
12260957b409SSimon J. Gerraty }
12270957b409SSimon J. Gerraty
12280957b409SSimon J. Gerraty /*
12290957b409SSimon J. Gerraty * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
12300957b409SSimon J. Gerraty * then we want to replace the result with a copy of P2. The
12310957b409SSimon J. Gerraty * test on z1 was done at the start, in the zz mask.
12320957b409SSimon J. Gerraty */
12330957b409SSimon J. Gerraty for (i = 0; i < 5; i ++) {
12340957b409SSimon J. Gerraty P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
12350957b409SSimon J. Gerraty P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
12360957b409SSimon J. Gerraty P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
12370957b409SSimon J. Gerraty }
12380957b409SSimon J. Gerraty }
12390957b409SSimon J. Gerraty #endif
12400957b409SSimon J. Gerraty
12410957b409SSimon J. Gerraty /*
12420957b409SSimon J. Gerraty * Inner function for computing a point multiplication. A window is
12430957b409SSimon J. Gerraty * provided, with points 1*P to 15*P in affine coordinates.
12440957b409SSimon J. Gerraty *
12450957b409SSimon J. Gerraty * Assumptions:
12460957b409SSimon J. Gerraty * - All provided points are valid points on the curve.
12470957b409SSimon J. Gerraty * - Multiplier is non-zero, and smaller than the curve order.
12480957b409SSimon J. Gerraty * - Everything is in Montgomery representation.
12490957b409SSimon J. Gerraty */
12500957b409SSimon J. Gerraty static void
point_mul_inner(p256_jacobian * R,const p256_affine * W,const unsigned char * k,size_t klen)12510957b409SSimon J. Gerraty point_mul_inner(p256_jacobian *R, const p256_affine *W,
12520957b409SSimon J. Gerraty const unsigned char *k, size_t klen)
12530957b409SSimon J. Gerraty {
12540957b409SSimon J. Gerraty p256_jacobian Q;
12550957b409SSimon J. Gerraty uint32_t qz;
12560957b409SSimon J. Gerraty
12570957b409SSimon J. Gerraty memset(&Q, 0, sizeof Q);
12580957b409SSimon J. Gerraty qz = 1;
12590957b409SSimon J. Gerraty while (klen -- > 0) {
12600957b409SSimon J. Gerraty int i;
12610957b409SSimon J. Gerraty unsigned bk;
12620957b409SSimon J. Gerraty
12630957b409SSimon J. Gerraty bk = *k ++;
12640957b409SSimon J. Gerraty for (i = 0; i < 2; i ++) {
12650957b409SSimon J. Gerraty uint32_t bits;
12660957b409SSimon J. Gerraty uint32_t bnz;
12670957b409SSimon J. Gerraty p256_affine T;
12680957b409SSimon J. Gerraty p256_jacobian U;
12690957b409SSimon J. Gerraty uint32_t n;
12700957b409SSimon J. Gerraty int j;
12710957b409SSimon J. Gerraty uint64_t m;
12720957b409SSimon J. Gerraty
12730957b409SSimon J. Gerraty p256_double(&Q);
12740957b409SSimon J. Gerraty p256_double(&Q);
12750957b409SSimon J. Gerraty p256_double(&Q);
12760957b409SSimon J. Gerraty p256_double(&Q);
12770957b409SSimon J. Gerraty bits = (bk >> 4) & 0x0F;
12780957b409SSimon J. Gerraty bnz = NEQ(bits, 0);
12790957b409SSimon J. Gerraty
12800957b409SSimon J. Gerraty /*
12810957b409SSimon J. Gerraty * Lookup point in window. If the bits are 0,
12820957b409SSimon J. Gerraty * we get something invalid, which is not a
12830957b409SSimon J. Gerraty * problem because we will use it only if the
12840957b409SSimon J. Gerraty * bits are non-zero.
12850957b409SSimon J. Gerraty */
12860957b409SSimon J. Gerraty memset(&T, 0, sizeof T);
12870957b409SSimon J. Gerraty for (n = 0; n < 15; n ++) {
12880957b409SSimon J. Gerraty m = -(uint64_t)EQ(bits, n + 1);
12890957b409SSimon J. Gerraty T.x[0] |= m & W[n].x[0];
12900957b409SSimon J. Gerraty T.x[1] |= m & W[n].x[1];
12910957b409SSimon J. Gerraty T.x[2] |= m & W[n].x[2];
12920957b409SSimon J. Gerraty T.x[3] |= m & W[n].x[3];
12930957b409SSimon J. Gerraty T.x[4] |= m & W[n].x[4];
12940957b409SSimon J. Gerraty T.y[0] |= m & W[n].y[0];
12950957b409SSimon J. Gerraty T.y[1] |= m & W[n].y[1];
12960957b409SSimon J. Gerraty T.y[2] |= m & W[n].y[2];
12970957b409SSimon J. Gerraty T.y[3] |= m & W[n].y[3];
12980957b409SSimon J. Gerraty T.y[4] |= m & W[n].y[4];
12990957b409SSimon J. Gerraty }
13000957b409SSimon J. Gerraty
13010957b409SSimon J. Gerraty U = Q;
13020957b409SSimon J. Gerraty p256_add_mixed(&U, &T);
13030957b409SSimon J. Gerraty
13040957b409SSimon J. Gerraty /*
13050957b409SSimon J. Gerraty * If qz is still 1, then Q was all-zeros, and this
13060957b409SSimon J. Gerraty * is conserved through p256_double().
13070957b409SSimon J. Gerraty */
13080957b409SSimon J. Gerraty m = -(uint64_t)(bnz & qz);
13090957b409SSimon J. Gerraty for (j = 0; j < 5; j ++) {
13100957b409SSimon J. Gerraty Q.x[j] ^= m & (Q.x[j] ^ T.x[j]);
13110957b409SSimon J. Gerraty Q.y[j] ^= m & (Q.y[j] ^ T.y[j]);
13120957b409SSimon J. Gerraty Q.z[j] ^= m & (Q.z[j] ^ F256_R[j]);
13130957b409SSimon J. Gerraty }
13140957b409SSimon J. Gerraty CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
13150957b409SSimon J. Gerraty qz &= ~bnz;
13160957b409SSimon J. Gerraty bk <<= 4;
13170957b409SSimon J. Gerraty }
13180957b409SSimon J. Gerraty }
13190957b409SSimon J. Gerraty *R = Q;
13200957b409SSimon J. Gerraty }
13210957b409SSimon J. Gerraty
13220957b409SSimon J. Gerraty /*
13230957b409SSimon J. Gerraty * Convert a window from Jacobian to affine coordinates. A single
13240957b409SSimon J. Gerraty * field inversion is used. This function works for windows up to
13250957b409SSimon J. Gerraty * 32 elements.
13260957b409SSimon J. Gerraty *
13270957b409SSimon J. Gerraty * The destination array (aff[]) and the source array (jac[]) may
13280957b409SSimon J. Gerraty * overlap, provided that the start of aff[] is not after the start of
13290957b409SSimon J. Gerraty * jac[]. Even if the arrays do _not_ overlap, the source array is
13300957b409SSimon J. Gerraty * modified.
13310957b409SSimon J. Gerraty */
13320957b409SSimon J. Gerraty static void
window_to_affine(p256_affine * aff,p256_jacobian * jac,int num)13330957b409SSimon J. Gerraty window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
13340957b409SSimon J. Gerraty {
13350957b409SSimon J. Gerraty /*
13360957b409SSimon J. Gerraty * Convert the window points to affine coordinates. We use the
13370957b409SSimon J. Gerraty * following trick to mutualize the inversion computation: if
13380957b409SSimon J. Gerraty * we have z1, z2, z3, and z4, and want to invert all of them,
13390957b409SSimon J. Gerraty * we compute u = 1/(z1*z2*z3*z4), and then we have:
13400957b409SSimon J. Gerraty * 1/z1 = u*z2*z3*z4
13410957b409SSimon J. Gerraty * 1/z2 = u*z1*z3*z4
13420957b409SSimon J. Gerraty * 1/z3 = u*z1*z2*z4
13430957b409SSimon J. Gerraty * 1/z4 = u*z1*z2*z3
13440957b409SSimon J. Gerraty *
13450957b409SSimon J. Gerraty * The partial products are computed recursively:
13460957b409SSimon J. Gerraty *
13470957b409SSimon J. Gerraty * - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
13480957b409SSimon J. Gerraty * - on input (z_1,z_2,... z_n):
13490957b409SSimon J. Gerraty * recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
13500957b409SSimon J. Gerraty * recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
13510957b409SSimon J. Gerraty * multiply elements of r1 by m2 -> s1
13520957b409SSimon J. Gerraty * multiply elements of r2 by m1 -> s2
13530957b409SSimon J. Gerraty * return r1||r2 and m1*m2
13540957b409SSimon J. Gerraty *
13550957b409SSimon J. Gerraty * In the example below, we suppose that we have 14 elements.
13560957b409SSimon J. Gerraty * Let z1, z2,... zE be the 14 values to invert (index noted in
13570957b409SSimon J. Gerraty * hexadecimal, starting at 1).
13580957b409SSimon J. Gerraty *
13590957b409SSimon J. Gerraty * - Depth 1:
13600957b409SSimon J. Gerraty * swap(z1, z2); z12 = z1*z2
13610957b409SSimon J. Gerraty * swap(z3, z4); z34 = z3*z4
13620957b409SSimon J. Gerraty * swap(z5, z6); z56 = z5*z6
13630957b409SSimon J. Gerraty * swap(z7, z8); z78 = z7*z8
13640957b409SSimon J. Gerraty * swap(z9, zA); z9A = z9*zA
13650957b409SSimon J. Gerraty * swap(zB, zC); zBC = zB*zC
13660957b409SSimon J. Gerraty * swap(zD, zE); zDE = zD*zE
13670957b409SSimon J. Gerraty *
13680957b409SSimon J. Gerraty * - Depth 2:
13690957b409SSimon J. Gerraty * z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
13700957b409SSimon J. Gerraty * z1234 = z12*z34
13710957b409SSimon J. Gerraty * z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
13720957b409SSimon J. Gerraty * z5678 = z56*z78
13730957b409SSimon J. Gerraty * z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
13740957b409SSimon J. Gerraty * z9ABC = z9A*zBC
13750957b409SSimon J. Gerraty *
13760957b409SSimon J. Gerraty * - Depth 3:
13770957b409SSimon J. Gerraty * z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
13780957b409SSimon J. Gerraty * z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
13790957b409SSimon J. Gerraty * z12345678 = z1234*z5678
13800957b409SSimon J. Gerraty * z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
13810957b409SSimon J. Gerraty * zD <- zD*z9ABC, zE*z9ABC
13820957b409SSimon J. Gerraty * z9ABCDE = z9ABC*zDE
13830957b409SSimon J. Gerraty *
13840957b409SSimon J. Gerraty * - Depth 4:
13850957b409SSimon J. Gerraty * multiply z1..z8 by z9ABCDE
13860957b409SSimon J. Gerraty * multiply z9..zE by z12345678
13870957b409SSimon J. Gerraty * final z = z12345678*z9ABCDE
13880957b409SSimon J. Gerraty */
13890957b409SSimon J. Gerraty
13900957b409SSimon J. Gerraty uint64_t z[16][5];
13910957b409SSimon J. Gerraty int i, k, s;
13920957b409SSimon J. Gerraty #define zt (z[15])
13930957b409SSimon J. Gerraty #define zu (z[14])
13940957b409SSimon J. Gerraty #define zv (z[13])
13950957b409SSimon J. Gerraty
13960957b409SSimon J. Gerraty /*
13970957b409SSimon J. Gerraty * First recursion step (pairwise swapping and multiplication).
13980957b409SSimon J. Gerraty * If there is an odd number of elements, then we "invent" an
13990957b409SSimon J. Gerraty * extra one with coordinate Z = 1 (in Montgomery representation).
14000957b409SSimon J. Gerraty */
14010957b409SSimon J. Gerraty for (i = 0; (i + 1) < num; i += 2) {
14020957b409SSimon J. Gerraty memcpy(zt, jac[i].z, sizeof zt);
14030957b409SSimon J. Gerraty memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
14040957b409SSimon J. Gerraty memcpy(jac[i + 1].z, zt, sizeof zt);
14050957b409SSimon J. Gerraty f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
14060957b409SSimon J. Gerraty }
14070957b409SSimon J. Gerraty if ((num & 1) != 0) {
14080957b409SSimon J. Gerraty memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
14090957b409SSimon J. Gerraty memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
14100957b409SSimon J. Gerraty }
14110957b409SSimon J. Gerraty
14120957b409SSimon J. Gerraty /*
14130957b409SSimon J. Gerraty * Perform further recursion steps. At the entry of each step,
14140957b409SSimon J. Gerraty * the process has been done for groups of 's' points. The
14150957b409SSimon J. Gerraty * integer k is the log2 of s.
14160957b409SSimon J. Gerraty */
14170957b409SSimon J. Gerraty for (k = 1, s = 2; s < num; k ++, s <<= 1) {
14180957b409SSimon J. Gerraty int n;
14190957b409SSimon J. Gerraty
14200957b409SSimon J. Gerraty for (i = 0; i < num; i ++) {
14210957b409SSimon J. Gerraty f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
14220957b409SSimon J. Gerraty }
14230957b409SSimon J. Gerraty n = (num + s - 1) >> k;
14240957b409SSimon J. Gerraty for (i = 0; i < (n >> 1); i ++) {
14250957b409SSimon J. Gerraty f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
14260957b409SSimon J. Gerraty }
14270957b409SSimon J. Gerraty if ((n & 1) != 0) {
14280957b409SSimon J. Gerraty memmove(z[n >> 1], z[n], sizeof zt);
14290957b409SSimon J. Gerraty }
14300957b409SSimon J. Gerraty }
14310957b409SSimon J. Gerraty
14320957b409SSimon J. Gerraty /*
14330957b409SSimon J. Gerraty * Invert the final result, and convert all points.
14340957b409SSimon J. Gerraty */
14350957b409SSimon J. Gerraty f256_invert(zt, z[0]);
14360957b409SSimon J. Gerraty for (i = 0; i < num; i ++) {
14370957b409SSimon J. Gerraty f256_montymul(zv, jac[i].z, zt);
14380957b409SSimon J. Gerraty f256_montysquare(zu, zv);
14390957b409SSimon J. Gerraty f256_montymul(zv, zv, zu);
14400957b409SSimon J. Gerraty f256_montymul(aff[i].x, jac[i].x, zu);
14410957b409SSimon J. Gerraty f256_montymul(aff[i].y, jac[i].y, zv);
14420957b409SSimon J. Gerraty }
14430957b409SSimon J. Gerraty }
14440957b409SSimon J. Gerraty
14450957b409SSimon J. Gerraty /*
14460957b409SSimon J. Gerraty * Multiply the provided point by an integer.
14470957b409SSimon J. Gerraty * Assumptions:
14480957b409SSimon J. Gerraty * - Source point is a valid curve point.
14490957b409SSimon J. Gerraty * - Source point is not the point-at-infinity.
14500957b409SSimon J. Gerraty * - Integer is not 0, and is lower than the curve order.
14510957b409SSimon J. Gerraty * If these conditions are not met, then the result is indeterminate
14520957b409SSimon J. Gerraty * (but the process is still constant-time).
14530957b409SSimon J. Gerraty */
14540957b409SSimon J. Gerraty static void
p256_mul(p256_jacobian * P,const unsigned char * k,size_t klen)14550957b409SSimon J. Gerraty p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
14560957b409SSimon J. Gerraty {
14570957b409SSimon J. Gerraty union {
14580957b409SSimon J. Gerraty p256_affine aff[15];
14590957b409SSimon J. Gerraty p256_jacobian jac[15];
14600957b409SSimon J. Gerraty } window;
14610957b409SSimon J. Gerraty int i;
14620957b409SSimon J. Gerraty
14630957b409SSimon J. Gerraty /*
14640957b409SSimon J. Gerraty * Compute window, in Jacobian coordinates.
14650957b409SSimon J. Gerraty */
14660957b409SSimon J. Gerraty window.jac[0] = *P;
14670957b409SSimon J. Gerraty for (i = 2; i < 16; i ++) {
14680957b409SSimon J. Gerraty window.jac[i - 1] = window.jac[(i >> 1) - 1];
14690957b409SSimon J. Gerraty if ((i & 1) == 0) {
14700957b409SSimon J. Gerraty p256_double(&window.jac[i - 1]);
14710957b409SSimon J. Gerraty } else {
14720957b409SSimon J. Gerraty p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
14730957b409SSimon J. Gerraty }
14740957b409SSimon J. Gerraty }
14750957b409SSimon J. Gerraty
14760957b409SSimon J. Gerraty /*
14770957b409SSimon J. Gerraty * Convert the window points to affine coordinates. Point
14780957b409SSimon J. Gerraty * window[0] is the source point, already in affine coordinates.
14790957b409SSimon J. Gerraty */
14800957b409SSimon J. Gerraty window_to_affine(window.aff, window.jac, 15);
14810957b409SSimon J. Gerraty
14820957b409SSimon J. Gerraty /*
14830957b409SSimon J. Gerraty * Perform point multiplication.
14840957b409SSimon J. Gerraty */
14850957b409SSimon J. Gerraty point_mul_inner(P, window.aff, k, klen);
14860957b409SSimon J. Gerraty }
14870957b409SSimon J. Gerraty
14880957b409SSimon J. Gerraty /*
14890957b409SSimon J. Gerraty * Precomputed window for the conventional generator: P256_Gwin[n]
14900957b409SSimon J. Gerraty * contains (n+1)*G (affine coordinates, in Montgomery representation).
14910957b409SSimon J. Gerraty */
14920957b409SSimon J. Gerraty static const p256_affine P256_Gwin[] = {
14930957b409SSimon J. Gerraty {
14940957b409SSimon J. Gerraty { 0x30D418A9143C1, 0xC4FEDB60179E7, 0x62251075BA95F,
14950957b409SSimon J. Gerraty 0x5C669FB732B77, 0x08905F76B5375 },
14960957b409SSimon J. Gerraty { 0x5357CE95560A8, 0x43A19E45CDDF2, 0x21F3258B4AB8E,
14970957b409SSimon J. Gerraty 0xD8552E88688DD, 0x0571FF18A5885 }
14980957b409SSimon J. Gerraty },
14990957b409SSimon J. Gerraty {
15000957b409SSimon J. Gerraty { 0x46D410DDD64DF, 0x0B433827D8500, 0x1490D9AA6AE3C,
15010957b409SSimon J. Gerraty 0xA3A832205038D, 0x06BB32E52DCF3 },
15020957b409SSimon J. Gerraty { 0x48D361BEE1A57, 0xB7B236FF82F36, 0x042DBE152CD7C,
15030957b409SSimon J. Gerraty 0xA3AA9A8FB0E92, 0x08C577517A5B8 }
15040957b409SSimon J. Gerraty },
15050957b409SSimon J. Gerraty {
15060957b409SSimon J. Gerraty { 0x3F904EEBC1272, 0x9E87D81FBFFAC, 0xCBBC98B027F84,
15070957b409SSimon J. Gerraty 0x47E46AD77DD87, 0x06936A3FD6FF7 },
15080957b409SSimon J. Gerraty { 0x5C1FC983A7EBD, 0xC3861FE1AB04C, 0x2EE98E583E47A,
15090957b409SSimon J. Gerraty 0xC06A88208311A, 0x05F06A2AB587C }
15100957b409SSimon J. Gerraty },
15110957b409SSimon J. Gerraty {
15120957b409SSimon J. Gerraty { 0xB50D46918DCC5, 0xD7623C17374B0, 0x100AF24650A6E,
15130957b409SSimon J. Gerraty 0x76ABCDAACACE8, 0x077362F591B01 },
15140957b409SSimon J. Gerraty { 0xF24CE4CBABA68, 0x17AD6F4472D96, 0xDDD22E1762847,
15150957b409SSimon J. Gerraty 0x862EB6C36DEE5, 0x04B14C39CC5AB }
15160957b409SSimon J. Gerraty },
15170957b409SSimon J. Gerraty {
15180957b409SSimon J. Gerraty { 0x8AAEC45C61F5C, 0x9D4B9537DBE1B, 0x76C20C90EC649,
15190957b409SSimon J. Gerraty 0x3C7D41CB5AAD0, 0x0907960649052 },
15200957b409SSimon J. Gerraty { 0x9B4AE7BA4F107, 0xF75EB882BEB30, 0x7A1F6873C568E,
15210957b409SSimon J. Gerraty 0x915C540A9877E, 0x03A076BB9DD1E }
15220957b409SSimon J. Gerraty },
15230957b409SSimon J. Gerraty {
15240957b409SSimon J. Gerraty { 0x47373E77664A1, 0xF246CEE3E4039, 0x17A3AD55AE744,
15250957b409SSimon J. Gerraty 0x673C50A961A5B, 0x03074B5964213 },
15260957b409SSimon J. Gerraty { 0x6220D377E44BA, 0x30DFF14B593D3, 0x639F11299C2B5,
15270957b409SSimon J. Gerraty 0x75F5424D44CEF, 0x04C9916DEA07F }
15280957b409SSimon J. Gerraty },
15290957b409SSimon J. Gerraty {
15300957b409SSimon J. Gerraty { 0x354EA0173B4F1, 0x3C23C00F70746, 0x23BB082BD2021,
15310957b409SSimon J. Gerraty 0xE03E43EAAB50C, 0x03BA5119D3123 },
15320957b409SSimon J. Gerraty { 0xD0303F5B9D4DE, 0x17DA67BDD2847, 0xC941956742F2F,
15330957b409SSimon J. Gerraty 0x8670F933BDC77, 0x0AEDD9164E240 }
15340957b409SSimon J. Gerraty },
15350957b409SSimon J. Gerraty {
15360957b409SSimon J. Gerraty { 0x4CD19499A78FB, 0x4BF9B345527F1, 0x2CFC6B462AB5C,
15370957b409SSimon J. Gerraty 0x30CDF90F02AF0, 0x0763891F62652 },
15380957b409SSimon J. Gerraty { 0xA3A9532D49775, 0xD7F9EBA15F59D, 0x60BBF021E3327,
15390957b409SSimon J. Gerraty 0xF75C23C7B84BE, 0x06EC12F2C706D }
15400957b409SSimon J. Gerraty },
15410957b409SSimon J. Gerraty {
15420957b409SSimon J. Gerraty { 0x6E8F264E20E8E, 0xC79A7A84175C9, 0xC8EB00ABE6BFE,
15430957b409SSimon J. Gerraty 0x16A4CC09C0444, 0x005B3081D0C4E },
15440957b409SSimon J. Gerraty { 0x777AA45F33140, 0xDCE5D45E31EB7, 0xB12F1A56AF7BE,
15450957b409SSimon J. Gerraty 0xF9B2B6E019A88, 0x086659CDFD835 }
15460957b409SSimon J. Gerraty },
15470957b409SSimon J. Gerraty {
15480957b409SSimon J. Gerraty { 0xDBD19DC21EC8C, 0x94FCF81392C18, 0x250B4998F9868,
15490957b409SSimon J. Gerraty 0x28EB37D2CD648, 0x0C61C947E4B34 },
15500957b409SSimon J. Gerraty { 0x407880DD9E767, 0x0C83FBE080C2B, 0x9BE5D2C43A899,
15510957b409SSimon J. Gerraty 0xAB4EF7D2D6577, 0x08719A555B3B4 }
15520957b409SSimon J. Gerraty },
15530957b409SSimon J. Gerraty {
15540957b409SSimon J. Gerraty { 0x260A6245E4043, 0x53E7FDFE0EA7D, 0xAC1AB59DE4079,
15550957b409SSimon J. Gerraty 0x072EFF3A4158D, 0x0E7090F1949C9 },
15560957b409SSimon J. Gerraty { 0x85612B944E886, 0xE857F61C81A76, 0xAD643D250F939,
15570957b409SSimon J. Gerraty 0x88DAC0DAA891E, 0x089300244125B }
15580957b409SSimon J. Gerraty },
15590957b409SSimon J. Gerraty {
15600957b409SSimon J. Gerraty { 0x1AA7D26977684, 0x58A345A3304B7, 0x37385EABDEDEF,
15610957b409SSimon J. Gerraty 0x155E409D29DEE, 0x0EE1DF780B83E },
15620957b409SSimon J. Gerraty { 0x12D91CBB5B437, 0x65A8956370CAC, 0xDE6D66170ED2F,
15630957b409SSimon J. Gerraty 0xAC9B8228CFA8A, 0x0FF57C95C3238 }
15640957b409SSimon J. Gerraty },
15650957b409SSimon J. Gerraty {
15660957b409SSimon J. Gerraty { 0x25634B2ED7097, 0x9156FD30DCCC4, 0x9E98110E35676,
15670957b409SSimon J. Gerraty 0x7594CBCD43F55, 0x038477ACC395B },
15680957b409SSimon J. Gerraty { 0x2B90C00EE17FF, 0xF842ED2E33575, 0x1F5BC16874838,
15690957b409SSimon J. Gerraty 0x7968CD06422BD, 0x0BC0876AB9E7B }
15700957b409SSimon J. Gerraty },
15710957b409SSimon J. Gerraty {
15720957b409SSimon J. Gerraty { 0xA35BB0CF664AF, 0x68F9707E3A242, 0x832660126E48F,
15730957b409SSimon J. Gerraty 0x72D2717BF54C6, 0x0AAE7333ED12C },
15740957b409SSimon J. Gerraty { 0x2DB7995D586B1, 0xE732237C227B5, 0x65E7DBBE29569,
15750957b409SSimon J. Gerraty 0xBBBD8E4193E2A, 0x052706DC3EAA1 }
15760957b409SSimon J. Gerraty },
15770957b409SSimon J. Gerraty {
15780957b409SSimon J. Gerraty { 0xD8B7BC60055BE, 0xD76E27E4B72BC, 0x81937003CC23E,
15790957b409SSimon J. Gerraty 0xA090E337424E4, 0x02AA0E43EAD3D },
15800957b409SSimon J. Gerraty { 0x524F6383C45D2, 0x422A41B2540B8, 0x8A4797D766355,
15810957b409SSimon J. Gerraty 0xDF444EFA6DE77, 0x0042170A9079A }
15820957b409SSimon J. Gerraty },
15830957b409SSimon J. Gerraty };
15840957b409SSimon J. Gerraty
15850957b409SSimon J. Gerraty /*
15860957b409SSimon J. Gerraty * Multiply the conventional generator of the curve by the provided
15870957b409SSimon J. Gerraty * integer. Return is written in *P.
15880957b409SSimon J. Gerraty *
15890957b409SSimon J. Gerraty * Assumptions:
15900957b409SSimon J. Gerraty * - Integer is not 0, and is lower than the curve order.
15910957b409SSimon J. Gerraty * If this conditions is not met, then the result is indeterminate
15920957b409SSimon J. Gerraty * (but the process is still constant-time).
15930957b409SSimon J. Gerraty */
15940957b409SSimon J. Gerraty static void
p256_mulgen(p256_jacobian * P,const unsigned char * k,size_t klen)15950957b409SSimon J. Gerraty p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
15960957b409SSimon J. Gerraty {
15970957b409SSimon J. Gerraty point_mul_inner(P, P256_Gwin, k, klen);
15980957b409SSimon J. Gerraty }
15990957b409SSimon J. Gerraty
16000957b409SSimon J. Gerraty /*
16010957b409SSimon J. Gerraty * Return 1 if all of the following hold:
16020957b409SSimon J. Gerraty * - klen <= 32
16030957b409SSimon J. Gerraty * - k != 0
16040957b409SSimon J. Gerraty * - k is lower than the curve order
16050957b409SSimon J. Gerraty * Otherwise, return 0.
16060957b409SSimon J. Gerraty *
16070957b409SSimon J. Gerraty * Constant-time behaviour: only klen may be observable.
16080957b409SSimon J. Gerraty */
16090957b409SSimon J. Gerraty static uint32_t
check_scalar(const unsigned char * k,size_t klen)16100957b409SSimon J. Gerraty check_scalar(const unsigned char *k, size_t klen)
16110957b409SSimon J. Gerraty {
16120957b409SSimon J. Gerraty uint32_t z;
16130957b409SSimon J. Gerraty int32_t c;
16140957b409SSimon J. Gerraty size_t u;
16150957b409SSimon J. Gerraty
16160957b409SSimon J. Gerraty if (klen > 32) {
16170957b409SSimon J. Gerraty return 0;
16180957b409SSimon J. Gerraty }
16190957b409SSimon J. Gerraty z = 0;
16200957b409SSimon J. Gerraty for (u = 0; u < klen; u ++) {
16210957b409SSimon J. Gerraty z |= k[u];
16220957b409SSimon J. Gerraty }
16230957b409SSimon J. Gerraty if (klen == 32) {
16240957b409SSimon J. Gerraty c = 0;
16250957b409SSimon J. Gerraty for (u = 0; u < klen; u ++) {
16260957b409SSimon J. Gerraty c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
16270957b409SSimon J. Gerraty }
16280957b409SSimon J. Gerraty } else {
16290957b409SSimon J. Gerraty c = -1;
16300957b409SSimon J. Gerraty }
16310957b409SSimon J. Gerraty return NEQ(z, 0) & LT0(c);
16320957b409SSimon J. Gerraty }
16330957b409SSimon J. Gerraty
16340957b409SSimon J. Gerraty static uint32_t
api_mul(unsigned char * G,size_t Glen,const unsigned char * k,size_t klen,int curve)16350957b409SSimon J. Gerraty api_mul(unsigned char *G, size_t Glen,
16360957b409SSimon J. Gerraty const unsigned char *k, size_t klen, int curve)
16370957b409SSimon J. Gerraty {
16380957b409SSimon J. Gerraty uint32_t r;
16390957b409SSimon J. Gerraty p256_jacobian P;
16400957b409SSimon J. Gerraty
16410957b409SSimon J. Gerraty (void)curve;
16420957b409SSimon J. Gerraty if (Glen != 65) {
16430957b409SSimon J. Gerraty return 0;
16440957b409SSimon J. Gerraty }
16450957b409SSimon J. Gerraty r = check_scalar(k, klen);
16460957b409SSimon J. Gerraty r &= point_decode(&P, G);
16470957b409SSimon J. Gerraty p256_mul(&P, k, klen);
16480957b409SSimon J. Gerraty r &= point_encode(G, &P);
16490957b409SSimon J. Gerraty return r;
16500957b409SSimon J. Gerraty }
16510957b409SSimon J. Gerraty
16520957b409SSimon J. Gerraty static size_t
api_mulgen(unsigned char * R,const unsigned char * k,size_t klen,int curve)16530957b409SSimon J. Gerraty api_mulgen(unsigned char *R,
16540957b409SSimon J. Gerraty const unsigned char *k, size_t klen, int curve)
16550957b409SSimon J. Gerraty {
16560957b409SSimon J. Gerraty p256_jacobian P;
16570957b409SSimon J. Gerraty
16580957b409SSimon J. Gerraty (void)curve;
16590957b409SSimon J. Gerraty p256_mulgen(&P, k, klen);
16600957b409SSimon J. Gerraty point_encode(R, &P);
16610957b409SSimon J. Gerraty return 65;
16620957b409SSimon J. Gerraty }
16630957b409SSimon J. Gerraty
16640957b409SSimon J. Gerraty static uint32_t
api_muladd(unsigned char * A,const unsigned char * B,size_t len,const unsigned char * x,size_t xlen,const unsigned char * y,size_t ylen,int curve)16650957b409SSimon J. Gerraty api_muladd(unsigned char *A, const unsigned char *B, size_t len,
16660957b409SSimon J. Gerraty const unsigned char *x, size_t xlen,
16670957b409SSimon J. Gerraty const unsigned char *y, size_t ylen, int curve)
16680957b409SSimon J. Gerraty {
16690957b409SSimon J. Gerraty /*
16700957b409SSimon J. Gerraty * We might want to use Shamir's trick here: make a composite
16710957b409SSimon J. Gerraty * window of u*P+v*Q points, to merge the two doubling-ladders
16720957b409SSimon J. Gerraty * into one. This, however, has some complications:
16730957b409SSimon J. Gerraty *
16740957b409SSimon J. Gerraty * - During the computation, we may hit the point-at-infinity.
16750957b409SSimon J. Gerraty * Thus, we would need p256_add_complete_mixed() (complete
16760957b409SSimon J. Gerraty * formulas for point addition), with a higher cost (17 muls
16770957b409SSimon J. Gerraty * instead of 11).
16780957b409SSimon J. Gerraty *
16790957b409SSimon J. Gerraty * - A 4-bit window would be too large, since it would involve
16800957b409SSimon J. Gerraty * 16*16-1 = 255 points. For the same window size as in the
16810957b409SSimon J. Gerraty * p256_mul() case, we would need to reduce the window size
16820957b409SSimon J. Gerraty * to 2 bits, and thus perform twice as many non-doubling
16830957b409SSimon J. Gerraty * point additions.
16840957b409SSimon J. Gerraty *
16850957b409SSimon J. Gerraty * - The window may itself contain the point-at-infinity, and
16860957b409SSimon J. Gerraty * thus cannot be in all generality be made of affine points.
16870957b409SSimon J. Gerraty * Instead, we would need to make it a window of points in
16880957b409SSimon J. Gerraty * Jacobian coordinates. Even p256_add_complete_mixed() would
16890957b409SSimon J. Gerraty * be inappropriate.
16900957b409SSimon J. Gerraty *
16910957b409SSimon J. Gerraty * For these reasons, the code below performs two separate
16920957b409SSimon J. Gerraty * point multiplications, then computes the final point addition
16930957b409SSimon J. Gerraty * (which is both a "normal" addition, and a doubling, to handle
16940957b409SSimon J. Gerraty * all cases).
16950957b409SSimon J. Gerraty */
16960957b409SSimon J. Gerraty
16970957b409SSimon J. Gerraty p256_jacobian P, Q;
16980957b409SSimon J. Gerraty uint32_t r, t, s;
16990957b409SSimon J. Gerraty uint64_t z;
17000957b409SSimon J. Gerraty
17010957b409SSimon J. Gerraty (void)curve;
17020957b409SSimon J. Gerraty if (len != 65) {
17030957b409SSimon J. Gerraty return 0;
17040957b409SSimon J. Gerraty }
17050957b409SSimon J. Gerraty r = point_decode(&P, A);
17060957b409SSimon J. Gerraty p256_mul(&P, x, xlen);
17070957b409SSimon J. Gerraty if (B == NULL) {
17080957b409SSimon J. Gerraty p256_mulgen(&Q, y, ylen);
17090957b409SSimon J. Gerraty } else {
17100957b409SSimon J. Gerraty r &= point_decode(&Q, B);
17110957b409SSimon J. Gerraty p256_mul(&Q, y, ylen);
17120957b409SSimon J. Gerraty }
17130957b409SSimon J. Gerraty
17140957b409SSimon J. Gerraty /*
17150957b409SSimon J. Gerraty * The final addition may fail in case both points are equal.
17160957b409SSimon J. Gerraty */
17170957b409SSimon J. Gerraty t = p256_add(&P, &Q);
17180957b409SSimon J. Gerraty f256_final_reduce(P.z);
17190957b409SSimon J. Gerraty z = P.z[0] | P.z[1] | P.z[2] | P.z[3] | P.z[4];
17200957b409SSimon J. Gerraty s = EQ((uint32_t)(z | (z >> 32)), 0);
17210957b409SSimon J. Gerraty p256_double(&Q);
17220957b409SSimon J. Gerraty
17230957b409SSimon J. Gerraty /*
17240957b409SSimon J. Gerraty * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
17250957b409SSimon J. Gerraty * have the following:
17260957b409SSimon J. Gerraty *
17270957b409SSimon J. Gerraty * s = 0, t = 0 return P (normal addition)
17280957b409SSimon J. Gerraty * s = 0, t = 1 return P (normal addition)
17290957b409SSimon J. Gerraty * s = 1, t = 0 return Q (a 'double' case)
17300957b409SSimon J. Gerraty * s = 1, t = 1 report an error (P+Q = 0)
17310957b409SSimon J. Gerraty */
17320957b409SSimon J. Gerraty CCOPY(s & ~t, &P, &Q, sizeof Q);
17330957b409SSimon J. Gerraty point_encode(A, &P);
17340957b409SSimon J. Gerraty r &= ~(s & t);
17350957b409SSimon J. Gerraty return r;
17360957b409SSimon J. Gerraty }
17370957b409SSimon J. Gerraty
17380957b409SSimon J. Gerraty /* see bearssl_ec.h */
17390957b409SSimon J. Gerraty const br_ec_impl br_ec_p256_m62 = {
17400957b409SSimon J. Gerraty (uint32_t)0x00800000,
17410957b409SSimon J. Gerraty &api_generator,
17420957b409SSimon J. Gerraty &api_order,
17430957b409SSimon J. Gerraty &api_xoff,
17440957b409SSimon J. Gerraty &api_mul,
17450957b409SSimon J. Gerraty &api_mulgen,
17460957b409SSimon J. Gerraty &api_muladd
17470957b409SSimon J. Gerraty };
17480957b409SSimon J. Gerraty
17490957b409SSimon J. Gerraty /* see bearssl_ec.h */
17500957b409SSimon J. Gerraty const br_ec_impl *
br_ec_p256_m62_get(void)17510957b409SSimon J. Gerraty br_ec_p256_m62_get(void)
17520957b409SSimon J. Gerraty {
17530957b409SSimon J. Gerraty return &br_ec_p256_m62;
17540957b409SSimon J. Gerraty }
17550957b409SSimon J. Gerraty
17560957b409SSimon J. Gerraty #else
17570957b409SSimon J. Gerraty
17580957b409SSimon J. Gerraty /* see bearssl_ec.h */
17590957b409SSimon J. Gerraty const br_ec_impl *
br_ec_p256_m62_get(void)17600957b409SSimon J. Gerraty br_ec_p256_m62_get(void)
17610957b409SSimon J. Gerraty {
17620957b409SSimon J. Gerraty return 0;
17630957b409SSimon J. Gerraty }
17640957b409SSimon J. Gerraty
17650957b409SSimon J. Gerraty #endif
1766