xref: /freebsd/contrib/bearssl/src/ec/ec_p256_m64.c (revision cc9e6590773dba57440750c124173ed531349a06)
10957b409SSimon J. Gerraty /*
20957b409SSimon J. Gerraty  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
30957b409SSimon J. Gerraty  *
40957b409SSimon J. Gerraty  * Permission is hereby granted, free of charge, to any person obtaining
50957b409SSimon J. Gerraty  * a copy of this software and associated documentation files (the
60957b409SSimon J. Gerraty  * "Software"), to deal in the Software without restriction, including
70957b409SSimon J. Gerraty  * without limitation the rights to use, copy, modify, merge, publish,
80957b409SSimon J. Gerraty  * distribute, sublicense, and/or sell copies of the Software, and to
90957b409SSimon J. Gerraty  * permit persons to whom the Software is furnished to do so, subject to
100957b409SSimon J. Gerraty  * the following conditions:
110957b409SSimon J. Gerraty  *
120957b409SSimon J. Gerraty  * The above copyright notice and this permission notice shall be
130957b409SSimon J. Gerraty  * included in all copies or substantial portions of the Software.
140957b409SSimon J. Gerraty  *
150957b409SSimon J. Gerraty  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
160957b409SSimon J. Gerraty  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
170957b409SSimon J. Gerraty  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
180957b409SSimon J. Gerraty  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
190957b409SSimon J. Gerraty  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
200957b409SSimon J. Gerraty  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
210957b409SSimon J. Gerraty  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
220957b409SSimon J. Gerraty  * SOFTWARE.
230957b409SSimon J. Gerraty  */
240957b409SSimon J. Gerraty 
250957b409SSimon J. Gerraty #include "inner.h"
260957b409SSimon J. Gerraty 
270957b409SSimon J. Gerraty #if BR_INT128 || BR_UMUL128
280957b409SSimon J. Gerraty 
290957b409SSimon J. Gerraty #if BR_UMUL128
300957b409SSimon J. Gerraty #include <intrin.h>
310957b409SSimon J. Gerraty #endif
320957b409SSimon J. Gerraty 
330957b409SSimon J. Gerraty static const unsigned char P256_G[] = {
340957b409SSimon J. Gerraty 	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
350957b409SSimon J. Gerraty 	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
360957b409SSimon J. Gerraty 	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
370957b409SSimon J. Gerraty 	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
380957b409SSimon J. Gerraty 	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
390957b409SSimon J. Gerraty 	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
400957b409SSimon J. Gerraty 	0x68, 0x37, 0xBF, 0x51, 0xF5
410957b409SSimon J. Gerraty };
420957b409SSimon J. Gerraty 
430957b409SSimon J. Gerraty static const unsigned char P256_N[] = {
440957b409SSimon J. Gerraty 	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
450957b409SSimon J. Gerraty 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
460957b409SSimon J. Gerraty 	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
470957b409SSimon J. Gerraty 	0x25, 0x51
480957b409SSimon J. Gerraty };
490957b409SSimon J. Gerraty 
500957b409SSimon J. Gerraty static const unsigned char *
api_generator(int curve,size_t * len)510957b409SSimon J. Gerraty api_generator(int curve, size_t *len)
520957b409SSimon J. Gerraty {
530957b409SSimon J. Gerraty 	(void)curve;
540957b409SSimon J. Gerraty 	*len = sizeof P256_G;
550957b409SSimon J. Gerraty 	return P256_G;
560957b409SSimon J. Gerraty }
570957b409SSimon J. Gerraty 
580957b409SSimon J. Gerraty static const unsigned char *
api_order(int curve,size_t * len)590957b409SSimon J. Gerraty api_order(int curve, size_t *len)
600957b409SSimon J. Gerraty {
610957b409SSimon J. Gerraty 	(void)curve;
620957b409SSimon J. Gerraty 	*len = sizeof P256_N;
630957b409SSimon J. Gerraty 	return P256_N;
640957b409SSimon J. Gerraty }
650957b409SSimon J. Gerraty 
660957b409SSimon J. Gerraty static size_t
api_xoff(int curve,size_t * len)670957b409SSimon J. Gerraty api_xoff(int curve, size_t *len)
680957b409SSimon J. Gerraty {
690957b409SSimon J. Gerraty 	(void)curve;
700957b409SSimon J. Gerraty 	*len = 32;
710957b409SSimon J. Gerraty 	return 1;
720957b409SSimon J. Gerraty }
730957b409SSimon J. Gerraty 
740957b409SSimon J. Gerraty /*
750957b409SSimon J. Gerraty  * A field element is encoded as four 64-bit integers, in basis 2^64.
760957b409SSimon J. Gerraty  * Values may reach up to 2^256-1. Montgomery multiplication is used.
770957b409SSimon J. Gerraty  */
780957b409SSimon J. Gerraty 
790957b409SSimon J. Gerraty /* R = 2^256 mod p */
800957b409SSimon J. Gerraty static const uint64_t F256_R[] = {
810957b409SSimon J. Gerraty 	0x0000000000000001, 0xFFFFFFFF00000000,
820957b409SSimon J. Gerraty 	0xFFFFFFFFFFFFFFFF, 0x00000000FFFFFFFE
830957b409SSimon J. Gerraty };
840957b409SSimon J. Gerraty 
850957b409SSimon J. Gerraty /* Curve equation is y^2 = x^3 - 3*x + B. This constant is B*R mod p
860957b409SSimon J. Gerraty    (Montgomery representation of B). */
870957b409SSimon J. Gerraty static const uint64_t P256_B_MONTY[] = {
880957b409SSimon J. Gerraty 	0xD89CDF6229C4BDDF, 0xACF005CD78843090,
890957b409SSimon J. Gerraty 	0xE5A220ABF7212ED6, 0xDC30061D04874834
900957b409SSimon J. Gerraty };
910957b409SSimon J. Gerraty 
920957b409SSimon J. Gerraty /*
930957b409SSimon J. Gerraty  * Addition in the field.
940957b409SSimon J. Gerraty  */
950957b409SSimon J. Gerraty static inline void
f256_add(uint64_t * d,const uint64_t * a,const uint64_t * b)960957b409SSimon J. Gerraty f256_add(uint64_t *d, const uint64_t *a, const uint64_t *b)
970957b409SSimon J. Gerraty {
980957b409SSimon J. Gerraty #if BR_INT128
990957b409SSimon J. Gerraty 	unsigned __int128 w;
1000957b409SSimon J. Gerraty 	uint64_t t;
1010957b409SSimon J. Gerraty 
102*cc9e6590SSimon J. Gerraty 	/*
103*cc9e6590SSimon J. Gerraty 	 * Do the addition, with an extra carry in t.
104*cc9e6590SSimon J. Gerraty 	 */
1050957b409SSimon J. Gerraty 	w = (unsigned __int128)a[0] + b[0];
1060957b409SSimon J. Gerraty 	d[0] = (uint64_t)w;
1070957b409SSimon J. Gerraty 	w = (unsigned __int128)a[1] + b[1] + (w >> 64);
1080957b409SSimon J. Gerraty 	d[1] = (uint64_t)w;
1090957b409SSimon J. Gerraty 	w = (unsigned __int128)a[2] + b[2] + (w >> 64);
1100957b409SSimon J. Gerraty 	d[2] = (uint64_t)w;
1110957b409SSimon J. Gerraty 	w = (unsigned __int128)a[3] + b[3] + (w >> 64);
1120957b409SSimon J. Gerraty 	d[3] = (uint64_t)w;
1130957b409SSimon J. Gerraty 	t = (uint64_t)(w >> 64);
1140957b409SSimon J. Gerraty 
1150957b409SSimon J. Gerraty 	/*
116*cc9e6590SSimon J. Gerraty 	 * Fold carry t, using: 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p.
1170957b409SSimon J. Gerraty 	 */
1180957b409SSimon J. Gerraty 	w = (unsigned __int128)d[0] + t;
1190957b409SSimon J. Gerraty 	d[0] = (uint64_t)w;
1200957b409SSimon J. Gerraty 	w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
1210957b409SSimon J. Gerraty 	d[1] = (uint64_t)w;
1220957b409SSimon J. Gerraty 	/* Here, carry "w >> 64" can only be 0 or -1 */
1230957b409SSimon J. Gerraty 	w = (unsigned __int128)d[2] - ((w >> 64) & 1);
1240957b409SSimon J. Gerraty 	d[2] = (uint64_t)w;
125*cc9e6590SSimon J. Gerraty 	/* Again, carry is 0 or -1. But there can be carry only if t = 1,
126*cc9e6590SSimon J. Gerraty 	   in which case the addition of (t << 32) - t is positive. */
127*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[3] - ((w >> 64) & 1) + (t << 32) - t;
128*cc9e6590SSimon J. Gerraty 	d[3] = (uint64_t)w;
129*cc9e6590SSimon J. Gerraty 	t = (uint64_t)(w >> 64);
130*cc9e6590SSimon J. Gerraty 
131*cc9e6590SSimon J. Gerraty 	/*
132*cc9e6590SSimon J. Gerraty 	 * There can be an extra carry here, which we must fold again.
133*cc9e6590SSimon J. Gerraty 	 */
134*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[0] + t;
135*cc9e6590SSimon J. Gerraty 	d[0] = (uint64_t)w;
136*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[1] + (w >> 64) - (t << 32);
137*cc9e6590SSimon J. Gerraty 	d[1] = (uint64_t)w;
138*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[2] - ((w >> 64) & 1);
139*cc9e6590SSimon J. Gerraty 	d[2] = (uint64_t)w;
140*cc9e6590SSimon J. Gerraty 	d[3] += (t << 32) - t - (uint64_t)((w >> 64) & 1);
1410957b409SSimon J. Gerraty 
1420957b409SSimon J. Gerraty #elif BR_UMUL128
1430957b409SSimon J. Gerraty 
1440957b409SSimon J. Gerraty 	unsigned char cc;
1450957b409SSimon J. Gerraty 	uint64_t t;
1460957b409SSimon J. Gerraty 
1470957b409SSimon J. Gerraty 	cc = _addcarry_u64(0, a[0], b[0], &d[0]);
1480957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, a[1], b[1], &d[1]);
1490957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, a[2], b[2], &d[2]);
1500957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, a[3], b[3], &d[3]);
1510957b409SSimon J. Gerraty 
1520957b409SSimon J. Gerraty 	/*
1530957b409SSimon J. Gerraty 	 * If there is a carry, then we want to subtract p, which we
1540957b409SSimon J. Gerraty 	 * do by adding 2^256 - p.
1550957b409SSimon J. Gerraty 	 */
1560957b409SSimon J. Gerraty 	t = cc;
1570957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[0], 0, &d[0]);
1580957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
1590957b409SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[2], -t, &d[2]);
160*cc9e6590SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
161*cc9e6590SSimon J. Gerraty 
162*cc9e6590SSimon J. Gerraty 	/*
163*cc9e6590SSimon J. Gerraty 	 * We have to do it again if there still is a carry.
164*cc9e6590SSimon J. Gerraty 	 */
165*cc9e6590SSimon J. Gerraty 	t = cc;
166*cc9e6590SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[0], 0, &d[0]);
167*cc9e6590SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[1], -(t << 32), &d[1]);
168*cc9e6590SSimon J. Gerraty 	cc = _addcarry_u64(cc, d[2], -t, &d[2]);
1690957b409SSimon J. Gerraty 	(void)_addcarry_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
1700957b409SSimon J. Gerraty 
1710957b409SSimon J. Gerraty #endif
1720957b409SSimon J. Gerraty }
1730957b409SSimon J. Gerraty 
1740957b409SSimon J. Gerraty /*
1750957b409SSimon J. Gerraty  * Subtraction in the field.
1760957b409SSimon J. Gerraty  */
1770957b409SSimon J. Gerraty static inline void
f256_sub(uint64_t * d,const uint64_t * a,const uint64_t * b)1780957b409SSimon J. Gerraty f256_sub(uint64_t *d, const uint64_t *a, const uint64_t *b)
1790957b409SSimon J. Gerraty {
1800957b409SSimon J. Gerraty #if BR_INT128
1810957b409SSimon J. Gerraty 
1820957b409SSimon J. Gerraty 	unsigned __int128 w;
1830957b409SSimon J. Gerraty 	uint64_t t;
1840957b409SSimon J. Gerraty 
1850957b409SSimon J. Gerraty 	w = (unsigned __int128)a[0] - b[0];
1860957b409SSimon J. Gerraty 	d[0] = (uint64_t)w;
1870957b409SSimon J. Gerraty 	w = (unsigned __int128)a[1] - b[1] - ((w >> 64) & 1);
1880957b409SSimon J. Gerraty 	d[1] = (uint64_t)w;
1890957b409SSimon J. Gerraty 	w = (unsigned __int128)a[2] - b[2] - ((w >> 64) & 1);
1900957b409SSimon J. Gerraty 	d[2] = (uint64_t)w;
1910957b409SSimon J. Gerraty 	w = (unsigned __int128)a[3] - b[3] - ((w >> 64) & 1);
1920957b409SSimon J. Gerraty 	d[3] = (uint64_t)w;
1930957b409SSimon J. Gerraty 	t = (uint64_t)(w >> 64) & 1;
1940957b409SSimon J. Gerraty 
1950957b409SSimon J. Gerraty 	/*
196*cc9e6590SSimon J. Gerraty 	 * If there is a borrow (t = 1), then we must add the modulus
1970957b409SSimon J. Gerraty 	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1.
1980957b409SSimon J. Gerraty 	 */
1990957b409SSimon J. Gerraty 	w = (unsigned __int128)d[0] - t;
2000957b409SSimon J. Gerraty 	d[0] = (uint64_t)w;
2010957b409SSimon J. Gerraty 	w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
2020957b409SSimon J. Gerraty 	d[1] = (uint64_t)w;
2030957b409SSimon J. Gerraty 	/* Here, carry "w >> 64" can only be 0 or +1 */
2040957b409SSimon J. Gerraty 	w = (unsigned __int128)d[2] + (w >> 64);
2050957b409SSimon J. Gerraty 	d[2] = (uint64_t)w;
2060957b409SSimon J. Gerraty 	/* Again, carry is 0 or +1 */
207*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[3] + (w >> 64) - (t << 32) + t;
208*cc9e6590SSimon J. Gerraty 	d[3] = (uint64_t)w;
209*cc9e6590SSimon J. Gerraty 	t = (uint64_t)(w >> 64) & 1;
210*cc9e6590SSimon J. Gerraty 
211*cc9e6590SSimon J. Gerraty 	/*
212*cc9e6590SSimon J. Gerraty 	 * There may be again a borrow, in which case we must add the
213*cc9e6590SSimon J. Gerraty 	 * modulus again.
214*cc9e6590SSimon J. Gerraty 	 */
215*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[0] - t;
216*cc9e6590SSimon J. Gerraty 	d[0] = (uint64_t)w;
217*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[1] + (t << 32) - ((w >> 64) & 1);
218*cc9e6590SSimon J. Gerraty 	d[1] = (uint64_t)w;
219*cc9e6590SSimon J. Gerraty 	w = (unsigned __int128)d[2] + (w >> 64);
220*cc9e6590SSimon J. Gerraty 	d[2] = (uint64_t)w;
2210957b409SSimon J. Gerraty 	d[3] += (uint64_t)(w >> 64) - (t << 32) + t;
2220957b409SSimon J. Gerraty 
2230957b409SSimon J. Gerraty #elif BR_UMUL128
2240957b409SSimon J. Gerraty 
2250957b409SSimon J. Gerraty 	unsigned char cc;
2260957b409SSimon J. Gerraty 	uint64_t t;
2270957b409SSimon J. Gerraty 
2280957b409SSimon J. Gerraty 	cc = _subborrow_u64(0, a[0], b[0], &d[0]);
2290957b409SSimon J. Gerraty 	cc = _subborrow_u64(cc, a[1], b[1], &d[1]);
2300957b409SSimon J. Gerraty 	cc = _subborrow_u64(cc, a[2], b[2], &d[2]);
2310957b409SSimon J. Gerraty 	cc = _subborrow_u64(cc, a[3], b[3], &d[3]);
2320957b409SSimon J. Gerraty 
2330957b409SSimon J. Gerraty 	/*
234*cc9e6590SSimon J. Gerraty 	 * If there is a borrow, then we need to add p. We (virtually)
235*cc9e6590SSimon J. Gerraty 	 * add 2^256, then subtract 2^256 - p.
2360957b409SSimon J. Gerraty 	 */
2370957b409SSimon J. Gerraty 	t = cc;
238*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(0, d[0], t, &d[0]);
239*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(cc, d[1], -(t << 32), &d[1]);
240*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(cc, d[2], -t, &d[2]);
241*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
242*cc9e6590SSimon J. Gerraty 
243*cc9e6590SSimon J. Gerraty 	/*
244*cc9e6590SSimon J. Gerraty 	 * If there still is a borrow, then we need to add p again.
245*cc9e6590SSimon J. Gerraty 	 */
246*cc9e6590SSimon J. Gerraty 	t = cc;
247*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(0, d[0], t, &d[0]);
248*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(cc, d[1], -(t << 32), &d[1]);
249*cc9e6590SSimon J. Gerraty 	cc = _subborrow_u64(cc, d[2], -t, &d[2]);
250*cc9e6590SSimon J. Gerraty 	(void)_subborrow_u64(cc, d[3], (t << 32) - (t << 1), &d[3]);
2510957b409SSimon J. Gerraty 
2520957b409SSimon J. Gerraty #endif
2530957b409SSimon J. Gerraty }
2540957b409SSimon J. Gerraty 
2550957b409SSimon J. Gerraty /*
2560957b409SSimon J. Gerraty  * Montgomery multiplication in the field.
2570957b409SSimon J. Gerraty  */
2580957b409SSimon J. Gerraty static void
f256_montymul(uint64_t * d,const uint64_t * a,const uint64_t * b)2590957b409SSimon J. Gerraty f256_montymul(uint64_t *d, const uint64_t *a, const uint64_t *b)
2600957b409SSimon J. Gerraty {
2610957b409SSimon J. Gerraty #if BR_INT128
2620957b409SSimon J. Gerraty 
2630957b409SSimon J. Gerraty 	uint64_t x, f, t0, t1, t2, t3, t4;
2640957b409SSimon J. Gerraty 	unsigned __int128 z, ff;
2650957b409SSimon J. Gerraty 	int i;
2660957b409SSimon J. Gerraty 
2670957b409SSimon J. Gerraty 	/*
2680957b409SSimon J. Gerraty 	 * When computing d <- d + a[u]*b, we also add f*p such
2690957b409SSimon J. Gerraty 	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
2700957b409SSimon J. Gerraty 	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
2710957b409SSimon J. Gerraty 	 */
2720957b409SSimon J. Gerraty 
2730957b409SSimon J. Gerraty 	/*
2740957b409SSimon J. Gerraty 	 * Step 1: t <- (a[0]*b + f*p) / 2^64
2750957b409SSimon J. Gerraty 	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
2760957b409SSimon J. Gerraty 	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
2770957b409SSimon J. Gerraty 	 *
2780957b409SSimon J. Gerraty 	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
2790957b409SSimon J. Gerraty 	 */
2800957b409SSimon J. Gerraty 	x = a[0];
2810957b409SSimon J. Gerraty 	z = (unsigned __int128)b[0] * x;
2820957b409SSimon J. Gerraty 	f = (uint64_t)z;
2830957b409SSimon J. Gerraty 	z = (unsigned __int128)b[1] * x + (z >> 64) + (uint64_t)(f << 32);
2840957b409SSimon J. Gerraty 	t0 = (uint64_t)z;
2850957b409SSimon J. Gerraty 	z = (unsigned __int128)b[2] * x + (z >> 64) + (uint64_t)(f >> 32);
2860957b409SSimon J. Gerraty 	t1 = (uint64_t)z;
2870957b409SSimon J. Gerraty 	z = (unsigned __int128)b[3] * x + (z >> 64) + f;
2880957b409SSimon J. Gerraty 	t2 = (uint64_t)z;
2890957b409SSimon J. Gerraty 	t3 = (uint64_t)(z >> 64);
2900957b409SSimon J. Gerraty 	ff = ((unsigned __int128)f << 64) - ((unsigned __int128)f << 32);
2910957b409SSimon J. Gerraty 	z = (unsigned __int128)t2 + (uint64_t)ff;
2920957b409SSimon J. Gerraty 	t2 = (uint64_t)z;
2930957b409SSimon J. Gerraty 	z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
2940957b409SSimon J. Gerraty 	t3 = (uint64_t)z;
2950957b409SSimon J. Gerraty 	t4 = (uint64_t)(z >> 64);
2960957b409SSimon J. Gerraty 
2970957b409SSimon J. Gerraty 	/*
2980957b409SSimon J. Gerraty 	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
2990957b409SSimon J. Gerraty 	 */
3000957b409SSimon J. Gerraty 	for (i = 1; i < 4; i ++) {
3010957b409SSimon J. Gerraty 		x = a[i];
3020957b409SSimon J. Gerraty 
3030957b409SSimon J. Gerraty 		/* t <- (t + x*b - f) / 2^64 */
3040957b409SSimon J. Gerraty 		z = (unsigned __int128)b[0] * x + t0;
3050957b409SSimon J. Gerraty 		f = (uint64_t)z;
3060957b409SSimon J. Gerraty 		z = (unsigned __int128)b[1] * x + t1 + (z >> 64);
3070957b409SSimon J. Gerraty 		t0 = (uint64_t)z;
3080957b409SSimon J. Gerraty 		z = (unsigned __int128)b[2] * x + t2 + (z >> 64);
3090957b409SSimon J. Gerraty 		t1 = (uint64_t)z;
3100957b409SSimon J. Gerraty 		z = (unsigned __int128)b[3] * x + t3 + (z >> 64);
3110957b409SSimon J. Gerraty 		t2 = (uint64_t)z;
3120957b409SSimon J. Gerraty 		z = t4 + (z >> 64);
3130957b409SSimon J. Gerraty 		t3 = (uint64_t)z;
3140957b409SSimon J. Gerraty 		t4 = (uint64_t)(z >> 64);
3150957b409SSimon J. Gerraty 
3160957b409SSimon J. Gerraty 		/* t <- t + f*2^32, carry in the upper half of z */
3170957b409SSimon J. Gerraty 		z = (unsigned __int128)t0 + (uint64_t)(f << 32);
3180957b409SSimon J. Gerraty 		t0 = (uint64_t)z;
3190957b409SSimon J. Gerraty 		z = (z >> 64) + (unsigned __int128)t1 + (uint64_t)(f >> 32);
3200957b409SSimon J. Gerraty 		t1 = (uint64_t)z;
3210957b409SSimon J. Gerraty 
3220957b409SSimon J. Gerraty 		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
3230957b409SSimon J. Gerraty 		ff = ((unsigned __int128)f << 64)
3240957b409SSimon J. Gerraty 			- ((unsigned __int128)f << 32) + f;
3250957b409SSimon J. Gerraty 		z = (z >> 64) + (unsigned __int128)t2 + (uint64_t)ff;
3260957b409SSimon J. Gerraty 		t2 = (uint64_t)z;
3270957b409SSimon J. Gerraty 		z = (unsigned __int128)t3 + (z >> 64) + (ff >> 64);
3280957b409SSimon J. Gerraty 		t3 = (uint64_t)z;
3290957b409SSimon J. Gerraty 		t4 += (uint64_t)(z >> 64);
3300957b409SSimon J. Gerraty 	}
3310957b409SSimon J. Gerraty 
3320957b409SSimon J. Gerraty 	/*
3330957b409SSimon J. Gerraty 	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
3340957b409SSimon J. Gerraty 	 * F is a 256-bit integer whose limbs are the "f" coefficients
3350957b409SSimon J. Gerraty 	 * in the steps above. We have:
3360957b409SSimon J. Gerraty 	 *   a <= 2^256-1
3370957b409SSimon J. Gerraty 	 *   b <= 2^256-1
3380957b409SSimon J. Gerraty 	 *   F <= 2^256-1
3390957b409SSimon J. Gerraty 	 * Hence:
3400957b409SSimon J. Gerraty 	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
3410957b409SSimon J. Gerraty 	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
3420957b409SSimon J. Gerraty 	 * Therefore:
3430957b409SSimon J. Gerraty 	 *   t < 2^256 + p - 2
3440957b409SSimon J. Gerraty 	 * Since p < 2^256, it follows that:
3450957b409SSimon J. Gerraty 	 *   t4 can be only 0 or 1
3460957b409SSimon J. Gerraty 	 *   t - p < 2^256
3470957b409SSimon J. Gerraty 	 * We can therefore subtract p from t, conditionally on t4, to
3480957b409SSimon J. Gerraty 	 * get a nonnegative result that fits on 256 bits.
3490957b409SSimon J. Gerraty 	 */
3500957b409SSimon J. Gerraty 	z = (unsigned __int128)t0 + t4;
3510957b409SSimon J. Gerraty 	t0 = (uint64_t)z;
3520957b409SSimon J. Gerraty 	z = (unsigned __int128)t1 - (t4 << 32) + (z >> 64);
3530957b409SSimon J. Gerraty 	t1 = (uint64_t)z;
3540957b409SSimon J. Gerraty 	z = (unsigned __int128)t2 - (z >> 127);
3550957b409SSimon J. Gerraty 	t2 = (uint64_t)z;
3560957b409SSimon J. Gerraty 	t3 = t3 - (uint64_t)(z >> 127) - t4 + (t4 << 32);
3570957b409SSimon J. Gerraty 
3580957b409SSimon J. Gerraty 	d[0] = t0;
3590957b409SSimon J. Gerraty 	d[1] = t1;
3600957b409SSimon J. Gerraty 	d[2] = t2;
3610957b409SSimon J. Gerraty 	d[3] = t3;
3620957b409SSimon J. Gerraty 
3630957b409SSimon J. Gerraty #elif BR_UMUL128
3640957b409SSimon J. Gerraty 
3650957b409SSimon J. Gerraty 	uint64_t x, f, t0, t1, t2, t3, t4;
3660957b409SSimon J. Gerraty 	uint64_t zl, zh, ffl, ffh;
3670957b409SSimon J. Gerraty 	unsigned char k, m;
3680957b409SSimon J. Gerraty 	int i;
3690957b409SSimon J. Gerraty 
3700957b409SSimon J. Gerraty 	/*
3710957b409SSimon J. Gerraty 	 * When computing d <- d + a[u]*b, we also add f*p such
3720957b409SSimon J. Gerraty 	 * that d + a[u]*b + f*p is a multiple of 2^64. Since
3730957b409SSimon J. Gerraty 	 * p = -1 mod 2^64, we can compute f = d[0] + a[u]*b[0] mod 2^64.
3740957b409SSimon J. Gerraty 	 */
3750957b409SSimon J. Gerraty 
3760957b409SSimon J. Gerraty 	/*
3770957b409SSimon J. Gerraty 	 * Step 1: t <- (a[0]*b + f*p) / 2^64
3780957b409SSimon J. Gerraty 	 * We have f = a[0]*b[0] mod 2^64. Since p = -1 mod 2^64, this
3790957b409SSimon J. Gerraty 	 * ensures that (a[0]*b + f*p) is a multiple of 2^64.
3800957b409SSimon J. Gerraty 	 *
3810957b409SSimon J. Gerraty 	 * We also have: f*p = f*2^256 - f*2^224 + f*2^192 + f*2^96 - f.
3820957b409SSimon J. Gerraty 	 */
3830957b409SSimon J. Gerraty 	x = a[0];
3840957b409SSimon J. Gerraty 
3850957b409SSimon J. Gerraty 	zl = _umul128(b[0], x, &zh);
3860957b409SSimon J. Gerraty 	f = zl;
3870957b409SSimon J. Gerraty 	t0 = zh;
3880957b409SSimon J. Gerraty 
3890957b409SSimon J. Gerraty 	zl = _umul128(b[1], x, &zh);
3900957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, t0, &zl);
3910957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
3920957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, f << 32, &zl);
3930957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
3940957b409SSimon J. Gerraty 	t0 = zl;
3950957b409SSimon J. Gerraty 	t1 = zh;
3960957b409SSimon J. Gerraty 
3970957b409SSimon J. Gerraty 	zl = _umul128(b[2], x, &zh);
3980957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, t1, &zl);
3990957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
4000957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, f >> 32, &zl);
4010957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
4020957b409SSimon J. Gerraty 	t1 = zl;
4030957b409SSimon J. Gerraty 	t2 = zh;
4040957b409SSimon J. Gerraty 
4050957b409SSimon J. Gerraty 	zl = _umul128(b[3], x, &zh);
4060957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, t2, &zl);
4070957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
4080957b409SSimon J. Gerraty 	k = _addcarry_u64(0, zl, f, &zl);
4090957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, zh, 0, &zh);
4100957b409SSimon J. Gerraty 	t2 = zl;
4110957b409SSimon J. Gerraty 	t3 = zh;
4120957b409SSimon J. Gerraty 
4130957b409SSimon J. Gerraty 	t4 = _addcarry_u64(0, t3, f, &t3);
4140957b409SSimon J. Gerraty 	k = _subborrow_u64(0, t2, f << 32, &t2);
4150957b409SSimon J. Gerraty 	k = _subborrow_u64(k, t3, f >> 32, &t3);
4160957b409SSimon J. Gerraty 	(void)_subborrow_u64(k, t4, 0, &t4);
4170957b409SSimon J. Gerraty 
4180957b409SSimon J. Gerraty 	/*
4190957b409SSimon J. Gerraty 	 * Steps 2 to 4: t <- (t + a[i]*b + f*p) / 2^64
4200957b409SSimon J. Gerraty 	 */
4210957b409SSimon J. Gerraty 	for (i = 1; i < 4; i ++) {
4220957b409SSimon J. Gerraty 		x = a[i];
4230957b409SSimon J. Gerraty 		/* f = t0 + x * b[0]; -- computed below */
4240957b409SSimon J. Gerraty 
4250957b409SSimon J. Gerraty 		/* t <- (t + x*b - f) / 2^64 */
4260957b409SSimon J. Gerraty 		zl = _umul128(b[0], x, &zh);
4270957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t0, &f);
4280957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &t0);
4290957b409SSimon J. Gerraty 
4300957b409SSimon J. Gerraty 		zl = _umul128(b[1], x, &zh);
4310957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t0, &zl);
4320957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &zh);
4330957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t1, &t0);
4340957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &t1);
4350957b409SSimon J. Gerraty 
4360957b409SSimon J. Gerraty 		zl = _umul128(b[2], x, &zh);
4370957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t1, &zl);
4380957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &zh);
4390957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t2, &t1);
4400957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &t2);
4410957b409SSimon J. Gerraty 
4420957b409SSimon J. Gerraty 		zl = _umul128(b[3], x, &zh);
4430957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t2, &zl);
4440957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &zh);
4450957b409SSimon J. Gerraty 		k = _addcarry_u64(0, zl, t3, &t2);
4460957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, zh, 0, &t3);
4470957b409SSimon J. Gerraty 
4480957b409SSimon J. Gerraty 		t4 = _addcarry_u64(0, t3, t4, &t3);
4490957b409SSimon J. Gerraty 
4500957b409SSimon J. Gerraty 		/* t <- t + f*2^32, carry in k */
4510957b409SSimon J. Gerraty 		k = _addcarry_u64(0, t0, f << 32, &t0);
4520957b409SSimon J. Gerraty 		k = _addcarry_u64(k, t1, f >> 32, &t1);
4530957b409SSimon J. Gerraty 
4540957b409SSimon J. Gerraty 		/* t <- t + f*2^192 - f*2^160 + f*2^128 */
4550957b409SSimon J. Gerraty 		m = _subborrow_u64(0, f, f << 32, &ffl);
4560957b409SSimon J. Gerraty 		(void)_subborrow_u64(m, f, f >> 32, &ffh);
4570957b409SSimon J. Gerraty 		k = _addcarry_u64(k, t2, ffl, &t2);
4580957b409SSimon J. Gerraty 		k = _addcarry_u64(k, t3, ffh, &t3);
4590957b409SSimon J. Gerraty 		(void)_addcarry_u64(k, t4, 0, &t4);
4600957b409SSimon J. Gerraty 	}
4610957b409SSimon J. Gerraty 
4620957b409SSimon J. Gerraty 	/*
4630957b409SSimon J. Gerraty 	 * At that point, we have computed t = (a*b + F*p) / 2^256, where
4640957b409SSimon J. Gerraty 	 * F is a 256-bit integer whose limbs are the "f" coefficients
4650957b409SSimon J. Gerraty 	 * in the steps above. We have:
4660957b409SSimon J. Gerraty 	 *   a <= 2^256-1
4670957b409SSimon J. Gerraty 	 *   b <= 2^256-1
4680957b409SSimon J. Gerraty 	 *   F <= 2^256-1
4690957b409SSimon J. Gerraty 	 * Hence:
4700957b409SSimon J. Gerraty 	 *   a*b + F*p <= (2^256-1)*(2^256-1) + p*(2^256-1)
4710957b409SSimon J. Gerraty 	 *   a*b + F*p <= 2^256*(2^256 - 2 + p) + 1 - p
4720957b409SSimon J. Gerraty 	 * Therefore:
4730957b409SSimon J. Gerraty 	 *   t < 2^256 + p - 2
4740957b409SSimon J. Gerraty 	 * Since p < 2^256, it follows that:
4750957b409SSimon J. Gerraty 	 *   t4 can be only 0 or 1
4760957b409SSimon J. Gerraty 	 *   t - p < 2^256
4770957b409SSimon J. Gerraty 	 * We can therefore subtract p from t, conditionally on t4, to
4780957b409SSimon J. Gerraty 	 * get a nonnegative result that fits on 256 bits.
4790957b409SSimon J. Gerraty 	 */
4800957b409SSimon J. Gerraty 	k = _addcarry_u64(0, t0, t4, &t0);
4810957b409SSimon J. Gerraty 	k = _addcarry_u64(k, t1, -(t4 << 32), &t1);
4820957b409SSimon J. Gerraty 	k = _addcarry_u64(k, t2, -t4, &t2);
4830957b409SSimon J. Gerraty 	(void)_addcarry_u64(k, t3, (t4 << 32) - (t4 << 1), &t3);
4840957b409SSimon J. Gerraty 
4850957b409SSimon J. Gerraty 	d[0] = t0;
4860957b409SSimon J. Gerraty 	d[1] = t1;
4870957b409SSimon J. Gerraty 	d[2] = t2;
4880957b409SSimon J. Gerraty 	d[3] = t3;
4890957b409SSimon J. Gerraty 
4900957b409SSimon J. Gerraty #endif
4910957b409SSimon J. Gerraty }
4920957b409SSimon J. Gerraty 
4930957b409SSimon J. Gerraty /*
4940957b409SSimon J. Gerraty  * Montgomery squaring in the field; currently a basic wrapper around
4950957b409SSimon J. Gerraty  * multiplication (inline, should be optimized away).
4960957b409SSimon J. Gerraty  * TODO: see if some extra speed can be gained here.
4970957b409SSimon J. Gerraty  */
4980957b409SSimon J. Gerraty static inline void
f256_montysquare(uint64_t * d,const uint64_t * a)4990957b409SSimon J. Gerraty f256_montysquare(uint64_t *d, const uint64_t *a)
5000957b409SSimon J. Gerraty {
5010957b409SSimon J. Gerraty 	f256_montymul(d, a, a);
5020957b409SSimon J. Gerraty }
5030957b409SSimon J. Gerraty 
5040957b409SSimon J. Gerraty /*
5050957b409SSimon J. Gerraty  * Convert to Montgomery representation.
5060957b409SSimon J. Gerraty  */
5070957b409SSimon J. Gerraty static void
f256_tomonty(uint64_t * d,const uint64_t * a)5080957b409SSimon J. Gerraty f256_tomonty(uint64_t *d, const uint64_t *a)
5090957b409SSimon J. Gerraty {
5100957b409SSimon J. Gerraty 	/*
5110957b409SSimon J. Gerraty 	 * R2 = 2^512 mod p.
5120957b409SSimon J. Gerraty 	 * If R = 2^256 mod p, then R2 = R^2 mod p; and the Montgomery
5130957b409SSimon J. Gerraty 	 * multiplication of a by R2 is: a*R2/R = a*R mod p, i.e. the
5140957b409SSimon J. Gerraty 	 * conversion to Montgomery representation.
5150957b409SSimon J. Gerraty 	 */
5160957b409SSimon J. Gerraty 	static const uint64_t R2[] = {
5170957b409SSimon J. Gerraty 		0x0000000000000003,
5180957b409SSimon J. Gerraty 		0xFFFFFFFBFFFFFFFF,
5190957b409SSimon J. Gerraty 		0xFFFFFFFFFFFFFFFE,
5200957b409SSimon J. Gerraty 		0x00000004FFFFFFFD
5210957b409SSimon J. Gerraty 	};
5220957b409SSimon J. Gerraty 
5230957b409SSimon J. Gerraty 	f256_montymul(d, a, R2);
5240957b409SSimon J. Gerraty }
5250957b409SSimon J. Gerraty 
5260957b409SSimon J. Gerraty /*
5270957b409SSimon J. Gerraty  * Convert from Montgomery representation.
5280957b409SSimon J. Gerraty  */
5290957b409SSimon J. Gerraty static void
f256_frommonty(uint64_t * d,const uint64_t * a)5300957b409SSimon J. Gerraty f256_frommonty(uint64_t *d, const uint64_t *a)
5310957b409SSimon J. Gerraty {
5320957b409SSimon J. Gerraty 	/*
5330957b409SSimon J. Gerraty 	 * Montgomery multiplication by 1 is division by 2^256 modulo p.
5340957b409SSimon J. Gerraty 	 */
5350957b409SSimon J. Gerraty 	static const uint64_t one[] = { 1, 0, 0, 0 };
5360957b409SSimon J. Gerraty 
5370957b409SSimon J. Gerraty 	f256_montymul(d, a, one);
5380957b409SSimon J. Gerraty }
5390957b409SSimon J. Gerraty 
5400957b409SSimon J. Gerraty /*
5410957b409SSimon J. Gerraty  * Inversion in the field. If the source value is 0 modulo p, then this
5420957b409SSimon J. Gerraty  * returns 0 or p. This function uses Montgomery representation.
5430957b409SSimon J. Gerraty  */
5440957b409SSimon J. Gerraty static void
f256_invert(uint64_t * d,const uint64_t * a)5450957b409SSimon J. Gerraty f256_invert(uint64_t *d, const uint64_t *a)
5460957b409SSimon J. Gerraty {
5470957b409SSimon J. Gerraty 	/*
5480957b409SSimon J. Gerraty 	 * We compute a^(p-2) mod p. The exponent pattern (from high to
5490957b409SSimon J. Gerraty 	 * low) is:
5500957b409SSimon J. Gerraty 	 *  - 32 bits of value 1
5510957b409SSimon J. Gerraty 	 *  - 31 bits of value 0
5520957b409SSimon J. Gerraty 	 *  - 1 bit of value 1
5530957b409SSimon J. Gerraty 	 *  - 96 bits of value 0
5540957b409SSimon J. Gerraty 	 *  - 94 bits of value 1
5550957b409SSimon J. Gerraty 	 *  - 1 bit of value 0
5560957b409SSimon J. Gerraty 	 *  - 1 bit of value 1
5570957b409SSimon J. Gerraty 	 * To speed up the square-and-multiply algorithm, we precompute
5580957b409SSimon J. Gerraty 	 * a^(2^31-1).
5590957b409SSimon J. Gerraty 	 */
5600957b409SSimon J. Gerraty 
5610957b409SSimon J. Gerraty 	uint64_t r[4], t[4];
5620957b409SSimon J. Gerraty 	int i;
5630957b409SSimon J. Gerraty 
5640957b409SSimon J. Gerraty 	memcpy(t, a, sizeof t);
5650957b409SSimon J. Gerraty 	for (i = 0; i < 30; i ++) {
5660957b409SSimon J. Gerraty 		f256_montysquare(t, t);
5670957b409SSimon J. Gerraty 		f256_montymul(t, t, a);
5680957b409SSimon J. Gerraty 	}
5690957b409SSimon J. Gerraty 
5700957b409SSimon J. Gerraty 	memcpy(r, t, sizeof t);
5710957b409SSimon J. Gerraty 	for (i = 224; i >= 0; i --) {
5720957b409SSimon J. Gerraty 		f256_montysquare(r, r);
5730957b409SSimon J. Gerraty 		switch (i) {
5740957b409SSimon J. Gerraty 		case 0:
5750957b409SSimon J. Gerraty 		case 2:
5760957b409SSimon J. Gerraty 		case 192:
5770957b409SSimon J. Gerraty 		case 224:
5780957b409SSimon J. Gerraty 			f256_montymul(r, r, a);
5790957b409SSimon J. Gerraty 			break;
5800957b409SSimon J. Gerraty 		case 3:
5810957b409SSimon J. Gerraty 		case 34:
5820957b409SSimon J. Gerraty 		case 65:
5830957b409SSimon J. Gerraty 			f256_montymul(r, r, t);
5840957b409SSimon J. Gerraty 			break;
5850957b409SSimon J. Gerraty 		}
5860957b409SSimon J. Gerraty 	}
5870957b409SSimon J. Gerraty 	memcpy(d, r, sizeof r);
5880957b409SSimon J. Gerraty }
5890957b409SSimon J. Gerraty 
5900957b409SSimon J. Gerraty /*
5910957b409SSimon J. Gerraty  * Finalize reduction.
5920957b409SSimon J. Gerraty  * Input value fits on 256 bits. This function subtracts p if and only
5930957b409SSimon J. Gerraty  * if the input is greater than or equal to p.
5940957b409SSimon J. Gerraty  */
5950957b409SSimon J. Gerraty static inline void
f256_final_reduce(uint64_t * a)5960957b409SSimon J. Gerraty f256_final_reduce(uint64_t *a)
5970957b409SSimon J. Gerraty {
5980957b409SSimon J. Gerraty #if BR_INT128
5990957b409SSimon J. Gerraty 
6000957b409SSimon J. Gerraty 	uint64_t t0, t1, t2, t3, cc;
6010957b409SSimon J. Gerraty 	unsigned __int128 z;
6020957b409SSimon J. Gerraty 
6030957b409SSimon J. Gerraty 	/*
6040957b409SSimon J. Gerraty 	 * We add 2^224 - 2^192 - 2^96 + 1 to a. If there is no carry,
6050957b409SSimon J. Gerraty 	 * then a < p; otherwise, the addition result we computed is
6060957b409SSimon J. Gerraty 	 * the value we must return.
6070957b409SSimon J. Gerraty 	 */
6080957b409SSimon J. Gerraty 	z = (unsigned __int128)a[0] + 1;
6090957b409SSimon J. Gerraty 	t0 = (uint64_t)z;
6100957b409SSimon J. Gerraty 	z = (unsigned __int128)a[1] + (z >> 64) - ((uint64_t)1 << 32);
6110957b409SSimon J. Gerraty 	t1 = (uint64_t)z;
6120957b409SSimon J. Gerraty 	z = (unsigned __int128)a[2] - (z >> 127);
6130957b409SSimon J. Gerraty 	t2 = (uint64_t)z;
6140957b409SSimon J. Gerraty 	z = (unsigned __int128)a[3] - (z >> 127) + 0xFFFFFFFF;
6150957b409SSimon J. Gerraty 	t3 = (uint64_t)z;
6160957b409SSimon J. Gerraty 	cc = -(uint64_t)(z >> 64);
6170957b409SSimon J. Gerraty 
6180957b409SSimon J. Gerraty 	a[0] ^= cc & (a[0] ^ t0);
6190957b409SSimon J. Gerraty 	a[1] ^= cc & (a[1] ^ t1);
6200957b409SSimon J. Gerraty 	a[2] ^= cc & (a[2] ^ t2);
6210957b409SSimon J. Gerraty 	a[3] ^= cc & (a[3] ^ t3);
6220957b409SSimon J. Gerraty 
6230957b409SSimon J. Gerraty #elif BR_UMUL128
6240957b409SSimon J. Gerraty 
6250957b409SSimon J. Gerraty 	uint64_t t0, t1, t2, t3, m;
6260957b409SSimon J. Gerraty 	unsigned char k;
6270957b409SSimon J. Gerraty 
6280957b409SSimon J. Gerraty 	k = _addcarry_u64(0, a[0], (uint64_t)1, &t0);
6290957b409SSimon J. Gerraty 	k = _addcarry_u64(k, a[1], -((uint64_t)1 << 32), &t1);
6300957b409SSimon J. Gerraty 	k = _addcarry_u64(k, a[2], -(uint64_t)1, &t2);
6310957b409SSimon J. Gerraty 	k = _addcarry_u64(k, a[3], ((uint64_t)1 << 32) - 2, &t3);
6320957b409SSimon J. Gerraty 	m = -(uint64_t)k;
6330957b409SSimon J. Gerraty 
6340957b409SSimon J. Gerraty 	a[0] ^= m & (a[0] ^ t0);
6350957b409SSimon J. Gerraty 	a[1] ^= m & (a[1] ^ t1);
6360957b409SSimon J. Gerraty 	a[2] ^= m & (a[2] ^ t2);
6370957b409SSimon J. Gerraty 	a[3] ^= m & (a[3] ^ t3);
6380957b409SSimon J. Gerraty 
6390957b409SSimon J. Gerraty #endif
6400957b409SSimon J. Gerraty }
6410957b409SSimon J. Gerraty 
6420957b409SSimon J. Gerraty /*
6430957b409SSimon J. Gerraty  * Points in affine and Jacobian coordinates.
6440957b409SSimon J. Gerraty  *
6450957b409SSimon J. Gerraty  *  - In affine coordinates, the point-at-infinity cannot be encoded.
6460957b409SSimon J. Gerraty  *  - Jacobian coordinates (X,Y,Z) correspond to affine (X/Z^2,Y/Z^3);
6470957b409SSimon J. Gerraty  *    if Z = 0 then this is the point-at-infinity.
6480957b409SSimon J. Gerraty  */
6490957b409SSimon J. Gerraty typedef struct {
6500957b409SSimon J. Gerraty 	uint64_t x[4];
6510957b409SSimon J. Gerraty 	uint64_t y[4];
6520957b409SSimon J. Gerraty } p256_affine;
6530957b409SSimon J. Gerraty 
6540957b409SSimon J. Gerraty typedef struct {
6550957b409SSimon J. Gerraty 	uint64_t x[4];
6560957b409SSimon J. Gerraty 	uint64_t y[4];
6570957b409SSimon J. Gerraty 	uint64_t z[4];
6580957b409SSimon J. Gerraty } p256_jacobian;
6590957b409SSimon J. Gerraty 
6600957b409SSimon J. Gerraty /*
6610957b409SSimon J. Gerraty  * Decode a point. The returned point is in Jacobian coordinates, but
6620957b409SSimon J. Gerraty  * with z = 1. If the encoding is invalid, or encodes a point which is
6630957b409SSimon J. Gerraty  * not on the curve, or encodes the point at infinity, then this function
6640957b409SSimon J. Gerraty  * returns 0. Otherwise, 1 is returned.
6650957b409SSimon J. Gerraty  *
6660957b409SSimon J. Gerraty  * The buffer is assumed to have length exactly 65 bytes.
6670957b409SSimon J. Gerraty  */
6680957b409SSimon J. Gerraty static uint32_t
point_decode(p256_jacobian * P,const unsigned char * buf)6690957b409SSimon J. Gerraty point_decode(p256_jacobian *P, const unsigned char *buf)
6700957b409SSimon J. Gerraty {
6710957b409SSimon J. Gerraty 	uint64_t x[4], y[4], t[4], x3[4], tt;
6720957b409SSimon J. Gerraty 	uint32_t r;
6730957b409SSimon J. Gerraty 
6740957b409SSimon J. Gerraty 	/*
6750957b409SSimon J. Gerraty 	 * Header byte shall be 0x04.
6760957b409SSimon J. Gerraty 	 */
6770957b409SSimon J. Gerraty 	r = EQ(buf[0], 0x04);
6780957b409SSimon J. Gerraty 
6790957b409SSimon J. Gerraty 	/*
6800957b409SSimon J. Gerraty 	 * Decode X and Y coordinates, and convert them into
6810957b409SSimon J. Gerraty 	 * Montgomery representation.
6820957b409SSimon J. Gerraty 	 */
6830957b409SSimon J. Gerraty 	x[3] = br_dec64be(buf +  1);
6840957b409SSimon J. Gerraty 	x[2] = br_dec64be(buf +  9);
6850957b409SSimon J. Gerraty 	x[1] = br_dec64be(buf + 17);
6860957b409SSimon J. Gerraty 	x[0] = br_dec64be(buf + 25);
6870957b409SSimon J. Gerraty 	y[3] = br_dec64be(buf + 33);
6880957b409SSimon J. Gerraty 	y[2] = br_dec64be(buf + 41);
6890957b409SSimon J. Gerraty 	y[1] = br_dec64be(buf + 49);
6900957b409SSimon J. Gerraty 	y[0] = br_dec64be(buf + 57);
6910957b409SSimon J. Gerraty 	f256_tomonty(x, x);
6920957b409SSimon J. Gerraty 	f256_tomonty(y, y);
6930957b409SSimon J. Gerraty 
6940957b409SSimon J. Gerraty 	/*
6950957b409SSimon J. Gerraty 	 * Verify y^2 = x^3 + A*x + B. In curve P-256, A = -3.
6960957b409SSimon J. Gerraty 	 * Note that the Montgomery representation of 0 is 0. We must
6970957b409SSimon J. Gerraty 	 * take care to apply the final reduction to make sure we have
6980957b409SSimon J. Gerraty 	 * 0 and not p.
6990957b409SSimon J. Gerraty 	 */
7000957b409SSimon J. Gerraty 	f256_montysquare(t, y);
7010957b409SSimon J. Gerraty 	f256_montysquare(x3, x);
7020957b409SSimon J. Gerraty 	f256_montymul(x3, x3, x);
7030957b409SSimon J. Gerraty 	f256_sub(t, t, x3);
7040957b409SSimon J. Gerraty 	f256_add(t, t, x);
7050957b409SSimon J. Gerraty 	f256_add(t, t, x);
7060957b409SSimon J. Gerraty 	f256_add(t, t, x);
7070957b409SSimon J. Gerraty 	f256_sub(t, t, P256_B_MONTY);
7080957b409SSimon J. Gerraty 	f256_final_reduce(t);
7090957b409SSimon J. Gerraty 	tt = t[0] | t[1] | t[2] | t[3];
7100957b409SSimon J. Gerraty 	r &= EQ((uint32_t)(tt | (tt >> 32)), 0);
7110957b409SSimon J. Gerraty 
7120957b409SSimon J. Gerraty 	/*
7130957b409SSimon J. Gerraty 	 * Return the point in Jacobian coordinates (and Montgomery
7140957b409SSimon J. Gerraty 	 * representation).
7150957b409SSimon J. Gerraty 	 */
7160957b409SSimon J. Gerraty 	memcpy(P->x, x, sizeof x);
7170957b409SSimon J. Gerraty 	memcpy(P->y, y, sizeof y);
7180957b409SSimon J. Gerraty 	memcpy(P->z, F256_R, sizeof F256_R);
7190957b409SSimon J. Gerraty 	return r;
7200957b409SSimon J. Gerraty }
7210957b409SSimon J. Gerraty 
7220957b409SSimon J. Gerraty /*
7230957b409SSimon J. Gerraty  * Final conversion for a point:
7240957b409SSimon J. Gerraty  *  - The point is converted back to affine coordinates.
7250957b409SSimon J. Gerraty  *  - Final reduction is performed.
7260957b409SSimon J. Gerraty  *  - The point is encoded into the provided buffer.
7270957b409SSimon J. Gerraty  *
7280957b409SSimon J. Gerraty  * If the point is the point-at-infinity, all operations are performed,
7290957b409SSimon J. Gerraty  * but the buffer contents are indeterminate, and 0 is returned. Otherwise,
7300957b409SSimon J. Gerraty  * the encoded point is written in the buffer, and 1 is returned.
7310957b409SSimon J. Gerraty  */
7320957b409SSimon J. Gerraty static uint32_t
point_encode(unsigned char * buf,const p256_jacobian * P)7330957b409SSimon J. Gerraty point_encode(unsigned char *buf, const p256_jacobian *P)
7340957b409SSimon J. Gerraty {
7350957b409SSimon J. Gerraty 	uint64_t t1[4], t2[4], z;
7360957b409SSimon J. Gerraty 
7370957b409SSimon J. Gerraty 	/* Set t1 = 1/z^2 and t2 = 1/z^3. */
7380957b409SSimon J. Gerraty 	f256_invert(t2, P->z);
7390957b409SSimon J. Gerraty 	f256_montysquare(t1, t2);
7400957b409SSimon J. Gerraty 	f256_montymul(t2, t2, t1);
7410957b409SSimon J. Gerraty 
7420957b409SSimon J. Gerraty 	/* Compute affine coordinates x (in t1) and y (in t2). */
7430957b409SSimon J. Gerraty 	f256_montymul(t1, P->x, t1);
7440957b409SSimon J. Gerraty 	f256_montymul(t2, P->y, t2);
7450957b409SSimon J. Gerraty 
7460957b409SSimon J. Gerraty 	/* Convert back from Montgomery representation, and finalize
7470957b409SSimon J. Gerraty 	   reductions. */
7480957b409SSimon J. Gerraty 	f256_frommonty(t1, t1);
7490957b409SSimon J. Gerraty 	f256_frommonty(t2, t2);
7500957b409SSimon J. Gerraty 	f256_final_reduce(t1);
7510957b409SSimon J. Gerraty 	f256_final_reduce(t2);
7520957b409SSimon J. Gerraty 
7530957b409SSimon J. Gerraty 	/* Encode. */
7540957b409SSimon J. Gerraty 	buf[0] = 0x04;
7550957b409SSimon J. Gerraty 	br_enc64be(buf +  1, t1[3]);
7560957b409SSimon J. Gerraty 	br_enc64be(buf +  9, t1[2]);
7570957b409SSimon J. Gerraty 	br_enc64be(buf + 17, t1[1]);
7580957b409SSimon J. Gerraty 	br_enc64be(buf + 25, t1[0]);
7590957b409SSimon J. Gerraty 	br_enc64be(buf + 33, t2[3]);
7600957b409SSimon J. Gerraty 	br_enc64be(buf + 41, t2[2]);
7610957b409SSimon J. Gerraty 	br_enc64be(buf + 49, t2[1]);
7620957b409SSimon J. Gerraty 	br_enc64be(buf + 57, t2[0]);
7630957b409SSimon J. Gerraty 
7640957b409SSimon J. Gerraty 	/* Return success if and only if P->z != 0. */
7650957b409SSimon J. Gerraty 	z = P->z[0] | P->z[1] | P->z[2] | P->z[3];
7660957b409SSimon J. Gerraty 	return NEQ((uint32_t)(z | z >> 32), 0);
7670957b409SSimon J. Gerraty }
7680957b409SSimon J. Gerraty 
7690957b409SSimon J. Gerraty /*
7700957b409SSimon J. Gerraty  * Point doubling in Jacobian coordinates: point P is doubled.
7710957b409SSimon J. Gerraty  * Note: if the source point is the point-at-infinity, then the result is
7720957b409SSimon J. Gerraty  * still the point-at-infinity, which is correct. Moreover, if the three
7730957b409SSimon J. Gerraty  * coordinates were zero, then they still are zero in the returned value.
7740957b409SSimon J. Gerraty  *
7750957b409SSimon J. Gerraty  * (Note: this is true even without the final reduction: if the three
7760957b409SSimon J. Gerraty  * coordinates are encoded as four words of value zero each, then the
7770957b409SSimon J. Gerraty  * result will also have all-zero coordinate encodings, not the alternate
7780957b409SSimon J. Gerraty  * encoding as the integer p.)
7790957b409SSimon J. Gerraty  */
7800957b409SSimon J. Gerraty static void
p256_double(p256_jacobian * P)7810957b409SSimon J. Gerraty p256_double(p256_jacobian *P)
7820957b409SSimon J. Gerraty {
7830957b409SSimon J. Gerraty 	/*
7840957b409SSimon J. Gerraty 	 * Doubling formulas are:
7850957b409SSimon J. Gerraty 	 *
7860957b409SSimon J. Gerraty 	 *   s = 4*x*y^2
7870957b409SSimon J. Gerraty 	 *   m = 3*(x + z^2)*(x - z^2)
7880957b409SSimon J. Gerraty 	 *   x' = m^2 - 2*s
7890957b409SSimon J. Gerraty 	 *   y' = m*(s - x') - 8*y^4
7900957b409SSimon J. Gerraty 	 *   z' = 2*y*z
7910957b409SSimon J. Gerraty 	 *
7920957b409SSimon J. Gerraty 	 * These formulas work for all points, including points of order 2
7930957b409SSimon J. Gerraty 	 * and points at infinity:
7940957b409SSimon J. Gerraty 	 *   - If y = 0 then z' = 0. But there is no such point in P-256
7950957b409SSimon J. Gerraty 	 *     anyway.
7960957b409SSimon J. Gerraty 	 *   - If z = 0 then z' = 0.
7970957b409SSimon J. Gerraty 	 */
7980957b409SSimon J. Gerraty 	uint64_t t1[4], t2[4], t3[4], t4[4];
7990957b409SSimon J. Gerraty 
8000957b409SSimon J. Gerraty 	/*
8010957b409SSimon J. Gerraty 	 * Compute z^2 in t1.
8020957b409SSimon J. Gerraty 	 */
8030957b409SSimon J. Gerraty 	f256_montysquare(t1, P->z);
8040957b409SSimon J. Gerraty 
8050957b409SSimon J. Gerraty 	/*
8060957b409SSimon J. Gerraty 	 * Compute x-z^2 in t2 and x+z^2 in t1.
8070957b409SSimon J. Gerraty 	 */
8080957b409SSimon J. Gerraty 	f256_add(t2, P->x, t1);
8090957b409SSimon J. Gerraty 	f256_sub(t1, P->x, t1);
8100957b409SSimon J. Gerraty 
8110957b409SSimon J. Gerraty 	/*
8120957b409SSimon J. Gerraty 	 * Compute 3*(x+z^2)*(x-z^2) in t1.
8130957b409SSimon J. Gerraty 	 */
8140957b409SSimon J. Gerraty 	f256_montymul(t3, t1, t2);
8150957b409SSimon J. Gerraty 	f256_add(t1, t3, t3);
8160957b409SSimon J. Gerraty 	f256_add(t1, t3, t1);
8170957b409SSimon J. Gerraty 
8180957b409SSimon J. Gerraty 	/*
8190957b409SSimon J. Gerraty 	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
8200957b409SSimon J. Gerraty 	 */
8210957b409SSimon J. Gerraty 	f256_montysquare(t3, P->y);
8220957b409SSimon J. Gerraty 	f256_add(t3, t3, t3);
8230957b409SSimon J. Gerraty 	f256_montymul(t2, P->x, t3);
8240957b409SSimon J. Gerraty 	f256_add(t2, t2, t2);
8250957b409SSimon J. Gerraty 
8260957b409SSimon J. Gerraty 	/*
8270957b409SSimon J. Gerraty 	 * Compute x' = m^2 - 2*s.
8280957b409SSimon J. Gerraty 	 */
8290957b409SSimon J. Gerraty 	f256_montysquare(P->x, t1);
8300957b409SSimon J. Gerraty 	f256_sub(P->x, P->x, t2);
8310957b409SSimon J. Gerraty 	f256_sub(P->x, P->x, t2);
8320957b409SSimon J. Gerraty 
8330957b409SSimon J. Gerraty 	/*
8340957b409SSimon J. Gerraty 	 * Compute z' = 2*y*z.
8350957b409SSimon J. Gerraty 	 */
8360957b409SSimon J. Gerraty 	f256_montymul(t4, P->y, P->z);
8370957b409SSimon J. Gerraty 	f256_add(P->z, t4, t4);
8380957b409SSimon J. Gerraty 
8390957b409SSimon J. Gerraty 	/*
8400957b409SSimon J. Gerraty 	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
8410957b409SSimon J. Gerraty 	 * 2*y^2 in t3.
8420957b409SSimon J. Gerraty 	 */
8430957b409SSimon J. Gerraty 	f256_sub(t2, t2, P->x);
8440957b409SSimon J. Gerraty 	f256_montymul(P->y, t1, t2);
8450957b409SSimon J. Gerraty 	f256_montysquare(t4, t3);
8460957b409SSimon J. Gerraty 	f256_add(t4, t4, t4);
8470957b409SSimon J. Gerraty 	f256_sub(P->y, P->y, t4);
8480957b409SSimon J. Gerraty }
8490957b409SSimon J. Gerraty 
8500957b409SSimon J. Gerraty /*
8510957b409SSimon J. Gerraty  * Point addition (Jacobian coordinates): P1 is replaced with P1+P2.
8520957b409SSimon J. Gerraty  * This function computes the wrong result in the following cases:
8530957b409SSimon J. Gerraty  *
8540957b409SSimon J. Gerraty  *   - If P1 == 0 but P2 != 0
8550957b409SSimon J. Gerraty  *   - If P1 != 0 but P2 == 0
8560957b409SSimon J. Gerraty  *   - If P1 == P2
8570957b409SSimon J. Gerraty  *
8580957b409SSimon J. Gerraty  * In all three cases, P1 is set to the point at infinity.
8590957b409SSimon J. Gerraty  *
8600957b409SSimon J. Gerraty  * Returned value is 0 if one of the following occurs:
8610957b409SSimon J. Gerraty  *
8620957b409SSimon J. Gerraty  *   - P1 and P2 have the same Y coordinate.
8630957b409SSimon J. Gerraty  *   - P1 == 0 and P2 == 0.
8640957b409SSimon J. Gerraty  *   - The Y coordinate of one of the points is 0 and the other point is
8650957b409SSimon J. Gerraty  *     the point at infinity.
8660957b409SSimon J. Gerraty  *
8670957b409SSimon J. Gerraty  * The third case cannot actually happen with valid points, since a point
8680957b409SSimon J. Gerraty  * with Y == 0 is a point of order 2, and there is no point of order 2 on
8690957b409SSimon J. Gerraty  * curve P-256.
8700957b409SSimon J. Gerraty  *
8710957b409SSimon J. Gerraty  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
8720957b409SSimon J. Gerraty  * can apply the following:
8730957b409SSimon J. Gerraty  *
8740957b409SSimon J. Gerraty  *   - If the result is not the point at infinity, then it is correct.
8750957b409SSimon J. Gerraty  *   - Otherwise, if the returned value is 1, then this is a case of
8760957b409SSimon J. Gerraty  *     P1+P2 == 0, so the result is indeed the point at infinity.
8770957b409SSimon J. Gerraty  *   - Otherwise, P1 == P2, so a "double" operation should have been
8780957b409SSimon J. Gerraty  *     performed.
8790957b409SSimon J. Gerraty  *
8800957b409SSimon J. Gerraty  * Note that you can get a returned value of 0 with a correct result,
8810957b409SSimon J. Gerraty  * e.g. if P1 and P2 have the same Y coordinate, but distinct X coordinates.
8820957b409SSimon J. Gerraty  */
8830957b409SSimon J. Gerraty static uint32_t
p256_add(p256_jacobian * P1,const p256_jacobian * P2)8840957b409SSimon J. Gerraty p256_add(p256_jacobian *P1, const p256_jacobian *P2)
8850957b409SSimon J. Gerraty {
8860957b409SSimon J. Gerraty 	/*
8870957b409SSimon J. Gerraty 	 * Addtions formulas are:
8880957b409SSimon J. Gerraty 	 *
8890957b409SSimon J. Gerraty 	 *   u1 = x1 * z2^2
8900957b409SSimon J. Gerraty 	 *   u2 = x2 * z1^2
8910957b409SSimon J. Gerraty 	 *   s1 = y1 * z2^3
8920957b409SSimon J. Gerraty 	 *   s2 = y2 * z1^3
8930957b409SSimon J. Gerraty 	 *   h = u2 - u1
8940957b409SSimon J. Gerraty 	 *   r = s2 - s1
8950957b409SSimon J. Gerraty 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
8960957b409SSimon J. Gerraty 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
8970957b409SSimon J. Gerraty 	 *   z3 = h * z1 * z2
8980957b409SSimon J. Gerraty 	 */
8990957b409SSimon J. Gerraty 	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
9000957b409SSimon J. Gerraty 	uint32_t ret;
9010957b409SSimon J. Gerraty 
9020957b409SSimon J. Gerraty 	/*
9030957b409SSimon J. Gerraty 	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
9040957b409SSimon J. Gerraty 	 */
9050957b409SSimon J. Gerraty 	f256_montysquare(t3, P2->z);
9060957b409SSimon J. Gerraty 	f256_montymul(t1, P1->x, t3);
9070957b409SSimon J. Gerraty 	f256_montymul(t4, P2->z, t3);
9080957b409SSimon J. Gerraty 	f256_montymul(t3, P1->y, t4);
9090957b409SSimon J. Gerraty 
9100957b409SSimon J. Gerraty 	/*
9110957b409SSimon J. Gerraty 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
9120957b409SSimon J. Gerraty 	 */
9130957b409SSimon J. Gerraty 	f256_montysquare(t4, P1->z);
9140957b409SSimon J. Gerraty 	f256_montymul(t2, P2->x, t4);
9150957b409SSimon J. Gerraty 	f256_montymul(t5, P1->z, t4);
9160957b409SSimon J. Gerraty 	f256_montymul(t4, P2->y, t5);
9170957b409SSimon J. Gerraty 
9180957b409SSimon J. Gerraty 	/*
9190957b409SSimon J. Gerraty 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
9200957b409SSimon J. Gerraty 	 * We need to test whether r is zero, so we will do some extra
9210957b409SSimon J. Gerraty 	 * reduce.
9220957b409SSimon J. Gerraty 	 */
9230957b409SSimon J. Gerraty 	f256_sub(t2, t2, t1);
9240957b409SSimon J. Gerraty 	f256_sub(t4, t4, t3);
9250957b409SSimon J. Gerraty 	f256_final_reduce(t4);
9260957b409SSimon J. Gerraty 	tt = t4[0] | t4[1] | t4[2] | t4[3];
9270957b409SSimon J. Gerraty 	ret = (uint32_t)(tt | (tt >> 32));
9280957b409SSimon J. Gerraty 	ret = (ret | -ret) >> 31;
9290957b409SSimon J. Gerraty 
9300957b409SSimon J. Gerraty 	/*
9310957b409SSimon J. Gerraty 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
9320957b409SSimon J. Gerraty 	 */
9330957b409SSimon J. Gerraty 	f256_montysquare(t7, t2);
9340957b409SSimon J. Gerraty 	f256_montymul(t6, t1, t7);
9350957b409SSimon J. Gerraty 	f256_montymul(t5, t7, t2);
9360957b409SSimon J. Gerraty 
9370957b409SSimon J. Gerraty 	/*
9380957b409SSimon J. Gerraty 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
9390957b409SSimon J. Gerraty 	 */
9400957b409SSimon J. Gerraty 	f256_montysquare(P1->x, t4);
9410957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t5);
9420957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
9430957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
9440957b409SSimon J. Gerraty 
9450957b409SSimon J. Gerraty 	/*
9460957b409SSimon J. Gerraty 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
9470957b409SSimon J. Gerraty 	 */
9480957b409SSimon J. Gerraty 	f256_sub(t6, t6, P1->x);
9490957b409SSimon J. Gerraty 	f256_montymul(P1->y, t4, t6);
9500957b409SSimon J. Gerraty 	f256_montymul(t1, t5, t3);
9510957b409SSimon J. Gerraty 	f256_sub(P1->y, P1->y, t1);
9520957b409SSimon J. Gerraty 
9530957b409SSimon J. Gerraty 	/*
9540957b409SSimon J. Gerraty 	 * Compute z3 = h*z1*z2.
9550957b409SSimon J. Gerraty 	 */
9560957b409SSimon J. Gerraty 	f256_montymul(t1, P1->z, P2->z);
9570957b409SSimon J. Gerraty 	f256_montymul(P1->z, t1, t2);
9580957b409SSimon J. Gerraty 
9590957b409SSimon J. Gerraty 	return ret;
9600957b409SSimon J. Gerraty }
9610957b409SSimon J. Gerraty 
9620957b409SSimon J. Gerraty /*
9630957b409SSimon J. Gerraty  * Point addition (mixed coordinates): P1 is replaced with P1+P2.
9640957b409SSimon J. Gerraty  * This is a specialised function for the case when P2 is a non-zero point
9650957b409SSimon J. Gerraty  * in affine coordinates.
9660957b409SSimon J. Gerraty  *
9670957b409SSimon J. Gerraty  * This function computes the wrong result in the following cases:
9680957b409SSimon J. Gerraty  *
9690957b409SSimon J. Gerraty  *   - If P1 == 0
9700957b409SSimon J. Gerraty  *   - If P1 == P2
9710957b409SSimon J. Gerraty  *
9720957b409SSimon J. Gerraty  * In both cases, P1 is set to the point at infinity.
9730957b409SSimon J. Gerraty  *
9740957b409SSimon J. Gerraty  * Returned value is 0 if one of the following occurs:
9750957b409SSimon J. Gerraty  *
9760957b409SSimon J. Gerraty  *   - P1 and P2 have the same Y (affine) coordinate.
9770957b409SSimon J. Gerraty  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
9780957b409SSimon J. Gerraty  *
9790957b409SSimon J. Gerraty  * The second case cannot actually happen with valid points, since a point
9800957b409SSimon J. Gerraty  * with Y == 0 is a point of order 2, and there is no point of order 2 on
9810957b409SSimon J. Gerraty  * curve P-256.
9820957b409SSimon J. Gerraty  *
9830957b409SSimon J. Gerraty  * Therefore, assuming that P1 != 0 on input, then the caller
9840957b409SSimon J. Gerraty  * can apply the following:
9850957b409SSimon J. Gerraty  *
9860957b409SSimon J. Gerraty  *   - If the result is not the point at infinity, then it is correct.
9870957b409SSimon J. Gerraty  *   - Otherwise, if the returned value is 1, then this is a case of
9880957b409SSimon J. Gerraty  *     P1+P2 == 0, so the result is indeed the point at infinity.
9890957b409SSimon J. Gerraty  *   - Otherwise, P1 == P2, so a "double" operation should have been
9900957b409SSimon J. Gerraty  *     performed.
9910957b409SSimon J. Gerraty  *
9920957b409SSimon J. Gerraty  * Again, a value of 0 may be returned in some cases where the addition
9930957b409SSimon J. Gerraty  * result is correct.
9940957b409SSimon J. Gerraty  */
9950957b409SSimon J. Gerraty static uint32_t
p256_add_mixed(p256_jacobian * P1,const p256_affine * P2)9960957b409SSimon J. Gerraty p256_add_mixed(p256_jacobian *P1, const p256_affine *P2)
9970957b409SSimon J. Gerraty {
9980957b409SSimon J. Gerraty 	/*
9990957b409SSimon J. Gerraty 	 * Addtions formulas are:
10000957b409SSimon J. Gerraty 	 *
10010957b409SSimon J. Gerraty 	 *   u1 = x1
10020957b409SSimon J. Gerraty 	 *   u2 = x2 * z1^2
10030957b409SSimon J. Gerraty 	 *   s1 = y1
10040957b409SSimon J. Gerraty 	 *   s2 = y2 * z1^3
10050957b409SSimon J. Gerraty 	 *   h = u2 - u1
10060957b409SSimon J. Gerraty 	 *   r = s2 - s1
10070957b409SSimon J. Gerraty 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
10080957b409SSimon J. Gerraty 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
10090957b409SSimon J. Gerraty 	 *   z3 = h * z1
10100957b409SSimon J. Gerraty 	 */
10110957b409SSimon J. Gerraty 	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt;
10120957b409SSimon J. Gerraty 	uint32_t ret;
10130957b409SSimon J. Gerraty 
10140957b409SSimon J. Gerraty 	/*
10150957b409SSimon J. Gerraty 	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
10160957b409SSimon J. Gerraty 	 */
10170957b409SSimon J. Gerraty 	memcpy(t1, P1->x, sizeof t1);
10180957b409SSimon J. Gerraty 	memcpy(t3, P1->y, sizeof t3);
10190957b409SSimon J. Gerraty 
10200957b409SSimon J. Gerraty 	/*
10210957b409SSimon J. Gerraty 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
10220957b409SSimon J. Gerraty 	 */
10230957b409SSimon J. Gerraty 	f256_montysquare(t4, P1->z);
10240957b409SSimon J. Gerraty 	f256_montymul(t2, P2->x, t4);
10250957b409SSimon J. Gerraty 	f256_montymul(t5, P1->z, t4);
10260957b409SSimon J. Gerraty 	f256_montymul(t4, P2->y, t5);
10270957b409SSimon J. Gerraty 
10280957b409SSimon J. Gerraty 	/*
10290957b409SSimon J. Gerraty 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
10300957b409SSimon J. Gerraty 	 * We need to test whether r is zero, so we will do some extra
10310957b409SSimon J. Gerraty 	 * reduce.
10320957b409SSimon J. Gerraty 	 */
10330957b409SSimon J. Gerraty 	f256_sub(t2, t2, t1);
10340957b409SSimon J. Gerraty 	f256_sub(t4, t4, t3);
10350957b409SSimon J. Gerraty 	f256_final_reduce(t4);
10360957b409SSimon J. Gerraty 	tt = t4[0] | t4[1] | t4[2] | t4[3];
10370957b409SSimon J. Gerraty 	ret = (uint32_t)(tt | (tt >> 32));
10380957b409SSimon J. Gerraty 	ret = (ret | -ret) >> 31;
10390957b409SSimon J. Gerraty 
10400957b409SSimon J. Gerraty 	/*
10410957b409SSimon J. Gerraty 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
10420957b409SSimon J. Gerraty 	 */
10430957b409SSimon J. Gerraty 	f256_montysquare(t7, t2);
10440957b409SSimon J. Gerraty 	f256_montymul(t6, t1, t7);
10450957b409SSimon J. Gerraty 	f256_montymul(t5, t7, t2);
10460957b409SSimon J. Gerraty 
10470957b409SSimon J. Gerraty 	/*
10480957b409SSimon J. Gerraty 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
10490957b409SSimon J. Gerraty 	 */
10500957b409SSimon J. Gerraty 	f256_montysquare(P1->x, t4);
10510957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t5);
10520957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
10530957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
10540957b409SSimon J. Gerraty 
10550957b409SSimon J. Gerraty 	/*
10560957b409SSimon J. Gerraty 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
10570957b409SSimon J. Gerraty 	 */
10580957b409SSimon J. Gerraty 	f256_sub(t6, t6, P1->x);
10590957b409SSimon J. Gerraty 	f256_montymul(P1->y, t4, t6);
10600957b409SSimon J. Gerraty 	f256_montymul(t1, t5, t3);
10610957b409SSimon J. Gerraty 	f256_sub(P1->y, P1->y, t1);
10620957b409SSimon J. Gerraty 
10630957b409SSimon J. Gerraty 	/*
10640957b409SSimon J. Gerraty 	 * Compute z3 = h*z1*z2.
10650957b409SSimon J. Gerraty 	 */
10660957b409SSimon J. Gerraty 	f256_montymul(P1->z, P1->z, t2);
10670957b409SSimon J. Gerraty 
10680957b409SSimon J. Gerraty 	return ret;
10690957b409SSimon J. Gerraty }
10700957b409SSimon J. Gerraty 
10710957b409SSimon J. Gerraty #if 0
10720957b409SSimon J. Gerraty /* unused */
10730957b409SSimon J. Gerraty /*
10740957b409SSimon J. Gerraty  * Point addition (mixed coordinates, complete): P1 is replaced with P1+P2.
10750957b409SSimon J. Gerraty  * This is a specialised function for the case when P2 is a non-zero point
10760957b409SSimon J. Gerraty  * in affine coordinates.
10770957b409SSimon J. Gerraty  *
10780957b409SSimon J. Gerraty  * This function returns the correct result in all cases.
10790957b409SSimon J. Gerraty  */
10800957b409SSimon J. Gerraty static uint32_t
10810957b409SSimon J. Gerraty p256_add_complete_mixed(p256_jacobian *P1, const p256_affine *P2)
10820957b409SSimon J. Gerraty {
10830957b409SSimon J. Gerraty 	/*
10840957b409SSimon J. Gerraty 	 * Addtions formulas, in the general case, are:
10850957b409SSimon J. Gerraty 	 *
10860957b409SSimon J. Gerraty 	 *   u1 = x1
10870957b409SSimon J. Gerraty 	 *   u2 = x2 * z1^2
10880957b409SSimon J. Gerraty 	 *   s1 = y1
10890957b409SSimon J. Gerraty 	 *   s2 = y2 * z1^3
10900957b409SSimon J. Gerraty 	 *   h = u2 - u1
10910957b409SSimon J. Gerraty 	 *   r = s2 - s1
10920957b409SSimon J. Gerraty 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
10930957b409SSimon J. Gerraty 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
10940957b409SSimon J. Gerraty 	 *   z3 = h * z1
10950957b409SSimon J. Gerraty 	 *
10960957b409SSimon J. Gerraty 	 * These formulas mishandle the two following cases:
10970957b409SSimon J. Gerraty 	 *
10980957b409SSimon J. Gerraty 	 *  - If P1 is the point-at-infinity (z1 = 0), then z3 is
10990957b409SSimon J. Gerraty 	 *    incorrectly set to 0.
11000957b409SSimon J. Gerraty 	 *
11010957b409SSimon J. Gerraty 	 *  - If P1 = P2, then u1 = u2 and s1 = s2, and x3, y3 and z3
11020957b409SSimon J. Gerraty 	 *    are all set to 0.
11030957b409SSimon J. Gerraty 	 *
11040957b409SSimon J. Gerraty 	 * However, if P1 + P2 = 0, then u1 = u2 but s1 != s2, and then
11050957b409SSimon J. Gerraty 	 * we correctly get z3 = 0 (the point-at-infinity).
11060957b409SSimon J. Gerraty 	 *
11070957b409SSimon J. Gerraty 	 * To fix the case P1 = 0, we perform at the end a copy of P2
11080957b409SSimon J. Gerraty 	 * over P1, conditional to z1 = 0.
11090957b409SSimon J. Gerraty 	 *
11100957b409SSimon J. Gerraty 	 * For P1 = P2: in that case, both h and r are set to 0, and
11110957b409SSimon J. Gerraty 	 * we get x3, y3 and z3 equal to 0. We can test for that
11120957b409SSimon J. Gerraty 	 * occurrence to make a mask which will be all-one if P1 = P2,
11130957b409SSimon J. Gerraty 	 * or all-zero otherwise; then we can compute the double of P2
11140957b409SSimon J. Gerraty 	 * and add it, combined with the mask, to (x3,y3,z3).
11150957b409SSimon J. Gerraty 	 *
11160957b409SSimon J. Gerraty 	 * Using the doubling formulas in p256_double() on (x2,y2),
11170957b409SSimon J. Gerraty 	 * simplifying since P2 is affine (i.e. z2 = 1, implicitly),
11180957b409SSimon J. Gerraty 	 * we get:
11190957b409SSimon J. Gerraty 	 *   s = 4*x2*y2^2
11200957b409SSimon J. Gerraty 	 *   m = 3*(x2 + 1)*(x2 - 1)
11210957b409SSimon J. Gerraty 	 *   x' = m^2 - 2*s
11220957b409SSimon J. Gerraty 	 *   y' = m*(s - x') - 8*y2^4
11230957b409SSimon J. Gerraty 	 *   z' = 2*y2
11240957b409SSimon J. Gerraty 	 * which requires only 6 multiplications. Added to the 11
11250957b409SSimon J. Gerraty 	 * multiplications of the normal mixed addition in Jacobian
11260957b409SSimon J. Gerraty 	 * coordinates, we get a cost of 17 multiplications in total.
11270957b409SSimon J. Gerraty 	 */
11280957b409SSimon J. Gerraty 	uint64_t t1[4], t2[4], t3[4], t4[4], t5[4], t6[4], t7[4], tt, zz;
11290957b409SSimon J. Gerraty 	int i;
11300957b409SSimon J. Gerraty 
11310957b409SSimon J. Gerraty 	/*
11320957b409SSimon J. Gerraty 	 * Set zz to -1 if P1 is the point at infinity, 0 otherwise.
11330957b409SSimon J. Gerraty 	 */
11340957b409SSimon J. Gerraty 	zz = P1->z[0] | P1->z[1] | P1->z[2] | P1->z[3];
11350957b409SSimon J. Gerraty 	zz = ((zz | -zz) >> 63) - (uint64_t)1;
11360957b409SSimon J. Gerraty 
11370957b409SSimon J. Gerraty 	/*
11380957b409SSimon J. Gerraty 	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
11390957b409SSimon J. Gerraty 	 */
11400957b409SSimon J. Gerraty 	memcpy(t1, P1->x, sizeof t1);
11410957b409SSimon J. Gerraty 	memcpy(t3, P1->y, sizeof t3);
11420957b409SSimon J. Gerraty 
11430957b409SSimon J. Gerraty 	/*
11440957b409SSimon J. Gerraty 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
11450957b409SSimon J. Gerraty 	 */
11460957b409SSimon J. Gerraty 	f256_montysquare(t4, P1->z);
11470957b409SSimon J. Gerraty 	f256_montymul(t2, P2->x, t4);
11480957b409SSimon J. Gerraty 	f256_montymul(t5, P1->z, t4);
11490957b409SSimon J. Gerraty 	f256_montymul(t4, P2->y, t5);
11500957b409SSimon J. Gerraty 
11510957b409SSimon J. Gerraty 	/*
11520957b409SSimon J. Gerraty 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
11530957b409SSimon J. Gerraty 	 * reduce.
11540957b409SSimon J. Gerraty 	 */
11550957b409SSimon J. Gerraty 	f256_sub(t2, t2, t1);
11560957b409SSimon J. Gerraty 	f256_sub(t4, t4, t3);
11570957b409SSimon J. Gerraty 
11580957b409SSimon J. Gerraty 	/*
11590957b409SSimon J. Gerraty 	 * If both h = 0 and r = 0, then P1 = P2, and we want to set
11600957b409SSimon J. Gerraty 	 * the mask tt to -1; otherwise, the mask will be 0.
11610957b409SSimon J. Gerraty 	 */
11620957b409SSimon J. Gerraty 	f256_final_reduce(t2);
11630957b409SSimon J. Gerraty 	f256_final_reduce(t4);
11640957b409SSimon J. Gerraty 	tt = t2[0] | t2[1] | t2[2] | t2[3] | t4[0] | t4[1] | t4[2] | t4[3];
11650957b409SSimon J. Gerraty 	tt = ((tt | -tt) >> 63) - (uint64_t)1;
11660957b409SSimon J. Gerraty 
11670957b409SSimon J. Gerraty 	/*
11680957b409SSimon J. Gerraty 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
11690957b409SSimon J. Gerraty 	 */
11700957b409SSimon J. Gerraty 	f256_montysquare(t7, t2);
11710957b409SSimon J. Gerraty 	f256_montymul(t6, t1, t7);
11720957b409SSimon J. Gerraty 	f256_montymul(t5, t7, t2);
11730957b409SSimon J. Gerraty 
11740957b409SSimon J. Gerraty 	/*
11750957b409SSimon J. Gerraty 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
11760957b409SSimon J. Gerraty 	 */
11770957b409SSimon J. Gerraty 	f256_montysquare(P1->x, t4);
11780957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t5);
11790957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
11800957b409SSimon J. Gerraty 	f256_sub(P1->x, P1->x, t6);
11810957b409SSimon J. Gerraty 
11820957b409SSimon J. Gerraty 	/*
11830957b409SSimon J. Gerraty 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
11840957b409SSimon J. Gerraty 	 */
11850957b409SSimon J. Gerraty 	f256_sub(t6, t6, P1->x);
11860957b409SSimon J. Gerraty 	f256_montymul(P1->y, t4, t6);
11870957b409SSimon J. Gerraty 	f256_montymul(t1, t5, t3);
11880957b409SSimon J. Gerraty 	f256_sub(P1->y, P1->y, t1);
11890957b409SSimon J. Gerraty 
11900957b409SSimon J. Gerraty 	/*
11910957b409SSimon J. Gerraty 	 * Compute z3 = h*z1.
11920957b409SSimon J. Gerraty 	 */
11930957b409SSimon J. Gerraty 	f256_montymul(P1->z, P1->z, t2);
11940957b409SSimon J. Gerraty 
11950957b409SSimon J. Gerraty 	/*
11960957b409SSimon J. Gerraty 	 * The "double" result, in case P1 = P2.
11970957b409SSimon J. Gerraty 	 */
11980957b409SSimon J. Gerraty 
11990957b409SSimon J. Gerraty 	/*
12000957b409SSimon J. Gerraty 	 * Compute z' = 2*y2 (in t1).
12010957b409SSimon J. Gerraty 	 */
12020957b409SSimon J. Gerraty 	f256_add(t1, P2->y, P2->y);
12030957b409SSimon J. Gerraty 
12040957b409SSimon J. Gerraty 	/*
12050957b409SSimon J. Gerraty 	 * Compute 2*(y2^2) (in t2) and s = 4*x2*(y2^2) (in t3).
12060957b409SSimon J. Gerraty 	 */
12070957b409SSimon J. Gerraty 	f256_montysquare(t2, P2->y);
12080957b409SSimon J. Gerraty 	f256_add(t2, t2, t2);
12090957b409SSimon J. Gerraty 	f256_add(t3, t2, t2);
12100957b409SSimon J. Gerraty 	f256_montymul(t3, P2->x, t3);
12110957b409SSimon J. Gerraty 
12120957b409SSimon J. Gerraty 	/*
12130957b409SSimon J. Gerraty 	 * Compute m = 3*(x2^2 - 1) (in t4).
12140957b409SSimon J. Gerraty 	 */
12150957b409SSimon J. Gerraty 	f256_montysquare(t4, P2->x);
12160957b409SSimon J. Gerraty 	f256_sub(t4, t4, F256_R);
12170957b409SSimon J. Gerraty 	f256_add(t5, t4, t4);
12180957b409SSimon J. Gerraty 	f256_add(t4, t4, t5);
12190957b409SSimon J. Gerraty 
12200957b409SSimon J. Gerraty 	/*
12210957b409SSimon J. Gerraty 	 * Compute x' = m^2 - 2*s (in t5).
12220957b409SSimon J. Gerraty 	 */
12230957b409SSimon J. Gerraty 	f256_montysquare(t5, t4);
12240957b409SSimon J. Gerraty 	f256_sub(t5, t3);
12250957b409SSimon J. Gerraty 	f256_sub(t5, t3);
12260957b409SSimon J. Gerraty 
12270957b409SSimon J. Gerraty 	/*
12280957b409SSimon J. Gerraty 	 * Compute y' = m*(s - x') - 8*y2^4 (in t6).
12290957b409SSimon J. Gerraty 	 */
12300957b409SSimon J. Gerraty 	f256_sub(t6, t3, t5);
12310957b409SSimon J. Gerraty 	f256_montymul(t6, t6, t4);
12320957b409SSimon J. Gerraty 	f256_montysquare(t7, t2);
12330957b409SSimon J. Gerraty 	f256_sub(t6, t6, t7);
12340957b409SSimon J. Gerraty 	f256_sub(t6, t6, t7);
12350957b409SSimon J. Gerraty 
12360957b409SSimon J. Gerraty 	/*
12370957b409SSimon J. Gerraty 	 * We now have the alternate (doubling) coordinates in (t5,t6,t1).
12380957b409SSimon J. Gerraty 	 * We combine them with (x3,y3,z3).
12390957b409SSimon J. Gerraty 	 */
12400957b409SSimon J. Gerraty 	for (i = 0; i < 4; i ++) {
12410957b409SSimon J. Gerraty 		P1->x[i] |= tt & t5[i];
12420957b409SSimon J. Gerraty 		P1->y[i] |= tt & t6[i];
12430957b409SSimon J. Gerraty 		P1->z[i] |= tt & t1[i];
12440957b409SSimon J. Gerraty 	}
12450957b409SSimon J. Gerraty 
12460957b409SSimon J. Gerraty 	/*
12470957b409SSimon J. Gerraty 	 * If P1 = 0, then we get z3 = 0 (which is invalid); if z1 is 0,
12480957b409SSimon J. Gerraty 	 * then we want to replace the result with a copy of P2. The
12490957b409SSimon J. Gerraty 	 * test on z1 was done at the start, in the zz mask.
12500957b409SSimon J. Gerraty 	 */
12510957b409SSimon J. Gerraty 	for (i = 0; i < 4; i ++) {
12520957b409SSimon J. Gerraty 		P1->x[i] ^= zz & (P1->x[i] ^ P2->x[i]);
12530957b409SSimon J. Gerraty 		P1->y[i] ^= zz & (P1->y[i] ^ P2->y[i]);
12540957b409SSimon J. Gerraty 		P1->z[i] ^= zz & (P1->z[i] ^ F256_R[i]);
12550957b409SSimon J. Gerraty 	}
12560957b409SSimon J. Gerraty }
12570957b409SSimon J. Gerraty #endif
12580957b409SSimon J. Gerraty 
12590957b409SSimon J. Gerraty /*
12600957b409SSimon J. Gerraty  * Inner function for computing a point multiplication. A window is
12610957b409SSimon J. Gerraty  * provided, with points 1*P to 15*P in affine coordinates.
12620957b409SSimon J. Gerraty  *
12630957b409SSimon J. Gerraty  * Assumptions:
12640957b409SSimon J. Gerraty  *  - All provided points are valid points on the curve.
12650957b409SSimon J. Gerraty  *  - Multiplier is non-zero, and smaller than the curve order.
12660957b409SSimon J. Gerraty  *  - Everything is in Montgomery representation.
12670957b409SSimon J. Gerraty  */
12680957b409SSimon J. Gerraty static void
point_mul_inner(p256_jacobian * R,const p256_affine * W,const unsigned char * k,size_t klen)12690957b409SSimon J. Gerraty point_mul_inner(p256_jacobian *R, const p256_affine *W,
12700957b409SSimon J. Gerraty 	const unsigned char *k, size_t klen)
12710957b409SSimon J. Gerraty {
12720957b409SSimon J. Gerraty 	p256_jacobian Q;
12730957b409SSimon J. Gerraty 	uint32_t qz;
12740957b409SSimon J. Gerraty 
12750957b409SSimon J. Gerraty 	memset(&Q, 0, sizeof Q);
12760957b409SSimon J. Gerraty 	qz = 1;
12770957b409SSimon J. Gerraty 	while (klen -- > 0) {
12780957b409SSimon J. Gerraty 		int i;
12790957b409SSimon J. Gerraty 		unsigned bk;
12800957b409SSimon J. Gerraty 
12810957b409SSimon J. Gerraty 		bk = *k ++;
12820957b409SSimon J. Gerraty 		for (i = 0; i < 2; i ++) {
12830957b409SSimon J. Gerraty 			uint32_t bits;
12840957b409SSimon J. Gerraty 			uint32_t bnz;
12850957b409SSimon J. Gerraty 			p256_affine T;
12860957b409SSimon J. Gerraty 			p256_jacobian U;
12870957b409SSimon J. Gerraty 			uint32_t n;
12880957b409SSimon J. Gerraty 			int j;
12890957b409SSimon J. Gerraty 			uint64_t m;
12900957b409SSimon J. Gerraty 
12910957b409SSimon J. Gerraty 			p256_double(&Q);
12920957b409SSimon J. Gerraty 			p256_double(&Q);
12930957b409SSimon J. Gerraty 			p256_double(&Q);
12940957b409SSimon J. Gerraty 			p256_double(&Q);
12950957b409SSimon J. Gerraty 			bits = (bk >> 4) & 0x0F;
12960957b409SSimon J. Gerraty 			bnz = NEQ(bits, 0);
12970957b409SSimon J. Gerraty 
12980957b409SSimon J. Gerraty 			/*
12990957b409SSimon J. Gerraty 			 * Lookup point in window. If the bits are 0,
13000957b409SSimon J. Gerraty 			 * we get something invalid, which is not a
13010957b409SSimon J. Gerraty 			 * problem because we will use it only if the
13020957b409SSimon J. Gerraty 			 * bits are non-zero.
13030957b409SSimon J. Gerraty 			 */
13040957b409SSimon J. Gerraty 			memset(&T, 0, sizeof T);
13050957b409SSimon J. Gerraty 			for (n = 0; n < 15; n ++) {
13060957b409SSimon J. Gerraty 				m = -(uint64_t)EQ(bits, n + 1);
13070957b409SSimon J. Gerraty 				T.x[0] |= m & W[n].x[0];
13080957b409SSimon J. Gerraty 				T.x[1] |= m & W[n].x[1];
13090957b409SSimon J. Gerraty 				T.x[2] |= m & W[n].x[2];
13100957b409SSimon J. Gerraty 				T.x[3] |= m & W[n].x[3];
13110957b409SSimon J. Gerraty 				T.y[0] |= m & W[n].y[0];
13120957b409SSimon J. Gerraty 				T.y[1] |= m & W[n].y[1];
13130957b409SSimon J. Gerraty 				T.y[2] |= m & W[n].y[2];
13140957b409SSimon J. Gerraty 				T.y[3] |= m & W[n].y[3];
13150957b409SSimon J. Gerraty 			}
13160957b409SSimon J. Gerraty 
13170957b409SSimon J. Gerraty 			U = Q;
13180957b409SSimon J. Gerraty 			p256_add_mixed(&U, &T);
13190957b409SSimon J. Gerraty 
13200957b409SSimon J. Gerraty 			/*
13210957b409SSimon J. Gerraty 			 * If qz is still 1, then Q was all-zeros, and this
13220957b409SSimon J. Gerraty 			 * is conserved through p256_double().
13230957b409SSimon J. Gerraty 			 */
13240957b409SSimon J. Gerraty 			m = -(uint64_t)(bnz & qz);
13250957b409SSimon J. Gerraty 			for (j = 0; j < 4; j ++) {
13260957b409SSimon J. Gerraty 				Q.x[j] |= m & T.x[j];
13270957b409SSimon J. Gerraty 				Q.y[j] |= m & T.y[j];
13280957b409SSimon J. Gerraty 				Q.z[j] |= m & F256_R[j];
13290957b409SSimon J. Gerraty 			}
13300957b409SSimon J. Gerraty 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
13310957b409SSimon J. Gerraty 			qz &= ~bnz;
13320957b409SSimon J. Gerraty 			bk <<= 4;
13330957b409SSimon J. Gerraty 		}
13340957b409SSimon J. Gerraty 	}
13350957b409SSimon J. Gerraty 	*R = Q;
13360957b409SSimon J. Gerraty }
13370957b409SSimon J. Gerraty 
13380957b409SSimon J. Gerraty /*
13390957b409SSimon J. Gerraty  * Convert a window from Jacobian to affine coordinates. A single
13400957b409SSimon J. Gerraty  * field inversion is used. This function works for windows up to
13410957b409SSimon J. Gerraty  * 32 elements.
13420957b409SSimon J. Gerraty  *
13430957b409SSimon J. Gerraty  * The destination array (aff[]) and the source array (jac[]) may
13440957b409SSimon J. Gerraty  * overlap, provided that the start of aff[] is not after the start of
13450957b409SSimon J. Gerraty  * jac[]. Even if the arrays do _not_ overlap, the source array is
13460957b409SSimon J. Gerraty  * modified.
13470957b409SSimon J. Gerraty  */
13480957b409SSimon J. Gerraty static void
window_to_affine(p256_affine * aff,p256_jacobian * jac,int num)13490957b409SSimon J. Gerraty window_to_affine(p256_affine *aff, p256_jacobian *jac, int num)
13500957b409SSimon J. Gerraty {
13510957b409SSimon J. Gerraty 	/*
13520957b409SSimon J. Gerraty 	 * Convert the window points to affine coordinates. We use the
13530957b409SSimon J. Gerraty 	 * following trick to mutualize the inversion computation: if
13540957b409SSimon J. Gerraty 	 * we have z1, z2, z3, and z4, and want to inverse all of them,
13550957b409SSimon J. Gerraty 	 * we compute u = 1/(z1*z2*z3*z4), and then we have:
13560957b409SSimon J. Gerraty 	 *   1/z1 = u*z2*z3*z4
13570957b409SSimon J. Gerraty 	 *   1/z2 = u*z1*z3*z4
13580957b409SSimon J. Gerraty 	 *   1/z3 = u*z1*z2*z4
13590957b409SSimon J. Gerraty 	 *   1/z4 = u*z1*z2*z3
13600957b409SSimon J. Gerraty 	 *
13610957b409SSimon J. Gerraty 	 * The partial products are computed recursively:
13620957b409SSimon J. Gerraty 	 *
13630957b409SSimon J. Gerraty 	 *  - on input (z_1,z_2), return (z_2,z_1) and z_1*z_2
13640957b409SSimon J. Gerraty 	 *  - on input (z_1,z_2,... z_n):
13650957b409SSimon J. Gerraty 	 *       recurse on (z_1,z_2,... z_(n/2)) -> r1 and m1
13660957b409SSimon J. Gerraty 	 *       recurse on (z_(n/2+1),z_(n/2+2)... z_n) -> r2 and m2
13670957b409SSimon J. Gerraty 	 *       multiply elements of r1 by m2 -> s1
13680957b409SSimon J. Gerraty 	 *       multiply elements of r2 by m1 -> s2
13690957b409SSimon J. Gerraty 	 *       return r1||r2 and m1*m2
13700957b409SSimon J. Gerraty 	 *
13710957b409SSimon J. Gerraty 	 * In the example below, we suppose that we have 14 elements.
13720957b409SSimon J. Gerraty 	 * Let z1, z2,... zE be the 14 values to invert (index noted in
13730957b409SSimon J. Gerraty 	 * hexadecimal, starting at 1).
13740957b409SSimon J. Gerraty 	 *
13750957b409SSimon J. Gerraty 	 *  - Depth 1:
13760957b409SSimon J. Gerraty 	 *      swap(z1, z2); z12 = z1*z2
13770957b409SSimon J. Gerraty 	 *      swap(z3, z4); z34 = z3*z4
13780957b409SSimon J. Gerraty 	 *      swap(z5, z6); z56 = z5*z6
13790957b409SSimon J. Gerraty 	 *      swap(z7, z8); z78 = z7*z8
13800957b409SSimon J. Gerraty 	 *      swap(z9, zA); z9A = z9*zA
13810957b409SSimon J. Gerraty 	 *      swap(zB, zC); zBC = zB*zC
13820957b409SSimon J. Gerraty 	 *      swap(zD, zE); zDE = zD*zE
13830957b409SSimon J. Gerraty 	 *
13840957b409SSimon J. Gerraty 	 *  - Depth 2:
13850957b409SSimon J. Gerraty 	 *      z1 <- z1*z34, z2 <- z2*z34, z3 <- z3*z12, z4 <- z4*z12
13860957b409SSimon J. Gerraty 	 *      z1234 = z12*z34
13870957b409SSimon J. Gerraty 	 *      z5 <- z5*z78, z6 <- z6*z78, z7 <- z7*z56, z8 <- z8*z56
13880957b409SSimon J. Gerraty 	 *      z5678 = z56*z78
13890957b409SSimon J. Gerraty 	 *      z9 <- z9*zBC, zA <- zA*zBC, zB <- zB*z9A, zC <- zC*z9A
13900957b409SSimon J. Gerraty 	 *      z9ABC = z9A*zBC
13910957b409SSimon J. Gerraty 	 *
13920957b409SSimon J. Gerraty 	 *  - Depth 3:
13930957b409SSimon J. Gerraty 	 *      z1 <- z1*z5678, z2 <- z2*z5678, z3 <- z3*z5678, z4 <- z4*z5678
13940957b409SSimon J. Gerraty 	 *      z5 <- z5*z1234, z6 <- z6*z1234, z7 <- z7*z1234, z8 <- z8*z1234
13950957b409SSimon J. Gerraty 	 *      z12345678 = z1234*z5678
13960957b409SSimon J. Gerraty 	 *      z9 <- z9*zDE, zA <- zA*zDE, zB <- zB*zDE, zC <- zC*zDE
13970957b409SSimon J. Gerraty 	 *      zD <- zD*z9ABC, zE*z9ABC
13980957b409SSimon J. Gerraty 	 *      z9ABCDE = z9ABC*zDE
13990957b409SSimon J. Gerraty 	 *
14000957b409SSimon J. Gerraty 	 *  - Depth 4:
14010957b409SSimon J. Gerraty 	 *      multiply z1..z8 by z9ABCDE
14020957b409SSimon J. Gerraty 	 *      multiply z9..zE by z12345678
14030957b409SSimon J. Gerraty 	 *      final z = z12345678*z9ABCDE
14040957b409SSimon J. Gerraty 	 */
14050957b409SSimon J. Gerraty 
14060957b409SSimon J. Gerraty 	uint64_t z[16][4];
14070957b409SSimon J. Gerraty 	int i, k, s;
14080957b409SSimon J. Gerraty #define zt   (z[15])
14090957b409SSimon J. Gerraty #define zu   (z[14])
14100957b409SSimon J. Gerraty #define zv   (z[13])
14110957b409SSimon J. Gerraty 
14120957b409SSimon J. Gerraty 	/*
14130957b409SSimon J. Gerraty 	 * First recursion step (pairwise swapping and multiplication).
14140957b409SSimon J. Gerraty 	 * If there is an odd number of elements, then we "invent" an
14150957b409SSimon J. Gerraty 	 * extra one with coordinate Z = 1 (in Montgomery representation).
14160957b409SSimon J. Gerraty 	 */
14170957b409SSimon J. Gerraty 	for (i = 0; (i + 1) < num; i += 2) {
14180957b409SSimon J. Gerraty 		memcpy(zt, jac[i].z, sizeof zt);
14190957b409SSimon J. Gerraty 		memcpy(jac[i].z, jac[i + 1].z, sizeof zt);
14200957b409SSimon J. Gerraty 		memcpy(jac[i + 1].z, zt, sizeof zt);
14210957b409SSimon J. Gerraty 		f256_montymul(z[i >> 1], jac[i].z, jac[i + 1].z);
14220957b409SSimon J. Gerraty 	}
14230957b409SSimon J. Gerraty 	if ((num & 1) != 0) {
14240957b409SSimon J. Gerraty 		memcpy(z[num >> 1], jac[num - 1].z, sizeof zt);
14250957b409SSimon J. Gerraty 		memcpy(jac[num - 1].z, F256_R, sizeof F256_R);
14260957b409SSimon J. Gerraty 	}
14270957b409SSimon J. Gerraty 
14280957b409SSimon J. Gerraty 	/*
14290957b409SSimon J. Gerraty 	 * Perform further recursion steps. At the entry of each step,
14300957b409SSimon J. Gerraty 	 * the process has been done for groups of 's' points. The
14310957b409SSimon J. Gerraty 	 * integer k is the log2 of s.
14320957b409SSimon J. Gerraty 	 */
14330957b409SSimon J. Gerraty 	for (k = 1, s = 2; s < num; k ++, s <<= 1) {
14340957b409SSimon J. Gerraty 		int n;
14350957b409SSimon J. Gerraty 
14360957b409SSimon J. Gerraty 		for (i = 0; i < num; i ++) {
14370957b409SSimon J. Gerraty 			f256_montymul(jac[i].z, jac[i].z, z[(i >> k) ^ 1]);
14380957b409SSimon J. Gerraty 		}
14390957b409SSimon J. Gerraty 		n = (num + s - 1) >> k;
14400957b409SSimon J. Gerraty 		for (i = 0; i < (n >> 1); i ++) {
14410957b409SSimon J. Gerraty 			f256_montymul(z[i], z[i << 1], z[(i << 1) + 1]);
14420957b409SSimon J. Gerraty 		}
14430957b409SSimon J. Gerraty 		if ((n & 1) != 0) {
14440957b409SSimon J. Gerraty 			memmove(z[n >> 1], z[n], sizeof zt);
14450957b409SSimon J. Gerraty 		}
14460957b409SSimon J. Gerraty 	}
14470957b409SSimon J. Gerraty 
14480957b409SSimon J. Gerraty 	/*
14490957b409SSimon J. Gerraty 	 * Invert the final result, and convert all points.
14500957b409SSimon J. Gerraty 	 */
14510957b409SSimon J. Gerraty 	f256_invert(zt, z[0]);
14520957b409SSimon J. Gerraty 	for (i = 0; i < num; i ++) {
14530957b409SSimon J. Gerraty 		f256_montymul(zv, jac[i].z, zt);
14540957b409SSimon J. Gerraty 		f256_montysquare(zu, zv);
14550957b409SSimon J. Gerraty 		f256_montymul(zv, zv, zu);
14560957b409SSimon J. Gerraty 		f256_montymul(aff[i].x, jac[i].x, zu);
14570957b409SSimon J. Gerraty 		f256_montymul(aff[i].y, jac[i].y, zv);
14580957b409SSimon J. Gerraty 	}
14590957b409SSimon J. Gerraty }
14600957b409SSimon J. Gerraty 
14610957b409SSimon J. Gerraty /*
14620957b409SSimon J. Gerraty  * Multiply the provided point by an integer.
14630957b409SSimon J. Gerraty  * Assumptions:
14640957b409SSimon J. Gerraty  *  - Source point is a valid curve point.
14650957b409SSimon J. Gerraty  *  - Source point is not the point-at-infinity.
14660957b409SSimon J. Gerraty  *  - Integer is not 0, and is lower than the curve order.
14670957b409SSimon J. Gerraty  * If these conditions are not met, then the result is indeterminate
14680957b409SSimon J. Gerraty  * (but the process is still constant-time).
14690957b409SSimon J. Gerraty  */
14700957b409SSimon J. Gerraty static void
p256_mul(p256_jacobian * P,const unsigned char * k,size_t klen)14710957b409SSimon J. Gerraty p256_mul(p256_jacobian *P, const unsigned char *k, size_t klen)
14720957b409SSimon J. Gerraty {
14730957b409SSimon J. Gerraty 	union {
14740957b409SSimon J. Gerraty 		p256_affine aff[15];
14750957b409SSimon J. Gerraty 		p256_jacobian jac[15];
14760957b409SSimon J. Gerraty 	} window;
14770957b409SSimon J. Gerraty 	int i;
14780957b409SSimon J. Gerraty 
14790957b409SSimon J. Gerraty 	/*
14800957b409SSimon J. Gerraty 	 * Compute window, in Jacobian coordinates.
14810957b409SSimon J. Gerraty 	 */
14820957b409SSimon J. Gerraty 	window.jac[0] = *P;
14830957b409SSimon J. Gerraty 	for (i = 2; i < 16; i ++) {
14840957b409SSimon J. Gerraty 		window.jac[i - 1] = window.jac[(i >> 1) - 1];
14850957b409SSimon J. Gerraty 		if ((i & 1) == 0) {
14860957b409SSimon J. Gerraty 			p256_double(&window.jac[i - 1]);
14870957b409SSimon J. Gerraty 		} else {
14880957b409SSimon J. Gerraty 			p256_add(&window.jac[i - 1], &window.jac[i >> 1]);
14890957b409SSimon J. Gerraty 		}
14900957b409SSimon J. Gerraty 	}
14910957b409SSimon J. Gerraty 
14920957b409SSimon J. Gerraty 	/*
14930957b409SSimon J. Gerraty 	 * Convert the window points to affine coordinates. Point
14940957b409SSimon J. Gerraty 	 * window[0] is the source point, already in affine coordinates.
14950957b409SSimon J. Gerraty 	 */
14960957b409SSimon J. Gerraty 	window_to_affine(window.aff, window.jac, 15);
14970957b409SSimon J. Gerraty 
14980957b409SSimon J. Gerraty 	/*
14990957b409SSimon J. Gerraty 	 * Perform point multiplication.
15000957b409SSimon J. Gerraty 	 */
15010957b409SSimon J. Gerraty 	point_mul_inner(P, window.aff, k, klen);
15020957b409SSimon J. Gerraty }
15030957b409SSimon J. Gerraty 
15040957b409SSimon J. Gerraty /*
15050957b409SSimon J. Gerraty  * Precomputed window for the conventional generator: P256_Gwin[n]
15060957b409SSimon J. Gerraty  * contains (n+1)*G (affine coordinates, in Montgomery representation).
15070957b409SSimon J. Gerraty  */
15080957b409SSimon J. Gerraty static const p256_affine P256_Gwin[] = {
15090957b409SSimon J. Gerraty 	{
15100957b409SSimon J. Gerraty 		{ 0x79E730D418A9143C, 0x75BA95FC5FEDB601,
15110957b409SSimon J. Gerraty 		  0x79FB732B77622510, 0x18905F76A53755C6 },
15120957b409SSimon J. Gerraty 		{ 0xDDF25357CE95560A, 0x8B4AB8E4BA19E45C,
15130957b409SSimon J. Gerraty 		  0xD2E88688DD21F325, 0x8571FF1825885D85 }
15140957b409SSimon J. Gerraty 	},
15150957b409SSimon J. Gerraty 	{
15160957b409SSimon J. Gerraty 		{ 0x850046D410DDD64D, 0xAA6AE3C1A433827D,
15170957b409SSimon J. Gerraty 		  0x732205038D1490D9, 0xF6BB32E43DCF3A3B },
15180957b409SSimon J. Gerraty 		{ 0x2F3648D361BEE1A5, 0x152CD7CBEB236FF8,
15190957b409SSimon J. Gerraty 		  0x19A8FB0E92042DBE, 0x78C577510A5B8A3B }
15200957b409SSimon J. Gerraty 	},
15210957b409SSimon J. Gerraty 	{
15220957b409SSimon J. Gerraty 		{ 0xFFAC3F904EEBC127, 0xB027F84A087D81FB,
15230957b409SSimon J. Gerraty 		  0x66AD77DD87CBBC98, 0x26936A3FB6FF747E },
15240957b409SSimon J. Gerraty 		{ 0xB04C5C1FC983A7EB, 0x583E47AD0861FE1A,
15250957b409SSimon J. Gerraty 		  0x788208311A2EE98E, 0xD5F06A29E587CC07 }
15260957b409SSimon J. Gerraty 	},
15270957b409SSimon J. Gerraty 	{
15280957b409SSimon J. Gerraty 		{ 0x74B0B50D46918DCC, 0x4650A6EDC623C173,
15290957b409SSimon J. Gerraty 		  0x0CDAACACE8100AF2, 0x577362F541B0176B },
15300957b409SSimon J. Gerraty 		{ 0x2D96F24CE4CBABA6, 0x17628471FAD6F447,
15310957b409SSimon J. Gerraty 		  0x6B6C36DEE5DDD22E, 0x84B14C394C5AB863 }
15320957b409SSimon J. Gerraty 	},
15330957b409SSimon J. Gerraty 	{
15340957b409SSimon J. Gerraty 		{ 0xBE1B8AAEC45C61F5, 0x90EC649A94B9537D,
15350957b409SSimon J. Gerraty 		  0x941CB5AAD076C20C, 0xC9079605890523C8 },
15360957b409SSimon J. Gerraty 		{ 0xEB309B4AE7BA4F10, 0x73C568EFE5EB882B,
15370957b409SSimon J. Gerraty 		  0x3540A9877E7A1F68, 0x73A076BB2DD1E916 }
15380957b409SSimon J. Gerraty 	},
15390957b409SSimon J. Gerraty 	{
15400957b409SSimon J. Gerraty 		{ 0x403947373E77664A, 0x55AE744F346CEE3E,
15410957b409SSimon J. Gerraty 		  0xD50A961A5B17A3AD, 0x13074B5954213673 },
15420957b409SSimon J. Gerraty 		{ 0x93D36220D377E44B, 0x299C2B53ADFF14B5,
15430957b409SSimon J. Gerraty 		  0xF424D44CEF639F11, 0xA4C9916D4A07F75F }
15440957b409SSimon J. Gerraty 	},
15450957b409SSimon J. Gerraty 	{
15460957b409SSimon J. Gerraty 		{ 0x0746354EA0173B4F, 0x2BD20213D23C00F7,
15470957b409SSimon J. Gerraty 		  0xF43EAAB50C23BB08, 0x13BA5119C3123E03 },
15480957b409SSimon J. Gerraty 		{ 0x2847D0303F5B9D4D, 0x6742F2F25DA67BDD,
15490957b409SSimon J. Gerraty 		  0xEF933BDC77C94195, 0xEAEDD9156E240867 }
15500957b409SSimon J. Gerraty 	},
15510957b409SSimon J. Gerraty 	{
15520957b409SSimon J. Gerraty 		{ 0x27F14CD19499A78F, 0x462AB5C56F9B3455,
15530957b409SSimon J. Gerraty 		  0x8F90F02AF02CFC6B, 0xB763891EB265230D },
15540957b409SSimon J. Gerraty 		{ 0xF59DA3A9532D4977, 0x21E3327DCF9EBA15,
15550957b409SSimon J. Gerraty 		  0x123C7B84BE60BBF0, 0x56EC12F27706DF76 }
15560957b409SSimon J. Gerraty 	},
15570957b409SSimon J. Gerraty 	{
15580957b409SSimon J. Gerraty 		{ 0x75C96E8F264E20E8, 0xABE6BFED59A7A841,
15590957b409SSimon J. Gerraty 		  0x2CC09C0444C8EB00, 0xE05B3080F0C4E16B },
15600957b409SSimon J. Gerraty 		{ 0x1EB7777AA45F3314, 0x56AF7BEDCE5D45E3,
15610957b409SSimon J. Gerraty 		  0x2B6E019A88B12F1A, 0x086659CDFD835F9B }
15620957b409SSimon J. Gerraty 	},
15630957b409SSimon J. Gerraty 	{
15640957b409SSimon J. Gerraty 		{ 0x2C18DBD19DC21EC8, 0x98F9868A0FCF8139,
15650957b409SSimon J. Gerraty 		  0x737D2CD648250B49, 0xCC61C94724B3428F },
15660957b409SSimon J. Gerraty 		{ 0x0C2B407880DD9E76, 0xC43A8991383FBE08,
15670957b409SSimon J. Gerraty 		  0x5F7D2D65779BE5D2, 0x78719A54EB3B4AB5 }
15680957b409SSimon J. Gerraty 	},
15690957b409SSimon J. Gerraty 	{
15700957b409SSimon J. Gerraty 		{ 0xEA7D260A6245E404, 0x9DE407956E7FDFE0,
15710957b409SSimon J. Gerraty 		  0x1FF3A4158DAC1AB5, 0x3E7090F1649C9073 },
15720957b409SSimon J. Gerraty 		{ 0x1A7685612B944E88, 0x250F939EE57F61C8,
15730957b409SSimon J. Gerraty 		  0x0C0DAA891EAD643D, 0x68930023E125B88E }
15740957b409SSimon J. Gerraty 	},
15750957b409SSimon J. Gerraty 	{
15760957b409SSimon J. Gerraty 		{ 0x04B71AA7D2697768, 0xABDEDEF5CA345A33,
15770957b409SSimon J. Gerraty 		  0x2409D29DEE37385E, 0x4EE1DF77CB83E156 },
15780957b409SSimon J. Gerraty 		{ 0x0CAC12D91CBB5B43, 0x170ED2F6CA895637,
15790957b409SSimon J. Gerraty 		  0x28228CFA8ADE6D66, 0x7FF57C9553238ACA }
15800957b409SSimon J. Gerraty 	},
15810957b409SSimon J. Gerraty 	{
15820957b409SSimon J. Gerraty 		{ 0xCCC425634B2ED709, 0x0E356769856FD30D,
15830957b409SSimon J. Gerraty 		  0xBCBCD43F559E9811, 0x738477AC5395B759 },
15840957b409SSimon J. Gerraty 		{ 0x35752B90C00EE17F, 0x68748390742ED2E3,
15850957b409SSimon J. Gerraty 		  0x7CD06422BD1F5BC1, 0xFBC08769C9E7B797 }
15860957b409SSimon J. Gerraty 	},
15870957b409SSimon J. Gerraty 	{
15880957b409SSimon J. Gerraty 		{ 0xA242A35BB0CF664A, 0x126E48F77F9707E3,
15890957b409SSimon J. Gerraty 		  0x1717BF54C6832660, 0xFAAE7332FD12C72E },
15900957b409SSimon J. Gerraty 		{ 0x27B52DB7995D586B, 0xBE29569E832237C2,
15910957b409SSimon J. Gerraty 		  0xE8E4193E2A65E7DB, 0x152706DC2EAA1BBB }
15920957b409SSimon J. Gerraty 	},
15930957b409SSimon J. Gerraty 	{
15940957b409SSimon J. Gerraty 		{ 0x72BCD8B7BC60055B, 0x03CC23EE56E27E4B,
15950957b409SSimon J. Gerraty 		  0xEE337424E4819370, 0xE2AA0E430AD3DA09 },
15960957b409SSimon J. Gerraty 		{ 0x40B8524F6383C45D, 0xD766355442A41B25,
15970957b409SSimon J. Gerraty 		  0x64EFA6DE778A4797, 0x2042170A7079ADF4 }
15980957b409SSimon J. Gerraty 	}
15990957b409SSimon J. Gerraty };
16000957b409SSimon J. Gerraty 
16010957b409SSimon J. Gerraty /*
16020957b409SSimon J. Gerraty  * Multiply the conventional generator of the curve by the provided
16030957b409SSimon J. Gerraty  * integer. Return is written in *P.
16040957b409SSimon J. Gerraty  *
16050957b409SSimon J. Gerraty  * Assumptions:
16060957b409SSimon J. Gerraty  *  - Integer is not 0, and is lower than the curve order.
16070957b409SSimon J. Gerraty  * If this conditions is not met, then the result is indeterminate
16080957b409SSimon J. Gerraty  * (but the process is still constant-time).
16090957b409SSimon J. Gerraty  */
16100957b409SSimon J. Gerraty static void
p256_mulgen(p256_jacobian * P,const unsigned char * k,size_t klen)16110957b409SSimon J. Gerraty p256_mulgen(p256_jacobian *P, const unsigned char *k, size_t klen)
16120957b409SSimon J. Gerraty {
16130957b409SSimon J. Gerraty 	point_mul_inner(P, P256_Gwin, k, klen);
16140957b409SSimon J. Gerraty }
16150957b409SSimon J. Gerraty 
16160957b409SSimon J. Gerraty /*
16170957b409SSimon J. Gerraty  * Return 1 if all of the following hold:
16180957b409SSimon J. Gerraty  *  - klen <= 32
16190957b409SSimon J. Gerraty  *  - k != 0
16200957b409SSimon J. Gerraty  *  - k is lower than the curve order
16210957b409SSimon J. Gerraty  * Otherwise, return 0.
16220957b409SSimon J. Gerraty  *
16230957b409SSimon J. Gerraty  * Constant-time behaviour: only klen may be observable.
16240957b409SSimon J. Gerraty  */
16250957b409SSimon J. Gerraty static uint32_t
check_scalar(const unsigned char * k,size_t klen)16260957b409SSimon J. Gerraty check_scalar(const unsigned char *k, size_t klen)
16270957b409SSimon J. Gerraty {
16280957b409SSimon J. Gerraty 	uint32_t z;
16290957b409SSimon J. Gerraty 	int32_t c;
16300957b409SSimon J. Gerraty 	size_t u;
16310957b409SSimon J. Gerraty 
16320957b409SSimon J. Gerraty 	if (klen > 32) {
16330957b409SSimon J. Gerraty 		return 0;
16340957b409SSimon J. Gerraty 	}
16350957b409SSimon J. Gerraty 	z = 0;
16360957b409SSimon J. Gerraty 	for (u = 0; u < klen; u ++) {
16370957b409SSimon J. Gerraty 		z |= k[u];
16380957b409SSimon J. Gerraty 	}
16390957b409SSimon J. Gerraty 	if (klen == 32) {
16400957b409SSimon J. Gerraty 		c = 0;
16410957b409SSimon J. Gerraty 		for (u = 0; u < klen; u ++) {
16420957b409SSimon J. Gerraty 			c |= -(int32_t)EQ0(c) & CMP(k[u], P256_N[u]);
16430957b409SSimon J. Gerraty 		}
16440957b409SSimon J. Gerraty 	} else {
16450957b409SSimon J. Gerraty 		c = -1;
16460957b409SSimon J. Gerraty 	}
16470957b409SSimon J. Gerraty 	return NEQ(z, 0) & LT0(c);
16480957b409SSimon J. Gerraty }
16490957b409SSimon J. Gerraty 
16500957b409SSimon J. Gerraty static uint32_t
api_mul(unsigned char * G,size_t Glen,const unsigned char * k,size_t klen,int curve)16510957b409SSimon J. Gerraty api_mul(unsigned char *G, size_t Glen,
16520957b409SSimon J. Gerraty 	const unsigned char *k, size_t klen, int curve)
16530957b409SSimon J. Gerraty {
16540957b409SSimon J. Gerraty 	uint32_t r;
16550957b409SSimon J. Gerraty 	p256_jacobian P;
16560957b409SSimon J. Gerraty 
16570957b409SSimon J. Gerraty 	(void)curve;
16580957b409SSimon J. Gerraty 	if (Glen != 65) {
16590957b409SSimon J. Gerraty 		return 0;
16600957b409SSimon J. Gerraty 	}
16610957b409SSimon J. Gerraty 	r = check_scalar(k, klen);
16620957b409SSimon J. Gerraty 	r &= point_decode(&P, G);
16630957b409SSimon J. Gerraty 	p256_mul(&P, k, klen);
16640957b409SSimon J. Gerraty 	r &= point_encode(G, &P);
16650957b409SSimon J. Gerraty 	return r;
16660957b409SSimon J. Gerraty }
16670957b409SSimon J. Gerraty 
16680957b409SSimon J. Gerraty static size_t
api_mulgen(unsigned char * R,const unsigned char * k,size_t klen,int curve)16690957b409SSimon J. Gerraty api_mulgen(unsigned char *R,
16700957b409SSimon J. Gerraty 	const unsigned char *k, size_t klen, int curve)
16710957b409SSimon J. Gerraty {
16720957b409SSimon J. Gerraty 	p256_jacobian P;
16730957b409SSimon J. Gerraty 
16740957b409SSimon J. Gerraty 	(void)curve;
16750957b409SSimon J. Gerraty 	p256_mulgen(&P, k, klen);
16760957b409SSimon J. Gerraty 	point_encode(R, &P);
16770957b409SSimon J. Gerraty 	return 65;
16780957b409SSimon J. Gerraty }
16790957b409SSimon J. Gerraty 
16800957b409SSimon J. Gerraty static uint32_t
api_muladd(unsigned char * A,const unsigned char * B,size_t len,const unsigned char * x,size_t xlen,const unsigned char * y,size_t ylen,int curve)16810957b409SSimon J. Gerraty api_muladd(unsigned char *A, const unsigned char *B, size_t len,
16820957b409SSimon J. Gerraty 	const unsigned char *x, size_t xlen,
16830957b409SSimon J. Gerraty 	const unsigned char *y, size_t ylen, int curve)
16840957b409SSimon J. Gerraty {
16850957b409SSimon J. Gerraty 	/*
16860957b409SSimon J. Gerraty 	 * We might want to use Shamir's trick here: make a composite
16870957b409SSimon J. Gerraty 	 * window of u*P+v*Q points, to merge the two doubling-ladders
16880957b409SSimon J. Gerraty 	 * into one. This, however, has some complications:
16890957b409SSimon J. Gerraty 	 *
16900957b409SSimon J. Gerraty 	 *  - During the computation, we may hit the point-at-infinity.
16910957b409SSimon J. Gerraty 	 *    Thus, we would need p256_add_complete_mixed() (complete
16920957b409SSimon J. Gerraty 	 *    formulas for point addition), with a higher cost (17 muls
16930957b409SSimon J. Gerraty 	 *    instead of 11).
16940957b409SSimon J. Gerraty 	 *
16950957b409SSimon J. Gerraty 	 *  - A 4-bit window would be too large, since it would involve
16960957b409SSimon J. Gerraty 	 *    16*16-1 = 255 points. For the same window size as in the
16970957b409SSimon J. Gerraty 	 *    p256_mul() case, we would need to reduce the window size
16980957b409SSimon J. Gerraty 	 *    to 2 bits, and thus perform twice as many non-doubling
16990957b409SSimon J. Gerraty 	 *    point additions.
17000957b409SSimon J. Gerraty 	 *
17010957b409SSimon J. Gerraty 	 *  - The window may itself contain the point-at-infinity, and
17020957b409SSimon J. Gerraty 	 *    thus cannot be in all generality be made of affine points.
17030957b409SSimon J. Gerraty 	 *    Instead, we would need to make it a window of points in
17040957b409SSimon J. Gerraty 	 *    Jacobian coordinates. Even p256_add_complete_mixed() would
17050957b409SSimon J. Gerraty 	 *    be inappropriate.
17060957b409SSimon J. Gerraty 	 *
17070957b409SSimon J. Gerraty 	 * For these reasons, the code below performs two separate
17080957b409SSimon J. Gerraty 	 * point multiplications, then computes the final point addition
17090957b409SSimon J. Gerraty 	 * (which is both a "normal" addition, and a doubling, to handle
17100957b409SSimon J. Gerraty 	 * all cases).
17110957b409SSimon J. Gerraty 	 */
17120957b409SSimon J. Gerraty 
17130957b409SSimon J. Gerraty 	p256_jacobian P, Q;
17140957b409SSimon J. Gerraty 	uint32_t r, t, s;
17150957b409SSimon J. Gerraty 	uint64_t z;
17160957b409SSimon J. Gerraty 
17170957b409SSimon J. Gerraty 	(void)curve;
17180957b409SSimon J. Gerraty 	if (len != 65) {
17190957b409SSimon J. Gerraty 		return 0;
17200957b409SSimon J. Gerraty 	}
17210957b409SSimon J. Gerraty 	r = point_decode(&P, A);
17220957b409SSimon J. Gerraty 	p256_mul(&P, x, xlen);
17230957b409SSimon J. Gerraty 	if (B == NULL) {
17240957b409SSimon J. Gerraty 		p256_mulgen(&Q, y, ylen);
17250957b409SSimon J. Gerraty 	} else {
17260957b409SSimon J. Gerraty 		r &= point_decode(&Q, B);
17270957b409SSimon J. Gerraty 		p256_mul(&Q, y, ylen);
17280957b409SSimon J. Gerraty 	}
17290957b409SSimon J. Gerraty 
17300957b409SSimon J. Gerraty 	/*
17310957b409SSimon J. Gerraty 	 * The final addition may fail in case both points are equal.
17320957b409SSimon J. Gerraty 	 */
17330957b409SSimon J. Gerraty 	t = p256_add(&P, &Q);
17340957b409SSimon J. Gerraty 	f256_final_reduce(P.z);
17350957b409SSimon J. Gerraty 	z = P.z[0] | P.z[1] | P.z[2] | P.z[3];
17360957b409SSimon J. Gerraty 	s = EQ((uint32_t)(z | (z >> 32)), 0);
17370957b409SSimon J. Gerraty 	p256_double(&Q);
17380957b409SSimon J. Gerraty 
17390957b409SSimon J. Gerraty 	/*
17400957b409SSimon J. Gerraty 	 * If s is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
17410957b409SSimon J. Gerraty 	 * have the following:
17420957b409SSimon J. Gerraty 	 *
17430957b409SSimon J. Gerraty 	 *   s = 0, t = 0   return P (normal addition)
17440957b409SSimon J. Gerraty 	 *   s = 0, t = 1   return P (normal addition)
17450957b409SSimon J. Gerraty 	 *   s = 1, t = 0   return Q (a 'double' case)
17460957b409SSimon J. Gerraty 	 *   s = 1, t = 1   report an error (P+Q = 0)
17470957b409SSimon J. Gerraty 	 */
17480957b409SSimon J. Gerraty 	CCOPY(s & ~t, &P, &Q, sizeof Q);
17490957b409SSimon J. Gerraty 	point_encode(A, &P);
17500957b409SSimon J. Gerraty 	r &= ~(s & t);
17510957b409SSimon J. Gerraty 	return r;
17520957b409SSimon J. Gerraty }
17530957b409SSimon J. Gerraty 
17540957b409SSimon J. Gerraty /* see bearssl_ec.h */
17550957b409SSimon J. Gerraty const br_ec_impl br_ec_p256_m64 = {
17560957b409SSimon J. Gerraty 	(uint32_t)0x00800000,
17570957b409SSimon J. Gerraty 	&api_generator,
17580957b409SSimon J. Gerraty 	&api_order,
17590957b409SSimon J. Gerraty 	&api_xoff,
17600957b409SSimon J. Gerraty 	&api_mul,
17610957b409SSimon J. Gerraty 	&api_mulgen,
17620957b409SSimon J. Gerraty 	&api_muladd
17630957b409SSimon J. Gerraty };
17640957b409SSimon J. Gerraty 
17650957b409SSimon J. Gerraty /* see bearssl_ec.h */
17660957b409SSimon J. Gerraty const br_ec_impl *
br_ec_p256_m64_get(void)17670957b409SSimon J. Gerraty br_ec_p256_m64_get(void)
17680957b409SSimon J. Gerraty {
17690957b409SSimon J. Gerraty 	return &br_ec_p256_m64;
17700957b409SSimon J. Gerraty }
17710957b409SSimon J. Gerraty 
17720957b409SSimon J. Gerraty #else
17730957b409SSimon J. Gerraty 
17740957b409SSimon J. Gerraty /* see bearssl_ec.h */
17750957b409SSimon J. Gerraty const br_ec_impl *
br_ec_p256_m64_get(void)17760957b409SSimon J. Gerraty br_ec_p256_m64_get(void)
17770957b409SSimon J. Gerraty {
17780957b409SSimon J. Gerraty 	return 0;
17790957b409SSimon J. Gerraty }
17800957b409SSimon J. Gerraty 
17810957b409SSimon J. Gerraty #endif
1782