xref: /freebsd/contrib/bearssl/src/ec/ec_p256_m31.c (revision cc9e6590773dba57440750c124173ed531349a06)
10957b409SSimon J. Gerraty /*
20957b409SSimon J. Gerraty  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
30957b409SSimon J. Gerraty  *
40957b409SSimon J. Gerraty  * Permission is hereby granted, free of charge, to any person obtaining
50957b409SSimon J. Gerraty  * a copy of this software and associated documentation files (the
60957b409SSimon J. Gerraty  * "Software"), to deal in the Software without restriction, including
70957b409SSimon J. Gerraty  * without limitation the rights to use, copy, modify, merge, publish,
80957b409SSimon J. Gerraty  * distribute, sublicense, and/or sell copies of the Software, and to
90957b409SSimon J. Gerraty  * permit persons to whom the Software is furnished to do so, subject to
100957b409SSimon J. Gerraty  * the following conditions:
110957b409SSimon J. Gerraty  *
120957b409SSimon J. Gerraty  * The above copyright notice and this permission notice shall be
130957b409SSimon J. Gerraty  * included in all copies or substantial portions of the Software.
140957b409SSimon J. Gerraty  *
150957b409SSimon J. Gerraty  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
160957b409SSimon J. Gerraty  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
170957b409SSimon J. Gerraty  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
180957b409SSimon J. Gerraty  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
190957b409SSimon J. Gerraty  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
200957b409SSimon J. Gerraty  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
210957b409SSimon J. Gerraty  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
220957b409SSimon J. Gerraty  * SOFTWARE.
230957b409SSimon J. Gerraty  */
240957b409SSimon J. Gerraty 
250957b409SSimon J. Gerraty #include "inner.h"
260957b409SSimon J. Gerraty 
270957b409SSimon J. Gerraty /*
280957b409SSimon J. Gerraty  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
290957b409SSimon J. Gerraty  * that right-shifting a signed negative integer copies the sign bit
300957b409SSimon J. Gerraty  * (arithmetic right-shift). This is "implementation-defined behaviour",
310957b409SSimon J. Gerraty  * i.e. it is not undefined, but it may differ between compilers. Each
320957b409SSimon J. Gerraty  * compiler is supposed to document its behaviour in that respect. GCC
330957b409SSimon J. Gerraty  * explicitly defines that an arithmetic right shift is used. We expect
340957b409SSimon J. Gerraty  * all other compilers to do the same, because underlying CPU offer an
350957b409SSimon J. Gerraty  * arithmetic right shift opcode that could not be used otherwise.
360957b409SSimon J. Gerraty  */
370957b409SSimon J. Gerraty #if BR_NO_ARITH_SHIFT
380957b409SSimon J. Gerraty #define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
390957b409SSimon J. Gerraty                       | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
400957b409SSimon J. Gerraty #define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
410957b409SSimon J. Gerraty                       | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
420957b409SSimon J. Gerraty #else
430957b409SSimon J. Gerraty #define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
440957b409SSimon J. Gerraty #define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
450957b409SSimon J. Gerraty #endif
460957b409SSimon J. Gerraty 
470957b409SSimon J. Gerraty /*
480957b409SSimon J. Gerraty  * Convert an integer from unsigned big-endian encoding to a sequence of
490957b409SSimon J. Gerraty  * 30-bit words in little-endian order. The final "partial" word is
500957b409SSimon J. Gerraty  * returned.
510957b409SSimon J. Gerraty  */
520957b409SSimon J. Gerraty static uint32_t
be8_to_le30(uint32_t * dst,const unsigned char * src,size_t len)530957b409SSimon J. Gerraty be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
540957b409SSimon J. Gerraty {
550957b409SSimon J. Gerraty 	uint32_t acc;
560957b409SSimon J. Gerraty 	int acc_len;
570957b409SSimon J. Gerraty 
580957b409SSimon J. Gerraty 	acc = 0;
590957b409SSimon J. Gerraty 	acc_len = 0;
600957b409SSimon J. Gerraty 	while (len -- > 0) {
610957b409SSimon J. Gerraty 		uint32_t b;
620957b409SSimon J. Gerraty 
630957b409SSimon J. Gerraty 		b = src[len];
640957b409SSimon J. Gerraty 		if (acc_len < 22) {
650957b409SSimon J. Gerraty 			acc |= b << acc_len;
660957b409SSimon J. Gerraty 			acc_len += 8;
670957b409SSimon J. Gerraty 		} else {
680957b409SSimon J. Gerraty 			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
690957b409SSimon J. Gerraty 			acc = b >> (30 - acc_len);
700957b409SSimon J. Gerraty 			acc_len -= 22;
710957b409SSimon J. Gerraty 		}
720957b409SSimon J. Gerraty 	}
730957b409SSimon J. Gerraty 	return acc;
740957b409SSimon J. Gerraty }
750957b409SSimon J. Gerraty 
760957b409SSimon J. Gerraty /*
770957b409SSimon J. Gerraty  * Convert an integer (30-bit words, little-endian) to unsigned
780957b409SSimon J. Gerraty  * big-endian encoding. The total encoding length is provided; all
790957b409SSimon J. Gerraty  * the destination bytes will be filled.
800957b409SSimon J. Gerraty  */
810957b409SSimon J. Gerraty static void
le30_to_be8(unsigned char * dst,size_t len,const uint32_t * src)820957b409SSimon J. Gerraty le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
830957b409SSimon J. Gerraty {
840957b409SSimon J. Gerraty 	uint32_t acc;
850957b409SSimon J. Gerraty 	int acc_len;
860957b409SSimon J. Gerraty 
870957b409SSimon J. Gerraty 	acc = 0;
880957b409SSimon J. Gerraty 	acc_len = 0;
890957b409SSimon J. Gerraty 	while (len -- > 0) {
900957b409SSimon J. Gerraty 		if (acc_len < 8) {
910957b409SSimon J. Gerraty 			uint32_t w;
920957b409SSimon J. Gerraty 
930957b409SSimon J. Gerraty 			w = *src ++;
940957b409SSimon J. Gerraty 			dst[len] = (unsigned char)(acc | (w << acc_len));
950957b409SSimon J. Gerraty 			acc = w >> (8 - acc_len);
960957b409SSimon J. Gerraty 			acc_len += 22;
970957b409SSimon J. Gerraty 		} else {
980957b409SSimon J. Gerraty 			dst[len] = (unsigned char)acc;
990957b409SSimon J. Gerraty 			acc >>= 8;
1000957b409SSimon J. Gerraty 			acc_len -= 8;
1010957b409SSimon J. Gerraty 		}
1020957b409SSimon J. Gerraty 	}
1030957b409SSimon J. Gerraty }
1040957b409SSimon J. Gerraty 
1050957b409SSimon J. Gerraty /*
1060957b409SSimon J. Gerraty  * Multiply two integers. Source integers are represented as arrays of
1070957b409SSimon J. Gerraty  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
1080957b409SSimon J. Gerraty  * 18 words of 30 bits each.
1090957b409SSimon J. Gerraty  */
1100957b409SSimon J. Gerraty static void
mul9(uint32_t * d,const uint32_t * a,const uint32_t * b)1110957b409SSimon J. Gerraty mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
1120957b409SSimon J. Gerraty {
1130957b409SSimon J. Gerraty 	/*
1140957b409SSimon J. Gerraty 	 * Maximum intermediate result is no more than
1150957b409SSimon J. Gerraty 	 * 10376293531797946367, which fits in 64 bits. Reason:
1160957b409SSimon J. Gerraty 	 *
1170957b409SSimon J. Gerraty 	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
1180957b409SSimon J. Gerraty 	 *   10376293531797946367 < 9663676407 * 2^30
1190957b409SSimon J. Gerraty 	 *
1200957b409SSimon J. Gerraty 	 * Thus, adding together 9 products of 30-bit integers, with
1210957b409SSimon J. Gerraty 	 * a carry of at most 9663676406, yields an integer that fits
1220957b409SSimon J. Gerraty 	 * on 64 bits and generates a carry of at most 9663676406.
1230957b409SSimon J. Gerraty 	 */
1240957b409SSimon J. Gerraty 	uint64_t t[17];
1250957b409SSimon J. Gerraty 	uint64_t cc;
1260957b409SSimon J. Gerraty 	int i;
1270957b409SSimon J. Gerraty 
1280957b409SSimon J. Gerraty 	t[ 0] = MUL31(a[0], b[0]);
1290957b409SSimon J. Gerraty 	t[ 1] = MUL31(a[0], b[1])
1300957b409SSimon J. Gerraty 		+ MUL31(a[1], b[0]);
1310957b409SSimon J. Gerraty 	t[ 2] = MUL31(a[0], b[2])
1320957b409SSimon J. Gerraty 		+ MUL31(a[1], b[1])
1330957b409SSimon J. Gerraty 		+ MUL31(a[2], b[0]);
1340957b409SSimon J. Gerraty 	t[ 3] = MUL31(a[0], b[3])
1350957b409SSimon J. Gerraty 		+ MUL31(a[1], b[2])
1360957b409SSimon J. Gerraty 		+ MUL31(a[2], b[1])
1370957b409SSimon J. Gerraty 		+ MUL31(a[3], b[0]);
1380957b409SSimon J. Gerraty 	t[ 4] = MUL31(a[0], b[4])
1390957b409SSimon J. Gerraty 		+ MUL31(a[1], b[3])
1400957b409SSimon J. Gerraty 		+ MUL31(a[2], b[2])
1410957b409SSimon J. Gerraty 		+ MUL31(a[3], b[1])
1420957b409SSimon J. Gerraty 		+ MUL31(a[4], b[0]);
1430957b409SSimon J. Gerraty 	t[ 5] = MUL31(a[0], b[5])
1440957b409SSimon J. Gerraty 		+ MUL31(a[1], b[4])
1450957b409SSimon J. Gerraty 		+ MUL31(a[2], b[3])
1460957b409SSimon J. Gerraty 		+ MUL31(a[3], b[2])
1470957b409SSimon J. Gerraty 		+ MUL31(a[4], b[1])
1480957b409SSimon J. Gerraty 		+ MUL31(a[5], b[0]);
1490957b409SSimon J. Gerraty 	t[ 6] = MUL31(a[0], b[6])
1500957b409SSimon J. Gerraty 		+ MUL31(a[1], b[5])
1510957b409SSimon J. Gerraty 		+ MUL31(a[2], b[4])
1520957b409SSimon J. Gerraty 		+ MUL31(a[3], b[3])
1530957b409SSimon J. Gerraty 		+ MUL31(a[4], b[2])
1540957b409SSimon J. Gerraty 		+ MUL31(a[5], b[1])
1550957b409SSimon J. Gerraty 		+ MUL31(a[6], b[0]);
1560957b409SSimon J. Gerraty 	t[ 7] = MUL31(a[0], b[7])
1570957b409SSimon J. Gerraty 		+ MUL31(a[1], b[6])
1580957b409SSimon J. Gerraty 		+ MUL31(a[2], b[5])
1590957b409SSimon J. Gerraty 		+ MUL31(a[3], b[4])
1600957b409SSimon J. Gerraty 		+ MUL31(a[4], b[3])
1610957b409SSimon J. Gerraty 		+ MUL31(a[5], b[2])
1620957b409SSimon J. Gerraty 		+ MUL31(a[6], b[1])
1630957b409SSimon J. Gerraty 		+ MUL31(a[7], b[0]);
1640957b409SSimon J. Gerraty 	t[ 8] = MUL31(a[0], b[8])
1650957b409SSimon J. Gerraty 		+ MUL31(a[1], b[7])
1660957b409SSimon J. Gerraty 		+ MUL31(a[2], b[6])
1670957b409SSimon J. Gerraty 		+ MUL31(a[3], b[5])
1680957b409SSimon J. Gerraty 		+ MUL31(a[4], b[4])
1690957b409SSimon J. Gerraty 		+ MUL31(a[5], b[3])
1700957b409SSimon J. Gerraty 		+ MUL31(a[6], b[2])
1710957b409SSimon J. Gerraty 		+ MUL31(a[7], b[1])
1720957b409SSimon J. Gerraty 		+ MUL31(a[8], b[0]);
1730957b409SSimon J. Gerraty 	t[ 9] = MUL31(a[1], b[8])
1740957b409SSimon J. Gerraty 		+ MUL31(a[2], b[7])
1750957b409SSimon J. Gerraty 		+ MUL31(a[3], b[6])
1760957b409SSimon J. Gerraty 		+ MUL31(a[4], b[5])
1770957b409SSimon J. Gerraty 		+ MUL31(a[5], b[4])
1780957b409SSimon J. Gerraty 		+ MUL31(a[6], b[3])
1790957b409SSimon J. Gerraty 		+ MUL31(a[7], b[2])
1800957b409SSimon J. Gerraty 		+ MUL31(a[8], b[1]);
1810957b409SSimon J. Gerraty 	t[10] = MUL31(a[2], b[8])
1820957b409SSimon J. Gerraty 		+ MUL31(a[3], b[7])
1830957b409SSimon J. Gerraty 		+ MUL31(a[4], b[6])
1840957b409SSimon J. Gerraty 		+ MUL31(a[5], b[5])
1850957b409SSimon J. Gerraty 		+ MUL31(a[6], b[4])
1860957b409SSimon J. Gerraty 		+ MUL31(a[7], b[3])
1870957b409SSimon J. Gerraty 		+ MUL31(a[8], b[2]);
1880957b409SSimon J. Gerraty 	t[11] = MUL31(a[3], b[8])
1890957b409SSimon J. Gerraty 		+ MUL31(a[4], b[7])
1900957b409SSimon J. Gerraty 		+ MUL31(a[5], b[6])
1910957b409SSimon J. Gerraty 		+ MUL31(a[6], b[5])
1920957b409SSimon J. Gerraty 		+ MUL31(a[7], b[4])
1930957b409SSimon J. Gerraty 		+ MUL31(a[8], b[3]);
1940957b409SSimon J. Gerraty 	t[12] = MUL31(a[4], b[8])
1950957b409SSimon J. Gerraty 		+ MUL31(a[5], b[7])
1960957b409SSimon J. Gerraty 		+ MUL31(a[6], b[6])
1970957b409SSimon J. Gerraty 		+ MUL31(a[7], b[5])
1980957b409SSimon J. Gerraty 		+ MUL31(a[8], b[4]);
1990957b409SSimon J. Gerraty 	t[13] = MUL31(a[5], b[8])
2000957b409SSimon J. Gerraty 		+ MUL31(a[6], b[7])
2010957b409SSimon J. Gerraty 		+ MUL31(a[7], b[6])
2020957b409SSimon J. Gerraty 		+ MUL31(a[8], b[5]);
2030957b409SSimon J. Gerraty 	t[14] = MUL31(a[6], b[8])
2040957b409SSimon J. Gerraty 		+ MUL31(a[7], b[7])
2050957b409SSimon J. Gerraty 		+ MUL31(a[8], b[6]);
2060957b409SSimon J. Gerraty 	t[15] = MUL31(a[7], b[8])
2070957b409SSimon J. Gerraty 		+ MUL31(a[8], b[7]);
2080957b409SSimon J. Gerraty 	t[16] = MUL31(a[8], b[8]);
2090957b409SSimon J. Gerraty 
2100957b409SSimon J. Gerraty 	/*
2110957b409SSimon J. Gerraty 	 * Propagate carries.
2120957b409SSimon J. Gerraty 	 */
2130957b409SSimon J. Gerraty 	cc = 0;
2140957b409SSimon J. Gerraty 	for (i = 0; i < 17; i ++) {
2150957b409SSimon J. Gerraty 		uint64_t w;
2160957b409SSimon J. Gerraty 
2170957b409SSimon J. Gerraty 		w = t[i] + cc;
2180957b409SSimon J. Gerraty 		d[i] = (uint32_t)w & 0x3FFFFFFF;
2190957b409SSimon J. Gerraty 		cc = w >> 30;
2200957b409SSimon J. Gerraty 	}
2210957b409SSimon J. Gerraty 	d[17] = (uint32_t)cc;
2220957b409SSimon J. Gerraty }
2230957b409SSimon J. Gerraty 
2240957b409SSimon J. Gerraty /*
2250957b409SSimon J. Gerraty  * Square a 270-bit integer, represented as an array of nine 30-bit words.
2260957b409SSimon J. Gerraty  * Result uses 18 words of 30 bits each.
2270957b409SSimon J. Gerraty  */
2280957b409SSimon J. Gerraty static void
square9(uint32_t * d,const uint32_t * a)2290957b409SSimon J. Gerraty square9(uint32_t *d, const uint32_t *a)
2300957b409SSimon J. Gerraty {
2310957b409SSimon J. Gerraty 	uint64_t t[17];
2320957b409SSimon J. Gerraty 	uint64_t cc;
2330957b409SSimon J. Gerraty 	int i;
2340957b409SSimon J. Gerraty 
2350957b409SSimon J. Gerraty 	t[ 0] = MUL31(a[0], a[0]);
2360957b409SSimon J. Gerraty 	t[ 1] = ((MUL31(a[0], a[1])) << 1);
2370957b409SSimon J. Gerraty 	t[ 2] = MUL31(a[1], a[1])
2380957b409SSimon J. Gerraty 		+ ((MUL31(a[0], a[2])) << 1);
2390957b409SSimon J. Gerraty 	t[ 3] = ((MUL31(a[0], a[3])
2400957b409SSimon J. Gerraty 		+ MUL31(a[1], a[2])) << 1);
2410957b409SSimon J. Gerraty 	t[ 4] = MUL31(a[2], a[2])
2420957b409SSimon J. Gerraty 		+ ((MUL31(a[0], a[4])
2430957b409SSimon J. Gerraty 		+ MUL31(a[1], a[3])) << 1);
2440957b409SSimon J. Gerraty 	t[ 5] = ((MUL31(a[0], a[5])
2450957b409SSimon J. Gerraty 		+ MUL31(a[1], a[4])
2460957b409SSimon J. Gerraty 		+ MUL31(a[2], a[3])) << 1);
2470957b409SSimon J. Gerraty 	t[ 6] = MUL31(a[3], a[3])
2480957b409SSimon J. Gerraty 		+ ((MUL31(a[0], a[6])
2490957b409SSimon J. Gerraty 		+ MUL31(a[1], a[5])
2500957b409SSimon J. Gerraty 		+ MUL31(a[2], a[4])) << 1);
2510957b409SSimon J. Gerraty 	t[ 7] = ((MUL31(a[0], a[7])
2520957b409SSimon J. Gerraty 		+ MUL31(a[1], a[6])
2530957b409SSimon J. Gerraty 		+ MUL31(a[2], a[5])
2540957b409SSimon J. Gerraty 		+ MUL31(a[3], a[4])) << 1);
2550957b409SSimon J. Gerraty 	t[ 8] = MUL31(a[4], a[4])
2560957b409SSimon J. Gerraty 		+ ((MUL31(a[0], a[8])
2570957b409SSimon J. Gerraty 		+ MUL31(a[1], a[7])
2580957b409SSimon J. Gerraty 		+ MUL31(a[2], a[6])
2590957b409SSimon J. Gerraty 		+ MUL31(a[3], a[5])) << 1);
2600957b409SSimon J. Gerraty 	t[ 9] = ((MUL31(a[1], a[8])
2610957b409SSimon J. Gerraty 		+ MUL31(a[2], a[7])
2620957b409SSimon J. Gerraty 		+ MUL31(a[3], a[6])
2630957b409SSimon J. Gerraty 		+ MUL31(a[4], a[5])) << 1);
2640957b409SSimon J. Gerraty 	t[10] = MUL31(a[5], a[5])
2650957b409SSimon J. Gerraty 		+ ((MUL31(a[2], a[8])
2660957b409SSimon J. Gerraty 		+ MUL31(a[3], a[7])
2670957b409SSimon J. Gerraty 		+ MUL31(a[4], a[6])) << 1);
2680957b409SSimon J. Gerraty 	t[11] = ((MUL31(a[3], a[8])
2690957b409SSimon J. Gerraty 		+ MUL31(a[4], a[7])
2700957b409SSimon J. Gerraty 		+ MUL31(a[5], a[6])) << 1);
2710957b409SSimon J. Gerraty 	t[12] = MUL31(a[6], a[6])
2720957b409SSimon J. Gerraty 		+ ((MUL31(a[4], a[8])
2730957b409SSimon J. Gerraty 		+ MUL31(a[5], a[7])) << 1);
2740957b409SSimon J. Gerraty 	t[13] = ((MUL31(a[5], a[8])
2750957b409SSimon J. Gerraty 		+ MUL31(a[6], a[7])) << 1);
2760957b409SSimon J. Gerraty 	t[14] = MUL31(a[7], a[7])
2770957b409SSimon J. Gerraty 		+ ((MUL31(a[6], a[8])) << 1);
2780957b409SSimon J. Gerraty 	t[15] = ((MUL31(a[7], a[8])) << 1);
2790957b409SSimon J. Gerraty 	t[16] = MUL31(a[8], a[8]);
2800957b409SSimon J. Gerraty 
2810957b409SSimon J. Gerraty 	/*
2820957b409SSimon J. Gerraty 	 * Propagate carries.
2830957b409SSimon J. Gerraty 	 */
2840957b409SSimon J. Gerraty 	cc = 0;
2850957b409SSimon J. Gerraty 	for (i = 0; i < 17; i ++) {
2860957b409SSimon J. Gerraty 		uint64_t w;
2870957b409SSimon J. Gerraty 
2880957b409SSimon J. Gerraty 		w = t[i] + cc;
2890957b409SSimon J. Gerraty 		d[i] = (uint32_t)w & 0x3FFFFFFF;
2900957b409SSimon J. Gerraty 		cc = w >> 30;
2910957b409SSimon J. Gerraty 	}
2920957b409SSimon J. Gerraty 	d[17] = (uint32_t)cc;
2930957b409SSimon J. Gerraty }
2940957b409SSimon J. Gerraty 
2950957b409SSimon J. Gerraty /*
2960957b409SSimon J. Gerraty  * Base field modulus for P-256.
2970957b409SSimon J. Gerraty  */
2980957b409SSimon J. Gerraty static const uint32_t F256[] = {
2990957b409SSimon J. Gerraty 
3000957b409SSimon J. Gerraty 	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
3010957b409SSimon J. Gerraty 	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
3020957b409SSimon J. Gerraty };
3030957b409SSimon J. Gerraty 
3040957b409SSimon J. Gerraty /*
3050957b409SSimon J. Gerraty  * The 'b' curve equation coefficient for P-256.
3060957b409SSimon J. Gerraty  */
3070957b409SSimon J. Gerraty static const uint32_t P256_B[] = {
3080957b409SSimon J. Gerraty 
3090957b409SSimon J. Gerraty 	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
3100957b409SSimon J. Gerraty 	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
3110957b409SSimon J. Gerraty };
3120957b409SSimon J. Gerraty 
3130957b409SSimon J. Gerraty /*
3140957b409SSimon J. Gerraty  * Addition in the field. Source operands shall fit on 257 bits; output
3150957b409SSimon J. Gerraty  * will be lower than twice the modulus.
3160957b409SSimon J. Gerraty  */
3170957b409SSimon J. Gerraty static void
add_f256(uint32_t * d,const uint32_t * a,const uint32_t * b)3180957b409SSimon J. Gerraty add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
3190957b409SSimon J. Gerraty {
3200957b409SSimon J. Gerraty 	uint32_t w, cc;
3210957b409SSimon J. Gerraty 	int i;
3220957b409SSimon J. Gerraty 
3230957b409SSimon J. Gerraty 	cc = 0;
3240957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
3250957b409SSimon J. Gerraty 		w = a[i] + b[i] + cc;
3260957b409SSimon J. Gerraty 		d[i] = w & 0x3FFFFFFF;
3270957b409SSimon J. Gerraty 		cc = w >> 30;
3280957b409SSimon J. Gerraty 	}
3290957b409SSimon J. Gerraty 	w >>= 16;
3300957b409SSimon J. Gerraty 	d[8] &= 0xFFFF;
3310957b409SSimon J. Gerraty 	d[3] -= w << 6;
3320957b409SSimon J. Gerraty 	d[6] -= w << 12;
3330957b409SSimon J. Gerraty 	d[7] += w << 14;
3340957b409SSimon J. Gerraty 	cc = w;
3350957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
3360957b409SSimon J. Gerraty 		w = d[i] + cc;
3370957b409SSimon J. Gerraty 		d[i] = w & 0x3FFFFFFF;
3380957b409SSimon J. Gerraty 		cc = ARSH(w, 30);
3390957b409SSimon J. Gerraty 	}
3400957b409SSimon J. Gerraty }
3410957b409SSimon J. Gerraty 
3420957b409SSimon J. Gerraty /*
3430957b409SSimon J. Gerraty  * Subtraction in the field. Source operands shall be smaller than twice
3440957b409SSimon J. Gerraty  * the modulus; the result will fulfil the same property.
3450957b409SSimon J. Gerraty  */
3460957b409SSimon J. Gerraty static void
sub_f256(uint32_t * d,const uint32_t * a,const uint32_t * b)3470957b409SSimon J. Gerraty sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
3480957b409SSimon J. Gerraty {
3490957b409SSimon J. Gerraty 	uint32_t w, cc;
3500957b409SSimon J. Gerraty 	int i;
3510957b409SSimon J. Gerraty 
3520957b409SSimon J. Gerraty 	/*
3530957b409SSimon J. Gerraty 	 * We really compute a - b + 2*p to make sure that the result is
3540957b409SSimon J. Gerraty 	 * positive.
3550957b409SSimon J. Gerraty 	 */
3560957b409SSimon J. Gerraty 	w = a[0] - b[0] - 0x00002;
3570957b409SSimon J. Gerraty 	d[0] = w & 0x3FFFFFFF;
3580957b409SSimon J. Gerraty 	w = a[1] - b[1] + ARSH(w, 30);
3590957b409SSimon J. Gerraty 	d[1] = w & 0x3FFFFFFF;
3600957b409SSimon J. Gerraty 	w = a[2] - b[2] + ARSH(w, 30);
3610957b409SSimon J. Gerraty 	d[2] = w & 0x3FFFFFFF;
3620957b409SSimon J. Gerraty 	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
3630957b409SSimon J. Gerraty 	d[3] = w & 0x3FFFFFFF;
3640957b409SSimon J. Gerraty 	w = a[4] - b[4] + ARSH(w, 30);
3650957b409SSimon J. Gerraty 	d[4] = w & 0x3FFFFFFF;
3660957b409SSimon J. Gerraty 	w = a[5] - b[5] + ARSH(w, 30);
3670957b409SSimon J. Gerraty 	d[5] = w & 0x3FFFFFFF;
3680957b409SSimon J. Gerraty 	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
3690957b409SSimon J. Gerraty 	d[6] = w & 0x3FFFFFFF;
3700957b409SSimon J. Gerraty 	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
3710957b409SSimon J. Gerraty 	d[7] = w & 0x3FFFFFFF;
3720957b409SSimon J. Gerraty 	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
3730957b409SSimon J. Gerraty 	d[8] = w & 0xFFFF;
3740957b409SSimon J. Gerraty 	w >>= 16;
3750957b409SSimon J. Gerraty 	d[8] &= 0xFFFF;
3760957b409SSimon J. Gerraty 	d[3] -= w << 6;
3770957b409SSimon J. Gerraty 	d[6] -= w << 12;
3780957b409SSimon J. Gerraty 	d[7] += w << 14;
3790957b409SSimon J. Gerraty 	cc = w;
3800957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
3810957b409SSimon J. Gerraty 		w = d[i] + cc;
3820957b409SSimon J. Gerraty 		d[i] = w & 0x3FFFFFFF;
3830957b409SSimon J. Gerraty 		cc = ARSH(w, 30);
3840957b409SSimon J. Gerraty 	}
3850957b409SSimon J. Gerraty }
3860957b409SSimon J. Gerraty 
3870957b409SSimon J. Gerraty /*
3880957b409SSimon J. Gerraty  * Compute a multiplication in F256. Source operands shall be less than
3890957b409SSimon J. Gerraty  * twice the modulus.
3900957b409SSimon J. Gerraty  */
3910957b409SSimon J. Gerraty static void
mul_f256(uint32_t * d,const uint32_t * a,const uint32_t * b)3920957b409SSimon J. Gerraty mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
3930957b409SSimon J. Gerraty {
3940957b409SSimon J. Gerraty 	uint32_t t[18];
3950957b409SSimon J. Gerraty 	uint64_t s[18];
3960957b409SSimon J. Gerraty 	uint64_t cc, x;
3970957b409SSimon J. Gerraty 	uint32_t z, c;
3980957b409SSimon J. Gerraty 	int i;
3990957b409SSimon J. Gerraty 
4000957b409SSimon J. Gerraty 	mul9(t, a, b);
4010957b409SSimon J. Gerraty 
4020957b409SSimon J. Gerraty 	/*
4030957b409SSimon J. Gerraty 	 * Modular reduction: each high word in added/subtracted where
4040957b409SSimon J. Gerraty 	 * necessary.
4050957b409SSimon J. Gerraty 	 *
4060957b409SSimon J. Gerraty 	 * The modulus is:
4070957b409SSimon J. Gerraty 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
4080957b409SSimon J. Gerraty 	 * Therefore:
4090957b409SSimon J. Gerraty 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
4100957b409SSimon J. Gerraty 	 *
4110957b409SSimon J. Gerraty 	 * For a word x at bit offset n (n >= 256), we have:
4120957b409SSimon J. Gerraty 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
4130957b409SSimon J. Gerraty 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
4140957b409SSimon J. Gerraty 	 *
4150957b409SSimon J. Gerraty 	 * Thus, we can nullify the high word if we reinject it at some
4160957b409SSimon J. Gerraty 	 * proper emplacements.
4170957b409SSimon J. Gerraty 	 *
4180957b409SSimon J. Gerraty 	 * We use 64-bit intermediate words to allow for carries to
4190957b409SSimon J. Gerraty 	 * accumulate easily, before performing the final propagation.
4200957b409SSimon J. Gerraty 	 */
4210957b409SSimon J. Gerraty 	for (i = 0; i < 18; i ++) {
4220957b409SSimon J. Gerraty 		s[i] = t[i];
4230957b409SSimon J. Gerraty 	}
4240957b409SSimon J. Gerraty 
4250957b409SSimon J. Gerraty 	for (i = 17; i >= 9; i --) {
4260957b409SSimon J. Gerraty 		uint64_t y;
4270957b409SSimon J. Gerraty 
4280957b409SSimon J. Gerraty 		y = s[i];
4290957b409SSimon J. Gerraty 		s[i - 1] += ARSHW(y, 2);
4300957b409SSimon J. Gerraty 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
4310957b409SSimon J. Gerraty 		s[i - 2] -= ARSHW(y, 4);
4320957b409SSimon J. Gerraty 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
4330957b409SSimon J. Gerraty 		s[i - 5] -= ARSHW(y, 10);
4340957b409SSimon J. Gerraty 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
4350957b409SSimon J. Gerraty 		s[i - 8] += ARSHW(y, 16);
4360957b409SSimon J. Gerraty 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
4370957b409SSimon J. Gerraty 	}
4380957b409SSimon J. Gerraty 
4390957b409SSimon J. Gerraty 	/*
4400957b409SSimon J. Gerraty 	 * Carry propagation must be signed. Moreover, we may have overdone
4410957b409SSimon J. Gerraty 	 * it a bit, and obtain a negative result.
4420957b409SSimon J. Gerraty 	 *
4430957b409SSimon J. Gerraty 	 * The loop above ran 9 times; each time, each word was augmented
4440957b409SSimon J. Gerraty 	 * by at most one extra word (in absolute value). Thus, the top
4450957b409SSimon J. Gerraty 	 * word must in fine fit in 39 bits, so the carry below will fit
4460957b409SSimon J. Gerraty 	 * on 9 bits.
4470957b409SSimon J. Gerraty 	 */
4480957b409SSimon J. Gerraty 	cc = 0;
4490957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
4500957b409SSimon J. Gerraty 		x = s[i] + cc;
4510957b409SSimon J. Gerraty 		d[i] = (uint32_t)x & 0x3FFFFFFF;
4520957b409SSimon J. Gerraty 		cc = ARSHW(x, 30);
4530957b409SSimon J. Gerraty 	}
4540957b409SSimon J. Gerraty 
4550957b409SSimon J. Gerraty 	/*
4560957b409SSimon J. Gerraty 	 * All nine words fit on 30 bits, but there may be an extra
4570957b409SSimon J. Gerraty 	 * carry for a few bits (at most 9), and that carry may be
4580957b409SSimon J. Gerraty 	 * negative. Moreover, we want the result to fit on 257 bits.
4590957b409SSimon J. Gerraty 	 * The two lines below ensure that the word in d[] has length
4600957b409SSimon J. Gerraty 	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
4610957b409SSimon J. Gerraty 	 * significant length of cc is less than 24 bits, so we will be
4620957b409SSimon J. Gerraty 	 * able to switch to 32-bit operations.
4630957b409SSimon J. Gerraty 	 */
4640957b409SSimon J. Gerraty 	cc = ARSHW(x, 16);
4650957b409SSimon J. Gerraty 	d[8] &= 0xFFFF;
4660957b409SSimon J. Gerraty 
4670957b409SSimon J. Gerraty 	/*
4680957b409SSimon J. Gerraty 	 * One extra round of reduction, for cc*2^256, which means
4690957b409SSimon J. Gerraty 	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
4700957b409SSimon J. Gerraty 	 * value. If cc is negative, then it may happen (rarely, but
4710957b409SSimon J. Gerraty 	 * not neglectibly so) that the result would be negative. In
4720957b409SSimon J. Gerraty 	 * order to avoid that, if cc is negative, then we add the
4730957b409SSimon J. Gerraty 	 * modulus once. Note that if cc is negative, then propagating
4740957b409SSimon J. Gerraty 	 * that carry must yield a value lower than the modulus, so
4750957b409SSimon J. Gerraty 	 * adding the modulus once will keep the final result under
4760957b409SSimon J. Gerraty 	 * twice the modulus.
4770957b409SSimon J. Gerraty 	 */
4780957b409SSimon J. Gerraty 	z = (uint32_t)cc;
4790957b409SSimon J. Gerraty 	d[3] -= z << 6;
4800957b409SSimon J. Gerraty 	d[6] -= (z << 12) & 0x3FFFFFFF;
4810957b409SSimon J. Gerraty 	d[7] -= ARSH(z, 18);
4820957b409SSimon J. Gerraty 	d[7] += (z << 14) & 0x3FFFFFFF;
4830957b409SSimon J. Gerraty 	d[8] += ARSH(z, 16);
4840957b409SSimon J. Gerraty 	c = z >> 31;
4850957b409SSimon J. Gerraty 	d[0] -= c;
4860957b409SSimon J. Gerraty 	d[3] += c << 6;
4870957b409SSimon J. Gerraty 	d[6] += c << 12;
4880957b409SSimon J. Gerraty 	d[7] -= c << 14;
4890957b409SSimon J. Gerraty 	d[8] += c << 16;
4900957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
4910957b409SSimon J. Gerraty 		uint32_t w;
4920957b409SSimon J. Gerraty 
4930957b409SSimon J. Gerraty 		w = d[i] + z;
4940957b409SSimon J. Gerraty 		d[i] = w & 0x3FFFFFFF;
4950957b409SSimon J. Gerraty 		z = ARSH(w, 30);
4960957b409SSimon J. Gerraty 	}
4970957b409SSimon J. Gerraty }
4980957b409SSimon J. Gerraty 
4990957b409SSimon J. Gerraty /*
5000957b409SSimon J. Gerraty  * Compute a square in F256. Source operand shall be less than
5010957b409SSimon J. Gerraty  * twice the modulus.
5020957b409SSimon J. Gerraty  */
5030957b409SSimon J. Gerraty static void
square_f256(uint32_t * d,const uint32_t * a)5040957b409SSimon J. Gerraty square_f256(uint32_t *d, const uint32_t *a)
5050957b409SSimon J. Gerraty {
5060957b409SSimon J. Gerraty 	uint32_t t[18];
5070957b409SSimon J. Gerraty 	uint64_t s[18];
5080957b409SSimon J. Gerraty 	uint64_t cc, x;
5090957b409SSimon J. Gerraty 	uint32_t z, c;
5100957b409SSimon J. Gerraty 	int i;
5110957b409SSimon J. Gerraty 
5120957b409SSimon J. Gerraty 	square9(t, a);
5130957b409SSimon J. Gerraty 
5140957b409SSimon J. Gerraty 	/*
5150957b409SSimon J. Gerraty 	 * Modular reduction: each high word in added/subtracted where
5160957b409SSimon J. Gerraty 	 * necessary.
5170957b409SSimon J. Gerraty 	 *
5180957b409SSimon J. Gerraty 	 * The modulus is:
5190957b409SSimon J. Gerraty 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
5200957b409SSimon J. Gerraty 	 * Therefore:
5210957b409SSimon J. Gerraty 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
5220957b409SSimon J. Gerraty 	 *
5230957b409SSimon J. Gerraty 	 * For a word x at bit offset n (n >= 256), we have:
5240957b409SSimon J. Gerraty 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
5250957b409SSimon J. Gerraty 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
5260957b409SSimon J. Gerraty 	 *
5270957b409SSimon J. Gerraty 	 * Thus, we can nullify the high word if we reinject it at some
5280957b409SSimon J. Gerraty 	 * proper emplacements.
5290957b409SSimon J. Gerraty 	 *
5300957b409SSimon J. Gerraty 	 * We use 64-bit intermediate words to allow for carries to
5310957b409SSimon J. Gerraty 	 * accumulate easily, before performing the final propagation.
5320957b409SSimon J. Gerraty 	 */
5330957b409SSimon J. Gerraty 	for (i = 0; i < 18; i ++) {
5340957b409SSimon J. Gerraty 		s[i] = t[i];
5350957b409SSimon J. Gerraty 	}
5360957b409SSimon J. Gerraty 
5370957b409SSimon J. Gerraty 	for (i = 17; i >= 9; i --) {
5380957b409SSimon J. Gerraty 		uint64_t y;
5390957b409SSimon J. Gerraty 
5400957b409SSimon J. Gerraty 		y = s[i];
5410957b409SSimon J. Gerraty 		s[i - 1] += ARSHW(y, 2);
5420957b409SSimon J. Gerraty 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
5430957b409SSimon J. Gerraty 		s[i - 2] -= ARSHW(y, 4);
5440957b409SSimon J. Gerraty 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
5450957b409SSimon J. Gerraty 		s[i - 5] -= ARSHW(y, 10);
5460957b409SSimon J. Gerraty 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
5470957b409SSimon J. Gerraty 		s[i - 8] += ARSHW(y, 16);
5480957b409SSimon J. Gerraty 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
5490957b409SSimon J. Gerraty 	}
5500957b409SSimon J. Gerraty 
5510957b409SSimon J. Gerraty 	/*
5520957b409SSimon J. Gerraty 	 * Carry propagation must be signed. Moreover, we may have overdone
5530957b409SSimon J. Gerraty 	 * it a bit, and obtain a negative result.
5540957b409SSimon J. Gerraty 	 *
5550957b409SSimon J. Gerraty 	 * The loop above ran 9 times; each time, each word was augmented
5560957b409SSimon J. Gerraty 	 * by at most one extra word (in absolute value). Thus, the top
5570957b409SSimon J. Gerraty 	 * word must in fine fit in 39 bits, so the carry below will fit
5580957b409SSimon J. Gerraty 	 * on 9 bits.
5590957b409SSimon J. Gerraty 	 */
5600957b409SSimon J. Gerraty 	cc = 0;
5610957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
5620957b409SSimon J. Gerraty 		x = s[i] + cc;
5630957b409SSimon J. Gerraty 		d[i] = (uint32_t)x & 0x3FFFFFFF;
5640957b409SSimon J. Gerraty 		cc = ARSHW(x, 30);
5650957b409SSimon J. Gerraty 	}
5660957b409SSimon J. Gerraty 
5670957b409SSimon J. Gerraty 	/*
5680957b409SSimon J. Gerraty 	 * All nine words fit on 30 bits, but there may be an extra
5690957b409SSimon J. Gerraty 	 * carry for a few bits (at most 9), and that carry may be
5700957b409SSimon J. Gerraty 	 * negative. Moreover, we want the result to fit on 257 bits.
5710957b409SSimon J. Gerraty 	 * The two lines below ensure that the word in d[] has length
5720957b409SSimon J. Gerraty 	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
5730957b409SSimon J. Gerraty 	 * significant length of cc is less than 24 bits, so we will be
5740957b409SSimon J. Gerraty 	 * able to switch to 32-bit operations.
5750957b409SSimon J. Gerraty 	 */
5760957b409SSimon J. Gerraty 	cc = ARSHW(x, 16);
5770957b409SSimon J. Gerraty 	d[8] &= 0xFFFF;
5780957b409SSimon J. Gerraty 
5790957b409SSimon J. Gerraty 	/*
5800957b409SSimon J. Gerraty 	 * One extra round of reduction, for cc*2^256, which means
5810957b409SSimon J. Gerraty 	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
5820957b409SSimon J. Gerraty 	 * value. If cc is negative, then it may happen (rarely, but
5830957b409SSimon J. Gerraty 	 * not neglectibly so) that the result would be negative. In
5840957b409SSimon J. Gerraty 	 * order to avoid that, if cc is negative, then we add the
5850957b409SSimon J. Gerraty 	 * modulus once. Note that if cc is negative, then propagating
5860957b409SSimon J. Gerraty 	 * that carry must yield a value lower than the modulus, so
5870957b409SSimon J. Gerraty 	 * adding the modulus once will keep the final result under
5880957b409SSimon J. Gerraty 	 * twice the modulus.
5890957b409SSimon J. Gerraty 	 */
5900957b409SSimon J. Gerraty 	z = (uint32_t)cc;
5910957b409SSimon J. Gerraty 	d[3] -= z << 6;
5920957b409SSimon J. Gerraty 	d[6] -= (z << 12) & 0x3FFFFFFF;
5930957b409SSimon J. Gerraty 	d[7] -= ARSH(z, 18);
5940957b409SSimon J. Gerraty 	d[7] += (z << 14) & 0x3FFFFFFF;
5950957b409SSimon J. Gerraty 	d[8] += ARSH(z, 16);
5960957b409SSimon J. Gerraty 	c = z >> 31;
5970957b409SSimon J. Gerraty 	d[0] -= c;
5980957b409SSimon J. Gerraty 	d[3] += c << 6;
5990957b409SSimon J. Gerraty 	d[6] += c << 12;
6000957b409SSimon J. Gerraty 	d[7] -= c << 14;
6010957b409SSimon J. Gerraty 	d[8] += c << 16;
6020957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
6030957b409SSimon J. Gerraty 		uint32_t w;
6040957b409SSimon J. Gerraty 
6050957b409SSimon J. Gerraty 		w = d[i] + z;
6060957b409SSimon J. Gerraty 		d[i] = w & 0x3FFFFFFF;
6070957b409SSimon J. Gerraty 		z = ARSH(w, 30);
6080957b409SSimon J. Gerraty 	}
6090957b409SSimon J. Gerraty }
6100957b409SSimon J. Gerraty 
6110957b409SSimon J. Gerraty /*
6120957b409SSimon J. Gerraty  * Perform a "final reduction" in field F256 (field for curve P-256).
6130957b409SSimon J. Gerraty  * The source value must be less than twice the modulus. If the value
6140957b409SSimon J. Gerraty  * is not lower than the modulus, then the modulus is subtracted and
6150957b409SSimon J. Gerraty  * this function returns 1; otherwise, it leaves it untouched and it
6160957b409SSimon J. Gerraty  * returns 0.
6170957b409SSimon J. Gerraty  */
6180957b409SSimon J. Gerraty static uint32_t
reduce_final_f256(uint32_t * d)6190957b409SSimon J. Gerraty reduce_final_f256(uint32_t *d)
6200957b409SSimon J. Gerraty {
6210957b409SSimon J. Gerraty 	uint32_t t[9];
6220957b409SSimon J. Gerraty 	uint32_t cc;
6230957b409SSimon J. Gerraty 	int i;
6240957b409SSimon J. Gerraty 
6250957b409SSimon J. Gerraty 	cc = 0;
6260957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
6270957b409SSimon J. Gerraty 		uint32_t w;
6280957b409SSimon J. Gerraty 
6290957b409SSimon J. Gerraty 		w = d[i] - F256[i] - cc;
6300957b409SSimon J. Gerraty 		cc = w >> 31;
6310957b409SSimon J. Gerraty 		t[i] = w & 0x3FFFFFFF;
6320957b409SSimon J. Gerraty 	}
6330957b409SSimon J. Gerraty 	cc ^= 1;
6340957b409SSimon J. Gerraty 	CCOPY(cc, d, t, sizeof t);
6350957b409SSimon J. Gerraty 	return cc;
6360957b409SSimon J. Gerraty }
6370957b409SSimon J. Gerraty 
6380957b409SSimon J. Gerraty /*
6390957b409SSimon J. Gerraty  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
6400957b409SSimon J. Gerraty  * are such that:
6410957b409SSimon J. Gerraty  *   X = x / z^2
6420957b409SSimon J. Gerraty  *   Y = y / z^3
6430957b409SSimon J. Gerraty  * For the point at infinity, z = 0.
6440957b409SSimon J. Gerraty  * Each point thus admits many possible representations.
6450957b409SSimon J. Gerraty  *
6460957b409SSimon J. Gerraty  * Coordinates are represented in arrays of 32-bit integers, each holding
6470957b409SSimon J. Gerraty  * 30 bits of data. Values may also be slightly greater than the modulus,
6480957b409SSimon J. Gerraty  * but they will always be lower than twice the modulus.
6490957b409SSimon J. Gerraty  */
6500957b409SSimon J. Gerraty typedef struct {
6510957b409SSimon J. Gerraty 	uint32_t x[9];
6520957b409SSimon J. Gerraty 	uint32_t y[9];
6530957b409SSimon J. Gerraty 	uint32_t z[9];
6540957b409SSimon J. Gerraty } p256_jacobian;
6550957b409SSimon J. Gerraty 
6560957b409SSimon J. Gerraty /*
6570957b409SSimon J. Gerraty  * Convert a point to affine coordinates:
6580957b409SSimon J. Gerraty  *  - If the point is the point at infinity, then all three coordinates
6590957b409SSimon J. Gerraty  *    are set to 0.
6600957b409SSimon J. Gerraty  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
6610957b409SSimon J. Gerraty  *    coordinates are the 'X' and 'Y' affine coordinates.
6620957b409SSimon J. Gerraty  * The coordinates are guaranteed to be lower than the modulus.
6630957b409SSimon J. Gerraty  */
6640957b409SSimon J. Gerraty static void
p256_to_affine(p256_jacobian * P)6650957b409SSimon J. Gerraty p256_to_affine(p256_jacobian *P)
6660957b409SSimon J. Gerraty {
6670957b409SSimon J. Gerraty 	uint32_t t1[9], t2[9];
6680957b409SSimon J. Gerraty 	int i;
6690957b409SSimon J. Gerraty 
6700957b409SSimon J. Gerraty 	/*
6710957b409SSimon J. Gerraty 	 * Invert z with a modular exponentiation: the modulus is
6720957b409SSimon J. Gerraty 	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
6730957b409SSimon J. Gerraty 	 * p-2. Exponent bit pattern (from high to low) is:
6740957b409SSimon J. Gerraty 	 *  - 32 bits of value 1
6750957b409SSimon J. Gerraty 	 *  - 31 bits of value 0
6760957b409SSimon J. Gerraty 	 *  - 1 bit of value 1
6770957b409SSimon J. Gerraty 	 *  - 96 bits of value 0
6780957b409SSimon J. Gerraty 	 *  - 94 bits of value 1
6790957b409SSimon J. Gerraty 	 *  - 1 bit of value 0
6800957b409SSimon J. Gerraty 	 *  - 1 bit of value 1
6810957b409SSimon J. Gerraty 	 * Thus, we precompute z^(2^31-1) to speed things up.
6820957b409SSimon J. Gerraty 	 *
6830957b409SSimon J. Gerraty 	 * If z = 0 (point at infinity) then the modular exponentiation
6840957b409SSimon J. Gerraty 	 * will yield 0, which leads to the expected result (all three
6850957b409SSimon J. Gerraty 	 * coordinates set to 0).
6860957b409SSimon J. Gerraty 	 */
6870957b409SSimon J. Gerraty 
6880957b409SSimon J. Gerraty 	/*
6890957b409SSimon J. Gerraty 	 * A simple square-and-multiply for z^(2^31-1). We could save about
6900957b409SSimon J. Gerraty 	 * two dozen multiplications here with an addition chain, but
6910957b409SSimon J. Gerraty 	 * this would require a bit more code, and extra stack buffers.
6920957b409SSimon J. Gerraty 	 */
6930957b409SSimon J. Gerraty 	memcpy(t1, P->z, sizeof P->z);
6940957b409SSimon J. Gerraty 	for (i = 0; i < 30; i ++) {
6950957b409SSimon J. Gerraty 		square_f256(t1, t1);
6960957b409SSimon J. Gerraty 		mul_f256(t1, t1, P->z);
6970957b409SSimon J. Gerraty 	}
6980957b409SSimon J. Gerraty 
6990957b409SSimon J. Gerraty 	/*
7000957b409SSimon J. Gerraty 	 * Square-and-multiply. Apart from the squarings, we have a few
7010957b409SSimon J. Gerraty 	 * multiplications to set bits to 1; we multiply by the original z
7020957b409SSimon J. Gerraty 	 * for setting 1 bit, and by t1 for setting 31 bits.
7030957b409SSimon J. Gerraty 	 */
7040957b409SSimon J. Gerraty 	memcpy(t2, P->z, sizeof P->z);
7050957b409SSimon J. Gerraty 	for (i = 1; i < 256; i ++) {
7060957b409SSimon J. Gerraty 		square_f256(t2, t2);
7070957b409SSimon J. Gerraty 		switch (i) {
7080957b409SSimon J. Gerraty 		case 31:
7090957b409SSimon J. Gerraty 		case 190:
7100957b409SSimon J. Gerraty 		case 221:
7110957b409SSimon J. Gerraty 		case 252:
7120957b409SSimon J. Gerraty 			mul_f256(t2, t2, t1);
7130957b409SSimon J. Gerraty 			break;
7140957b409SSimon J. Gerraty 		case 63:
7150957b409SSimon J. Gerraty 		case 253:
7160957b409SSimon J. Gerraty 		case 255:
7170957b409SSimon J. Gerraty 			mul_f256(t2, t2, P->z);
7180957b409SSimon J. Gerraty 			break;
7190957b409SSimon J. Gerraty 		}
7200957b409SSimon J. Gerraty 	}
7210957b409SSimon J. Gerraty 
7220957b409SSimon J. Gerraty 	/*
7230957b409SSimon J. Gerraty 	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
7240957b409SSimon J. Gerraty 	 */
7250957b409SSimon J. Gerraty 	mul_f256(t1, t2, t2);
7260957b409SSimon J. Gerraty 	mul_f256(P->x, t1, P->x);
7270957b409SSimon J. Gerraty 	mul_f256(t1, t1, t2);
7280957b409SSimon J. Gerraty 	mul_f256(P->y, t1, P->y);
7290957b409SSimon J. Gerraty 	reduce_final_f256(P->x);
7300957b409SSimon J. Gerraty 	reduce_final_f256(P->y);
7310957b409SSimon J. Gerraty 
7320957b409SSimon J. Gerraty 	/*
7330957b409SSimon J. Gerraty 	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
7340957b409SSimon J. Gerraty 	 * this will set z to 1.
7350957b409SSimon J. Gerraty 	 */
7360957b409SSimon J. Gerraty 	mul_f256(P->z, P->z, t2);
7370957b409SSimon J. Gerraty 	reduce_final_f256(P->z);
7380957b409SSimon J. Gerraty }
7390957b409SSimon J. Gerraty 
7400957b409SSimon J. Gerraty /*
7410957b409SSimon J. Gerraty  * Double a point in P-256. This function works for all valid points,
7420957b409SSimon J. Gerraty  * including the point at infinity.
7430957b409SSimon J. Gerraty  */
7440957b409SSimon J. Gerraty static void
p256_double(p256_jacobian * Q)7450957b409SSimon J. Gerraty p256_double(p256_jacobian *Q)
7460957b409SSimon J. Gerraty {
7470957b409SSimon J. Gerraty 	/*
7480957b409SSimon J. Gerraty 	 * Doubling formulas are:
7490957b409SSimon J. Gerraty 	 *
7500957b409SSimon J. Gerraty 	 *   s = 4*x*y^2
7510957b409SSimon J. Gerraty 	 *   m = 3*(x + z^2)*(x - z^2)
7520957b409SSimon J. Gerraty 	 *   x' = m^2 - 2*s
7530957b409SSimon J. Gerraty 	 *   y' = m*(s - x') - 8*y^4
7540957b409SSimon J. Gerraty 	 *   z' = 2*y*z
7550957b409SSimon J. Gerraty 	 *
7560957b409SSimon J. Gerraty 	 * These formulas work for all points, including points of order 2
7570957b409SSimon J. Gerraty 	 * and points at infinity:
7580957b409SSimon J. Gerraty 	 *   - If y = 0 then z' = 0. But there is no such point in P-256
7590957b409SSimon J. Gerraty 	 *     anyway.
7600957b409SSimon J. Gerraty 	 *   - If z = 0 then z' = 0.
7610957b409SSimon J. Gerraty 	 */
7620957b409SSimon J. Gerraty 	uint32_t t1[9], t2[9], t3[9], t4[9];
7630957b409SSimon J. Gerraty 
7640957b409SSimon J. Gerraty 	/*
7650957b409SSimon J. Gerraty 	 * Compute z^2 in t1.
7660957b409SSimon J. Gerraty 	 */
7670957b409SSimon J. Gerraty 	square_f256(t1, Q->z);
7680957b409SSimon J. Gerraty 
7690957b409SSimon J. Gerraty 	/*
7700957b409SSimon J. Gerraty 	 * Compute x-z^2 in t2 and x+z^2 in t1.
7710957b409SSimon J. Gerraty 	 */
7720957b409SSimon J. Gerraty 	add_f256(t2, Q->x, t1);
7730957b409SSimon J. Gerraty 	sub_f256(t1, Q->x, t1);
7740957b409SSimon J. Gerraty 
7750957b409SSimon J. Gerraty 	/*
7760957b409SSimon J. Gerraty 	 * Compute 3*(x+z^2)*(x-z^2) in t1.
7770957b409SSimon J. Gerraty 	 */
7780957b409SSimon J. Gerraty 	mul_f256(t3, t1, t2);
7790957b409SSimon J. Gerraty 	add_f256(t1, t3, t3);
7800957b409SSimon J. Gerraty 	add_f256(t1, t3, t1);
7810957b409SSimon J. Gerraty 
7820957b409SSimon J. Gerraty 	/*
7830957b409SSimon J. Gerraty 	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
7840957b409SSimon J. Gerraty 	 */
7850957b409SSimon J. Gerraty 	square_f256(t3, Q->y);
7860957b409SSimon J. Gerraty 	add_f256(t3, t3, t3);
7870957b409SSimon J. Gerraty 	mul_f256(t2, Q->x, t3);
7880957b409SSimon J. Gerraty 	add_f256(t2, t2, t2);
7890957b409SSimon J. Gerraty 
7900957b409SSimon J. Gerraty 	/*
7910957b409SSimon J. Gerraty 	 * Compute x' = m^2 - 2*s.
7920957b409SSimon J. Gerraty 	 */
7930957b409SSimon J. Gerraty 	square_f256(Q->x, t1);
7940957b409SSimon J. Gerraty 	sub_f256(Q->x, Q->x, t2);
7950957b409SSimon J. Gerraty 	sub_f256(Q->x, Q->x, t2);
7960957b409SSimon J. Gerraty 
7970957b409SSimon J. Gerraty 	/*
7980957b409SSimon J. Gerraty 	 * Compute z' = 2*y*z.
7990957b409SSimon J. Gerraty 	 */
8000957b409SSimon J. Gerraty 	mul_f256(t4, Q->y, Q->z);
8010957b409SSimon J. Gerraty 	add_f256(Q->z, t4, t4);
8020957b409SSimon J. Gerraty 
8030957b409SSimon J. Gerraty 	/*
8040957b409SSimon J. Gerraty 	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
8050957b409SSimon J. Gerraty 	 * 2*y^2 in t3.
8060957b409SSimon J. Gerraty 	 */
8070957b409SSimon J. Gerraty 	sub_f256(t2, t2, Q->x);
8080957b409SSimon J. Gerraty 	mul_f256(Q->y, t1, t2);
8090957b409SSimon J. Gerraty 	square_f256(t4, t3);
8100957b409SSimon J. Gerraty 	add_f256(t4, t4, t4);
8110957b409SSimon J. Gerraty 	sub_f256(Q->y, Q->y, t4);
8120957b409SSimon J. Gerraty }
8130957b409SSimon J. Gerraty 
8140957b409SSimon J. Gerraty /*
8150957b409SSimon J. Gerraty  * Add point P2 to point P1.
8160957b409SSimon J. Gerraty  *
8170957b409SSimon J. Gerraty  * This function computes the wrong result in the following cases:
8180957b409SSimon J. Gerraty  *
8190957b409SSimon J. Gerraty  *   - If P1 == 0 but P2 != 0
8200957b409SSimon J. Gerraty  *   - If P1 != 0 but P2 == 0
8210957b409SSimon J. Gerraty  *   - If P1 == P2
8220957b409SSimon J. Gerraty  *
8230957b409SSimon J. Gerraty  * In all three cases, P1 is set to the point at infinity.
8240957b409SSimon J. Gerraty  *
8250957b409SSimon J. Gerraty  * Returned value is 0 if one of the following occurs:
8260957b409SSimon J. Gerraty  *
8270957b409SSimon J. Gerraty  *   - P1 and P2 have the same Y coordinate
8280957b409SSimon J. Gerraty  *   - P1 == 0 and P2 == 0
8290957b409SSimon J. Gerraty  *   - The Y coordinate of one of the points is 0 and the other point is
8300957b409SSimon J. Gerraty  *     the point at infinity.
8310957b409SSimon J. Gerraty  *
8320957b409SSimon J. Gerraty  * The third case cannot actually happen with valid points, since a point
8330957b409SSimon J. Gerraty  * with Y == 0 is a point of order 2, and there is no point of order 2 on
8340957b409SSimon J. Gerraty  * curve P-256.
8350957b409SSimon J. Gerraty  *
8360957b409SSimon J. Gerraty  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
8370957b409SSimon J. Gerraty  * can apply the following:
8380957b409SSimon J. Gerraty  *
8390957b409SSimon J. Gerraty  *   - If the result is not the point at infinity, then it is correct.
8400957b409SSimon J. Gerraty  *   - Otherwise, if the returned value is 1, then this is a case of
8410957b409SSimon J. Gerraty  *     P1+P2 == 0, so the result is indeed the point at infinity.
8420957b409SSimon J. Gerraty  *   - Otherwise, P1 == P2, so a "double" operation should have been
8430957b409SSimon J. Gerraty  *     performed.
8440957b409SSimon J. Gerraty  */
8450957b409SSimon J. Gerraty static uint32_t
p256_add(p256_jacobian * P1,const p256_jacobian * P2)8460957b409SSimon J. Gerraty p256_add(p256_jacobian *P1, const p256_jacobian *P2)
8470957b409SSimon J. Gerraty {
8480957b409SSimon J. Gerraty 	/*
8490957b409SSimon J. Gerraty 	 * Addtions formulas are:
8500957b409SSimon J. Gerraty 	 *
8510957b409SSimon J. Gerraty 	 *   u1 = x1 * z2^2
8520957b409SSimon J. Gerraty 	 *   u2 = x2 * z1^2
8530957b409SSimon J. Gerraty 	 *   s1 = y1 * z2^3
8540957b409SSimon J. Gerraty 	 *   s2 = y2 * z1^3
8550957b409SSimon J. Gerraty 	 *   h = u2 - u1
8560957b409SSimon J. Gerraty 	 *   r = s2 - s1
8570957b409SSimon J. Gerraty 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
8580957b409SSimon J. Gerraty 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
8590957b409SSimon J. Gerraty 	 *   z3 = h * z1 * z2
8600957b409SSimon J. Gerraty 	 */
8610957b409SSimon J. Gerraty 	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
8620957b409SSimon J. Gerraty 	uint32_t ret;
8630957b409SSimon J. Gerraty 	int i;
8640957b409SSimon J. Gerraty 
8650957b409SSimon J. Gerraty 	/*
8660957b409SSimon J. Gerraty 	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
8670957b409SSimon J. Gerraty 	 */
8680957b409SSimon J. Gerraty 	square_f256(t3, P2->z);
8690957b409SSimon J. Gerraty 	mul_f256(t1, P1->x, t3);
8700957b409SSimon J. Gerraty 	mul_f256(t4, P2->z, t3);
8710957b409SSimon J. Gerraty 	mul_f256(t3, P1->y, t4);
8720957b409SSimon J. Gerraty 
8730957b409SSimon J. Gerraty 	/*
8740957b409SSimon J. Gerraty 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
8750957b409SSimon J. Gerraty 	 */
8760957b409SSimon J. Gerraty 	square_f256(t4, P1->z);
8770957b409SSimon J. Gerraty 	mul_f256(t2, P2->x, t4);
8780957b409SSimon J. Gerraty 	mul_f256(t5, P1->z, t4);
8790957b409SSimon J. Gerraty 	mul_f256(t4, P2->y, t5);
8800957b409SSimon J. Gerraty 
8810957b409SSimon J. Gerraty 	/*
8820957b409SSimon J. Gerraty 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
8830957b409SSimon J. Gerraty 	 * We need to test whether r is zero, so we will do some extra
8840957b409SSimon J. Gerraty 	 * reduce.
8850957b409SSimon J. Gerraty 	 */
8860957b409SSimon J. Gerraty 	sub_f256(t2, t2, t1);
8870957b409SSimon J. Gerraty 	sub_f256(t4, t4, t3);
8880957b409SSimon J. Gerraty 	reduce_final_f256(t4);
8890957b409SSimon J. Gerraty 	ret = 0;
8900957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
8910957b409SSimon J. Gerraty 		ret |= t4[i];
8920957b409SSimon J. Gerraty 	}
8930957b409SSimon J. Gerraty 	ret = (ret | -ret) >> 31;
8940957b409SSimon J. Gerraty 
8950957b409SSimon J. Gerraty 	/*
8960957b409SSimon J. Gerraty 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
8970957b409SSimon J. Gerraty 	 */
8980957b409SSimon J. Gerraty 	square_f256(t7, t2);
8990957b409SSimon J. Gerraty 	mul_f256(t6, t1, t7);
9000957b409SSimon J. Gerraty 	mul_f256(t5, t7, t2);
9010957b409SSimon J. Gerraty 
9020957b409SSimon J. Gerraty 	/*
9030957b409SSimon J. Gerraty 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
9040957b409SSimon J. Gerraty 	 */
9050957b409SSimon J. Gerraty 	square_f256(P1->x, t4);
9060957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t5);
9070957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t6);
9080957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t6);
9090957b409SSimon J. Gerraty 
9100957b409SSimon J. Gerraty 	/*
9110957b409SSimon J. Gerraty 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
9120957b409SSimon J. Gerraty 	 */
9130957b409SSimon J. Gerraty 	sub_f256(t6, t6, P1->x);
9140957b409SSimon J. Gerraty 	mul_f256(P1->y, t4, t6);
9150957b409SSimon J. Gerraty 	mul_f256(t1, t5, t3);
9160957b409SSimon J. Gerraty 	sub_f256(P1->y, P1->y, t1);
9170957b409SSimon J. Gerraty 
9180957b409SSimon J. Gerraty 	/*
9190957b409SSimon J. Gerraty 	 * Compute z3 = h*z1*z2.
9200957b409SSimon J. Gerraty 	 */
9210957b409SSimon J. Gerraty 	mul_f256(t1, P1->z, P2->z);
9220957b409SSimon J. Gerraty 	mul_f256(P1->z, t1, t2);
9230957b409SSimon J. Gerraty 
9240957b409SSimon J. Gerraty 	return ret;
9250957b409SSimon J. Gerraty }
9260957b409SSimon J. Gerraty 
9270957b409SSimon J. Gerraty /*
9280957b409SSimon J. Gerraty  * Add point P2 to point P1. This is a specialised function for the
9290957b409SSimon J. Gerraty  * case when P2 is a non-zero point in affine coordinate.
9300957b409SSimon J. Gerraty  *
9310957b409SSimon J. Gerraty  * This function computes the wrong result in the following cases:
9320957b409SSimon J. Gerraty  *
9330957b409SSimon J. Gerraty  *   - If P1 == 0
9340957b409SSimon J. Gerraty  *   - If P1 == P2
9350957b409SSimon J. Gerraty  *
9360957b409SSimon J. Gerraty  * In both cases, P1 is set to the point at infinity.
9370957b409SSimon J. Gerraty  *
9380957b409SSimon J. Gerraty  * Returned value is 0 if one of the following occurs:
9390957b409SSimon J. Gerraty  *
9400957b409SSimon J. Gerraty  *   - P1 and P2 have the same Y coordinate
9410957b409SSimon J. Gerraty  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
9420957b409SSimon J. Gerraty  *
9430957b409SSimon J. Gerraty  * The second case cannot actually happen with valid points, since a point
9440957b409SSimon J. Gerraty  * with Y == 0 is a point of order 2, and there is no point of order 2 on
9450957b409SSimon J. Gerraty  * curve P-256.
9460957b409SSimon J. Gerraty  *
9470957b409SSimon J. Gerraty  * Therefore, assuming that P1 != 0 on input, then the caller
9480957b409SSimon J. Gerraty  * can apply the following:
9490957b409SSimon J. Gerraty  *
9500957b409SSimon J. Gerraty  *   - If the result is not the point at infinity, then it is correct.
9510957b409SSimon J. Gerraty  *   - Otherwise, if the returned value is 1, then this is a case of
9520957b409SSimon J. Gerraty  *     P1+P2 == 0, so the result is indeed the point at infinity.
9530957b409SSimon J. Gerraty  *   - Otherwise, P1 == P2, so a "double" operation should have been
9540957b409SSimon J. Gerraty  *     performed.
9550957b409SSimon J. Gerraty  */
9560957b409SSimon J. Gerraty static uint32_t
p256_add_mixed(p256_jacobian * P1,const p256_jacobian * P2)9570957b409SSimon J. Gerraty p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
9580957b409SSimon J. Gerraty {
9590957b409SSimon J. Gerraty 	/*
9600957b409SSimon J. Gerraty 	 * Addtions formulas are:
9610957b409SSimon J. Gerraty 	 *
9620957b409SSimon J. Gerraty 	 *   u1 = x1
9630957b409SSimon J. Gerraty 	 *   u2 = x2 * z1^2
9640957b409SSimon J. Gerraty 	 *   s1 = y1
9650957b409SSimon J. Gerraty 	 *   s2 = y2 * z1^3
9660957b409SSimon J. Gerraty 	 *   h = u2 - u1
9670957b409SSimon J. Gerraty 	 *   r = s2 - s1
9680957b409SSimon J. Gerraty 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
9690957b409SSimon J. Gerraty 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
9700957b409SSimon J. Gerraty 	 *   z3 = h * z1
9710957b409SSimon J. Gerraty 	 */
9720957b409SSimon J. Gerraty 	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
9730957b409SSimon J. Gerraty 	uint32_t ret;
9740957b409SSimon J. Gerraty 	int i;
9750957b409SSimon J. Gerraty 
9760957b409SSimon J. Gerraty 	/*
9770957b409SSimon J. Gerraty 	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
9780957b409SSimon J. Gerraty 	 */
9790957b409SSimon J. Gerraty 	memcpy(t1, P1->x, sizeof t1);
9800957b409SSimon J. Gerraty 	memcpy(t3, P1->y, sizeof t3);
9810957b409SSimon J. Gerraty 
9820957b409SSimon J. Gerraty 	/*
9830957b409SSimon J. Gerraty 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
9840957b409SSimon J. Gerraty 	 */
9850957b409SSimon J. Gerraty 	square_f256(t4, P1->z);
9860957b409SSimon J. Gerraty 	mul_f256(t2, P2->x, t4);
9870957b409SSimon J. Gerraty 	mul_f256(t5, P1->z, t4);
9880957b409SSimon J. Gerraty 	mul_f256(t4, P2->y, t5);
9890957b409SSimon J. Gerraty 
9900957b409SSimon J. Gerraty 	/*
9910957b409SSimon J. Gerraty 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
9920957b409SSimon J. Gerraty 	 * We need to test whether r is zero, so we will do some extra
9930957b409SSimon J. Gerraty 	 * reduce.
9940957b409SSimon J. Gerraty 	 */
9950957b409SSimon J. Gerraty 	sub_f256(t2, t2, t1);
9960957b409SSimon J. Gerraty 	sub_f256(t4, t4, t3);
9970957b409SSimon J. Gerraty 	reduce_final_f256(t4);
9980957b409SSimon J. Gerraty 	ret = 0;
9990957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
10000957b409SSimon J. Gerraty 		ret |= t4[i];
10010957b409SSimon J. Gerraty 	}
10020957b409SSimon J. Gerraty 	ret = (ret | -ret) >> 31;
10030957b409SSimon J. Gerraty 
10040957b409SSimon J. Gerraty 	/*
10050957b409SSimon J. Gerraty 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
10060957b409SSimon J. Gerraty 	 */
10070957b409SSimon J. Gerraty 	square_f256(t7, t2);
10080957b409SSimon J. Gerraty 	mul_f256(t6, t1, t7);
10090957b409SSimon J. Gerraty 	mul_f256(t5, t7, t2);
10100957b409SSimon J. Gerraty 
10110957b409SSimon J. Gerraty 	/*
10120957b409SSimon J. Gerraty 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
10130957b409SSimon J. Gerraty 	 */
10140957b409SSimon J. Gerraty 	square_f256(P1->x, t4);
10150957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t5);
10160957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t6);
10170957b409SSimon J. Gerraty 	sub_f256(P1->x, P1->x, t6);
10180957b409SSimon J. Gerraty 
10190957b409SSimon J. Gerraty 	/*
10200957b409SSimon J. Gerraty 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
10210957b409SSimon J. Gerraty 	 */
10220957b409SSimon J. Gerraty 	sub_f256(t6, t6, P1->x);
10230957b409SSimon J. Gerraty 	mul_f256(P1->y, t4, t6);
10240957b409SSimon J. Gerraty 	mul_f256(t1, t5, t3);
10250957b409SSimon J. Gerraty 	sub_f256(P1->y, P1->y, t1);
10260957b409SSimon J. Gerraty 
10270957b409SSimon J. Gerraty 	/*
10280957b409SSimon J. Gerraty 	 * Compute z3 = h*z1*z2.
10290957b409SSimon J. Gerraty 	 */
10300957b409SSimon J. Gerraty 	mul_f256(P1->z, P1->z, t2);
10310957b409SSimon J. Gerraty 
10320957b409SSimon J. Gerraty 	return ret;
10330957b409SSimon J. Gerraty }
10340957b409SSimon J. Gerraty 
10350957b409SSimon J. Gerraty /*
10360957b409SSimon J. Gerraty  * Decode a P-256 point. This function does not support the point at
10370957b409SSimon J. Gerraty  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
10380957b409SSimon J. Gerraty  */
10390957b409SSimon J. Gerraty static uint32_t
p256_decode(p256_jacobian * P,const void * src,size_t len)10400957b409SSimon J. Gerraty p256_decode(p256_jacobian *P, const void *src, size_t len)
10410957b409SSimon J. Gerraty {
10420957b409SSimon J. Gerraty 	const unsigned char *buf;
10430957b409SSimon J. Gerraty 	uint32_t tx[9], ty[9], t1[9], t2[9];
10440957b409SSimon J. Gerraty 	uint32_t bad;
10450957b409SSimon J. Gerraty 	int i;
10460957b409SSimon J. Gerraty 
10470957b409SSimon J. Gerraty 	if (len != 65) {
10480957b409SSimon J. Gerraty 		return 0;
10490957b409SSimon J. Gerraty 	}
10500957b409SSimon J. Gerraty 	buf = src;
10510957b409SSimon J. Gerraty 
10520957b409SSimon J. Gerraty 	/*
10530957b409SSimon J. Gerraty 	 * First byte must be 0x04 (uncompressed format). We could support
10540957b409SSimon J. Gerraty 	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
10550957b409SSimon J. Gerraty 	 * least significant bit of the Y coordinate), but it is explicitly
10560957b409SSimon J. Gerraty 	 * forbidden by RFC 5480 (section 2.2).
10570957b409SSimon J. Gerraty 	 */
10580957b409SSimon J. Gerraty 	bad = NEQ(buf[0], 0x04);
10590957b409SSimon J. Gerraty 
10600957b409SSimon J. Gerraty 	/*
10610957b409SSimon J. Gerraty 	 * Decode the coordinates, and check that they are both lower
10620957b409SSimon J. Gerraty 	 * than the modulus.
10630957b409SSimon J. Gerraty 	 */
10640957b409SSimon J. Gerraty 	tx[8] = be8_to_le30(tx, buf + 1, 32);
10650957b409SSimon J. Gerraty 	ty[8] = be8_to_le30(ty, buf + 33, 32);
10660957b409SSimon J. Gerraty 	bad |= reduce_final_f256(tx);
10670957b409SSimon J. Gerraty 	bad |= reduce_final_f256(ty);
10680957b409SSimon J. Gerraty 
10690957b409SSimon J. Gerraty 	/*
10700957b409SSimon J. Gerraty 	 * Check curve equation.
10710957b409SSimon J. Gerraty 	 */
10720957b409SSimon J. Gerraty 	square_f256(t1, tx);
10730957b409SSimon J. Gerraty 	mul_f256(t1, tx, t1);
10740957b409SSimon J. Gerraty 	square_f256(t2, ty);
10750957b409SSimon J. Gerraty 	sub_f256(t1, t1, tx);
10760957b409SSimon J. Gerraty 	sub_f256(t1, t1, tx);
10770957b409SSimon J. Gerraty 	sub_f256(t1, t1, tx);
10780957b409SSimon J. Gerraty 	add_f256(t1, t1, P256_B);
10790957b409SSimon J. Gerraty 	sub_f256(t1, t1, t2);
10800957b409SSimon J. Gerraty 	reduce_final_f256(t1);
10810957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
10820957b409SSimon J. Gerraty 		bad |= t1[i];
10830957b409SSimon J. Gerraty 	}
10840957b409SSimon J. Gerraty 
10850957b409SSimon J. Gerraty 	/*
10860957b409SSimon J. Gerraty 	 * Copy coordinates to the point structure.
10870957b409SSimon J. Gerraty 	 */
10880957b409SSimon J. Gerraty 	memcpy(P->x, tx, sizeof tx);
10890957b409SSimon J. Gerraty 	memcpy(P->y, ty, sizeof ty);
10900957b409SSimon J. Gerraty 	memset(P->z, 0, sizeof P->z);
10910957b409SSimon J. Gerraty 	P->z[0] = 1;
10920957b409SSimon J. Gerraty 	return EQ(bad, 0);
10930957b409SSimon J. Gerraty }
10940957b409SSimon J. Gerraty 
10950957b409SSimon J. Gerraty /*
10960957b409SSimon J. Gerraty  * Encode a point into a buffer. This function assumes that the point is
10970957b409SSimon J. Gerraty  * valid, in affine coordinates, and not the point at infinity.
10980957b409SSimon J. Gerraty  */
10990957b409SSimon J. Gerraty static void
p256_encode(void * dst,const p256_jacobian * P)11000957b409SSimon J. Gerraty p256_encode(void *dst, const p256_jacobian *P)
11010957b409SSimon J. Gerraty {
11020957b409SSimon J. Gerraty 	unsigned char *buf;
11030957b409SSimon J. Gerraty 
11040957b409SSimon J. Gerraty 	buf = dst;
11050957b409SSimon J. Gerraty 	buf[0] = 0x04;
11060957b409SSimon J. Gerraty 	le30_to_be8(buf + 1, 32, P->x);
11070957b409SSimon J. Gerraty 	le30_to_be8(buf + 33, 32, P->y);
11080957b409SSimon J. Gerraty }
11090957b409SSimon J. Gerraty 
11100957b409SSimon J. Gerraty /*
11110957b409SSimon J. Gerraty  * Multiply a curve point by an integer. The integer is assumed to be
11120957b409SSimon J. Gerraty  * lower than the curve order, and the base point must not be the point
11130957b409SSimon J. Gerraty  * at infinity.
11140957b409SSimon J. Gerraty  */
11150957b409SSimon J. Gerraty static void
p256_mul(p256_jacobian * P,const unsigned char * x,size_t xlen)11160957b409SSimon J. Gerraty p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
11170957b409SSimon J. Gerraty {
11180957b409SSimon J. Gerraty 	/*
11190957b409SSimon J. Gerraty 	 * qz is a flag that is initially 1, and remains equal to 1
11200957b409SSimon J. Gerraty 	 * as long as the point is the point at infinity.
11210957b409SSimon J. Gerraty 	 *
11220957b409SSimon J. Gerraty 	 * We use a 2-bit window to handle multiplier bits by pairs.
11230957b409SSimon J. Gerraty 	 * The precomputed window really is the points P2 and P3.
11240957b409SSimon J. Gerraty 	 */
11250957b409SSimon J. Gerraty 	uint32_t qz;
11260957b409SSimon J. Gerraty 	p256_jacobian P2, P3, Q, T, U;
11270957b409SSimon J. Gerraty 
11280957b409SSimon J. Gerraty 	/*
11290957b409SSimon J. Gerraty 	 * Compute window values.
11300957b409SSimon J. Gerraty 	 */
11310957b409SSimon J. Gerraty 	P2 = *P;
11320957b409SSimon J. Gerraty 	p256_double(&P2);
11330957b409SSimon J. Gerraty 	P3 = *P;
11340957b409SSimon J. Gerraty 	p256_add(&P3, &P2);
11350957b409SSimon J. Gerraty 
11360957b409SSimon J. Gerraty 	/*
11370957b409SSimon J. Gerraty 	 * We start with Q = 0. We process multiplier bits 2 by 2.
11380957b409SSimon J. Gerraty 	 */
11390957b409SSimon J. Gerraty 	memset(&Q, 0, sizeof Q);
11400957b409SSimon J. Gerraty 	qz = 1;
11410957b409SSimon J. Gerraty 	while (xlen -- > 0) {
11420957b409SSimon J. Gerraty 		int k;
11430957b409SSimon J. Gerraty 
11440957b409SSimon J. Gerraty 		for (k = 6; k >= 0; k -= 2) {
11450957b409SSimon J. Gerraty 			uint32_t bits;
11460957b409SSimon J. Gerraty 			uint32_t bnz;
11470957b409SSimon J. Gerraty 
11480957b409SSimon J. Gerraty 			p256_double(&Q);
11490957b409SSimon J. Gerraty 			p256_double(&Q);
11500957b409SSimon J. Gerraty 			T = *P;
11510957b409SSimon J. Gerraty 			U = Q;
11520957b409SSimon J. Gerraty 			bits = (*x >> k) & (uint32_t)3;
11530957b409SSimon J. Gerraty 			bnz = NEQ(bits, 0);
11540957b409SSimon J. Gerraty 			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
11550957b409SSimon J. Gerraty 			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
11560957b409SSimon J. Gerraty 			p256_add(&U, &T);
11570957b409SSimon J. Gerraty 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
11580957b409SSimon J. Gerraty 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
11590957b409SSimon J. Gerraty 			qz &= ~bnz;
11600957b409SSimon J. Gerraty 		}
11610957b409SSimon J. Gerraty 		x ++;
11620957b409SSimon J. Gerraty 	}
11630957b409SSimon J. Gerraty 	*P = Q;
11640957b409SSimon J. Gerraty }
11650957b409SSimon J. Gerraty 
11660957b409SSimon J. Gerraty /*
11670957b409SSimon J. Gerraty  * Precomputed window: k*G points, where G is the curve generator, and k
11680957b409SSimon J. Gerraty  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
11690957b409SSimon J. Gerraty  * the point are encoded as 9 words of 30 bits each (little-endian
11700957b409SSimon J. Gerraty  * order).
11710957b409SSimon J. Gerraty  */
11720957b409SSimon J. Gerraty static const uint32_t Gwin[15][18] = {
11730957b409SSimon J. Gerraty 
11740957b409SSimon J. Gerraty 	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
11750957b409SSimon J. Gerraty 	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
11760957b409SSimon J. Gerraty 	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
11770957b409SSimon J. Gerraty 	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
11780957b409SSimon J. Gerraty 	  0x10B8BF86, 0x00004FE3 },
11790957b409SSimon J. Gerraty 
11800957b409SSimon J. Gerraty 	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
11810957b409SSimon J. Gerraty 	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
11820957b409SSimon J. Gerraty 	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
11830957b409SSimon J. Gerraty 	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
11840957b409SSimon J. Gerraty 	  0x154436E3, 0x00000777 },
11850957b409SSimon J. Gerraty 
11860957b409SSimon J. Gerraty 	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
11870957b409SSimon J. Gerraty 	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
11880957b409SSimon J. Gerraty 	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
11890957b409SSimon J. Gerraty 	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
11900957b409SSimon J. Gerraty 	  0x19031266, 0x00008734 },
11910957b409SSimon J. Gerraty 
11920957b409SSimon J. Gerraty 	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
11930957b409SSimon J. Gerraty 	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
11940957b409SSimon J. Gerraty 	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
11950957b409SSimon J. Gerraty 	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
11960957b409SSimon J. Gerraty 	  0x15D69318, 0x0000E0F1 },
11970957b409SSimon J. Gerraty 
11980957b409SSimon J. Gerraty 	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
11990957b409SSimon J. Gerraty 	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
12000957b409SSimon J. Gerraty 	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
12010957b409SSimon J. Gerraty 	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
12020957b409SSimon J. Gerraty 	  0x1F6A2412, 0x0000E0C1 },
12030957b409SSimon J. Gerraty 
12040957b409SSimon J. Gerraty 	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
12050957b409SSimon J. Gerraty 	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
12060957b409SSimon J. Gerraty 	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
12070957b409SSimon J. Gerraty 	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
12080957b409SSimon J. Gerraty 	  0x041D0C8D, 0x0000E85C },
12090957b409SSimon J. Gerraty 
12100957b409SSimon J. Gerraty 	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
12110957b409SSimon J. Gerraty 	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
12120957b409SSimon J. Gerraty 	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
12130957b409SSimon J. Gerraty 	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
12140957b409SSimon J. Gerraty 	  0x076F780C, 0x000073EB },
12150957b409SSimon J. Gerraty 
12160957b409SSimon J. Gerraty 	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
12170957b409SSimon J. Gerraty 	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
12180957b409SSimon J. Gerraty 	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
12190957b409SSimon J. Gerraty 	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
12200957b409SSimon J. Gerraty 	  0x332F647A, 0x0000AD5A },
12210957b409SSimon J. Gerraty 
12220957b409SSimon J. Gerraty 	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
12230957b409SSimon J. Gerraty 	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
12240957b409SSimon J. Gerraty 	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
12250957b409SSimon J. Gerraty 	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
12260957b409SSimon J. Gerraty 	  0x11325CB2, 0x00002A27 },
12270957b409SSimon J. Gerraty 
12280957b409SSimon J. Gerraty 	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
12290957b409SSimon J. Gerraty 	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
12300957b409SSimon J. Gerraty 	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
12310957b409SSimon J. Gerraty 	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
12320957b409SSimon J. Gerraty 	  0x18A88A6A, 0x00008786 },
12330957b409SSimon J. Gerraty 
12340957b409SSimon J. Gerraty 	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
12350957b409SSimon J. Gerraty 	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
12360957b409SSimon J. Gerraty 	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
12370957b409SSimon J. Gerraty 	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
12380957b409SSimon J. Gerraty 	  0x0826B331, 0x00009099 },
12390957b409SSimon J. Gerraty 
12400957b409SSimon J. Gerraty 	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
12410957b409SSimon J. Gerraty 	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
12420957b409SSimon J. Gerraty 	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
12430957b409SSimon J. Gerraty 	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
12440957b409SSimon J. Gerraty 	  0x2D1AA70E, 0x00000770 },
12450957b409SSimon J. Gerraty 
12460957b409SSimon J. Gerraty 	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
12470957b409SSimon J. Gerraty 	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
12480957b409SSimon J. Gerraty 	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
12490957b409SSimon J. Gerraty 	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
12500957b409SSimon J. Gerraty 	  0x163353AF, 0x000063BB },
12510957b409SSimon J. Gerraty 
12520957b409SSimon J. Gerraty 	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
12530957b409SSimon J. Gerraty 	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
12540957b409SSimon J. Gerraty 	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
12550957b409SSimon J. Gerraty 	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
12560957b409SSimon J. Gerraty 	  0x3C6ECA7D, 0x0000F599 },
12570957b409SSimon J. Gerraty 
12580957b409SSimon J. Gerraty 	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
12590957b409SSimon J. Gerraty 	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
12600957b409SSimon J. Gerraty 	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
12610957b409SSimon J. Gerraty 	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
12620957b409SSimon J. Gerraty 	  0x0FB8D64B, 0x0000B5B9 }
12630957b409SSimon J. Gerraty };
12640957b409SSimon J. Gerraty 
12650957b409SSimon J. Gerraty /*
12660957b409SSimon J. Gerraty  * Lookup one of the Gwin[] values, by index. This is constant-time.
12670957b409SSimon J. Gerraty  */
12680957b409SSimon J. Gerraty static void
lookup_Gwin(p256_jacobian * T,uint32_t idx)12690957b409SSimon J. Gerraty lookup_Gwin(p256_jacobian *T, uint32_t idx)
12700957b409SSimon J. Gerraty {
12710957b409SSimon J. Gerraty 	uint32_t xy[18];
12720957b409SSimon J. Gerraty 	uint32_t k;
12730957b409SSimon J. Gerraty 	size_t u;
12740957b409SSimon J. Gerraty 
12750957b409SSimon J. Gerraty 	memset(xy, 0, sizeof xy);
12760957b409SSimon J. Gerraty 	for (k = 0; k < 15; k ++) {
12770957b409SSimon J. Gerraty 		uint32_t m;
12780957b409SSimon J. Gerraty 
12790957b409SSimon J. Gerraty 		m = -EQ(idx, k + 1);
12800957b409SSimon J. Gerraty 		for (u = 0; u < 18; u ++) {
12810957b409SSimon J. Gerraty 			xy[u] |= m & Gwin[k][u];
12820957b409SSimon J. Gerraty 		}
12830957b409SSimon J. Gerraty 	}
12840957b409SSimon J. Gerraty 	memcpy(T->x, &xy[0], sizeof T->x);
12850957b409SSimon J. Gerraty 	memcpy(T->y, &xy[9], sizeof T->y);
12860957b409SSimon J. Gerraty 	memset(T->z, 0, sizeof T->z);
12870957b409SSimon J. Gerraty 	T->z[0] = 1;
12880957b409SSimon J. Gerraty }
12890957b409SSimon J. Gerraty 
12900957b409SSimon J. Gerraty /*
12910957b409SSimon J. Gerraty  * Multiply the generator by an integer. The integer is assumed non-zero
12920957b409SSimon J. Gerraty  * and lower than the curve order.
12930957b409SSimon J. Gerraty  */
12940957b409SSimon J. Gerraty static void
p256_mulgen(p256_jacobian * P,const unsigned char * x,size_t xlen)12950957b409SSimon J. Gerraty p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
12960957b409SSimon J. Gerraty {
12970957b409SSimon J. Gerraty 	/*
12980957b409SSimon J. Gerraty 	 * qz is a flag that is initially 1, and remains equal to 1
12990957b409SSimon J. Gerraty 	 * as long as the point is the point at infinity.
13000957b409SSimon J. Gerraty 	 *
13010957b409SSimon J. Gerraty 	 * We use a 4-bit window to handle multiplier bits by groups
13020957b409SSimon J. Gerraty 	 * of 4. The precomputed window is constant static data, with
13030957b409SSimon J. Gerraty 	 * points in affine coordinates; we use a constant-time lookup.
13040957b409SSimon J. Gerraty 	 */
13050957b409SSimon J. Gerraty 	p256_jacobian Q;
13060957b409SSimon J. Gerraty 	uint32_t qz;
13070957b409SSimon J. Gerraty 
13080957b409SSimon J. Gerraty 	memset(&Q, 0, sizeof Q);
13090957b409SSimon J. Gerraty 	qz = 1;
13100957b409SSimon J. Gerraty 	while (xlen -- > 0) {
13110957b409SSimon J. Gerraty 		int k;
13120957b409SSimon J. Gerraty 		unsigned bx;
13130957b409SSimon J. Gerraty 
13140957b409SSimon J. Gerraty 		bx = *x ++;
13150957b409SSimon J. Gerraty 		for (k = 0; k < 2; k ++) {
13160957b409SSimon J. Gerraty 			uint32_t bits;
13170957b409SSimon J. Gerraty 			uint32_t bnz;
13180957b409SSimon J. Gerraty 			p256_jacobian T, U;
13190957b409SSimon J. Gerraty 
13200957b409SSimon J. Gerraty 			p256_double(&Q);
13210957b409SSimon J. Gerraty 			p256_double(&Q);
13220957b409SSimon J. Gerraty 			p256_double(&Q);
13230957b409SSimon J. Gerraty 			p256_double(&Q);
13240957b409SSimon J. Gerraty 			bits = (bx >> 4) & 0x0F;
13250957b409SSimon J. Gerraty 			bnz = NEQ(bits, 0);
13260957b409SSimon J. Gerraty 			lookup_Gwin(&T, bits);
13270957b409SSimon J. Gerraty 			U = Q;
13280957b409SSimon J. Gerraty 			p256_add_mixed(&U, &T);
13290957b409SSimon J. Gerraty 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
13300957b409SSimon J. Gerraty 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
13310957b409SSimon J. Gerraty 			qz &= ~bnz;
13320957b409SSimon J. Gerraty 			bx <<= 4;
13330957b409SSimon J. Gerraty 		}
13340957b409SSimon J. Gerraty 	}
13350957b409SSimon J. Gerraty 	*P = Q;
13360957b409SSimon J. Gerraty }
13370957b409SSimon J. Gerraty 
13380957b409SSimon J. Gerraty static const unsigned char P256_G[] = {
13390957b409SSimon J. Gerraty 	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
13400957b409SSimon J. Gerraty 	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
13410957b409SSimon J. Gerraty 	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
13420957b409SSimon J. Gerraty 	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
13430957b409SSimon J. Gerraty 	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
13440957b409SSimon J. Gerraty 	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
13450957b409SSimon J. Gerraty 	0x68, 0x37, 0xBF, 0x51, 0xF5
13460957b409SSimon J. Gerraty };
13470957b409SSimon J. Gerraty 
13480957b409SSimon J. Gerraty static const unsigned char P256_N[] = {
13490957b409SSimon J. Gerraty 	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
13500957b409SSimon J. Gerraty 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
13510957b409SSimon J. Gerraty 	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
13520957b409SSimon J. Gerraty 	0x25, 0x51
13530957b409SSimon J. Gerraty };
13540957b409SSimon J. Gerraty 
13550957b409SSimon J. Gerraty static const unsigned char *
api_generator(int curve,size_t * len)13560957b409SSimon J. Gerraty api_generator(int curve, size_t *len)
13570957b409SSimon J. Gerraty {
13580957b409SSimon J. Gerraty 	(void)curve;
13590957b409SSimon J. Gerraty 	*len = sizeof P256_G;
13600957b409SSimon J. Gerraty 	return P256_G;
13610957b409SSimon J. Gerraty }
13620957b409SSimon J. Gerraty 
13630957b409SSimon J. Gerraty static const unsigned char *
api_order(int curve,size_t * len)13640957b409SSimon J. Gerraty api_order(int curve, size_t *len)
13650957b409SSimon J. Gerraty {
13660957b409SSimon J. Gerraty 	(void)curve;
13670957b409SSimon J. Gerraty 	*len = sizeof P256_N;
13680957b409SSimon J. Gerraty 	return P256_N;
13690957b409SSimon J. Gerraty }
13700957b409SSimon J. Gerraty 
13710957b409SSimon J. Gerraty static size_t
api_xoff(int curve,size_t * len)13720957b409SSimon J. Gerraty api_xoff(int curve, size_t *len)
13730957b409SSimon J. Gerraty {
13740957b409SSimon J. Gerraty 	(void)curve;
13750957b409SSimon J. Gerraty 	*len = 32;
13760957b409SSimon J. Gerraty 	return 1;
13770957b409SSimon J. Gerraty }
13780957b409SSimon J. Gerraty 
13790957b409SSimon J. Gerraty static uint32_t
api_mul(unsigned char * G,size_t Glen,const unsigned char * x,size_t xlen,int curve)13800957b409SSimon J. Gerraty api_mul(unsigned char *G, size_t Glen,
13810957b409SSimon J. Gerraty 	const unsigned char *x, size_t xlen, int curve)
13820957b409SSimon J. Gerraty {
13830957b409SSimon J. Gerraty 	uint32_t r;
13840957b409SSimon J. Gerraty 	p256_jacobian P;
13850957b409SSimon J. Gerraty 
13860957b409SSimon J. Gerraty 	(void)curve;
1387*cc9e6590SSimon J. Gerraty 	if (Glen != 65) {
1388*cc9e6590SSimon J. Gerraty 		return 0;
1389*cc9e6590SSimon J. Gerraty 	}
13900957b409SSimon J. Gerraty 	r = p256_decode(&P, G, Glen);
13910957b409SSimon J. Gerraty 	p256_mul(&P, x, xlen);
13920957b409SSimon J. Gerraty 	p256_to_affine(&P);
13930957b409SSimon J. Gerraty 	p256_encode(G, &P);
13940957b409SSimon J. Gerraty 	return r;
13950957b409SSimon J. Gerraty }
13960957b409SSimon J. Gerraty 
13970957b409SSimon J. Gerraty static size_t
api_mulgen(unsigned char * R,const unsigned char * x,size_t xlen,int curve)13980957b409SSimon J. Gerraty api_mulgen(unsigned char *R,
13990957b409SSimon J. Gerraty 	const unsigned char *x, size_t xlen, int curve)
14000957b409SSimon J. Gerraty {
14010957b409SSimon J. Gerraty 	p256_jacobian P;
14020957b409SSimon J. Gerraty 
14030957b409SSimon J. Gerraty 	(void)curve;
14040957b409SSimon J. Gerraty 	p256_mulgen(&P, x, xlen);
14050957b409SSimon J. Gerraty 	p256_to_affine(&P);
14060957b409SSimon J. Gerraty 	p256_encode(R, &P);
14070957b409SSimon J. Gerraty 	return 65;
14080957b409SSimon J. Gerraty }
14090957b409SSimon J. Gerraty 
14100957b409SSimon J. Gerraty static uint32_t
api_muladd(unsigned char * A,const unsigned char * B,size_t len,const unsigned char * x,size_t xlen,const unsigned char * y,size_t ylen,int curve)14110957b409SSimon J. Gerraty api_muladd(unsigned char *A, const unsigned char *B, size_t len,
14120957b409SSimon J. Gerraty 	const unsigned char *x, size_t xlen,
14130957b409SSimon J. Gerraty 	const unsigned char *y, size_t ylen, int curve)
14140957b409SSimon J. Gerraty {
14150957b409SSimon J. Gerraty 	p256_jacobian P, Q;
14160957b409SSimon J. Gerraty 	uint32_t r, t, z;
14170957b409SSimon J. Gerraty 	int i;
14180957b409SSimon J. Gerraty 
14190957b409SSimon J. Gerraty 	(void)curve;
1420*cc9e6590SSimon J. Gerraty 	if (len != 65) {
1421*cc9e6590SSimon J. Gerraty 		return 0;
1422*cc9e6590SSimon J. Gerraty 	}
14230957b409SSimon J. Gerraty 	r = p256_decode(&P, A, len);
14240957b409SSimon J. Gerraty 	p256_mul(&P, x, xlen);
14250957b409SSimon J. Gerraty 	if (B == NULL) {
14260957b409SSimon J. Gerraty 		p256_mulgen(&Q, y, ylen);
14270957b409SSimon J. Gerraty 	} else {
14280957b409SSimon J. Gerraty 		r &= p256_decode(&Q, B, len);
14290957b409SSimon J. Gerraty 		p256_mul(&Q, y, ylen);
14300957b409SSimon J. Gerraty 	}
14310957b409SSimon J. Gerraty 
14320957b409SSimon J. Gerraty 	/*
14330957b409SSimon J. Gerraty 	 * The final addition may fail in case both points are equal.
14340957b409SSimon J. Gerraty 	 */
14350957b409SSimon J. Gerraty 	t = p256_add(&P, &Q);
14360957b409SSimon J. Gerraty 	reduce_final_f256(P.z);
14370957b409SSimon J. Gerraty 	z = 0;
14380957b409SSimon J. Gerraty 	for (i = 0; i < 9; i ++) {
14390957b409SSimon J. Gerraty 		z |= P.z[i];
14400957b409SSimon J. Gerraty 	}
14410957b409SSimon J. Gerraty 	z = EQ(z, 0);
14420957b409SSimon J. Gerraty 	p256_double(&Q);
14430957b409SSimon J. Gerraty 
14440957b409SSimon J. Gerraty 	/*
14450957b409SSimon J. Gerraty 	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
14460957b409SSimon J. Gerraty 	 * have the following:
14470957b409SSimon J. Gerraty 	 *
14480957b409SSimon J. Gerraty 	 *   z = 0, t = 0   return P (normal addition)
14490957b409SSimon J. Gerraty 	 *   z = 0, t = 1   return P (normal addition)
14500957b409SSimon J. Gerraty 	 *   z = 1, t = 0   return Q (a 'double' case)
14510957b409SSimon J. Gerraty 	 *   z = 1, t = 1   report an error (P+Q = 0)
14520957b409SSimon J. Gerraty 	 */
14530957b409SSimon J. Gerraty 	CCOPY(z & ~t, &P, &Q, sizeof Q);
14540957b409SSimon J. Gerraty 	p256_to_affine(&P);
14550957b409SSimon J. Gerraty 	p256_encode(A, &P);
14560957b409SSimon J. Gerraty 	r &= ~(z & t);
14570957b409SSimon J. Gerraty 	return r;
14580957b409SSimon J. Gerraty }
14590957b409SSimon J. Gerraty 
14600957b409SSimon J. Gerraty /* see bearssl_ec.h */
14610957b409SSimon J. Gerraty const br_ec_impl br_ec_p256_m31 = {
14620957b409SSimon J. Gerraty 	(uint32_t)0x00800000,
14630957b409SSimon J. Gerraty 	&api_generator,
14640957b409SSimon J. Gerraty 	&api_order,
14650957b409SSimon J. Gerraty 	&api_xoff,
14660957b409SSimon J. Gerraty 	&api_mul,
14670957b409SSimon J. Gerraty 	&api_mulgen,
14680957b409SSimon J. Gerraty 	&api_muladd
14690957b409SSimon J. Gerraty };
1470