ec_p256_m15.c - OpenGrok cross reference for /freebsd/contrib/bearssl/src/ec/ec_p256

Lines Matching +full:40 +full:- +full:bit
29  * that right-shifting a signed negative integer copies the sign bit
30  * (arithmetic right-shift). This is "implementation-defined behaviour",
39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
45  * Convert an integer from unsigned big-endian encoding to a sequence of
46  * 13-bit words in little-endian order. The final "partial" word is
57 	while (len -- > 0) {  in be8_to_le13()
63 			acc_len -= 13;  in be8_to_le13()
70  * Convert an integer (13-bit words, little-endian) to unsigned
71  * big-endian encoding. The total encoding length is provided; all
82 	while (len -- > 0) {  in le13_to_be8()
89 		acc_len -= 8;  in le13_to_be8()
116  * mul20() multiplies two 260-bit integers together. Each word must fit
118  * receives 40 words. All overlaps allowed.
120  * square20() computes the square of a 260-bit integer. Each word must
122  * receives 40 words. All overlaps allowed.
131 	 * Two-level Karatsuba: turns a 20x20 multiplication into  in mul20()
132 	 * nine 5x5 multiplications. We use 13-bit words but do not  in mul20()
135 	 *  - First Karatsuba decomposition turns the 20x20 mul on  in mul20()
136 	 *    13-bit words into three 10x10 muls, two on 13-bit words  in mul20()
137 	 *    and one on 14-bit words.  in mul20()
139 	 *  - Second Karatsuba decomposition further splits these into:  in mul20()
141 	 *     * four 5x5 muls on 13-bit words  in mul20()
142 	 *     * four 5x5 muls on 14-bit words  in mul20()
143 	 *     * one 5x5 mul on 15-bit words  in mul20()
145 	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit  in mul20()
146 	 * or 15-bit words, respectively.  in mul20()
174 		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \  in mul20()
176 		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \  in mul20()
178 		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \  in mul20()
180 		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \  in mul20()
182 		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \  in mul20()
223 	 * each, so we can add product results together "as is" in 32-bit  in mul20()
226 	for (i = 0; i < 40; i += 5) {  in mul20()
267 	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);  in mul20()
268 	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])  in mul20()
269 		+ MUL15(u[40 + 1], v[40 + 0]);  in mul20()
270 	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])  in mul20()
271 		+ MUL15(u[40 + 1], v[40 + 1])  in mul20()
272 		+ MUL15(u[40 + 2], v[40 + 0]);  in mul20()
273 	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])  in mul20()
274 		+ MUL15(u[40 + 1], v[40 + 2])  in mul20()
275 		+ MUL15(u[40 + 2], v[40 + 1])  in mul20()
276 		+ MUL15(u[40 + 3], v[40 + 0]);  in mul20()
277 	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])  in mul20()
278 		+ MUL15(u[40 + 1], v[40 + 3])  in mul20()
279 		+ MUL15(u[40 + 2], v[40 + 2])  in mul20()
280 		+ MUL15(u[40 + 3], v[40 + 1]);  in mul20()
281 		/* + MUL15(u[40 + 4], v[40 + 0]) */  in mul20()
282 	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])  in mul20()
283 		+ MUL15(u[40 + 2], v[40 + 3])  in mul20()
284 		+ MUL15(u[40 + 3], v[40 + 2])  in mul20()
285 		+ MUL15(u[40 + 4], v[40 + 1]);  in mul20()
286 	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])  in mul20()
287 		+ MUL15(u[40 + 3], v[40 + 3])  in mul20()
288 		+ MUL15(u[40 + 4], v[40 + 2]);  in mul20()
289 	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])  in mul20()
290 		+ MUL15(u[40 + 4], v[40 + 3]);  in mul20()
291 	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);  in mul20()
295 	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);  in mul20()
298 	 * The products on 14-bit words in slots 6 and 7 yield values  in mul20()
301 	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.  in mul20()
303 	 * bit of reduction here.  in mul20()
330 	/* first-level recomposition */  in mul20()
343 	cc = norm13(d, w, 40);  in mul20()
990  * Modulus for field F256 (field for point coordinates in curve P-256).
999  * The 'b' curve equation coefficient for P-256.
1008  * Perform a "short reduction" in field F256 (field for curve P-256).
1020 	d[14] -= x << 10;  in reduce_f256()
1021 	d[7] -= x << 5;  in reduce_f256()
1027  * Perform a "final reduction" in field F256 (field for curve P-256).
1045 		w = t[i] - F256[i] - cc;  in reduce_final_f256()
1056  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057  * of 20 words, each containing 13 bits of data, in little-endian order.
1058  * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1064 	uint32_t t[40], cc;  in mul_f256()
1078 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1  in mul_f256()
1080 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p  in mul_f256()
1082 	 * For a word x at bit offset n (n >= 256), we have:  in mul_f256()
1083 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)  in mul_f256()
1084 	 *            - x*2^(n - 160) + x*2^(n-256) mod p  in mul_f256()
1089 	for (i = 39; i >= 20; i --) {  in mul_f256()
1093 		t[i - 2] += ARSH(x, 6);  in mul_f256()
1094 		t[i - 3] += (x << 7) & 0x1FFF;  in mul_f256()
1095 		t[i - 4] -= ARSH(x, 12);  in mul_f256()
1096 		t[i - 5] -= (x << 1) & 0x1FFF;  in mul_f256()
1097 		t[i - 12] -= ARSH(x, 4);  in mul_f256()
1098 		t[i - 13] -= (x << 9) & 0x1FFF;  in mul_f256()
1099 		t[i - 19] += ARSH(x, 9);  in mul_f256()
1100 		t[i - 20] += (x << 4) & 0x1FFF;  in mul_f256()
1106 	 * but not two much: worst case is the chain involving t[i - 3],  in mul_f256()
1108 	 * starting values are 13-bit each, all words fit on 20 bits  in mul_f256()
1109 	 * (21 to account for the sign bit).  in mul_f256()
1116 	 * bits, and the values fit on 21 bits, values fit in 32-bit words,  in mul_f256()
1122 	t[14] -= cc << 10;  in mul_f256()
1123 	t[7] -= cc << 5;  in mul_f256()
1135 	t[0] -= cc;  in mul_f256()
1138 	t[17] -= cc << 3;  in mul_f256()
1145  * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146  * P-256). Operand is an array of 20 words, each containing 13 bits of
1147  * data, in little-endian order. On input, upper word may be up to 13
1148  * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1154 	uint32_t t[40], cc;  in square_f256()
1167 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1  in square_f256()
1169 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p  in square_f256()
1171 	 * For a word x at bit offset n (n >= 256), we have:  in square_f256()
1172 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)  in square_f256()
1173 	 *            - x*2^(n - 160) + x*2^(n-256) mod p  in square_f256()
1178 	for (i = 39; i >= 20; i --) {  in square_f256()
1182 		t[i - 2] += ARSH(x, 6);  in square_f256()
1183 		t[i - 3] += (x << 7) & 0x1FFF;  in square_f256()
1184 		t[i - 4] -= ARSH(x, 12);  in square_f256()
1185 		t[i - 5] -= (x << 1) & 0x1FFF;  in square_f256()
1186 		t[i - 12] -= ARSH(x, 4);  in square_f256()
1187 		t[i - 13] -= (x << 9) & 0x1FFF;  in square_f256()
1188 		t[i - 19] += ARSH(x, 9);  in square_f256()
1189 		t[i - 20] += (x << 4) & 0x1FFF;  in square_f256()
1195 	 * but not two much: worst case is the chain involving t[i - 3],  in square_f256()
1197 	 * starting values are 13-bit each, all words fit on 20 bits  in square_f256()
1198 	 * (21 to account for the sign bit).  in square_f256()
1205 	 * bits, and the values fit on 21 bits, values fit in 32-bit words,  in square_f256()
1211 	t[14] -= cc << 10;  in square_f256()
1212 	t[7] -= cc << 5;  in square_f256()
1224 	t[0] -= cc;  in square_f256()
1227 	t[17] -= cc << 3;  in square_f256()
1234  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1241  * Coordinates are represented in arrays of 32-bit integers, each holding
1253  *  - If the point is the point at infinity, then all three coordinates
1255  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1267 	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is  in p256_to_affine()
1268 	 * p-2. Exponent bit pattern (from high to low) is:  in p256_to_affine()
1269 	 *  - 32 bits of value 1  in p256_to_affine()
1270 	 *  - 31 bits of value 0  in p256_to_affine()
1271 	 *  - 1 bit of value 1  in p256_to_affine()
1272 	 *  - 96 bits of value 0  in p256_to_affine()
1273 	 *  - 94 bits of value 1  in p256_to_affine()
1274 	 *  - 1 bit of value 0  in p256_to_affine()
1275 	 *  - 1 bit of value 1  in p256_to_affine()
1276 	 * Thus, we precompute z^(2^31-1) to speed things up.  in p256_to_affine()
1284 	 * A simple square-and-multiply for z^(2^31-1). We could save about  in p256_to_affine()
1286 	 * this would require a bit more code, and extra stack buffers.  in p256_to_affine()
1288 	memcpy(t1, P->z, sizeof P->z);  in p256_to_affine()
1291 		mul_f256(t1, t1, P->z);  in p256_to_affine()
1295 	 * Square-and-multiply. Apart from the squarings, we have a few  in p256_to_affine()
1297 	 * for setting 1 bit, and by t1 for setting 31 bits.  in p256_to_affine()
1299 	memcpy(t2, P->z, sizeof P->z);  in p256_to_affine()
1312 			mul_f256(t2, t2, P->z);  in p256_to_affine()
1321 	mul_f256(P->x, t1, P->x);  in p256_to_affine()
1323 	mul_f256(P->y, t1, P->y);  in p256_to_affine()
1324 	reduce_final_f256(P->x);  in p256_to_affine()
1325 	reduce_final_f256(P->y);  in p256_to_affine()
1331 	mul_f256(P->z, P->z, t2);  in p256_to_affine()
1332 	reduce_final_f256(P->z);  in p256_to_affine()
1336  * Double a point in P-256. This function works for all valid points,
1346 	 *   m = 3*(x + z^2)*(x - z^2)  in p256_double()
1347 	 *   x' = m^2 - 2*s  in p256_double()
1348 	 *   y' = m*(s - x') - 8*y^4  in p256_double()
1353 	 *   - If y = 0 then z' = 0. But there is no such point in P-256  in p256_double()
1355 	 *   - If z = 0 then z' = 0.  in p256_double()
1363 	square_f256(t1, Q->z);  in p256_double()
1366 	 * Compute x-z^2 in t2 and x+z^2 in t1.  in p256_double()
1369 		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];  in p256_double()
1370 		t1[i] += Q->x[i];  in p256_double()
1376 	 * Compute 3*(x+z^2)*(x-z^2) in t1.  in p256_double()
1387 	square_f256(t3, Q->y);  in p256_double()
1392 	mul_f256(t2, Q->x, t3);  in p256_double()
1400 	 * Compute x' = m^2 - 2*s.  in p256_double()
1402 	square_f256(Q->x, t1);  in p256_double()
1404 		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);  in p256_double()
1406 	norm13(Q->x, Q->x, 20);  in p256_double()
1407 	reduce_f256(Q->x);  in p256_double()
1412 	mul_f256(t4, Q->y, Q->z);  in p256_double()
1414 		Q->z[i] = t4[i] << 1;  in p256_double()
1416 	norm13(Q->z, Q->z, 20);  in p256_double()
1417 	reduce_f256(Q->z);  in p256_double()
1420 	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have  in p256_double()
1424 		t2[i] += (F256[i] << 1) - Q->x[i];  in p256_double()
1427 	mul_f256(Q->y, t1, t2);  in p256_double()
1430 		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);  in p256_double()
1432 	norm13(Q->y, Q->y, 20);  in p256_double()
1433 	reduce_f256(Q->y);  in p256_double()
1441  *   - If P1 == 0 but P2 != 0
1442  *   - If P1 != 0 but P2 == 0
1443  *   - If P1 == P2
1449  *   - P1 and P2 have the same Y coordinate
1450  *   - P1 == 0 and P2 == 0
1451  *   - The Y coordinate of one of the points is 0 and the other point is
1456  * curve P-256.
1461  *   - If the result is not the point at infinity, then it is correct.
1462  *   - Otherwise, if the returned value is 1, then this is a case of
1464  *   - Otherwise, P1 == P2, so a "double" operation should have been
1477 	 *   h = u2 - u1  in p256_add()
1478 	 *   r = s2 - s1  in p256_add()
1479 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2  in p256_add()
1480 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3  in p256_add()
1490 	square_f256(t3, P2->z);  in p256_add()
1491 	mul_f256(t1, P1->x, t3);  in p256_add()
1492 	mul_f256(t4, P2->z, t3);  in p256_add()
1493 	mul_f256(t3, P1->y, t4);  in p256_add()
1498 	square_f256(t4, P1->z);  in p256_add()
1499 	mul_f256(t2, P2->x, t4);  in p256_add()
1500 	mul_f256(t5, P1->z, t4);  in p256_add()
1501 	mul_f256(t4, P2->y, t5);  in p256_add()
1504 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).  in p256_add()
1509 		t2[i] += (F256[i] << 1) - t1[i];  in p256_add()
1510 		t4[i] += (F256[i] << 1) - t3[i];  in p256_add()
1520 	ret = (ret | -ret) >> 31;  in p256_add()
1530 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.  in p256_add()
1532 	square_f256(P1->x, t4);  in p256_add()
1534 		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);  in p256_add()
1536 	norm13(P1->x, P1->x, 20);  in p256_add()
1537 	reduce_f256(P1->x);  in p256_add()
1540 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.  in p256_add()
1543 		t6[i] += (F256[i] << 1) - P1->x[i];  in p256_add()
1546 	mul_f256(P1->y, t4, t6);  in p256_add()
1549 		P1->y[i] += (F256[i] << 1) - t1[i];  in p256_add()
1551 	norm13(P1->y, P1->y, 20);  in p256_add()
1552 	reduce_f256(P1->y);  in p256_add()
1557 	mul_f256(t1, P1->z, P2->z);  in p256_add()
1558 	mul_f256(P1->z, t1, t2);  in p256_add()
1565  * case when P2 is a non-zero point in affine coordinate.
1569  *   - If P1 == 0
1570  *   - If P1 == P2
1576  *   - P1 and P2 have the same Y coordinate
1577  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1581  * curve P-256.
1586  *   - If the result is not the point at infinity, then it is correct.
1587  *   - Otherwise, if the returned value is 1, then this is a case of
1589  *   - Otherwise, P1 == P2, so a "double" operation should have been
1602 	 *   h = u2 - u1  in p256_add_mixed()
1603 	 *   r = s2 - s1  in p256_add_mixed()
1604 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2  in p256_add_mixed()
1605 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3  in p256_add_mixed()
1615 	memcpy(t1, P1->x, sizeof t1);  in p256_add_mixed()
1616 	memcpy(t3, P1->y, sizeof t3);  in p256_add_mixed()
1621 	square_f256(t4, P1->z);  in p256_add_mixed()
1622 	mul_f256(t2, P2->x, t4);  in p256_add_mixed()
1623 	mul_f256(t5, P1->z, t4);  in p256_add_mixed()
1624 	mul_f256(t4, P2->y, t5);  in p256_add_mixed()
1627 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).  in p256_add_mixed()
1632 		t2[i] += (F256[i] << 1) - t1[i];  in p256_add_mixed()
1633 		t4[i] += (F256[i] << 1) - t3[i];  in p256_add_mixed()
1643 	ret = (ret | -ret) >> 31;  in p256_add_mixed()
1653 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.  in p256_add_mixed()
1655 	square_f256(P1->x, t4);  in p256_add_mixed()
1657 		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);  in p256_add_mixed()
1659 	norm13(P1->x, P1->x, 20);  in p256_add_mixed()
1660 	reduce_f256(P1->x);  in p256_add_mixed()
1663 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.  in p256_add_mixed()
1666 		t6[i] += (F256[i] << 1) - P1->x[i];  in p256_add_mixed()
1669 	mul_f256(P1->y, t4, t6);  in p256_add_mixed()
1672 		P1->y[i] += (F256[i] << 1) - t1[i];  in p256_add_mixed()
1674 	norm13(P1->y, P1->y, 20);  in p256_add_mixed()
1675 	reduce_f256(P1->y);  in p256_add_mixed()
1680 	mul_f256(P1->z, P1->z, t2);  in p256_add_mixed()
1686  * Decode a P-256 point. This function does not support the point at
1705 	 * least significant bit of the Y coordinate), but it is explicitly  in p256_decode()
1726 		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];  in p256_decode()
1738 	memcpy(P->x, tx, sizeof tx);  in p256_decode()
1739 	memcpy(P->y, ty, sizeof ty);  in p256_decode()
1740 	memset(P->z, 0, sizeof P->z);  in p256_decode()
1741 	P->z[0] = 1;  in p256_decode()
1756 	le13_to_be8(buf + 1, 32, P->x);  in p256_encode()
1757 	le13_to_be8(buf + 33, 32, P->y);  in p256_encode()
1772 	 * We use a 2-bit window to handle multiplier bits by pairs.  in p256_mul()
1791 	while (xlen -- > 0) {  in p256_mul()
1794 		for (k = 6; k >= 0; k -= 2) {  in p256_mul()
1819  * the point are encoded as 20 words of 13 bits each (little-endian
1820  * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821  * (little-endian order within each word).
1917  * Lookup one of the Gwin[] values, by index. This is constant-time.
1930 		m = -EQ(idx, k + 1);  in lookup_Gwin()
1936 		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;  in lookup_Gwin()
1937 		T->x[(u << 1) + 1] = xy[u] >> 16;  in lookup_Gwin()
1938 		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;  in lookup_Gwin()
1939 		T->y[(u << 1) + 1] = xy[u + 10] >> 16;  in lookup_Gwin()
1941 	memset(T->z, 0, sizeof T->z);  in lookup_Gwin()
1942 	T->z[0] = 1;  in lookup_Gwin()
1946  * Multiply the generator by an integer. The integer is assumed non-zero
1956 	 * We use a 4-bit window to handle multiplier bits by groups  in p256_mulgen()
1958 	 * points in affine coordinates; we use a constant-time lookup.  in p256_mulgen()
1965 	while (xlen -- > 0) {  in p256_mulgen()