xref: /freebsd/contrib/bearssl/src/ec/ec_p256_m31.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "inner.h"
26 
27 /*
28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29  * that right-shifting a signed negative integer copies the sign bit
30  * (arithmetic right-shift). This is "implementation-defined behaviour",
31  * i.e. it is not undefined, but it may differ between compilers. Each
32  * compiler is supposed to document its behaviour in that respect. GCC
33  * explicitly defines that an arithmetic right shift is used. We expect
34  * all other compilers to do the same, because underlying CPU offer an
35  * arithmetic right shift opcode that could not be used otherwise.
36  */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n)    (((uint32_t)(x) >> (n)) \
39                       | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #define ARSHW(x, n)   (((uint64_t)(x) >> (n)) \
41                       | ((-((uint64_t)(x) >> 63)) << (64 - (n))))
42 #else
43 #define ARSH(x, n)    ((*(int32_t *)&(x)) >> (n))
44 #define ARSHW(x, n)   ((*(int64_t *)&(x)) >> (n))
45 #endif
46 
47 /*
48  * Convert an integer from unsigned big-endian encoding to a sequence of
49  * 30-bit words in little-endian order. The final "partial" word is
50  * returned.
51  */
52 static uint32_t
53 be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
54 {
55 	uint32_t acc;
56 	int acc_len;
57 
58 	acc = 0;
59 	acc_len = 0;
60 	while (len -- > 0) {
61 		uint32_t b;
62 
63 		b = src[len];
64 		if (acc_len < 22) {
65 			acc |= b << acc_len;
66 			acc_len += 8;
67 		} else {
68 			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
69 			acc = b >> (30 - acc_len);
70 			acc_len -= 22;
71 		}
72 	}
73 	return acc;
74 }
75 
76 /*
77  * Convert an integer (30-bit words, little-endian) to unsigned
78  * big-endian encoding. The total encoding length is provided; all
79  * the destination bytes will be filled.
80  */
81 static void
82 le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
83 {
84 	uint32_t acc;
85 	int acc_len;
86 
87 	acc = 0;
88 	acc_len = 0;
89 	while (len -- > 0) {
90 		if (acc_len < 8) {
91 			uint32_t w;
92 
93 			w = *src ++;
94 			dst[len] = (unsigned char)(acc | (w << acc_len));
95 			acc = w >> (8 - acc_len);
96 			acc_len += 22;
97 		} else {
98 			dst[len] = (unsigned char)acc;
99 			acc >>= 8;
100 			acc_len -= 8;
101 		}
102 	}
103 }
104 
105 /*
106  * Multiply two integers. Source integers are represented as arrays of
107  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
108  * 18 words of 30 bits each.
109  */
110 static void
111 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
112 {
113 	/*
114 	 * Maximum intermediate result is no more than
115 	 * 10376293531797946367, which fits in 64 bits. Reason:
116 	 *
117 	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
118 	 *   10376293531797946367 < 9663676407 * 2^30
119 	 *
120 	 * Thus, adding together 9 products of 30-bit integers, with
121 	 * a carry of at most 9663676406, yields an integer that fits
122 	 * on 64 bits and generates a carry of at most 9663676406.
123 	 */
124 	uint64_t t[17];
125 	uint64_t cc;
126 	int i;
127 
128 	t[ 0] = MUL31(a[0], b[0]);
129 	t[ 1] = MUL31(a[0], b[1])
130 		+ MUL31(a[1], b[0]);
131 	t[ 2] = MUL31(a[0], b[2])
132 		+ MUL31(a[1], b[1])
133 		+ MUL31(a[2], b[0]);
134 	t[ 3] = MUL31(a[0], b[3])
135 		+ MUL31(a[1], b[2])
136 		+ MUL31(a[2], b[1])
137 		+ MUL31(a[3], b[0]);
138 	t[ 4] = MUL31(a[0], b[4])
139 		+ MUL31(a[1], b[3])
140 		+ MUL31(a[2], b[2])
141 		+ MUL31(a[3], b[1])
142 		+ MUL31(a[4], b[0]);
143 	t[ 5] = MUL31(a[0], b[5])
144 		+ MUL31(a[1], b[4])
145 		+ MUL31(a[2], b[3])
146 		+ MUL31(a[3], b[2])
147 		+ MUL31(a[4], b[1])
148 		+ MUL31(a[5], b[0]);
149 	t[ 6] = MUL31(a[0], b[6])
150 		+ MUL31(a[1], b[5])
151 		+ MUL31(a[2], b[4])
152 		+ MUL31(a[3], b[3])
153 		+ MUL31(a[4], b[2])
154 		+ MUL31(a[5], b[1])
155 		+ MUL31(a[6], b[0]);
156 	t[ 7] = MUL31(a[0], b[7])
157 		+ MUL31(a[1], b[6])
158 		+ MUL31(a[2], b[5])
159 		+ MUL31(a[3], b[4])
160 		+ MUL31(a[4], b[3])
161 		+ MUL31(a[5], b[2])
162 		+ MUL31(a[6], b[1])
163 		+ MUL31(a[7], b[0]);
164 	t[ 8] = MUL31(a[0], b[8])
165 		+ MUL31(a[1], b[7])
166 		+ MUL31(a[2], b[6])
167 		+ MUL31(a[3], b[5])
168 		+ MUL31(a[4], b[4])
169 		+ MUL31(a[5], b[3])
170 		+ MUL31(a[6], b[2])
171 		+ MUL31(a[7], b[1])
172 		+ MUL31(a[8], b[0]);
173 	t[ 9] = MUL31(a[1], b[8])
174 		+ MUL31(a[2], b[7])
175 		+ MUL31(a[3], b[6])
176 		+ MUL31(a[4], b[5])
177 		+ MUL31(a[5], b[4])
178 		+ MUL31(a[6], b[3])
179 		+ MUL31(a[7], b[2])
180 		+ MUL31(a[8], b[1]);
181 	t[10] = MUL31(a[2], b[8])
182 		+ MUL31(a[3], b[7])
183 		+ MUL31(a[4], b[6])
184 		+ MUL31(a[5], b[5])
185 		+ MUL31(a[6], b[4])
186 		+ MUL31(a[7], b[3])
187 		+ MUL31(a[8], b[2]);
188 	t[11] = MUL31(a[3], b[8])
189 		+ MUL31(a[4], b[7])
190 		+ MUL31(a[5], b[6])
191 		+ MUL31(a[6], b[5])
192 		+ MUL31(a[7], b[4])
193 		+ MUL31(a[8], b[3]);
194 	t[12] = MUL31(a[4], b[8])
195 		+ MUL31(a[5], b[7])
196 		+ MUL31(a[6], b[6])
197 		+ MUL31(a[7], b[5])
198 		+ MUL31(a[8], b[4]);
199 	t[13] = MUL31(a[5], b[8])
200 		+ MUL31(a[6], b[7])
201 		+ MUL31(a[7], b[6])
202 		+ MUL31(a[8], b[5]);
203 	t[14] = MUL31(a[6], b[8])
204 		+ MUL31(a[7], b[7])
205 		+ MUL31(a[8], b[6]);
206 	t[15] = MUL31(a[7], b[8])
207 		+ MUL31(a[8], b[7]);
208 	t[16] = MUL31(a[8], b[8]);
209 
210 	/*
211 	 * Propagate carries.
212 	 */
213 	cc = 0;
214 	for (i = 0; i < 17; i ++) {
215 		uint64_t w;
216 
217 		w = t[i] + cc;
218 		d[i] = (uint32_t)w & 0x3FFFFFFF;
219 		cc = w >> 30;
220 	}
221 	d[17] = (uint32_t)cc;
222 }
223 
224 /*
225  * Square a 270-bit integer, represented as an array of nine 30-bit words.
226  * Result uses 18 words of 30 bits each.
227  */
228 static void
229 square9(uint32_t *d, const uint32_t *a)
230 {
231 	uint64_t t[17];
232 	uint64_t cc;
233 	int i;
234 
235 	t[ 0] = MUL31(a[0], a[0]);
236 	t[ 1] = ((MUL31(a[0], a[1])) << 1);
237 	t[ 2] = MUL31(a[1], a[1])
238 		+ ((MUL31(a[0], a[2])) << 1);
239 	t[ 3] = ((MUL31(a[0], a[3])
240 		+ MUL31(a[1], a[2])) << 1);
241 	t[ 4] = MUL31(a[2], a[2])
242 		+ ((MUL31(a[0], a[4])
243 		+ MUL31(a[1], a[3])) << 1);
244 	t[ 5] = ((MUL31(a[0], a[5])
245 		+ MUL31(a[1], a[4])
246 		+ MUL31(a[2], a[3])) << 1);
247 	t[ 6] = MUL31(a[3], a[3])
248 		+ ((MUL31(a[0], a[6])
249 		+ MUL31(a[1], a[5])
250 		+ MUL31(a[2], a[4])) << 1);
251 	t[ 7] = ((MUL31(a[0], a[7])
252 		+ MUL31(a[1], a[6])
253 		+ MUL31(a[2], a[5])
254 		+ MUL31(a[3], a[4])) << 1);
255 	t[ 8] = MUL31(a[4], a[4])
256 		+ ((MUL31(a[0], a[8])
257 		+ MUL31(a[1], a[7])
258 		+ MUL31(a[2], a[6])
259 		+ MUL31(a[3], a[5])) << 1);
260 	t[ 9] = ((MUL31(a[1], a[8])
261 		+ MUL31(a[2], a[7])
262 		+ MUL31(a[3], a[6])
263 		+ MUL31(a[4], a[5])) << 1);
264 	t[10] = MUL31(a[5], a[5])
265 		+ ((MUL31(a[2], a[8])
266 		+ MUL31(a[3], a[7])
267 		+ MUL31(a[4], a[6])) << 1);
268 	t[11] = ((MUL31(a[3], a[8])
269 		+ MUL31(a[4], a[7])
270 		+ MUL31(a[5], a[6])) << 1);
271 	t[12] = MUL31(a[6], a[6])
272 		+ ((MUL31(a[4], a[8])
273 		+ MUL31(a[5], a[7])) << 1);
274 	t[13] = ((MUL31(a[5], a[8])
275 		+ MUL31(a[6], a[7])) << 1);
276 	t[14] = MUL31(a[7], a[7])
277 		+ ((MUL31(a[6], a[8])) << 1);
278 	t[15] = ((MUL31(a[7], a[8])) << 1);
279 	t[16] = MUL31(a[8], a[8]);
280 
281 	/*
282 	 * Propagate carries.
283 	 */
284 	cc = 0;
285 	for (i = 0; i < 17; i ++) {
286 		uint64_t w;
287 
288 		w = t[i] + cc;
289 		d[i] = (uint32_t)w & 0x3FFFFFFF;
290 		cc = w >> 30;
291 	}
292 	d[17] = (uint32_t)cc;
293 }
294 
295 /*
296  * Base field modulus for P-256.
297  */
298 static const uint32_t F256[] = {
299 
300 	0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000,
301 	0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF
302 };
303 
304 /*
305  * The 'b' curve equation coefficient for P-256.
306  */
307 static const uint32_t P256_B[] = {
308 
309 	0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65,
310 	0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6
311 };
312 
313 /*
314  * Addition in the field. Source operands shall fit on 257 bits; output
315  * will be lower than twice the modulus.
316  */
317 static void
318 add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
319 {
320 	uint32_t w, cc;
321 	int i;
322 
323 	cc = 0;
324 	for (i = 0; i < 9; i ++) {
325 		w = a[i] + b[i] + cc;
326 		d[i] = w & 0x3FFFFFFF;
327 		cc = w >> 30;
328 	}
329 	w >>= 16;
330 	d[8] &= 0xFFFF;
331 	d[3] -= w << 6;
332 	d[6] -= w << 12;
333 	d[7] += w << 14;
334 	cc = w;
335 	for (i = 0; i < 9; i ++) {
336 		w = d[i] + cc;
337 		d[i] = w & 0x3FFFFFFF;
338 		cc = ARSH(w, 30);
339 	}
340 }
341 
342 /*
343  * Subtraction in the field. Source operands shall be smaller than twice
344  * the modulus; the result will fulfil the same property.
345  */
346 static void
347 sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
348 {
349 	uint32_t w, cc;
350 	int i;
351 
352 	/*
353 	 * We really compute a - b + 2*p to make sure that the result is
354 	 * positive.
355 	 */
356 	w = a[0] - b[0] - 0x00002;
357 	d[0] = w & 0x3FFFFFFF;
358 	w = a[1] - b[1] + ARSH(w, 30);
359 	d[1] = w & 0x3FFFFFFF;
360 	w = a[2] - b[2] + ARSH(w, 30);
361 	d[2] = w & 0x3FFFFFFF;
362 	w = a[3] - b[3] + ARSH(w, 30) + 0x00080;
363 	d[3] = w & 0x3FFFFFFF;
364 	w = a[4] - b[4] + ARSH(w, 30);
365 	d[4] = w & 0x3FFFFFFF;
366 	w = a[5] - b[5] + ARSH(w, 30);
367 	d[5] = w & 0x3FFFFFFF;
368 	w = a[6] - b[6] + ARSH(w, 30) + 0x02000;
369 	d[6] = w & 0x3FFFFFFF;
370 	w = a[7] - b[7] + ARSH(w, 30) - 0x08000;
371 	d[7] = w & 0x3FFFFFFF;
372 	w = a[8] - b[8] + ARSH(w, 30) + 0x20000;
373 	d[8] = w & 0xFFFF;
374 	w >>= 16;
375 	d[8] &= 0xFFFF;
376 	d[3] -= w << 6;
377 	d[6] -= w << 12;
378 	d[7] += w << 14;
379 	cc = w;
380 	for (i = 0; i < 9; i ++) {
381 		w = d[i] + cc;
382 		d[i] = w & 0x3FFFFFFF;
383 		cc = ARSH(w, 30);
384 	}
385 }
386 
387 /*
388  * Compute a multiplication in F256. Source operands shall be less than
389  * twice the modulus.
390  */
391 static void
392 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
393 {
394 	uint32_t t[18];
395 	uint64_t s[18];
396 	uint64_t cc, x;
397 	uint32_t z, c;
398 	int i;
399 
400 	mul9(t, a, b);
401 
402 	/*
403 	 * Modular reduction: each high word in added/subtracted where
404 	 * necessary.
405 	 *
406 	 * The modulus is:
407 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
408 	 * Therefore:
409 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
410 	 *
411 	 * For a word x at bit offset n (n >= 256), we have:
412 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
413 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
414 	 *
415 	 * Thus, we can nullify the high word if we reinject it at some
416 	 * proper emplacements.
417 	 *
418 	 * We use 64-bit intermediate words to allow for carries to
419 	 * accumulate easily, before performing the final propagation.
420 	 */
421 	for (i = 0; i < 18; i ++) {
422 		s[i] = t[i];
423 	}
424 
425 	for (i = 17; i >= 9; i --) {
426 		uint64_t y;
427 
428 		y = s[i];
429 		s[i - 1] += ARSHW(y, 2);
430 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
431 		s[i - 2] -= ARSHW(y, 4);
432 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
433 		s[i - 5] -= ARSHW(y, 10);
434 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
435 		s[i - 8] += ARSHW(y, 16);
436 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
437 	}
438 
439 	/*
440 	 * Carry propagation must be signed. Moreover, we may have overdone
441 	 * it a bit, and obtain a negative result.
442 	 *
443 	 * The loop above ran 9 times; each time, each word was augmented
444 	 * by at most one extra word (in absolute value). Thus, the top
445 	 * word must in fine fit in 39 bits, so the carry below will fit
446 	 * on 9 bits.
447 	 */
448 	cc = 0;
449 	for (i = 0; i < 9; i ++) {
450 		x = s[i] + cc;
451 		d[i] = (uint32_t)x & 0x3FFFFFFF;
452 		cc = ARSHW(x, 30);
453 	}
454 
455 	/*
456 	 * All nine words fit on 30 bits, but there may be an extra
457 	 * carry for a few bits (at most 9), and that carry may be
458 	 * negative. Moreover, we want the result to fit on 257 bits.
459 	 * The two lines below ensure that the word in d[] has length
460 	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
461 	 * significant length of cc is less than 24 bits, so we will be
462 	 * able to switch to 32-bit operations.
463 	 */
464 	cc = ARSHW(x, 16);
465 	d[8] &= 0xFFFF;
466 
467 	/*
468 	 * One extra round of reduction, for cc*2^256, which means
469 	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
470 	 * value. If cc is negative, then it may happen (rarely, but
471 	 * not neglectibly so) that the result would be negative. In
472 	 * order to avoid that, if cc is negative, then we add the
473 	 * modulus once. Note that if cc is negative, then propagating
474 	 * that carry must yield a value lower than the modulus, so
475 	 * adding the modulus once will keep the final result under
476 	 * twice the modulus.
477 	 */
478 	z = (uint32_t)cc;
479 	d[3] -= z << 6;
480 	d[6] -= (z << 12) & 0x3FFFFFFF;
481 	d[7] -= ARSH(z, 18);
482 	d[7] += (z << 14) & 0x3FFFFFFF;
483 	d[8] += ARSH(z, 16);
484 	c = z >> 31;
485 	d[0] -= c;
486 	d[3] += c << 6;
487 	d[6] += c << 12;
488 	d[7] -= c << 14;
489 	d[8] += c << 16;
490 	for (i = 0; i < 9; i ++) {
491 		uint32_t w;
492 
493 		w = d[i] + z;
494 		d[i] = w & 0x3FFFFFFF;
495 		z = ARSH(w, 30);
496 	}
497 }
498 
499 /*
500  * Compute a square in F256. Source operand shall be less than
501  * twice the modulus.
502  */
503 static void
504 square_f256(uint32_t *d, const uint32_t *a)
505 {
506 	uint32_t t[18];
507 	uint64_t s[18];
508 	uint64_t cc, x;
509 	uint32_t z, c;
510 	int i;
511 
512 	square9(t, a);
513 
514 	/*
515 	 * Modular reduction: each high word in added/subtracted where
516 	 * necessary.
517 	 *
518 	 * The modulus is:
519 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
520 	 * Therefore:
521 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
522 	 *
523 	 * For a word x at bit offset n (n >= 256), we have:
524 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
525 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
526 	 *
527 	 * Thus, we can nullify the high word if we reinject it at some
528 	 * proper emplacements.
529 	 *
530 	 * We use 64-bit intermediate words to allow for carries to
531 	 * accumulate easily, before performing the final propagation.
532 	 */
533 	for (i = 0; i < 18; i ++) {
534 		s[i] = t[i];
535 	}
536 
537 	for (i = 17; i >= 9; i --) {
538 		uint64_t y;
539 
540 		y = s[i];
541 		s[i - 1] += ARSHW(y, 2);
542 		s[i - 2] += (y << 28) & 0x3FFFFFFF;
543 		s[i - 2] -= ARSHW(y, 4);
544 		s[i - 3] -= (y << 26) & 0x3FFFFFFF;
545 		s[i - 5] -= ARSHW(y, 10);
546 		s[i - 6] -= (y << 20) & 0x3FFFFFFF;
547 		s[i - 8] += ARSHW(y, 16);
548 		s[i - 9] += (y << 14) & 0x3FFFFFFF;
549 	}
550 
551 	/*
552 	 * Carry propagation must be signed. Moreover, we may have overdone
553 	 * it a bit, and obtain a negative result.
554 	 *
555 	 * The loop above ran 9 times; each time, each word was augmented
556 	 * by at most one extra word (in absolute value). Thus, the top
557 	 * word must in fine fit in 39 bits, so the carry below will fit
558 	 * on 9 bits.
559 	 */
560 	cc = 0;
561 	for (i = 0; i < 9; i ++) {
562 		x = s[i] + cc;
563 		d[i] = (uint32_t)x & 0x3FFFFFFF;
564 		cc = ARSHW(x, 30);
565 	}
566 
567 	/*
568 	 * All nine words fit on 30 bits, but there may be an extra
569 	 * carry for a few bits (at most 9), and that carry may be
570 	 * negative. Moreover, we want the result to fit on 257 bits.
571 	 * The two lines below ensure that the word in d[] has length
572 	 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The
573 	 * significant length of cc is less than 24 bits, so we will be
574 	 * able to switch to 32-bit operations.
575 	 */
576 	cc = ARSHW(x, 16);
577 	d[8] &= 0xFFFF;
578 
579 	/*
580 	 * One extra round of reduction, for cc*2^256, which means
581 	 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative)
582 	 * value. If cc is negative, then it may happen (rarely, but
583 	 * not neglectibly so) that the result would be negative. In
584 	 * order to avoid that, if cc is negative, then we add the
585 	 * modulus once. Note that if cc is negative, then propagating
586 	 * that carry must yield a value lower than the modulus, so
587 	 * adding the modulus once will keep the final result under
588 	 * twice the modulus.
589 	 */
590 	z = (uint32_t)cc;
591 	d[3] -= z << 6;
592 	d[6] -= (z << 12) & 0x3FFFFFFF;
593 	d[7] -= ARSH(z, 18);
594 	d[7] += (z << 14) & 0x3FFFFFFF;
595 	d[8] += ARSH(z, 16);
596 	c = z >> 31;
597 	d[0] -= c;
598 	d[3] += c << 6;
599 	d[6] += c << 12;
600 	d[7] -= c << 14;
601 	d[8] += c << 16;
602 	for (i = 0; i < 9; i ++) {
603 		uint32_t w;
604 
605 		w = d[i] + z;
606 		d[i] = w & 0x3FFFFFFF;
607 		z = ARSH(w, 30);
608 	}
609 }
610 
611 /*
612  * Perform a "final reduction" in field F256 (field for curve P-256).
613  * The source value must be less than twice the modulus. If the value
614  * is not lower than the modulus, then the modulus is subtracted and
615  * this function returns 1; otherwise, it leaves it untouched and it
616  * returns 0.
617  */
618 static uint32_t
619 reduce_final_f256(uint32_t *d)
620 {
621 	uint32_t t[9];
622 	uint32_t cc;
623 	int i;
624 
625 	cc = 0;
626 	for (i = 0; i < 9; i ++) {
627 		uint32_t w;
628 
629 		w = d[i] - F256[i] - cc;
630 		cc = w >> 31;
631 		t[i] = w & 0x3FFFFFFF;
632 	}
633 	cc ^= 1;
634 	CCOPY(cc, d, t, sizeof t);
635 	return cc;
636 }
637 
638 /*
639  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
640  * are such that:
641  *   X = x / z^2
642  *   Y = y / z^3
643  * For the point at infinity, z = 0.
644  * Each point thus admits many possible representations.
645  *
646  * Coordinates are represented in arrays of 32-bit integers, each holding
647  * 30 bits of data. Values may also be slightly greater than the modulus,
648  * but they will always be lower than twice the modulus.
649  */
650 typedef struct {
651 	uint32_t x[9];
652 	uint32_t y[9];
653 	uint32_t z[9];
654 } p256_jacobian;
655 
656 /*
657  * Convert a point to affine coordinates:
658  *  - If the point is the point at infinity, then all three coordinates
659  *    are set to 0.
660  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
661  *    coordinates are the 'X' and 'Y' affine coordinates.
662  * The coordinates are guaranteed to be lower than the modulus.
663  */
664 static void
665 p256_to_affine(p256_jacobian *P)
666 {
667 	uint32_t t1[9], t2[9];
668 	int i;
669 
670 	/*
671 	 * Invert z with a modular exponentiation: the modulus is
672 	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
673 	 * p-2. Exponent bit pattern (from high to low) is:
674 	 *  - 32 bits of value 1
675 	 *  - 31 bits of value 0
676 	 *  - 1 bit of value 1
677 	 *  - 96 bits of value 0
678 	 *  - 94 bits of value 1
679 	 *  - 1 bit of value 0
680 	 *  - 1 bit of value 1
681 	 * Thus, we precompute z^(2^31-1) to speed things up.
682 	 *
683 	 * If z = 0 (point at infinity) then the modular exponentiation
684 	 * will yield 0, which leads to the expected result (all three
685 	 * coordinates set to 0).
686 	 */
687 
688 	/*
689 	 * A simple square-and-multiply for z^(2^31-1). We could save about
690 	 * two dozen multiplications here with an addition chain, but
691 	 * this would require a bit more code, and extra stack buffers.
692 	 */
693 	memcpy(t1, P->z, sizeof P->z);
694 	for (i = 0; i < 30; i ++) {
695 		square_f256(t1, t1);
696 		mul_f256(t1, t1, P->z);
697 	}
698 
699 	/*
700 	 * Square-and-multiply. Apart from the squarings, we have a few
701 	 * multiplications to set bits to 1; we multiply by the original z
702 	 * for setting 1 bit, and by t1 for setting 31 bits.
703 	 */
704 	memcpy(t2, P->z, sizeof P->z);
705 	for (i = 1; i < 256; i ++) {
706 		square_f256(t2, t2);
707 		switch (i) {
708 		case 31:
709 		case 190:
710 		case 221:
711 		case 252:
712 			mul_f256(t2, t2, t1);
713 			break;
714 		case 63:
715 		case 253:
716 		case 255:
717 			mul_f256(t2, t2, P->z);
718 			break;
719 		}
720 	}
721 
722 	/*
723 	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
724 	 */
725 	mul_f256(t1, t2, t2);
726 	mul_f256(P->x, t1, P->x);
727 	mul_f256(t1, t1, t2);
728 	mul_f256(P->y, t1, P->y);
729 	reduce_final_f256(P->x);
730 	reduce_final_f256(P->y);
731 
732 	/*
733 	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
734 	 * this will set z to 1.
735 	 */
736 	mul_f256(P->z, P->z, t2);
737 	reduce_final_f256(P->z);
738 }
739 
740 /*
741  * Double a point in P-256. This function works for all valid points,
742  * including the point at infinity.
743  */
744 static void
745 p256_double(p256_jacobian *Q)
746 {
747 	/*
748 	 * Doubling formulas are:
749 	 *
750 	 *   s = 4*x*y^2
751 	 *   m = 3*(x + z^2)*(x - z^2)
752 	 *   x' = m^2 - 2*s
753 	 *   y' = m*(s - x') - 8*y^4
754 	 *   z' = 2*y*z
755 	 *
756 	 * These formulas work for all points, including points of order 2
757 	 * and points at infinity:
758 	 *   - If y = 0 then z' = 0. But there is no such point in P-256
759 	 *     anyway.
760 	 *   - If z = 0 then z' = 0.
761 	 */
762 	uint32_t t1[9], t2[9], t3[9], t4[9];
763 
764 	/*
765 	 * Compute z^2 in t1.
766 	 */
767 	square_f256(t1, Q->z);
768 
769 	/*
770 	 * Compute x-z^2 in t2 and x+z^2 in t1.
771 	 */
772 	add_f256(t2, Q->x, t1);
773 	sub_f256(t1, Q->x, t1);
774 
775 	/*
776 	 * Compute 3*(x+z^2)*(x-z^2) in t1.
777 	 */
778 	mul_f256(t3, t1, t2);
779 	add_f256(t1, t3, t3);
780 	add_f256(t1, t3, t1);
781 
782 	/*
783 	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
784 	 */
785 	square_f256(t3, Q->y);
786 	add_f256(t3, t3, t3);
787 	mul_f256(t2, Q->x, t3);
788 	add_f256(t2, t2, t2);
789 
790 	/*
791 	 * Compute x' = m^2 - 2*s.
792 	 */
793 	square_f256(Q->x, t1);
794 	sub_f256(Q->x, Q->x, t2);
795 	sub_f256(Q->x, Q->x, t2);
796 
797 	/*
798 	 * Compute z' = 2*y*z.
799 	 */
800 	mul_f256(t4, Q->y, Q->z);
801 	add_f256(Q->z, t4, t4);
802 
803 	/*
804 	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
805 	 * 2*y^2 in t3.
806 	 */
807 	sub_f256(t2, t2, Q->x);
808 	mul_f256(Q->y, t1, t2);
809 	square_f256(t4, t3);
810 	add_f256(t4, t4, t4);
811 	sub_f256(Q->y, Q->y, t4);
812 }
813 
814 /*
815  * Add point P2 to point P1.
816  *
817  * This function computes the wrong result in the following cases:
818  *
819  *   - If P1 == 0 but P2 != 0
820  *   - If P1 != 0 but P2 == 0
821  *   - If P1 == P2
822  *
823  * In all three cases, P1 is set to the point at infinity.
824  *
825  * Returned value is 0 if one of the following occurs:
826  *
827  *   - P1 and P2 have the same Y coordinate
828  *   - P1 == 0 and P2 == 0
829  *   - The Y coordinate of one of the points is 0 and the other point is
830  *     the point at infinity.
831  *
832  * The third case cannot actually happen with valid points, since a point
833  * with Y == 0 is a point of order 2, and there is no point of order 2 on
834  * curve P-256.
835  *
836  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
837  * can apply the following:
838  *
839  *   - If the result is not the point at infinity, then it is correct.
840  *   - Otherwise, if the returned value is 1, then this is a case of
841  *     P1+P2 == 0, so the result is indeed the point at infinity.
842  *   - Otherwise, P1 == P2, so a "double" operation should have been
843  *     performed.
844  */
845 static uint32_t
846 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
847 {
848 	/*
849 	 * Addtions formulas are:
850 	 *
851 	 *   u1 = x1 * z2^2
852 	 *   u2 = x2 * z1^2
853 	 *   s1 = y1 * z2^3
854 	 *   s2 = y2 * z1^3
855 	 *   h = u2 - u1
856 	 *   r = s2 - s1
857 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
858 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
859 	 *   z3 = h * z1 * z2
860 	 */
861 	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
862 	uint32_t ret;
863 	int i;
864 
865 	/*
866 	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
867 	 */
868 	square_f256(t3, P2->z);
869 	mul_f256(t1, P1->x, t3);
870 	mul_f256(t4, P2->z, t3);
871 	mul_f256(t3, P1->y, t4);
872 
873 	/*
874 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
875 	 */
876 	square_f256(t4, P1->z);
877 	mul_f256(t2, P2->x, t4);
878 	mul_f256(t5, P1->z, t4);
879 	mul_f256(t4, P2->y, t5);
880 
881 	/*
882 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
883 	 * We need to test whether r is zero, so we will do some extra
884 	 * reduce.
885 	 */
886 	sub_f256(t2, t2, t1);
887 	sub_f256(t4, t4, t3);
888 	reduce_final_f256(t4);
889 	ret = 0;
890 	for (i = 0; i < 9; i ++) {
891 		ret |= t4[i];
892 	}
893 	ret = (ret | -ret) >> 31;
894 
895 	/*
896 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
897 	 */
898 	square_f256(t7, t2);
899 	mul_f256(t6, t1, t7);
900 	mul_f256(t5, t7, t2);
901 
902 	/*
903 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
904 	 */
905 	square_f256(P1->x, t4);
906 	sub_f256(P1->x, P1->x, t5);
907 	sub_f256(P1->x, P1->x, t6);
908 	sub_f256(P1->x, P1->x, t6);
909 
910 	/*
911 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
912 	 */
913 	sub_f256(t6, t6, P1->x);
914 	mul_f256(P1->y, t4, t6);
915 	mul_f256(t1, t5, t3);
916 	sub_f256(P1->y, P1->y, t1);
917 
918 	/*
919 	 * Compute z3 = h*z1*z2.
920 	 */
921 	mul_f256(t1, P1->z, P2->z);
922 	mul_f256(P1->z, t1, t2);
923 
924 	return ret;
925 }
926 
927 /*
928  * Add point P2 to point P1. This is a specialised function for the
929  * case when P2 is a non-zero point in affine coordinate.
930  *
931  * This function computes the wrong result in the following cases:
932  *
933  *   - If P1 == 0
934  *   - If P1 == P2
935  *
936  * In both cases, P1 is set to the point at infinity.
937  *
938  * Returned value is 0 if one of the following occurs:
939  *
940  *   - P1 and P2 have the same Y coordinate
941  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
942  *
943  * The second case cannot actually happen with valid points, since a point
944  * with Y == 0 is a point of order 2, and there is no point of order 2 on
945  * curve P-256.
946  *
947  * Therefore, assuming that P1 != 0 on input, then the caller
948  * can apply the following:
949  *
950  *   - If the result is not the point at infinity, then it is correct.
951  *   - Otherwise, if the returned value is 1, then this is a case of
952  *     P1+P2 == 0, so the result is indeed the point at infinity.
953  *   - Otherwise, P1 == P2, so a "double" operation should have been
954  *     performed.
955  */
956 static uint32_t
957 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
958 {
959 	/*
960 	 * Addtions formulas are:
961 	 *
962 	 *   u1 = x1
963 	 *   u2 = x2 * z1^2
964 	 *   s1 = y1
965 	 *   s2 = y2 * z1^3
966 	 *   h = u2 - u1
967 	 *   r = s2 - s1
968 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
969 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
970 	 *   z3 = h * z1
971 	 */
972 	uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9];
973 	uint32_t ret;
974 	int i;
975 
976 	/*
977 	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
978 	 */
979 	memcpy(t1, P1->x, sizeof t1);
980 	memcpy(t3, P1->y, sizeof t3);
981 
982 	/*
983 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
984 	 */
985 	square_f256(t4, P1->z);
986 	mul_f256(t2, P2->x, t4);
987 	mul_f256(t5, P1->z, t4);
988 	mul_f256(t4, P2->y, t5);
989 
990 	/*
991 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
992 	 * We need to test whether r is zero, so we will do some extra
993 	 * reduce.
994 	 */
995 	sub_f256(t2, t2, t1);
996 	sub_f256(t4, t4, t3);
997 	reduce_final_f256(t4);
998 	ret = 0;
999 	for (i = 0; i < 9; i ++) {
1000 		ret |= t4[i];
1001 	}
1002 	ret = (ret | -ret) >> 31;
1003 
1004 	/*
1005 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1006 	 */
1007 	square_f256(t7, t2);
1008 	mul_f256(t6, t1, t7);
1009 	mul_f256(t5, t7, t2);
1010 
1011 	/*
1012 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1013 	 */
1014 	square_f256(P1->x, t4);
1015 	sub_f256(P1->x, P1->x, t5);
1016 	sub_f256(P1->x, P1->x, t6);
1017 	sub_f256(P1->x, P1->x, t6);
1018 
1019 	/*
1020 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1021 	 */
1022 	sub_f256(t6, t6, P1->x);
1023 	mul_f256(P1->y, t4, t6);
1024 	mul_f256(t1, t5, t3);
1025 	sub_f256(P1->y, P1->y, t1);
1026 
1027 	/*
1028 	 * Compute z3 = h*z1*z2.
1029 	 */
1030 	mul_f256(P1->z, P1->z, t2);
1031 
1032 	return ret;
1033 }
1034 
1035 /*
1036  * Decode a P-256 point. This function does not support the point at
1037  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1038  */
1039 static uint32_t
1040 p256_decode(p256_jacobian *P, const void *src, size_t len)
1041 {
1042 	const unsigned char *buf;
1043 	uint32_t tx[9], ty[9], t1[9], t2[9];
1044 	uint32_t bad;
1045 	int i;
1046 
1047 	if (len != 65) {
1048 		return 0;
1049 	}
1050 	buf = src;
1051 
1052 	/*
1053 	 * First byte must be 0x04 (uncompressed format). We could support
1054 	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1055 	 * least significant bit of the Y coordinate), but it is explicitly
1056 	 * forbidden by RFC 5480 (section 2.2).
1057 	 */
1058 	bad = NEQ(buf[0], 0x04);
1059 
1060 	/*
1061 	 * Decode the coordinates, and check that they are both lower
1062 	 * than the modulus.
1063 	 */
1064 	tx[8] = be8_to_le30(tx, buf + 1, 32);
1065 	ty[8] = be8_to_le30(ty, buf + 33, 32);
1066 	bad |= reduce_final_f256(tx);
1067 	bad |= reduce_final_f256(ty);
1068 
1069 	/*
1070 	 * Check curve equation.
1071 	 */
1072 	square_f256(t1, tx);
1073 	mul_f256(t1, tx, t1);
1074 	square_f256(t2, ty);
1075 	sub_f256(t1, t1, tx);
1076 	sub_f256(t1, t1, tx);
1077 	sub_f256(t1, t1, tx);
1078 	add_f256(t1, t1, P256_B);
1079 	sub_f256(t1, t1, t2);
1080 	reduce_final_f256(t1);
1081 	for (i = 0; i < 9; i ++) {
1082 		bad |= t1[i];
1083 	}
1084 
1085 	/*
1086 	 * Copy coordinates to the point structure.
1087 	 */
1088 	memcpy(P->x, tx, sizeof tx);
1089 	memcpy(P->y, ty, sizeof ty);
1090 	memset(P->z, 0, sizeof P->z);
1091 	P->z[0] = 1;
1092 	return EQ(bad, 0);
1093 }
1094 
1095 /*
1096  * Encode a point into a buffer. This function assumes that the point is
1097  * valid, in affine coordinates, and not the point at infinity.
1098  */
1099 static void
1100 p256_encode(void *dst, const p256_jacobian *P)
1101 {
1102 	unsigned char *buf;
1103 
1104 	buf = dst;
1105 	buf[0] = 0x04;
1106 	le30_to_be8(buf + 1, 32, P->x);
1107 	le30_to_be8(buf + 33, 32, P->y);
1108 }
1109 
1110 /*
1111  * Multiply a curve point by an integer. The integer is assumed to be
1112  * lower than the curve order, and the base point must not be the point
1113  * at infinity.
1114  */
1115 static void
1116 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1117 {
1118 	/*
1119 	 * qz is a flag that is initially 1, and remains equal to 1
1120 	 * as long as the point is the point at infinity.
1121 	 *
1122 	 * We use a 2-bit window to handle multiplier bits by pairs.
1123 	 * The precomputed window really is the points P2 and P3.
1124 	 */
1125 	uint32_t qz;
1126 	p256_jacobian P2, P3, Q, T, U;
1127 
1128 	/*
1129 	 * Compute window values.
1130 	 */
1131 	P2 = *P;
1132 	p256_double(&P2);
1133 	P3 = *P;
1134 	p256_add(&P3, &P2);
1135 
1136 	/*
1137 	 * We start with Q = 0. We process multiplier bits 2 by 2.
1138 	 */
1139 	memset(&Q, 0, sizeof Q);
1140 	qz = 1;
1141 	while (xlen -- > 0) {
1142 		int k;
1143 
1144 		for (k = 6; k >= 0; k -= 2) {
1145 			uint32_t bits;
1146 			uint32_t bnz;
1147 
1148 			p256_double(&Q);
1149 			p256_double(&Q);
1150 			T = *P;
1151 			U = Q;
1152 			bits = (*x >> k) & (uint32_t)3;
1153 			bnz = NEQ(bits, 0);
1154 			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1155 			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1156 			p256_add(&U, &T);
1157 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1158 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1159 			qz &= ~bnz;
1160 		}
1161 		x ++;
1162 	}
1163 	*P = Q;
1164 }
1165 
1166 /*
1167  * Precomputed window: k*G points, where G is the curve generator, and k
1168  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1169  * the point are encoded as 9 words of 30 bits each (little-endian
1170  * order).
1171  */
1172 static const uint32_t Gwin[15][18] = {
1173 
1174 	{ 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B,
1175 	  0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B,
1176 	  0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC,
1177 	  0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE,
1178 	  0x10B8BF86, 0x00004FE3 },
1179 
1180 	{ 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D,
1181 	  0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340,
1182 	  0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299,
1183 	  0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293,
1184 	  0x154436E3, 0x00000777 },
1185 
1186 	{ 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B,
1187 	  0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C,
1188 	  0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9,
1189 	  0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374,
1190 	  0x19031266, 0x00008734 },
1191 
1192 	{ 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE,
1193 	  0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4,
1194 	  0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055,
1195 	  0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D,
1196 	  0x15D69318, 0x0000E0F1 },
1197 
1198 	{ 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47,
1199 	  0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454,
1200 	  0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D,
1201 	  0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE,
1202 	  0x1F6A2412, 0x0000E0C1 },
1203 
1204 	{ 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A,
1205 	  0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9,
1206 	  0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF,
1207 	  0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE,
1208 	  0x041D0C8D, 0x0000E85C },
1209 
1210 	{ 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A,
1211 	  0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F,
1212 	  0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C,
1213 	  0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0,
1214 	  0x076F780C, 0x000073EB },
1215 
1216 	{ 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603,
1217 	  0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA,
1218 	  0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D,
1219 	  0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF,
1220 	  0x332F647A, 0x0000AD5A },
1221 
1222 	{ 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B,
1223 	  0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7,
1224 	  0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE,
1225 	  0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870,
1226 	  0x11325CB2, 0x00002A27 },
1227 
1228 	{ 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7,
1229 	  0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E,
1230 	  0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC,
1231 	  0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1,
1232 	  0x18A88A6A, 0x00008786 },
1233 
1234 	{ 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409,
1235 	  0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E,
1236 	  0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE,
1237 	  0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C,
1238 	  0x0826B331, 0x00009099 },
1239 
1240 	{ 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C,
1241 	  0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05,
1242 	  0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71,
1243 	  0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567,
1244 	  0x2D1AA70E, 0x00000770 },
1245 
1246 	{ 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9,
1247 	  0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B,
1248 	  0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39,
1249 	  0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24,
1250 	  0x163353AF, 0x000063BB },
1251 
1252 	{ 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E,
1253 	  0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E,
1254 	  0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081,
1255 	  0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421,
1256 	  0x3C6ECA7D, 0x0000F599 },
1257 
1258 	{ 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7,
1259 	  0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6,
1260 	  0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4,
1261 	  0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6,
1262 	  0x0FB8D64B, 0x0000B5B9 }
1263 };
1264 
1265 /*
1266  * Lookup one of the Gwin[] values, by index. This is constant-time.
1267  */
1268 static void
1269 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1270 {
1271 	uint32_t xy[18];
1272 	uint32_t k;
1273 	size_t u;
1274 
1275 	memset(xy, 0, sizeof xy);
1276 	for (k = 0; k < 15; k ++) {
1277 		uint32_t m;
1278 
1279 		m = -EQ(idx, k + 1);
1280 		for (u = 0; u < 18; u ++) {
1281 			xy[u] |= m & Gwin[k][u];
1282 		}
1283 	}
1284 	memcpy(T->x, &xy[0], sizeof T->x);
1285 	memcpy(T->y, &xy[9], sizeof T->y);
1286 	memset(T->z, 0, sizeof T->z);
1287 	T->z[0] = 1;
1288 }
1289 
1290 /*
1291  * Multiply the generator by an integer. The integer is assumed non-zero
1292  * and lower than the curve order.
1293  */
1294 static void
1295 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1296 {
1297 	/*
1298 	 * qz is a flag that is initially 1, and remains equal to 1
1299 	 * as long as the point is the point at infinity.
1300 	 *
1301 	 * We use a 4-bit window to handle multiplier bits by groups
1302 	 * of 4. The precomputed window is constant static data, with
1303 	 * points in affine coordinates; we use a constant-time lookup.
1304 	 */
1305 	p256_jacobian Q;
1306 	uint32_t qz;
1307 
1308 	memset(&Q, 0, sizeof Q);
1309 	qz = 1;
1310 	while (xlen -- > 0) {
1311 		int k;
1312 		unsigned bx;
1313 
1314 		bx = *x ++;
1315 		for (k = 0; k < 2; k ++) {
1316 			uint32_t bits;
1317 			uint32_t bnz;
1318 			p256_jacobian T, U;
1319 
1320 			p256_double(&Q);
1321 			p256_double(&Q);
1322 			p256_double(&Q);
1323 			p256_double(&Q);
1324 			bits = (bx >> 4) & 0x0F;
1325 			bnz = NEQ(bits, 0);
1326 			lookup_Gwin(&T, bits);
1327 			U = Q;
1328 			p256_add_mixed(&U, &T);
1329 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1330 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1331 			qz &= ~bnz;
1332 			bx <<= 4;
1333 		}
1334 	}
1335 	*P = Q;
1336 }
1337 
1338 static const unsigned char P256_G[] = {
1339 	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1340 	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1341 	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1342 	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1343 	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1344 	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
1345 	0x68, 0x37, 0xBF, 0x51, 0xF5
1346 };
1347 
1348 static const unsigned char P256_N[] = {
1349 	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
1350 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
1351 	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
1352 	0x25, 0x51
1353 };
1354 
1355 static const unsigned char *
1356 api_generator(int curve, size_t *len)
1357 {
1358 	(void)curve;
1359 	*len = sizeof P256_G;
1360 	return P256_G;
1361 }
1362 
1363 static const unsigned char *
1364 api_order(int curve, size_t *len)
1365 {
1366 	(void)curve;
1367 	*len = sizeof P256_N;
1368 	return P256_N;
1369 }
1370 
1371 static size_t
1372 api_xoff(int curve, size_t *len)
1373 {
1374 	(void)curve;
1375 	*len = 32;
1376 	return 1;
1377 }
1378 
1379 static uint32_t
1380 api_mul(unsigned char *G, size_t Glen,
1381 	const unsigned char *x, size_t xlen, int curve)
1382 {
1383 	uint32_t r;
1384 	p256_jacobian P;
1385 
1386 	(void)curve;
1387 	r = p256_decode(&P, G, Glen);
1388 	p256_mul(&P, x, xlen);
1389 	if (Glen >= 65) {
1390 		p256_to_affine(&P);
1391 		p256_encode(G, &P);
1392 	}
1393 	return r;
1394 }
1395 
1396 static size_t
1397 api_mulgen(unsigned char *R,
1398 	const unsigned char *x, size_t xlen, int curve)
1399 {
1400 	p256_jacobian P;
1401 
1402 	(void)curve;
1403 	p256_mulgen(&P, x, xlen);
1404 	p256_to_affine(&P);
1405 	p256_encode(R, &P);
1406 	return 65;
1407 
1408 	/*
1409 	const unsigned char *G;
1410 	size_t Glen;
1411 
1412 	G = api_generator(curve, &Glen);
1413 	memcpy(R, G, Glen);
1414 	api_mul(R, Glen, x, xlen, curve);
1415 	return Glen;
1416 	*/
1417 }
1418 
1419 static uint32_t
1420 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1421 	const unsigned char *x, size_t xlen,
1422 	const unsigned char *y, size_t ylen, int curve)
1423 {
1424 	p256_jacobian P, Q;
1425 	uint32_t r, t, z;
1426 	int i;
1427 
1428 	(void)curve;
1429 	r = p256_decode(&P, A, len);
1430 	p256_mul(&P, x, xlen);
1431 	if (B == NULL) {
1432 		p256_mulgen(&Q, y, ylen);
1433 	} else {
1434 		r &= p256_decode(&Q, B, len);
1435 		p256_mul(&Q, y, ylen);
1436 	}
1437 
1438 	/*
1439 	 * The final addition may fail in case both points are equal.
1440 	 */
1441 	t = p256_add(&P, &Q);
1442 	reduce_final_f256(P.z);
1443 	z = 0;
1444 	for (i = 0; i < 9; i ++) {
1445 		z |= P.z[i];
1446 	}
1447 	z = EQ(z, 0);
1448 	p256_double(&Q);
1449 
1450 	/*
1451 	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
1452 	 * have the following:
1453 	 *
1454 	 *   z = 0, t = 0   return P (normal addition)
1455 	 *   z = 0, t = 1   return P (normal addition)
1456 	 *   z = 1, t = 0   return Q (a 'double' case)
1457 	 *   z = 1, t = 1   report an error (P+Q = 0)
1458 	 */
1459 	CCOPY(z & ~t, &P, &Q, sizeof Q);
1460 	p256_to_affine(&P);
1461 	p256_encode(A, &P);
1462 	r &= ~(z & t);
1463 	return r;
1464 }
1465 
1466 /* see bearssl_ec.h */
1467 const br_ec_impl br_ec_p256_m31 = {
1468 	(uint32_t)0x00800000,
1469 	&api_generator,
1470 	&api_order,
1471 	&api_xoff,
1472 	&api_mul,
1473 	&api_mulgen,
1474 	&api_muladd
1475 };
1476