xref: /freebsd/contrib/bearssl/src/ec/ec_p256_m15.c (revision 29fc4075e69fd27de0cded313ac6000165d99f8b)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "inner.h"
26 
27 /*
28  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
29  * that right-shifting a signed negative integer copies the sign bit
30  * (arithmetic right-shift). This is "implementation-defined behaviour",
31  * i.e. it is not undefined, but it may differ between compilers. Each
32  * compiler is supposed to document its behaviour in that respect. GCC
33  * explicitly defines that an arithmetic right shift is used. We expect
34  * all other compilers to do the same, because underlying CPU offer an
35  * arithmetic right shift opcode that could not be used otherwise.
36  */
37 #if BR_NO_ARITH_SHIFT
38 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
39                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
40 #else
41 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
42 #endif
43 
44 /*
45  * Convert an integer from unsigned big-endian encoding to a sequence of
46  * 13-bit words in little-endian order. The final "partial" word is
47  * returned.
48  */
49 static uint32_t
50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
51 {
52 	uint32_t acc;
53 	int acc_len;
54 
55 	acc = 0;
56 	acc_len = 0;
57 	while (len -- > 0) {
58 		acc |= (uint32_t)src[len] << acc_len;
59 		acc_len += 8;
60 		if (acc_len >= 13) {
61 			*dst ++ = acc & 0x1FFF;
62 			acc >>= 13;
63 			acc_len -= 13;
64 		}
65 	}
66 	return acc;
67 }
68 
69 /*
70  * Convert an integer (13-bit words, little-endian) to unsigned
71  * big-endian encoding. The total encoding length is provided; all
72  * the destination bytes will be filled.
73  */
74 static void
75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src)
76 {
77 	uint32_t acc;
78 	int acc_len;
79 
80 	acc = 0;
81 	acc_len = 0;
82 	while (len -- > 0) {
83 		if (acc_len < 8) {
84 			acc |= (*src ++) << acc_len;
85 			acc_len += 13;
86 		}
87 		dst[len] = (unsigned char)acc;
88 		acc >>= 8;
89 		acc_len -= 8;
90 	}
91 }
92 
93 /*
94  * Normalise an array of words to a strict 13 bits per word. Returned
95  * value is the resulting carry. The source (w) and destination (d)
96  * arrays may be identical, but shall not overlap partially.
97  */
98 static inline uint32_t
99 norm13(uint32_t *d, const uint32_t *w, size_t len)
100 {
101 	size_t u;
102 	uint32_t cc;
103 
104 	cc = 0;
105 	for (u = 0; u < len; u ++) {
106 		int32_t z;
107 
108 		z = w[u] + cc;
109 		d[u] = z & 0x1FFF;
110 		cc = ARSH(z, 13);
111 	}
112 	return cc;
113 }
114 
115 /*
116  * mul20() multiplies two 260-bit integers together. Each word must fit
117  * on 13 bits; source operands use 20 words, destination operand
118  * receives 40 words. All overlaps allowed.
119  *
120  * square20() computes the square of a 260-bit integer. Each word must
121  * fit on 13 bits; source operand uses 20 words, destination operand
122  * receives 40 words. All overlaps allowed.
123  */
124 
125 #if BR_SLOW_MUL15
126 
127 static void
128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
129 {
130 	/*
131 	 * Two-level Karatsuba: turns a 20x20 multiplication into
132 	 * nine 5x5 multiplications. We use 13-bit words but do not
133 	 * propagate carries immediately, so words may expand:
134 	 *
135 	 *  - First Karatsuba decomposition turns the 20x20 mul on
136 	 *    13-bit words into three 10x10 muls, two on 13-bit words
137 	 *    and one on 14-bit words.
138 	 *
139 	 *  - Second Karatsuba decomposition further splits these into:
140 	 *
141 	 *     * four 5x5 muls on 13-bit words
142 	 *     * four 5x5 muls on 14-bit words
143 	 *     * one 5x5 mul on 15-bit words
144 	 *
145 	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
146 	 * or 15-bit words, respectively.
147 	 */
148 	uint32_t u[45], v[45], w[90];
149 	uint32_t cc;
150 	int i;
151 
152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
153 		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
154 			+ (s2w)[5 * (s2_off) + 0]; \
155 		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
156 			+ (s2w)[5 * (s2_off) + 1]; \
157 		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
158 			+ (s2w)[5 * (s2_off) + 2]; \
159 		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
160 			+ (s2w)[5 * (s2_off) + 3]; \
161 		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
162 			+ (s2w)[5 * (s2_off) + 4]; \
163 	} while (0)
164 
165 #define ZADDT(dw, d_off, sw, s_off)   do { \
166 		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
167 		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
168 		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
169 		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
170 		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
171 	} while (0)
172 
173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
174 		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
175 			+ (s2w)[5 * (s2_off) + 0]; \
176 		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
177 			+ (s2w)[5 * (s2_off) + 1]; \
178 		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
179 			+ (s2w)[5 * (s2_off) + 2]; \
180 		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
181 			+ (s2w)[5 * (s2_off) + 3]; \
182 		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
183 			+ (s2w)[5 * (s2_off) + 4]; \
184 	} while (0)
185 
186 #define CPR1(w, cprcc)   do { \
187 		uint32_t cprz = (w) + cprcc; \
188 		(w) = cprz & 0x1FFF; \
189 		cprcc = cprz >> 13; \
190 	} while (0)
191 
192 #define CPR(dw, d_off)   do { \
193 		uint32_t cprcc; \
194 		cprcc = 0; \
195 		CPR1((dw)[(d_off) + 0], cprcc); \
196 		CPR1((dw)[(d_off) + 1], cprcc); \
197 		CPR1((dw)[(d_off) + 2], cprcc); \
198 		CPR1((dw)[(d_off) + 3], cprcc); \
199 		CPR1((dw)[(d_off) + 4], cprcc); \
200 		CPR1((dw)[(d_off) + 5], cprcc); \
201 		CPR1((dw)[(d_off) + 6], cprcc); \
202 		CPR1((dw)[(d_off) + 7], cprcc); \
203 		CPR1((dw)[(d_off) + 8], cprcc); \
204 		(dw)[(d_off) + 9] = cprcc; \
205 	} while (0)
206 
207 	memcpy(u, a, 20 * sizeof *a);
208 	ZADD(u, 4, a, 0, a, 1);
209 	ZADD(u, 5, a, 2, a, 3);
210 	ZADD(u, 6, a, 0, a, 2);
211 	ZADD(u, 7, a, 1, a, 3);
212 	ZADD(u, 8, u, 6, u, 7);
213 
214 	memcpy(v, b, 20 * sizeof *b);
215 	ZADD(v, 4, b, 0, b, 1);
216 	ZADD(v, 5, b, 2, b, 3);
217 	ZADD(v, 6, b, 0, b, 2);
218 	ZADD(v, 7, b, 1, b, 3);
219 	ZADD(v, 8, v, 6, v, 7);
220 
221 	/*
222 	 * Do the eight first 8x8 muls. Source words are at most 16382
223 	 * each, so we can add product results together "as is" in 32-bit
224 	 * words.
225 	 */
226 	for (i = 0; i < 40; i += 5) {
227 		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
228 		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
229 			+ MUL15(u[i + 1], v[i + 0]);
230 		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
231 			+ MUL15(u[i + 1], v[i + 1])
232 			+ MUL15(u[i + 2], v[i + 0]);
233 		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
234 			+ MUL15(u[i + 1], v[i + 2])
235 			+ MUL15(u[i + 2], v[i + 1])
236 			+ MUL15(u[i + 3], v[i + 0]);
237 		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
238 			+ MUL15(u[i + 1], v[i + 3])
239 			+ MUL15(u[i + 2], v[i + 2])
240 			+ MUL15(u[i + 3], v[i + 1])
241 			+ MUL15(u[i + 4], v[i + 0]);
242 		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
243 			+ MUL15(u[i + 2], v[i + 3])
244 			+ MUL15(u[i + 3], v[i + 2])
245 			+ MUL15(u[i + 4], v[i + 1]);
246 		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
247 			+ MUL15(u[i + 3], v[i + 3])
248 			+ MUL15(u[i + 4], v[i + 2]);
249 		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
250 			+ MUL15(u[i + 4], v[i + 3]);
251 		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
252 		w[(i << 1) + 9] = 0;
253 	}
254 
255 	/*
256 	 * For the 9th multiplication, source words are up to 32764,
257 	 * so we must do some carry propagation. If we add up to
258 	 * 4 products and the carry is no more than 524224, then the
259 	 * result fits in 32 bits, and the next carry will be no more
260 	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
261 	 *
262 	 * We thus just skip one of the products in the middle word,
263 	 * then do a carry propagation (this reduces words to 13 bits
264 	 * each, except possibly the last, which may use up to 17 bits
265 	 * or so), then add the missing product.
266 	 */
267 	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
268 	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
269 		+ MUL15(u[40 + 1], v[40 + 0]);
270 	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
271 		+ MUL15(u[40 + 1], v[40 + 1])
272 		+ MUL15(u[40 + 2], v[40 + 0]);
273 	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
274 		+ MUL15(u[40 + 1], v[40 + 2])
275 		+ MUL15(u[40 + 2], v[40 + 1])
276 		+ MUL15(u[40 + 3], v[40 + 0]);
277 	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
278 		+ MUL15(u[40 + 1], v[40 + 3])
279 		+ MUL15(u[40 + 2], v[40 + 2])
280 		+ MUL15(u[40 + 3], v[40 + 1]);
281 		/* + MUL15(u[40 + 4], v[40 + 0]) */
282 	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
283 		+ MUL15(u[40 + 2], v[40 + 3])
284 		+ MUL15(u[40 + 3], v[40 + 2])
285 		+ MUL15(u[40 + 4], v[40 + 1]);
286 	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
287 		+ MUL15(u[40 + 3], v[40 + 3])
288 		+ MUL15(u[40 + 4], v[40 + 2]);
289 	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
290 		+ MUL15(u[40 + 4], v[40 + 3]);
291 	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
292 
293 	CPR(w, 80);
294 
295 	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
296 
297 	/*
298 	 * The products on 14-bit words in slots 6 and 7 yield values
299 	 * up to 5*(16382^2) each, and we need to subtract two such
300 	 * values from the higher word. We need the subtraction to fit
301 	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
302 	 * However, 10*(16382^2) does not fit. So we must perform a
303 	 * bit of reduction here.
304 	 */
305 	CPR(w, 60);
306 	CPR(w, 70);
307 
308 	/*
309 	 * Recompose results.
310 	 */
311 
312 	/* 0..1*0..1 into 0..3 */
313 	ZSUB2F(w, 8, w, 0, w, 2);
314 	ZSUB2F(w, 9, w, 1, w, 3);
315 	ZADDT(w, 1, w, 8);
316 	ZADDT(w, 2, w, 9);
317 
318 	/* 2..3*2..3 into 4..7 */
319 	ZSUB2F(w, 10, w, 4, w, 6);
320 	ZSUB2F(w, 11, w, 5, w, 7);
321 	ZADDT(w, 5, w, 10);
322 	ZADDT(w, 6, w, 11);
323 
324 	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
325 	ZSUB2F(w, 16, w, 12, w, 14);
326 	ZSUB2F(w, 17, w, 13, w, 15);
327 	ZADDT(w, 13, w, 16);
328 	ZADDT(w, 14, w, 17);
329 
330 	/* first-level recomposition */
331 	ZSUB2F(w, 12, w, 0, w, 4);
332 	ZSUB2F(w, 13, w, 1, w, 5);
333 	ZSUB2F(w, 14, w, 2, w, 6);
334 	ZSUB2F(w, 15, w, 3, w, 7);
335 	ZADDT(w, 2, w, 12);
336 	ZADDT(w, 3, w, 13);
337 	ZADDT(w, 4, w, 14);
338 	ZADDT(w, 5, w, 15);
339 
340 	/*
341 	 * Perform carry propagation to bring all words down to 13 bits.
342 	 */
343 	cc = norm13(d, w, 40);
344 	d[39] += (cc << 13);
345 
346 #undef ZADD
347 #undef ZADDT
348 #undef ZSUB2F
349 #undef CPR1
350 #undef CPR
351 }
352 
353 static inline void
354 square20(uint32_t *d, const uint32_t *a)
355 {
356 	mul20(d, a, a);
357 }
358 
359 #else
360 
361 static void
362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
363 {
364 	uint32_t t[39];
365 
366 	t[ 0] = MUL15(a[ 0], b[ 0]);
367 	t[ 1] = MUL15(a[ 0], b[ 1])
368 		+ MUL15(a[ 1], b[ 0]);
369 	t[ 2] = MUL15(a[ 0], b[ 2])
370 		+ MUL15(a[ 1], b[ 1])
371 		+ MUL15(a[ 2], b[ 0]);
372 	t[ 3] = MUL15(a[ 0], b[ 3])
373 		+ MUL15(a[ 1], b[ 2])
374 		+ MUL15(a[ 2], b[ 1])
375 		+ MUL15(a[ 3], b[ 0]);
376 	t[ 4] = MUL15(a[ 0], b[ 4])
377 		+ MUL15(a[ 1], b[ 3])
378 		+ MUL15(a[ 2], b[ 2])
379 		+ MUL15(a[ 3], b[ 1])
380 		+ MUL15(a[ 4], b[ 0]);
381 	t[ 5] = MUL15(a[ 0], b[ 5])
382 		+ MUL15(a[ 1], b[ 4])
383 		+ MUL15(a[ 2], b[ 3])
384 		+ MUL15(a[ 3], b[ 2])
385 		+ MUL15(a[ 4], b[ 1])
386 		+ MUL15(a[ 5], b[ 0]);
387 	t[ 6] = MUL15(a[ 0], b[ 6])
388 		+ MUL15(a[ 1], b[ 5])
389 		+ MUL15(a[ 2], b[ 4])
390 		+ MUL15(a[ 3], b[ 3])
391 		+ MUL15(a[ 4], b[ 2])
392 		+ MUL15(a[ 5], b[ 1])
393 		+ MUL15(a[ 6], b[ 0]);
394 	t[ 7] = MUL15(a[ 0], b[ 7])
395 		+ MUL15(a[ 1], b[ 6])
396 		+ MUL15(a[ 2], b[ 5])
397 		+ MUL15(a[ 3], b[ 4])
398 		+ MUL15(a[ 4], b[ 3])
399 		+ MUL15(a[ 5], b[ 2])
400 		+ MUL15(a[ 6], b[ 1])
401 		+ MUL15(a[ 7], b[ 0]);
402 	t[ 8] = MUL15(a[ 0], b[ 8])
403 		+ MUL15(a[ 1], b[ 7])
404 		+ MUL15(a[ 2], b[ 6])
405 		+ MUL15(a[ 3], b[ 5])
406 		+ MUL15(a[ 4], b[ 4])
407 		+ MUL15(a[ 5], b[ 3])
408 		+ MUL15(a[ 6], b[ 2])
409 		+ MUL15(a[ 7], b[ 1])
410 		+ MUL15(a[ 8], b[ 0]);
411 	t[ 9] = MUL15(a[ 0], b[ 9])
412 		+ MUL15(a[ 1], b[ 8])
413 		+ MUL15(a[ 2], b[ 7])
414 		+ MUL15(a[ 3], b[ 6])
415 		+ MUL15(a[ 4], b[ 5])
416 		+ MUL15(a[ 5], b[ 4])
417 		+ MUL15(a[ 6], b[ 3])
418 		+ MUL15(a[ 7], b[ 2])
419 		+ MUL15(a[ 8], b[ 1])
420 		+ MUL15(a[ 9], b[ 0]);
421 	t[10] = MUL15(a[ 0], b[10])
422 		+ MUL15(a[ 1], b[ 9])
423 		+ MUL15(a[ 2], b[ 8])
424 		+ MUL15(a[ 3], b[ 7])
425 		+ MUL15(a[ 4], b[ 6])
426 		+ MUL15(a[ 5], b[ 5])
427 		+ MUL15(a[ 6], b[ 4])
428 		+ MUL15(a[ 7], b[ 3])
429 		+ MUL15(a[ 8], b[ 2])
430 		+ MUL15(a[ 9], b[ 1])
431 		+ MUL15(a[10], b[ 0]);
432 	t[11] = MUL15(a[ 0], b[11])
433 		+ MUL15(a[ 1], b[10])
434 		+ MUL15(a[ 2], b[ 9])
435 		+ MUL15(a[ 3], b[ 8])
436 		+ MUL15(a[ 4], b[ 7])
437 		+ MUL15(a[ 5], b[ 6])
438 		+ MUL15(a[ 6], b[ 5])
439 		+ MUL15(a[ 7], b[ 4])
440 		+ MUL15(a[ 8], b[ 3])
441 		+ MUL15(a[ 9], b[ 2])
442 		+ MUL15(a[10], b[ 1])
443 		+ MUL15(a[11], b[ 0]);
444 	t[12] = MUL15(a[ 0], b[12])
445 		+ MUL15(a[ 1], b[11])
446 		+ MUL15(a[ 2], b[10])
447 		+ MUL15(a[ 3], b[ 9])
448 		+ MUL15(a[ 4], b[ 8])
449 		+ MUL15(a[ 5], b[ 7])
450 		+ MUL15(a[ 6], b[ 6])
451 		+ MUL15(a[ 7], b[ 5])
452 		+ MUL15(a[ 8], b[ 4])
453 		+ MUL15(a[ 9], b[ 3])
454 		+ MUL15(a[10], b[ 2])
455 		+ MUL15(a[11], b[ 1])
456 		+ MUL15(a[12], b[ 0]);
457 	t[13] = MUL15(a[ 0], b[13])
458 		+ MUL15(a[ 1], b[12])
459 		+ MUL15(a[ 2], b[11])
460 		+ MUL15(a[ 3], b[10])
461 		+ MUL15(a[ 4], b[ 9])
462 		+ MUL15(a[ 5], b[ 8])
463 		+ MUL15(a[ 6], b[ 7])
464 		+ MUL15(a[ 7], b[ 6])
465 		+ MUL15(a[ 8], b[ 5])
466 		+ MUL15(a[ 9], b[ 4])
467 		+ MUL15(a[10], b[ 3])
468 		+ MUL15(a[11], b[ 2])
469 		+ MUL15(a[12], b[ 1])
470 		+ MUL15(a[13], b[ 0]);
471 	t[14] = MUL15(a[ 0], b[14])
472 		+ MUL15(a[ 1], b[13])
473 		+ MUL15(a[ 2], b[12])
474 		+ MUL15(a[ 3], b[11])
475 		+ MUL15(a[ 4], b[10])
476 		+ MUL15(a[ 5], b[ 9])
477 		+ MUL15(a[ 6], b[ 8])
478 		+ MUL15(a[ 7], b[ 7])
479 		+ MUL15(a[ 8], b[ 6])
480 		+ MUL15(a[ 9], b[ 5])
481 		+ MUL15(a[10], b[ 4])
482 		+ MUL15(a[11], b[ 3])
483 		+ MUL15(a[12], b[ 2])
484 		+ MUL15(a[13], b[ 1])
485 		+ MUL15(a[14], b[ 0]);
486 	t[15] = MUL15(a[ 0], b[15])
487 		+ MUL15(a[ 1], b[14])
488 		+ MUL15(a[ 2], b[13])
489 		+ MUL15(a[ 3], b[12])
490 		+ MUL15(a[ 4], b[11])
491 		+ MUL15(a[ 5], b[10])
492 		+ MUL15(a[ 6], b[ 9])
493 		+ MUL15(a[ 7], b[ 8])
494 		+ MUL15(a[ 8], b[ 7])
495 		+ MUL15(a[ 9], b[ 6])
496 		+ MUL15(a[10], b[ 5])
497 		+ MUL15(a[11], b[ 4])
498 		+ MUL15(a[12], b[ 3])
499 		+ MUL15(a[13], b[ 2])
500 		+ MUL15(a[14], b[ 1])
501 		+ MUL15(a[15], b[ 0]);
502 	t[16] = MUL15(a[ 0], b[16])
503 		+ MUL15(a[ 1], b[15])
504 		+ MUL15(a[ 2], b[14])
505 		+ MUL15(a[ 3], b[13])
506 		+ MUL15(a[ 4], b[12])
507 		+ MUL15(a[ 5], b[11])
508 		+ MUL15(a[ 6], b[10])
509 		+ MUL15(a[ 7], b[ 9])
510 		+ MUL15(a[ 8], b[ 8])
511 		+ MUL15(a[ 9], b[ 7])
512 		+ MUL15(a[10], b[ 6])
513 		+ MUL15(a[11], b[ 5])
514 		+ MUL15(a[12], b[ 4])
515 		+ MUL15(a[13], b[ 3])
516 		+ MUL15(a[14], b[ 2])
517 		+ MUL15(a[15], b[ 1])
518 		+ MUL15(a[16], b[ 0]);
519 	t[17] = MUL15(a[ 0], b[17])
520 		+ MUL15(a[ 1], b[16])
521 		+ MUL15(a[ 2], b[15])
522 		+ MUL15(a[ 3], b[14])
523 		+ MUL15(a[ 4], b[13])
524 		+ MUL15(a[ 5], b[12])
525 		+ MUL15(a[ 6], b[11])
526 		+ MUL15(a[ 7], b[10])
527 		+ MUL15(a[ 8], b[ 9])
528 		+ MUL15(a[ 9], b[ 8])
529 		+ MUL15(a[10], b[ 7])
530 		+ MUL15(a[11], b[ 6])
531 		+ MUL15(a[12], b[ 5])
532 		+ MUL15(a[13], b[ 4])
533 		+ MUL15(a[14], b[ 3])
534 		+ MUL15(a[15], b[ 2])
535 		+ MUL15(a[16], b[ 1])
536 		+ MUL15(a[17], b[ 0]);
537 	t[18] = MUL15(a[ 0], b[18])
538 		+ MUL15(a[ 1], b[17])
539 		+ MUL15(a[ 2], b[16])
540 		+ MUL15(a[ 3], b[15])
541 		+ MUL15(a[ 4], b[14])
542 		+ MUL15(a[ 5], b[13])
543 		+ MUL15(a[ 6], b[12])
544 		+ MUL15(a[ 7], b[11])
545 		+ MUL15(a[ 8], b[10])
546 		+ MUL15(a[ 9], b[ 9])
547 		+ MUL15(a[10], b[ 8])
548 		+ MUL15(a[11], b[ 7])
549 		+ MUL15(a[12], b[ 6])
550 		+ MUL15(a[13], b[ 5])
551 		+ MUL15(a[14], b[ 4])
552 		+ MUL15(a[15], b[ 3])
553 		+ MUL15(a[16], b[ 2])
554 		+ MUL15(a[17], b[ 1])
555 		+ MUL15(a[18], b[ 0]);
556 	t[19] = MUL15(a[ 0], b[19])
557 		+ MUL15(a[ 1], b[18])
558 		+ MUL15(a[ 2], b[17])
559 		+ MUL15(a[ 3], b[16])
560 		+ MUL15(a[ 4], b[15])
561 		+ MUL15(a[ 5], b[14])
562 		+ MUL15(a[ 6], b[13])
563 		+ MUL15(a[ 7], b[12])
564 		+ MUL15(a[ 8], b[11])
565 		+ MUL15(a[ 9], b[10])
566 		+ MUL15(a[10], b[ 9])
567 		+ MUL15(a[11], b[ 8])
568 		+ MUL15(a[12], b[ 7])
569 		+ MUL15(a[13], b[ 6])
570 		+ MUL15(a[14], b[ 5])
571 		+ MUL15(a[15], b[ 4])
572 		+ MUL15(a[16], b[ 3])
573 		+ MUL15(a[17], b[ 2])
574 		+ MUL15(a[18], b[ 1])
575 		+ MUL15(a[19], b[ 0]);
576 	t[20] = MUL15(a[ 1], b[19])
577 		+ MUL15(a[ 2], b[18])
578 		+ MUL15(a[ 3], b[17])
579 		+ MUL15(a[ 4], b[16])
580 		+ MUL15(a[ 5], b[15])
581 		+ MUL15(a[ 6], b[14])
582 		+ MUL15(a[ 7], b[13])
583 		+ MUL15(a[ 8], b[12])
584 		+ MUL15(a[ 9], b[11])
585 		+ MUL15(a[10], b[10])
586 		+ MUL15(a[11], b[ 9])
587 		+ MUL15(a[12], b[ 8])
588 		+ MUL15(a[13], b[ 7])
589 		+ MUL15(a[14], b[ 6])
590 		+ MUL15(a[15], b[ 5])
591 		+ MUL15(a[16], b[ 4])
592 		+ MUL15(a[17], b[ 3])
593 		+ MUL15(a[18], b[ 2])
594 		+ MUL15(a[19], b[ 1]);
595 	t[21] = MUL15(a[ 2], b[19])
596 		+ MUL15(a[ 3], b[18])
597 		+ MUL15(a[ 4], b[17])
598 		+ MUL15(a[ 5], b[16])
599 		+ MUL15(a[ 6], b[15])
600 		+ MUL15(a[ 7], b[14])
601 		+ MUL15(a[ 8], b[13])
602 		+ MUL15(a[ 9], b[12])
603 		+ MUL15(a[10], b[11])
604 		+ MUL15(a[11], b[10])
605 		+ MUL15(a[12], b[ 9])
606 		+ MUL15(a[13], b[ 8])
607 		+ MUL15(a[14], b[ 7])
608 		+ MUL15(a[15], b[ 6])
609 		+ MUL15(a[16], b[ 5])
610 		+ MUL15(a[17], b[ 4])
611 		+ MUL15(a[18], b[ 3])
612 		+ MUL15(a[19], b[ 2]);
613 	t[22] = MUL15(a[ 3], b[19])
614 		+ MUL15(a[ 4], b[18])
615 		+ MUL15(a[ 5], b[17])
616 		+ MUL15(a[ 6], b[16])
617 		+ MUL15(a[ 7], b[15])
618 		+ MUL15(a[ 8], b[14])
619 		+ MUL15(a[ 9], b[13])
620 		+ MUL15(a[10], b[12])
621 		+ MUL15(a[11], b[11])
622 		+ MUL15(a[12], b[10])
623 		+ MUL15(a[13], b[ 9])
624 		+ MUL15(a[14], b[ 8])
625 		+ MUL15(a[15], b[ 7])
626 		+ MUL15(a[16], b[ 6])
627 		+ MUL15(a[17], b[ 5])
628 		+ MUL15(a[18], b[ 4])
629 		+ MUL15(a[19], b[ 3]);
630 	t[23] = MUL15(a[ 4], b[19])
631 		+ MUL15(a[ 5], b[18])
632 		+ MUL15(a[ 6], b[17])
633 		+ MUL15(a[ 7], b[16])
634 		+ MUL15(a[ 8], b[15])
635 		+ MUL15(a[ 9], b[14])
636 		+ MUL15(a[10], b[13])
637 		+ MUL15(a[11], b[12])
638 		+ MUL15(a[12], b[11])
639 		+ MUL15(a[13], b[10])
640 		+ MUL15(a[14], b[ 9])
641 		+ MUL15(a[15], b[ 8])
642 		+ MUL15(a[16], b[ 7])
643 		+ MUL15(a[17], b[ 6])
644 		+ MUL15(a[18], b[ 5])
645 		+ MUL15(a[19], b[ 4]);
646 	t[24] = MUL15(a[ 5], b[19])
647 		+ MUL15(a[ 6], b[18])
648 		+ MUL15(a[ 7], b[17])
649 		+ MUL15(a[ 8], b[16])
650 		+ MUL15(a[ 9], b[15])
651 		+ MUL15(a[10], b[14])
652 		+ MUL15(a[11], b[13])
653 		+ MUL15(a[12], b[12])
654 		+ MUL15(a[13], b[11])
655 		+ MUL15(a[14], b[10])
656 		+ MUL15(a[15], b[ 9])
657 		+ MUL15(a[16], b[ 8])
658 		+ MUL15(a[17], b[ 7])
659 		+ MUL15(a[18], b[ 6])
660 		+ MUL15(a[19], b[ 5]);
661 	t[25] = MUL15(a[ 6], b[19])
662 		+ MUL15(a[ 7], b[18])
663 		+ MUL15(a[ 8], b[17])
664 		+ MUL15(a[ 9], b[16])
665 		+ MUL15(a[10], b[15])
666 		+ MUL15(a[11], b[14])
667 		+ MUL15(a[12], b[13])
668 		+ MUL15(a[13], b[12])
669 		+ MUL15(a[14], b[11])
670 		+ MUL15(a[15], b[10])
671 		+ MUL15(a[16], b[ 9])
672 		+ MUL15(a[17], b[ 8])
673 		+ MUL15(a[18], b[ 7])
674 		+ MUL15(a[19], b[ 6]);
675 	t[26] = MUL15(a[ 7], b[19])
676 		+ MUL15(a[ 8], b[18])
677 		+ MUL15(a[ 9], b[17])
678 		+ MUL15(a[10], b[16])
679 		+ MUL15(a[11], b[15])
680 		+ MUL15(a[12], b[14])
681 		+ MUL15(a[13], b[13])
682 		+ MUL15(a[14], b[12])
683 		+ MUL15(a[15], b[11])
684 		+ MUL15(a[16], b[10])
685 		+ MUL15(a[17], b[ 9])
686 		+ MUL15(a[18], b[ 8])
687 		+ MUL15(a[19], b[ 7]);
688 	t[27] = MUL15(a[ 8], b[19])
689 		+ MUL15(a[ 9], b[18])
690 		+ MUL15(a[10], b[17])
691 		+ MUL15(a[11], b[16])
692 		+ MUL15(a[12], b[15])
693 		+ MUL15(a[13], b[14])
694 		+ MUL15(a[14], b[13])
695 		+ MUL15(a[15], b[12])
696 		+ MUL15(a[16], b[11])
697 		+ MUL15(a[17], b[10])
698 		+ MUL15(a[18], b[ 9])
699 		+ MUL15(a[19], b[ 8]);
700 	t[28] = MUL15(a[ 9], b[19])
701 		+ MUL15(a[10], b[18])
702 		+ MUL15(a[11], b[17])
703 		+ MUL15(a[12], b[16])
704 		+ MUL15(a[13], b[15])
705 		+ MUL15(a[14], b[14])
706 		+ MUL15(a[15], b[13])
707 		+ MUL15(a[16], b[12])
708 		+ MUL15(a[17], b[11])
709 		+ MUL15(a[18], b[10])
710 		+ MUL15(a[19], b[ 9]);
711 	t[29] = MUL15(a[10], b[19])
712 		+ MUL15(a[11], b[18])
713 		+ MUL15(a[12], b[17])
714 		+ MUL15(a[13], b[16])
715 		+ MUL15(a[14], b[15])
716 		+ MUL15(a[15], b[14])
717 		+ MUL15(a[16], b[13])
718 		+ MUL15(a[17], b[12])
719 		+ MUL15(a[18], b[11])
720 		+ MUL15(a[19], b[10]);
721 	t[30] = MUL15(a[11], b[19])
722 		+ MUL15(a[12], b[18])
723 		+ MUL15(a[13], b[17])
724 		+ MUL15(a[14], b[16])
725 		+ MUL15(a[15], b[15])
726 		+ MUL15(a[16], b[14])
727 		+ MUL15(a[17], b[13])
728 		+ MUL15(a[18], b[12])
729 		+ MUL15(a[19], b[11]);
730 	t[31] = MUL15(a[12], b[19])
731 		+ MUL15(a[13], b[18])
732 		+ MUL15(a[14], b[17])
733 		+ MUL15(a[15], b[16])
734 		+ MUL15(a[16], b[15])
735 		+ MUL15(a[17], b[14])
736 		+ MUL15(a[18], b[13])
737 		+ MUL15(a[19], b[12]);
738 	t[32] = MUL15(a[13], b[19])
739 		+ MUL15(a[14], b[18])
740 		+ MUL15(a[15], b[17])
741 		+ MUL15(a[16], b[16])
742 		+ MUL15(a[17], b[15])
743 		+ MUL15(a[18], b[14])
744 		+ MUL15(a[19], b[13]);
745 	t[33] = MUL15(a[14], b[19])
746 		+ MUL15(a[15], b[18])
747 		+ MUL15(a[16], b[17])
748 		+ MUL15(a[17], b[16])
749 		+ MUL15(a[18], b[15])
750 		+ MUL15(a[19], b[14]);
751 	t[34] = MUL15(a[15], b[19])
752 		+ MUL15(a[16], b[18])
753 		+ MUL15(a[17], b[17])
754 		+ MUL15(a[18], b[16])
755 		+ MUL15(a[19], b[15]);
756 	t[35] = MUL15(a[16], b[19])
757 		+ MUL15(a[17], b[18])
758 		+ MUL15(a[18], b[17])
759 		+ MUL15(a[19], b[16]);
760 	t[36] = MUL15(a[17], b[19])
761 		+ MUL15(a[18], b[18])
762 		+ MUL15(a[19], b[17]);
763 	t[37] = MUL15(a[18], b[19])
764 		+ MUL15(a[19], b[18]);
765 	t[38] = MUL15(a[19], b[19]);
766 	d[39] = norm13(d, t, 39);
767 }
768 
769 static void
770 square20(uint32_t *d, const uint32_t *a)
771 {
772 	uint32_t t[39];
773 
774 	t[ 0] = MUL15(a[ 0], a[ 0]);
775 	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
776 	t[ 2] = MUL15(a[ 1], a[ 1])
777 		+ ((MUL15(a[ 0], a[ 2])) << 1);
778 	t[ 3] = ((MUL15(a[ 0], a[ 3])
779 		+ MUL15(a[ 1], a[ 2])) << 1);
780 	t[ 4] = MUL15(a[ 2], a[ 2])
781 		+ ((MUL15(a[ 0], a[ 4])
782 		+ MUL15(a[ 1], a[ 3])) << 1);
783 	t[ 5] = ((MUL15(a[ 0], a[ 5])
784 		+ MUL15(a[ 1], a[ 4])
785 		+ MUL15(a[ 2], a[ 3])) << 1);
786 	t[ 6] = MUL15(a[ 3], a[ 3])
787 		+ ((MUL15(a[ 0], a[ 6])
788 		+ MUL15(a[ 1], a[ 5])
789 		+ MUL15(a[ 2], a[ 4])) << 1);
790 	t[ 7] = ((MUL15(a[ 0], a[ 7])
791 		+ MUL15(a[ 1], a[ 6])
792 		+ MUL15(a[ 2], a[ 5])
793 		+ MUL15(a[ 3], a[ 4])) << 1);
794 	t[ 8] = MUL15(a[ 4], a[ 4])
795 		+ ((MUL15(a[ 0], a[ 8])
796 		+ MUL15(a[ 1], a[ 7])
797 		+ MUL15(a[ 2], a[ 6])
798 		+ MUL15(a[ 3], a[ 5])) << 1);
799 	t[ 9] = ((MUL15(a[ 0], a[ 9])
800 		+ MUL15(a[ 1], a[ 8])
801 		+ MUL15(a[ 2], a[ 7])
802 		+ MUL15(a[ 3], a[ 6])
803 		+ MUL15(a[ 4], a[ 5])) << 1);
804 	t[10] = MUL15(a[ 5], a[ 5])
805 		+ ((MUL15(a[ 0], a[10])
806 		+ MUL15(a[ 1], a[ 9])
807 		+ MUL15(a[ 2], a[ 8])
808 		+ MUL15(a[ 3], a[ 7])
809 		+ MUL15(a[ 4], a[ 6])) << 1);
810 	t[11] = ((MUL15(a[ 0], a[11])
811 		+ MUL15(a[ 1], a[10])
812 		+ MUL15(a[ 2], a[ 9])
813 		+ MUL15(a[ 3], a[ 8])
814 		+ MUL15(a[ 4], a[ 7])
815 		+ MUL15(a[ 5], a[ 6])) << 1);
816 	t[12] = MUL15(a[ 6], a[ 6])
817 		+ ((MUL15(a[ 0], a[12])
818 		+ MUL15(a[ 1], a[11])
819 		+ MUL15(a[ 2], a[10])
820 		+ MUL15(a[ 3], a[ 9])
821 		+ MUL15(a[ 4], a[ 8])
822 		+ MUL15(a[ 5], a[ 7])) << 1);
823 	t[13] = ((MUL15(a[ 0], a[13])
824 		+ MUL15(a[ 1], a[12])
825 		+ MUL15(a[ 2], a[11])
826 		+ MUL15(a[ 3], a[10])
827 		+ MUL15(a[ 4], a[ 9])
828 		+ MUL15(a[ 5], a[ 8])
829 		+ MUL15(a[ 6], a[ 7])) << 1);
830 	t[14] = MUL15(a[ 7], a[ 7])
831 		+ ((MUL15(a[ 0], a[14])
832 		+ MUL15(a[ 1], a[13])
833 		+ MUL15(a[ 2], a[12])
834 		+ MUL15(a[ 3], a[11])
835 		+ MUL15(a[ 4], a[10])
836 		+ MUL15(a[ 5], a[ 9])
837 		+ MUL15(a[ 6], a[ 8])) << 1);
838 	t[15] = ((MUL15(a[ 0], a[15])
839 		+ MUL15(a[ 1], a[14])
840 		+ MUL15(a[ 2], a[13])
841 		+ MUL15(a[ 3], a[12])
842 		+ MUL15(a[ 4], a[11])
843 		+ MUL15(a[ 5], a[10])
844 		+ MUL15(a[ 6], a[ 9])
845 		+ MUL15(a[ 7], a[ 8])) << 1);
846 	t[16] = MUL15(a[ 8], a[ 8])
847 		+ ((MUL15(a[ 0], a[16])
848 		+ MUL15(a[ 1], a[15])
849 		+ MUL15(a[ 2], a[14])
850 		+ MUL15(a[ 3], a[13])
851 		+ MUL15(a[ 4], a[12])
852 		+ MUL15(a[ 5], a[11])
853 		+ MUL15(a[ 6], a[10])
854 		+ MUL15(a[ 7], a[ 9])) << 1);
855 	t[17] = ((MUL15(a[ 0], a[17])
856 		+ MUL15(a[ 1], a[16])
857 		+ MUL15(a[ 2], a[15])
858 		+ MUL15(a[ 3], a[14])
859 		+ MUL15(a[ 4], a[13])
860 		+ MUL15(a[ 5], a[12])
861 		+ MUL15(a[ 6], a[11])
862 		+ MUL15(a[ 7], a[10])
863 		+ MUL15(a[ 8], a[ 9])) << 1);
864 	t[18] = MUL15(a[ 9], a[ 9])
865 		+ ((MUL15(a[ 0], a[18])
866 		+ MUL15(a[ 1], a[17])
867 		+ MUL15(a[ 2], a[16])
868 		+ MUL15(a[ 3], a[15])
869 		+ MUL15(a[ 4], a[14])
870 		+ MUL15(a[ 5], a[13])
871 		+ MUL15(a[ 6], a[12])
872 		+ MUL15(a[ 7], a[11])
873 		+ MUL15(a[ 8], a[10])) << 1);
874 	t[19] = ((MUL15(a[ 0], a[19])
875 		+ MUL15(a[ 1], a[18])
876 		+ MUL15(a[ 2], a[17])
877 		+ MUL15(a[ 3], a[16])
878 		+ MUL15(a[ 4], a[15])
879 		+ MUL15(a[ 5], a[14])
880 		+ MUL15(a[ 6], a[13])
881 		+ MUL15(a[ 7], a[12])
882 		+ MUL15(a[ 8], a[11])
883 		+ MUL15(a[ 9], a[10])) << 1);
884 	t[20] = MUL15(a[10], a[10])
885 		+ ((MUL15(a[ 1], a[19])
886 		+ MUL15(a[ 2], a[18])
887 		+ MUL15(a[ 3], a[17])
888 		+ MUL15(a[ 4], a[16])
889 		+ MUL15(a[ 5], a[15])
890 		+ MUL15(a[ 6], a[14])
891 		+ MUL15(a[ 7], a[13])
892 		+ MUL15(a[ 8], a[12])
893 		+ MUL15(a[ 9], a[11])) << 1);
894 	t[21] = ((MUL15(a[ 2], a[19])
895 		+ MUL15(a[ 3], a[18])
896 		+ MUL15(a[ 4], a[17])
897 		+ MUL15(a[ 5], a[16])
898 		+ MUL15(a[ 6], a[15])
899 		+ MUL15(a[ 7], a[14])
900 		+ MUL15(a[ 8], a[13])
901 		+ MUL15(a[ 9], a[12])
902 		+ MUL15(a[10], a[11])) << 1);
903 	t[22] = MUL15(a[11], a[11])
904 		+ ((MUL15(a[ 3], a[19])
905 		+ MUL15(a[ 4], a[18])
906 		+ MUL15(a[ 5], a[17])
907 		+ MUL15(a[ 6], a[16])
908 		+ MUL15(a[ 7], a[15])
909 		+ MUL15(a[ 8], a[14])
910 		+ MUL15(a[ 9], a[13])
911 		+ MUL15(a[10], a[12])) << 1);
912 	t[23] = ((MUL15(a[ 4], a[19])
913 		+ MUL15(a[ 5], a[18])
914 		+ MUL15(a[ 6], a[17])
915 		+ MUL15(a[ 7], a[16])
916 		+ MUL15(a[ 8], a[15])
917 		+ MUL15(a[ 9], a[14])
918 		+ MUL15(a[10], a[13])
919 		+ MUL15(a[11], a[12])) << 1);
920 	t[24] = MUL15(a[12], a[12])
921 		+ ((MUL15(a[ 5], a[19])
922 		+ MUL15(a[ 6], a[18])
923 		+ MUL15(a[ 7], a[17])
924 		+ MUL15(a[ 8], a[16])
925 		+ MUL15(a[ 9], a[15])
926 		+ MUL15(a[10], a[14])
927 		+ MUL15(a[11], a[13])) << 1);
928 	t[25] = ((MUL15(a[ 6], a[19])
929 		+ MUL15(a[ 7], a[18])
930 		+ MUL15(a[ 8], a[17])
931 		+ MUL15(a[ 9], a[16])
932 		+ MUL15(a[10], a[15])
933 		+ MUL15(a[11], a[14])
934 		+ MUL15(a[12], a[13])) << 1);
935 	t[26] = MUL15(a[13], a[13])
936 		+ ((MUL15(a[ 7], a[19])
937 		+ MUL15(a[ 8], a[18])
938 		+ MUL15(a[ 9], a[17])
939 		+ MUL15(a[10], a[16])
940 		+ MUL15(a[11], a[15])
941 		+ MUL15(a[12], a[14])) << 1);
942 	t[27] = ((MUL15(a[ 8], a[19])
943 		+ MUL15(a[ 9], a[18])
944 		+ MUL15(a[10], a[17])
945 		+ MUL15(a[11], a[16])
946 		+ MUL15(a[12], a[15])
947 		+ MUL15(a[13], a[14])) << 1);
948 	t[28] = MUL15(a[14], a[14])
949 		+ ((MUL15(a[ 9], a[19])
950 		+ MUL15(a[10], a[18])
951 		+ MUL15(a[11], a[17])
952 		+ MUL15(a[12], a[16])
953 		+ MUL15(a[13], a[15])) << 1);
954 	t[29] = ((MUL15(a[10], a[19])
955 		+ MUL15(a[11], a[18])
956 		+ MUL15(a[12], a[17])
957 		+ MUL15(a[13], a[16])
958 		+ MUL15(a[14], a[15])) << 1);
959 	t[30] = MUL15(a[15], a[15])
960 		+ ((MUL15(a[11], a[19])
961 		+ MUL15(a[12], a[18])
962 		+ MUL15(a[13], a[17])
963 		+ MUL15(a[14], a[16])) << 1);
964 	t[31] = ((MUL15(a[12], a[19])
965 		+ MUL15(a[13], a[18])
966 		+ MUL15(a[14], a[17])
967 		+ MUL15(a[15], a[16])) << 1);
968 	t[32] = MUL15(a[16], a[16])
969 		+ ((MUL15(a[13], a[19])
970 		+ MUL15(a[14], a[18])
971 		+ MUL15(a[15], a[17])) << 1);
972 	t[33] = ((MUL15(a[14], a[19])
973 		+ MUL15(a[15], a[18])
974 		+ MUL15(a[16], a[17])) << 1);
975 	t[34] = MUL15(a[17], a[17])
976 		+ ((MUL15(a[15], a[19])
977 		+ MUL15(a[16], a[18])) << 1);
978 	t[35] = ((MUL15(a[16], a[19])
979 		+ MUL15(a[17], a[18])) << 1);
980 	t[36] = MUL15(a[18], a[18])
981 		+ ((MUL15(a[17], a[19])) << 1);
982 	t[37] = ((MUL15(a[18], a[19])) << 1);
983 	t[38] = MUL15(a[19], a[19]);
984 	d[39] = norm13(d, t, 39);
985 }
986 
987 #endif
988 
989 /*
990  * Modulus for field F256 (field for point coordinates in curve P-256).
991  */
992 static const uint32_t F256[] = {
993 	0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F,
994 	0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000,
995 	0x0000, 0x1FF8, 0x1FFF, 0x01FF
996 };
997 
998 /*
999  * The 'b' curve equation coefficient for P-256.
1000  */
1001 static const uint32_t P256_B[] = {
1002 	0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619,
1003 	0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C,
1004 	0x0A3A, 0x0EC5, 0x118D, 0x00B5
1005 };
1006 
1007 /*
1008  * Perform a "short reduction" in field F256 (field for curve P-256).
1009  * The source value should be less than 262 bits; on output, it will
1010  * be at most 257 bits, and less than twice the modulus.
1011  */
1012 static void
1013 reduce_f256(uint32_t *d)
1014 {
1015 	uint32_t x;
1016 
1017 	x = d[19] >> 9;
1018 	d[19] &= 0x01FF;
1019 	d[17] += x << 3;
1020 	d[14] -= x << 10;
1021 	d[7] -= x << 5;
1022 	d[0] += x;
1023 	norm13(d, d, 20);
1024 }
1025 
1026 /*
1027  * Perform a "final reduction" in field F256 (field for curve P-256).
1028  * The source value must be less than twice the modulus. If the value
1029  * is not lower than the modulus, then the modulus is subtracted and
1030  * this function returns 1; otherwise, it leaves it untouched and it
1031  * returns 0.
1032  */
1033 static uint32_t
1034 reduce_final_f256(uint32_t *d)
1035 {
1036 	uint32_t t[20];
1037 	uint32_t cc;
1038 	int i;
1039 
1040 	memcpy(t, d, sizeof t);
1041 	cc = 0;
1042 	for (i = 0; i < 20; i ++) {
1043 		uint32_t w;
1044 
1045 		w = t[i] - F256[i] - cc;
1046 		cc = w >> 31;
1047 		t[i] = w & 0x1FFF;
1048 	}
1049 	cc ^= 1;
1050 	CCOPY(cc, d, t, sizeof t);
1051 	return cc;
1052 }
1053 
1054 /*
1055  * Perform a multiplication of two integers modulo
1056  * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays
1057  * of 20 words, each containing 13 bits of data, in little-endian order.
1058  * On input, upper word may be up to 13 bits (hence value up to 2^260-1);
1059  * on output, value fits on 257 bits and is lower than twice the modulus.
1060  */
1061 static void
1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b)
1063 {
1064 	uint32_t t[40], cc;
1065 	int i;
1066 
1067 	/*
1068 	 * Compute raw multiplication. All result words fit in 13 bits
1069 	 * each.
1070 	 */
1071 	mul20(t, a, b);
1072 
1073 	/*
1074 	 * Modular reduction: each high word in added/subtracted where
1075 	 * necessary.
1076 	 *
1077 	 * The modulus is:
1078 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1079 	 * Therefore:
1080 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1081 	 *
1082 	 * For a word x at bit offset n (n >= 256), we have:
1083 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1084 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1085 	 *
1086 	 * Thus, we can nullify the high word if we reinject it at some
1087 	 * proper emplacements.
1088 	 */
1089 	for (i = 39; i >= 20; i --) {
1090 		uint32_t x;
1091 
1092 		x = t[i];
1093 		t[i - 2] += ARSH(x, 6);
1094 		t[i - 3] += (x << 7) & 0x1FFF;
1095 		t[i - 4] -= ARSH(x, 12);
1096 		t[i - 5] -= (x << 1) & 0x1FFF;
1097 		t[i - 12] -= ARSH(x, 4);
1098 		t[i - 13] -= (x << 9) & 0x1FFF;
1099 		t[i - 19] += ARSH(x, 9);
1100 		t[i - 20] += (x << 4) & 0x1FFF;
1101 	}
1102 
1103 	/*
1104 	 * Propagate carries. This is a signed propagation, and the
1105 	 * result may be negative. The loop above may enlarge values,
1106 	 * but not two much: worst case is the chain involving t[i - 3],
1107 	 * in which a value may be added to itself up to 7 times. Since
1108 	 * starting values are 13-bit each, all words fit on 20 bits
1109 	 * (21 to account for the sign bit).
1110 	 */
1111 	cc = norm13(t, t, 20);
1112 
1113 	/*
1114 	 * Perform modular reduction again for the bits beyond 256 (the carry
1115 	 * and the bits 256..259). Since the largest shift below is by 10
1116 	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1117 	 * thereby allowing injecting full word values.
1118 	 */
1119 	cc = (cc << 4) | (t[19] >> 9);
1120 	t[19] &= 0x01FF;
1121 	t[17] += cc << 3;
1122 	t[14] -= cc << 10;
1123 	t[7] -= cc << 5;
1124 	t[0] += cc;
1125 
1126 	/*
1127 	 * If the carry is negative, then after carry propagation, we may
1128 	 * end up with a value which is negative, and we don't want that.
1129 	 * Thus, in that case, we add the modulus. Note that the subtraction
1130 	 * result, when the carry is negative, is always smaller than the
1131 	 * modulus, so the extra addition will not make the value exceed
1132 	 * twice the modulus.
1133 	 */
1134 	cc >>= 31;
1135 	t[0] -= cc;
1136 	t[7] += cc << 5;
1137 	t[14] += cc << 10;
1138 	t[17] -= cc << 3;
1139 	t[19] += cc << 9;
1140 
1141 	norm13(d, t, 20);
1142 }
1143 
1144 /*
1145  * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve
1146  * P-256). Operand is an array of 20 words, each containing 13 bits of
1147  * data, in little-endian order. On input, upper word may be up to 13
1148  * bits (hence value up to 2^260-1); on output, value fits on 257 bits
1149  * and is lower than twice the modulus.
1150  */
1151 static void
1152 square_f256(uint32_t *d, const uint32_t *a)
1153 {
1154 	uint32_t t[40], cc;
1155 	int i;
1156 
1157 	/*
1158 	 * Compute raw square. All result words fit in 13 bits each.
1159 	 */
1160 	square20(t, a);
1161 
1162 	/*
1163 	 * Modular reduction: each high word in added/subtracted where
1164 	 * necessary.
1165 	 *
1166 	 * The modulus is:
1167 	 *    p = 2^256 - 2^224 + 2^192 + 2^96 - 1
1168 	 * Therefore:
1169 	 *    2^256 = 2^224 - 2^192 - 2^96 + 1 mod p
1170 	 *
1171 	 * For a word x at bit offset n (n >= 256), we have:
1172 	 *    x*2^n = x*2^(n-32) - x*2^(n-64)
1173 	 *            - x*2^(n - 160) + x*2^(n-256) mod p
1174 	 *
1175 	 * Thus, we can nullify the high word if we reinject it at some
1176 	 * proper emplacements.
1177 	 */
1178 	for (i = 39; i >= 20; i --) {
1179 		uint32_t x;
1180 
1181 		x = t[i];
1182 		t[i - 2] += ARSH(x, 6);
1183 		t[i - 3] += (x << 7) & 0x1FFF;
1184 		t[i - 4] -= ARSH(x, 12);
1185 		t[i - 5] -= (x << 1) & 0x1FFF;
1186 		t[i - 12] -= ARSH(x, 4);
1187 		t[i - 13] -= (x << 9) & 0x1FFF;
1188 		t[i - 19] += ARSH(x, 9);
1189 		t[i - 20] += (x << 4) & 0x1FFF;
1190 	}
1191 
1192 	/*
1193 	 * Propagate carries. This is a signed propagation, and the
1194 	 * result may be negative. The loop above may enlarge values,
1195 	 * but not two much: worst case is the chain involving t[i - 3],
1196 	 * in which a value may be added to itself up to 7 times. Since
1197 	 * starting values are 13-bit each, all words fit on 20 bits
1198 	 * (21 to account for the sign bit).
1199 	 */
1200 	cc = norm13(t, t, 20);
1201 
1202 	/*
1203 	 * Perform modular reduction again for the bits beyond 256 (the carry
1204 	 * and the bits 256..259). Since the largest shift below is by 10
1205 	 * bits, and the values fit on 21 bits, values fit in 32-bit words,
1206 	 * thereby allowing injecting full word values.
1207 	 */
1208 	cc = (cc << 4) | (t[19] >> 9);
1209 	t[19] &= 0x01FF;
1210 	t[17] += cc << 3;
1211 	t[14] -= cc << 10;
1212 	t[7] -= cc << 5;
1213 	t[0] += cc;
1214 
1215 	/*
1216 	 * If the carry is negative, then after carry propagation, we may
1217 	 * end up with a value which is negative, and we don't want that.
1218 	 * Thus, in that case, we add the modulus. Note that the subtraction
1219 	 * result, when the carry is negative, is always smaller than the
1220 	 * modulus, so the extra addition will not make the value exceed
1221 	 * twice the modulus.
1222 	 */
1223 	cc >>= 31;
1224 	t[0] -= cc;
1225 	t[7] += cc << 5;
1226 	t[14] += cc << 10;
1227 	t[17] -= cc << 3;
1228 	t[19] += cc << 9;
1229 
1230 	norm13(d, t, 20);
1231 }
1232 
1233 /*
1234  * Jacobian coordinates for a point in P-256: affine coordinates (X,Y)
1235  * are such that:
1236  *   X = x / z^2
1237  *   Y = y / z^3
1238  * For the point at infinity, z = 0.
1239  * Each point thus admits many possible representations.
1240  *
1241  * Coordinates are represented in arrays of 32-bit integers, each holding
1242  * 13 bits of data. Values may also be slightly greater than the modulus,
1243  * but they will always be lower than twice the modulus.
1244  */
1245 typedef struct {
1246 	uint32_t x[20];
1247 	uint32_t y[20];
1248 	uint32_t z[20];
1249 } p256_jacobian;
1250 
1251 /*
1252  * Convert a point to affine coordinates:
1253  *  - If the point is the point at infinity, then all three coordinates
1254  *    are set to 0.
1255  *  - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y'
1256  *    coordinates are the 'X' and 'Y' affine coordinates.
1257  * The coordinates are guaranteed to be lower than the modulus.
1258  */
1259 static void
1260 p256_to_affine(p256_jacobian *P)
1261 {
1262 	uint32_t t1[20], t2[20];
1263 	int i;
1264 
1265 	/*
1266 	 * Invert z with a modular exponentiation: the modulus is
1267 	 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is
1268 	 * p-2. Exponent bit pattern (from high to low) is:
1269 	 *  - 32 bits of value 1
1270 	 *  - 31 bits of value 0
1271 	 *  - 1 bit of value 1
1272 	 *  - 96 bits of value 0
1273 	 *  - 94 bits of value 1
1274 	 *  - 1 bit of value 0
1275 	 *  - 1 bit of value 1
1276 	 * Thus, we precompute z^(2^31-1) to speed things up.
1277 	 *
1278 	 * If z = 0 (point at infinity) then the modular exponentiation
1279 	 * will yield 0, which leads to the expected result (all three
1280 	 * coordinates set to 0).
1281 	 */
1282 
1283 	/*
1284 	 * A simple square-and-multiply for z^(2^31-1). We could save about
1285 	 * two dozen multiplications here with an addition chain, but
1286 	 * this would require a bit more code, and extra stack buffers.
1287 	 */
1288 	memcpy(t1, P->z, sizeof P->z);
1289 	for (i = 0; i < 30; i ++) {
1290 		square_f256(t1, t1);
1291 		mul_f256(t1, t1, P->z);
1292 	}
1293 
1294 	/*
1295 	 * Square-and-multiply. Apart from the squarings, we have a few
1296 	 * multiplications to set bits to 1; we multiply by the original z
1297 	 * for setting 1 bit, and by t1 for setting 31 bits.
1298 	 */
1299 	memcpy(t2, P->z, sizeof P->z);
1300 	for (i = 1; i < 256; i ++) {
1301 		square_f256(t2, t2);
1302 		switch (i) {
1303 		case 31:
1304 		case 190:
1305 		case 221:
1306 		case 252:
1307 			mul_f256(t2, t2, t1);
1308 			break;
1309 		case 63:
1310 		case 253:
1311 		case 255:
1312 			mul_f256(t2, t2, P->z);
1313 			break;
1314 		}
1315 	}
1316 
1317 	/*
1318 	 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3.
1319 	 */
1320 	mul_f256(t1, t2, t2);
1321 	mul_f256(P->x, t1, P->x);
1322 	mul_f256(t1, t1, t2);
1323 	mul_f256(P->y, t1, P->y);
1324 	reduce_final_f256(P->x);
1325 	reduce_final_f256(P->y);
1326 
1327 	/*
1328 	 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise
1329 	 * this will set z to 1.
1330 	 */
1331 	mul_f256(P->z, P->z, t2);
1332 	reduce_final_f256(P->z);
1333 }
1334 
1335 /*
1336  * Double a point in P-256. This function works for all valid points,
1337  * including the point at infinity.
1338  */
1339 static void
1340 p256_double(p256_jacobian *Q)
1341 {
1342 	/*
1343 	 * Doubling formulas are:
1344 	 *
1345 	 *   s = 4*x*y^2
1346 	 *   m = 3*(x + z^2)*(x - z^2)
1347 	 *   x' = m^2 - 2*s
1348 	 *   y' = m*(s - x') - 8*y^4
1349 	 *   z' = 2*y*z
1350 	 *
1351 	 * These formulas work for all points, including points of order 2
1352 	 * and points at infinity:
1353 	 *   - If y = 0 then z' = 0. But there is no such point in P-256
1354 	 *     anyway.
1355 	 *   - If z = 0 then z' = 0.
1356 	 */
1357 	uint32_t t1[20], t2[20], t3[20], t4[20];
1358 	int i;
1359 
1360 	/*
1361 	 * Compute z^2 in t1.
1362 	 */
1363 	square_f256(t1, Q->z);
1364 
1365 	/*
1366 	 * Compute x-z^2 in t2 and x+z^2 in t1.
1367 	 */
1368 	for (i = 0; i < 20; i ++) {
1369 		t2[i] = (F256[i] << 1) + Q->x[i] - t1[i];
1370 		t1[i] += Q->x[i];
1371 	}
1372 	norm13(t1, t1, 20);
1373 	norm13(t2, t2, 20);
1374 
1375 	/*
1376 	 * Compute 3*(x+z^2)*(x-z^2) in t1.
1377 	 */
1378 	mul_f256(t3, t1, t2);
1379 	for (i = 0; i < 20; i ++) {
1380 		t1[i] = MUL15(3, t3[i]);
1381 	}
1382 	norm13(t1, t1, 20);
1383 
1384 	/*
1385 	 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3).
1386 	 */
1387 	square_f256(t3, Q->y);
1388 	for (i = 0; i < 20; i ++) {
1389 		t3[i] <<= 1;
1390 	}
1391 	norm13(t3, t3, 20);
1392 	mul_f256(t2, Q->x, t3);
1393 	for (i = 0; i < 20; i ++) {
1394 		t2[i] <<= 1;
1395 	}
1396 	norm13(t2, t2, 20);
1397 	reduce_f256(t2);
1398 
1399 	/*
1400 	 * Compute x' = m^2 - 2*s.
1401 	 */
1402 	square_f256(Q->x, t1);
1403 	for (i = 0; i < 20; i ++) {
1404 		Q->x[i] += (F256[i] << 2) - (t2[i] << 1);
1405 	}
1406 	norm13(Q->x, Q->x, 20);
1407 	reduce_f256(Q->x);
1408 
1409 	/*
1410 	 * Compute z' = 2*y*z.
1411 	 */
1412 	mul_f256(t4, Q->y, Q->z);
1413 	for (i = 0; i < 20; i ++) {
1414 		Q->z[i] = t4[i] << 1;
1415 	}
1416 	norm13(Q->z, Q->z, 20);
1417 	reduce_f256(Q->z);
1418 
1419 	/*
1420 	 * Compute y' = m*(s - x') - 8*y^4. Note that we already have
1421 	 * 2*y^2 in t3.
1422 	 */
1423 	for (i = 0; i < 20; i ++) {
1424 		t2[i] += (F256[i] << 1) - Q->x[i];
1425 	}
1426 	norm13(t2, t2, 20);
1427 	mul_f256(Q->y, t1, t2);
1428 	square_f256(t4, t3);
1429 	for (i = 0; i < 20; i ++) {
1430 		Q->y[i] += (F256[i] << 2) - (t4[i] << 1);
1431 	}
1432 	norm13(Q->y, Q->y, 20);
1433 	reduce_f256(Q->y);
1434 }
1435 
1436 /*
1437  * Add point P2 to point P1.
1438  *
1439  * This function computes the wrong result in the following cases:
1440  *
1441  *   - If P1 == 0 but P2 != 0
1442  *   - If P1 != 0 but P2 == 0
1443  *   - If P1 == P2
1444  *
1445  * In all three cases, P1 is set to the point at infinity.
1446  *
1447  * Returned value is 0 if one of the following occurs:
1448  *
1449  *   - P1 and P2 have the same Y coordinate
1450  *   - P1 == 0 and P2 == 0
1451  *   - The Y coordinate of one of the points is 0 and the other point is
1452  *     the point at infinity.
1453  *
1454  * The third case cannot actually happen with valid points, since a point
1455  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1456  * curve P-256.
1457  *
1458  * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller
1459  * can apply the following:
1460  *
1461  *   - If the result is not the point at infinity, then it is correct.
1462  *   - Otherwise, if the returned value is 1, then this is a case of
1463  *     P1+P2 == 0, so the result is indeed the point at infinity.
1464  *   - Otherwise, P1 == P2, so a "double" operation should have been
1465  *     performed.
1466  */
1467 static uint32_t
1468 p256_add(p256_jacobian *P1, const p256_jacobian *P2)
1469 {
1470 	/*
1471 	 * Addtions formulas are:
1472 	 *
1473 	 *   u1 = x1 * z2^2
1474 	 *   u2 = x2 * z1^2
1475 	 *   s1 = y1 * z2^3
1476 	 *   s2 = y2 * z1^3
1477 	 *   h = u2 - u1
1478 	 *   r = s2 - s1
1479 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1480 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1481 	 *   z3 = h * z1 * z2
1482 	 */
1483 	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1484 	uint32_t ret;
1485 	int i;
1486 
1487 	/*
1488 	 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3).
1489 	 */
1490 	square_f256(t3, P2->z);
1491 	mul_f256(t1, P1->x, t3);
1492 	mul_f256(t4, P2->z, t3);
1493 	mul_f256(t3, P1->y, t4);
1494 
1495 	/*
1496 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1497 	 */
1498 	square_f256(t4, P1->z);
1499 	mul_f256(t2, P2->x, t4);
1500 	mul_f256(t5, P1->z, t4);
1501 	mul_f256(t4, P2->y, t5);
1502 
1503 	/*
1504 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1505 	 * We need to test whether r is zero, so we will do some extra
1506 	 * reduce.
1507 	 */
1508 	for (i = 0; i < 20; i ++) {
1509 		t2[i] += (F256[i] << 1) - t1[i];
1510 		t4[i] += (F256[i] << 1) - t3[i];
1511 	}
1512 	norm13(t2, t2, 20);
1513 	norm13(t4, t4, 20);
1514 	reduce_f256(t4);
1515 	reduce_final_f256(t4);
1516 	ret = 0;
1517 	for (i = 0; i < 20; i ++) {
1518 		ret |= t4[i];
1519 	}
1520 	ret = (ret | -ret) >> 31;
1521 
1522 	/*
1523 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1524 	 */
1525 	square_f256(t7, t2);
1526 	mul_f256(t6, t1, t7);
1527 	mul_f256(t5, t7, t2);
1528 
1529 	/*
1530 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1531 	 */
1532 	square_f256(P1->x, t4);
1533 	for (i = 0; i < 20; i ++) {
1534 		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1535 	}
1536 	norm13(P1->x, P1->x, 20);
1537 	reduce_f256(P1->x);
1538 
1539 	/*
1540 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1541 	 */
1542 	for (i = 0; i < 20; i ++) {
1543 		t6[i] += (F256[i] << 1) - P1->x[i];
1544 	}
1545 	norm13(t6, t6, 20);
1546 	mul_f256(P1->y, t4, t6);
1547 	mul_f256(t1, t5, t3);
1548 	for (i = 0; i < 20; i ++) {
1549 		P1->y[i] += (F256[i] << 1) - t1[i];
1550 	}
1551 	norm13(P1->y, P1->y, 20);
1552 	reduce_f256(P1->y);
1553 
1554 	/*
1555 	 * Compute z3 = h*z1*z2.
1556 	 */
1557 	mul_f256(t1, P1->z, P2->z);
1558 	mul_f256(P1->z, t1, t2);
1559 
1560 	return ret;
1561 }
1562 
1563 /*
1564  * Add point P2 to point P1. This is a specialised function for the
1565  * case when P2 is a non-zero point in affine coordinate.
1566  *
1567  * This function computes the wrong result in the following cases:
1568  *
1569  *   - If P1 == 0
1570  *   - If P1 == P2
1571  *
1572  * In both cases, P1 is set to the point at infinity.
1573  *
1574  * Returned value is 0 if one of the following occurs:
1575  *
1576  *   - P1 and P2 have the same Y coordinate
1577  *   - The Y coordinate of P2 is 0 and P1 is the point at infinity.
1578  *
1579  * The second case cannot actually happen with valid points, since a point
1580  * with Y == 0 is a point of order 2, and there is no point of order 2 on
1581  * curve P-256.
1582  *
1583  * Therefore, assuming that P1 != 0 on input, then the caller
1584  * can apply the following:
1585  *
1586  *   - If the result is not the point at infinity, then it is correct.
1587  *   - Otherwise, if the returned value is 1, then this is a case of
1588  *     P1+P2 == 0, so the result is indeed the point at infinity.
1589  *   - Otherwise, P1 == P2, so a "double" operation should have been
1590  *     performed.
1591  */
1592 static uint32_t
1593 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2)
1594 {
1595 	/*
1596 	 * Addtions formulas are:
1597 	 *
1598 	 *   u1 = x1
1599 	 *   u2 = x2 * z1^2
1600 	 *   s1 = y1
1601 	 *   s2 = y2 * z1^3
1602 	 *   h = u2 - u1
1603 	 *   r = s2 - s1
1604 	 *   x3 = r^2 - h^3 - 2 * u1 * h^2
1605 	 *   y3 = r * (u1 * h^2 - x3) - s1 * h^3
1606 	 *   z3 = h * z1
1607 	 */
1608 	uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20];
1609 	uint32_t ret;
1610 	int i;
1611 
1612 	/*
1613 	 * Compute u1 = x1 (in t1) and s1 = y1 (in t3).
1614 	 */
1615 	memcpy(t1, P1->x, sizeof t1);
1616 	memcpy(t3, P1->y, sizeof t3);
1617 
1618 	/*
1619 	 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4).
1620 	 */
1621 	square_f256(t4, P1->z);
1622 	mul_f256(t2, P2->x, t4);
1623 	mul_f256(t5, P1->z, t4);
1624 	mul_f256(t4, P2->y, t5);
1625 
1626 	/*
1627 	 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4).
1628 	 * We need to test whether r is zero, so we will do some extra
1629 	 * reduce.
1630 	 */
1631 	for (i = 0; i < 20; i ++) {
1632 		t2[i] += (F256[i] << 1) - t1[i];
1633 		t4[i] += (F256[i] << 1) - t3[i];
1634 	}
1635 	norm13(t2, t2, 20);
1636 	norm13(t4, t4, 20);
1637 	reduce_f256(t4);
1638 	reduce_final_f256(t4);
1639 	ret = 0;
1640 	for (i = 0; i < 20; i ++) {
1641 		ret |= t4[i];
1642 	}
1643 	ret = (ret | -ret) >> 31;
1644 
1645 	/*
1646 	 * Compute u1*h^2 (in t6) and h^3 (in t5);
1647 	 */
1648 	square_f256(t7, t2);
1649 	mul_f256(t6, t1, t7);
1650 	mul_f256(t5, t7, t2);
1651 
1652 	/*
1653 	 * Compute x3 = r^2 - h^3 - 2*u1*h^2.
1654 	 */
1655 	square_f256(P1->x, t4);
1656 	for (i = 0; i < 20; i ++) {
1657 		P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1);
1658 	}
1659 	norm13(P1->x, P1->x, 20);
1660 	reduce_f256(P1->x);
1661 
1662 	/*
1663 	 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3.
1664 	 */
1665 	for (i = 0; i < 20; i ++) {
1666 		t6[i] += (F256[i] << 1) - P1->x[i];
1667 	}
1668 	norm13(t6, t6, 20);
1669 	mul_f256(P1->y, t4, t6);
1670 	mul_f256(t1, t5, t3);
1671 	for (i = 0; i < 20; i ++) {
1672 		P1->y[i] += (F256[i] << 1) - t1[i];
1673 	}
1674 	norm13(P1->y, P1->y, 20);
1675 	reduce_f256(P1->y);
1676 
1677 	/*
1678 	 * Compute z3 = h*z1*z2.
1679 	 */
1680 	mul_f256(P1->z, P1->z, t2);
1681 
1682 	return ret;
1683 }
1684 
1685 /*
1686  * Decode a P-256 point. This function does not support the point at
1687  * infinity. Returned value is 0 if the point is invalid, 1 otherwise.
1688  */
1689 static uint32_t
1690 p256_decode(p256_jacobian *P, const void *src, size_t len)
1691 {
1692 	const unsigned char *buf;
1693 	uint32_t tx[20], ty[20], t1[20], t2[20];
1694 	uint32_t bad;
1695 	int i;
1696 
1697 	if (len != 65) {
1698 		return 0;
1699 	}
1700 	buf = src;
1701 
1702 	/*
1703 	 * First byte must be 0x04 (uncompressed format). We could support
1704 	 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the
1705 	 * least significant bit of the Y coordinate), but it is explicitly
1706 	 * forbidden by RFC 5480 (section 2.2).
1707 	 */
1708 	bad = NEQ(buf[0], 0x04);
1709 
1710 	/*
1711 	 * Decode the coordinates, and check that they are both lower
1712 	 * than the modulus.
1713 	 */
1714 	tx[19] = be8_to_le13(tx, buf + 1, 32);
1715 	ty[19] = be8_to_le13(ty, buf + 33, 32);
1716 	bad |= reduce_final_f256(tx);
1717 	bad |= reduce_final_f256(ty);
1718 
1719 	/*
1720 	 * Check curve equation.
1721 	 */
1722 	square_f256(t1, tx);
1723 	mul_f256(t1, tx, t1);
1724 	square_f256(t2, ty);
1725 	for (i = 0; i < 20; i ++) {
1726 		t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i];
1727 	}
1728 	norm13(t1, t1, 20);
1729 	reduce_f256(t1);
1730 	reduce_final_f256(t1);
1731 	for (i = 0; i < 20; i ++) {
1732 		bad |= t1[i];
1733 	}
1734 
1735 	/*
1736 	 * Copy coordinates to the point structure.
1737 	 */
1738 	memcpy(P->x, tx, sizeof tx);
1739 	memcpy(P->y, ty, sizeof ty);
1740 	memset(P->z, 0, sizeof P->z);
1741 	P->z[0] = 1;
1742 	return EQ(bad, 0);
1743 }
1744 
1745 /*
1746  * Encode a point into a buffer. This function assumes that the point is
1747  * valid, in affine coordinates, and not the point at infinity.
1748  */
1749 static void
1750 p256_encode(void *dst, const p256_jacobian *P)
1751 {
1752 	unsigned char *buf;
1753 
1754 	buf = dst;
1755 	buf[0] = 0x04;
1756 	le13_to_be8(buf + 1, 32, P->x);
1757 	le13_to_be8(buf + 33, 32, P->y);
1758 }
1759 
1760 /*
1761  * Multiply a curve point by an integer. The integer is assumed to be
1762  * lower than the curve order, and the base point must not be the point
1763  * at infinity.
1764  */
1765 static void
1766 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen)
1767 {
1768 	/*
1769 	 * qz is a flag that is initially 1, and remains equal to 1
1770 	 * as long as the point is the point at infinity.
1771 	 *
1772 	 * We use a 2-bit window to handle multiplier bits by pairs.
1773 	 * The precomputed window really is the points P2 and P3.
1774 	 */
1775 	uint32_t qz;
1776 	p256_jacobian P2, P3, Q, T, U;
1777 
1778 	/*
1779 	 * Compute window values.
1780 	 */
1781 	P2 = *P;
1782 	p256_double(&P2);
1783 	P3 = *P;
1784 	p256_add(&P3, &P2);
1785 
1786 	/*
1787 	 * We start with Q = 0. We process multiplier bits 2 by 2.
1788 	 */
1789 	memset(&Q, 0, sizeof Q);
1790 	qz = 1;
1791 	while (xlen -- > 0) {
1792 		int k;
1793 
1794 		for (k = 6; k >= 0; k -= 2) {
1795 			uint32_t bits;
1796 			uint32_t bnz;
1797 
1798 			p256_double(&Q);
1799 			p256_double(&Q);
1800 			T = *P;
1801 			U = Q;
1802 			bits = (*x >> k) & (uint32_t)3;
1803 			bnz = NEQ(bits, 0);
1804 			CCOPY(EQ(bits, 2), &T, &P2, sizeof T);
1805 			CCOPY(EQ(bits, 3), &T, &P3, sizeof T);
1806 			p256_add(&U, &T);
1807 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1808 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1809 			qz &= ~bnz;
1810 		}
1811 		x ++;
1812 	}
1813 	*P = Q;
1814 }
1815 
1816 /*
1817  * Precomputed window: k*G points, where G is the curve generator, and k
1818  * is an integer from 1 to 15 (inclusive). The X and Y coordinates of
1819  * the point are encoded as 20 words of 13 bits each (little-endian
1820  * order); 13-bit words are then grouped 2-by-2 into 32-bit words
1821  * (little-endian order within each word).
1822  */
1823 static const uint32_t Gwin[15][20] = {
1824 
1825 	{ 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC,
1826 	  0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2,
1827 	  0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D,
1828 	  0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785,
1829 	  0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 },
1830 
1831 	{ 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8,
1832 	  0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29,
1833 	  0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748,
1834 	  0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36,
1835 	  0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 },
1836 
1837 	{ 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6,
1838 	  0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323,
1839 	  0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8,
1840 	  0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB,
1841 	  0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 },
1842 
1843 	{ 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156,
1844 	  0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80,
1845 	  0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6,
1846 	  0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D,
1847 	  0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 },
1848 
1849 	{ 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F,
1850 	  0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E,
1851 	  0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F,
1852 	  0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49,
1853 	  0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F },
1854 
1855 	{ 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8,
1856 	  0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B,
1857 	  0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3,
1858 	  0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE,
1859 	  0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 },
1860 
1861 	{ 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F,
1862 	  0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896,
1863 	  0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0,
1864 	  0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400,
1865 	  0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 },
1866 
1867 	{ 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A,
1868 	  0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01,
1869 	  0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83,
1870 	  0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8,
1871 	  0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 },
1872 
1873 	{ 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7,
1874 	  0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E,
1875 	  0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293,
1876 	  0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832,
1877 	  0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 },
1878 
1879 	{ 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120,
1880 	  0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964,
1881 	  0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91,
1882 	  0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304,
1883 	  0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 },
1884 
1885 	{ 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541,
1886 	  0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418,
1887 	  0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A,
1888 	  0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2,
1889 	  0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 },
1890 
1891 	{ 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0,
1892 	  0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918,
1893 	  0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3,
1894 	  0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55,
1895 	  0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D },
1896 
1897 	{ 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB,
1898 	  0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986,
1899 	  0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB,
1900 	  0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C,
1901 	  0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 },
1902 
1903 	{ 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A,
1904 	  0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9,
1905 	  0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8,
1906 	  0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0,
1907 	  0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C },
1908 
1909 	{ 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8,
1910 	  0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7,
1911 	  0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783,
1912 	  0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9,
1913 	  0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F }
1914 };
1915 
1916 /*
1917  * Lookup one of the Gwin[] values, by index. This is constant-time.
1918  */
1919 static void
1920 lookup_Gwin(p256_jacobian *T, uint32_t idx)
1921 {
1922 	uint32_t xy[20];
1923 	uint32_t k;
1924 	size_t u;
1925 
1926 	memset(xy, 0, sizeof xy);
1927 	for (k = 0; k < 15; k ++) {
1928 		uint32_t m;
1929 
1930 		m = -EQ(idx, k + 1);
1931 		for (u = 0; u < 20; u ++) {
1932 			xy[u] |= m & Gwin[k][u];
1933 		}
1934 	}
1935 	for (u = 0; u < 10; u ++) {
1936 		T->x[(u << 1) + 0] = xy[u] & 0xFFFF;
1937 		T->x[(u << 1) + 1] = xy[u] >> 16;
1938 		T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF;
1939 		T->y[(u << 1) + 1] = xy[u + 10] >> 16;
1940 	}
1941 	memset(T->z, 0, sizeof T->z);
1942 	T->z[0] = 1;
1943 }
1944 
1945 /*
1946  * Multiply the generator by an integer. The integer is assumed non-zero
1947  * and lower than the curve order.
1948  */
1949 static void
1950 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen)
1951 {
1952 	/*
1953 	 * qz is a flag that is initially 1, and remains equal to 1
1954 	 * as long as the point is the point at infinity.
1955 	 *
1956 	 * We use a 4-bit window to handle multiplier bits by groups
1957 	 * of 4. The precomputed window is constant static data, with
1958 	 * points in affine coordinates; we use a constant-time lookup.
1959 	 */
1960 	p256_jacobian Q;
1961 	uint32_t qz;
1962 
1963 	memset(&Q, 0, sizeof Q);
1964 	qz = 1;
1965 	while (xlen -- > 0) {
1966 		int k;
1967 		unsigned bx;
1968 
1969 		bx = *x ++;
1970 		for (k = 0; k < 2; k ++) {
1971 			uint32_t bits;
1972 			uint32_t bnz;
1973 			p256_jacobian T, U;
1974 
1975 			p256_double(&Q);
1976 			p256_double(&Q);
1977 			p256_double(&Q);
1978 			p256_double(&Q);
1979 			bits = (bx >> 4) & 0x0F;
1980 			bnz = NEQ(bits, 0);
1981 			lookup_Gwin(&T, bits);
1982 			U = Q;
1983 			p256_add_mixed(&U, &T);
1984 			CCOPY(bnz & qz, &Q, &T, sizeof Q);
1985 			CCOPY(bnz & ~qz, &Q, &U, sizeof Q);
1986 			qz &= ~bnz;
1987 			bx <<= 4;
1988 		}
1989 	}
1990 	*P = Q;
1991 }
1992 
1993 static const unsigned char P256_G[] = {
1994 	0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8,
1995 	0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D,
1996 	0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8,
1997 	0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F,
1998 	0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B,
1999 	0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40,
2000 	0x68, 0x37, 0xBF, 0x51, 0xF5
2001 };
2002 
2003 static const unsigned char P256_N[] = {
2004 	0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF,
2005 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD,
2006 	0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63,
2007 	0x25, 0x51
2008 };
2009 
2010 static const unsigned char *
2011 api_generator(int curve, size_t *len)
2012 {
2013 	(void)curve;
2014 	*len = sizeof P256_G;
2015 	return P256_G;
2016 }
2017 
2018 static const unsigned char *
2019 api_order(int curve, size_t *len)
2020 {
2021 	(void)curve;
2022 	*len = sizeof P256_N;
2023 	return P256_N;
2024 }
2025 
2026 static size_t
2027 api_xoff(int curve, size_t *len)
2028 {
2029 	(void)curve;
2030 	*len = 32;
2031 	return 1;
2032 }
2033 
2034 static uint32_t
2035 api_mul(unsigned char *G, size_t Glen,
2036 	const unsigned char *x, size_t xlen, int curve)
2037 {
2038 	uint32_t r;
2039 	p256_jacobian P;
2040 
2041 	(void)curve;
2042 	if (Glen != 65) {
2043 		return 0;
2044 	}
2045 	r = p256_decode(&P, G, Glen);
2046 	p256_mul(&P, x, xlen);
2047 	p256_to_affine(&P);
2048 	p256_encode(G, &P);
2049 	return r;
2050 }
2051 
2052 static size_t
2053 api_mulgen(unsigned char *R,
2054 	const unsigned char *x, size_t xlen, int curve)
2055 {
2056 	p256_jacobian P;
2057 
2058 	(void)curve;
2059 	p256_mulgen(&P, x, xlen);
2060 	p256_to_affine(&P);
2061 	p256_encode(R, &P);
2062 	return 65;
2063 }
2064 
2065 static uint32_t
2066 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
2067 	const unsigned char *x, size_t xlen,
2068 	const unsigned char *y, size_t ylen, int curve)
2069 {
2070 	p256_jacobian P, Q;
2071 	uint32_t r, t, z;
2072 	int i;
2073 
2074 	(void)curve;
2075 	if (len != 65) {
2076 		return 0;
2077 	}
2078 	r = p256_decode(&P, A, len);
2079 	p256_mul(&P, x, xlen);
2080 	if (B == NULL) {
2081 		p256_mulgen(&Q, y, ylen);
2082 	} else {
2083 		r &= p256_decode(&Q, B, len);
2084 		p256_mul(&Q, y, ylen);
2085 	}
2086 
2087 	/*
2088 	 * The final addition may fail in case both points are equal.
2089 	 */
2090 	t = p256_add(&P, &Q);
2091 	reduce_final_f256(P.z);
2092 	z = 0;
2093 	for (i = 0; i < 20; i ++) {
2094 		z |= P.z[i];
2095 	}
2096 	z = EQ(z, 0);
2097 	p256_double(&Q);
2098 
2099 	/*
2100 	 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we
2101 	 * have the following:
2102 	 *
2103 	 *   z = 0, t = 0   return P (normal addition)
2104 	 *   z = 0, t = 1   return P (normal addition)
2105 	 *   z = 1, t = 0   return Q (a 'double' case)
2106 	 *   z = 1, t = 1   report an error (P+Q = 0)
2107 	 */
2108 	CCOPY(z & ~t, &P, &Q, sizeof Q);
2109 	p256_to_affine(&P);
2110 	p256_encode(A, &P);
2111 	r &= ~(z & t);
2112 	return r;
2113 }
2114 
2115 /* see bearssl_ec.h */
2116 const br_ec_impl br_ec_p256_m15 = {
2117 	(uint32_t)0x00800000,
2118 	&api_generator,
2119 	&api_order,
2120 	&api_xoff,
2121 	&api_mul,
2122 	&api_mulgen,
2123 	&api_muladd
2124 };
2125