xref: /freebsd/contrib/bearssl/src/ec/ec_c25519_m15.c (revision 0e8011faf58b743cc652e3b2ad0f7671227610df)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "inner.h"
26 
27 /* obsolete
28 #include <stdio.h>
29 #include <stdlib.h>
30 static void
31 print_int(const char *name, const uint32_t *x)
32 {
33 	size_t u;
34 	unsigned char tmp[36];
35 
36 	printf("%s = ", name);
37 	for (u = 0; u < 20; u ++) {
38 		if (x[u] > 0x1FFF) {
39 			printf("INVALID:");
40 			for (u = 0; u < 20; u ++) {
41 				printf(" %04X", x[u]);
42 			}
43 			printf("\n");
44 			return;
45 		}
46 	}
47 	memset(tmp, 0, sizeof tmp);
48 	for (u = 0; u < 20; u ++) {
49 		uint32_t w;
50 		int j, k;
51 
52 		w = x[u];
53 		j = 13 * (int)u;
54 		k = j & 7;
55 		if (k != 0) {
56 			w <<= k;
57 			j -= k;
58 		}
59 		k = j >> 3;
60 		tmp[35 - k] |= (unsigned char)w;
61 		tmp[34 - k] |= (unsigned char)(w >> 8);
62 		tmp[33 - k] |= (unsigned char)(w >> 16);
63 		tmp[32 - k] |= (unsigned char)(w >> 24);
64 	}
65 	for (u = 4; u < 36; u ++) {
66 		printf("%02X", tmp[u]);
67 	}
68 	printf("\n");
69 }
70 */
71 
72 /*
73  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74  * that right-shifting a signed negative integer copies the sign bit
75  * (arithmetic right-shift). This is "implementation-defined behaviour",
76  * i.e. it is not undefined, but it may differ between compilers. Each
77  * compiler is supposed to document its behaviour in that respect. GCC
78  * explicitly defines that an arithmetic right shift is used. We expect
79  * all other compilers to do the same, because underlying CPU offer an
80  * arithmetic right shift opcode that could not be used otherwise.
81  */
82 #if BR_NO_ARITH_SHIFT
83 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85 #else
86 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87 #endif
88 
89 /*
90  * Convert an integer from unsigned little-endian encoding to a sequence of
91  * 13-bit words in little-endian order. The final "partial" word is
92  * returned.
93  */
94 static uint32_t
95 le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len)
96 {
97 	uint32_t acc;
98 	int acc_len;
99 
100 	acc = 0;
101 	acc_len = 0;
102 	while (len -- > 0) {
103 		acc |= (uint32_t)(*src ++) << acc_len;
104 		acc_len += 8;
105 		if (acc_len >= 13) {
106 			*dst ++ = acc & 0x1FFF;
107 			acc >>= 13;
108 			acc_len -= 13;
109 		}
110 	}
111 	return acc;
112 }
113 
114 /*
115  * Convert an integer (13-bit words, little-endian) to unsigned
116  * little-endian encoding. The total encoding length is provided; all
117  * the destination bytes will be filled.
118  */
119 static void
120 le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
121 {
122 	uint32_t acc;
123 	int acc_len;
124 
125 	acc = 0;
126 	acc_len = 0;
127 	while (len -- > 0) {
128 		if (acc_len < 8) {
129 			acc |= (*src ++) << acc_len;
130 			acc_len += 13;
131 		}
132 		*dst ++ = (unsigned char)acc;
133 		acc >>= 8;
134 		acc_len -= 8;
135 	}
136 }
137 
138 /*
139  * Normalise an array of words to a strict 13 bits per word. Returned
140  * value is the resulting carry. The source (w) and destination (d)
141  * arrays may be identical, but shall not overlap partially.
142  */
143 static inline uint32_t
144 norm13(uint32_t *d, const uint32_t *w, size_t len)
145 {
146 	size_t u;
147 	uint32_t cc;
148 
149 	cc = 0;
150 	for (u = 0; u < len; u ++) {
151 		int32_t z;
152 
153 		z = w[u] + cc;
154 		d[u] = z & 0x1FFF;
155 		cc = ARSH(z, 13);
156 	}
157 	return cc;
158 }
159 
160 /*
161  * mul20() multiplies two 260-bit integers together. Each word must fit
162  * on 13 bits; source operands use 20 words, destination operand
163  * receives 40 words. All overlaps allowed.
164  *
165  * square20() computes the square of a 260-bit integer. Each word must
166  * fit on 13 bits; source operand uses 20 words, destination operand
167  * receives 40 words. All overlaps allowed.
168  */
169 
170 #if BR_SLOW_MUL15
171 
172 static void
173 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
174 {
175 	/*
176 	 * Two-level Karatsuba: turns a 20x20 multiplication into
177 	 * nine 5x5 multiplications. We use 13-bit words but do not
178 	 * propagate carries immediately, so words may expand:
179 	 *
180 	 *  - First Karatsuba decomposition turns the 20x20 mul on
181 	 *    13-bit words into three 10x10 muls, two on 13-bit words
182 	 *    and one on 14-bit words.
183 	 *
184 	 *  - Second Karatsuba decomposition further splits these into:
185 	 *
186 	 *     * four 5x5 muls on 13-bit words
187 	 *     * four 5x5 muls on 14-bit words
188 	 *     * one 5x5 mul on 15-bit words
189 	 *
190 	 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit
191 	 * or 15-bit words, respectively.
192 	 */
193 	uint32_t u[45], v[45], w[90];
194 	uint32_t cc;
195 	int i;
196 
197 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
198 		(dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \
199 			+ (s2w)[5 * (s2_off) + 0]; \
200 		(dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \
201 			+ (s2w)[5 * (s2_off) + 1]; \
202 		(dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \
203 			+ (s2w)[5 * (s2_off) + 2]; \
204 		(dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \
205 			+ (s2w)[5 * (s2_off) + 3]; \
206 		(dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \
207 			+ (s2w)[5 * (s2_off) + 4]; \
208 	} while (0)
209 
210 #define ZADDT(dw, d_off, sw, s_off)   do { \
211 		(dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \
212 		(dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \
213 		(dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \
214 		(dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \
215 		(dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \
216 	} while (0)
217 
218 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off)   do { \
219 		(dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \
220 			+ (s2w)[5 * (s2_off) + 0]; \
221 		(dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \
222 			+ (s2w)[5 * (s2_off) + 1]; \
223 		(dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \
224 			+ (s2w)[5 * (s2_off) + 2]; \
225 		(dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \
226 			+ (s2w)[5 * (s2_off) + 3]; \
227 		(dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \
228 			+ (s2w)[5 * (s2_off) + 4]; \
229 	} while (0)
230 
231 #define CPR1(w, cprcc)   do { \
232 		uint32_t cprz = (w) + cprcc; \
233 		(w) = cprz & 0x1FFF; \
234 		cprcc = cprz >> 13; \
235 	} while (0)
236 
237 #define CPR(dw, d_off)   do { \
238 		uint32_t cprcc; \
239 		cprcc = 0; \
240 		CPR1((dw)[(d_off) + 0], cprcc); \
241 		CPR1((dw)[(d_off) + 1], cprcc); \
242 		CPR1((dw)[(d_off) + 2], cprcc); \
243 		CPR1((dw)[(d_off) + 3], cprcc); \
244 		CPR1((dw)[(d_off) + 4], cprcc); \
245 		CPR1((dw)[(d_off) + 5], cprcc); \
246 		CPR1((dw)[(d_off) + 6], cprcc); \
247 		CPR1((dw)[(d_off) + 7], cprcc); \
248 		CPR1((dw)[(d_off) + 8], cprcc); \
249 		(dw)[(d_off) + 9] = cprcc; \
250 	} while (0)
251 
252 	memcpy(u, a, 20 * sizeof *a);
253 	ZADD(u, 4, a, 0, a, 1);
254 	ZADD(u, 5, a, 2, a, 3);
255 	ZADD(u, 6, a, 0, a, 2);
256 	ZADD(u, 7, a, 1, a, 3);
257 	ZADD(u, 8, u, 6, u, 7);
258 
259 	memcpy(v, b, 20 * sizeof *b);
260 	ZADD(v, 4, b, 0, b, 1);
261 	ZADD(v, 5, b, 2, b, 3);
262 	ZADD(v, 6, b, 0, b, 2);
263 	ZADD(v, 7, b, 1, b, 3);
264 	ZADD(v, 8, v, 6, v, 7);
265 
266 	/*
267 	 * Do the eight first 8x8 muls. Source words are at most 16382
268 	 * each, so we can add product results together "as is" in 32-bit
269 	 * words.
270 	 */
271 	for (i = 0; i < 40; i += 5) {
272 		w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]);
273 		w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1])
274 			+ MUL15(u[i + 1], v[i + 0]);
275 		w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2])
276 			+ MUL15(u[i + 1], v[i + 1])
277 			+ MUL15(u[i + 2], v[i + 0]);
278 		w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3])
279 			+ MUL15(u[i + 1], v[i + 2])
280 			+ MUL15(u[i + 2], v[i + 1])
281 			+ MUL15(u[i + 3], v[i + 0]);
282 		w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4])
283 			+ MUL15(u[i + 1], v[i + 3])
284 			+ MUL15(u[i + 2], v[i + 2])
285 			+ MUL15(u[i + 3], v[i + 1])
286 			+ MUL15(u[i + 4], v[i + 0]);
287 		w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4])
288 			+ MUL15(u[i + 2], v[i + 3])
289 			+ MUL15(u[i + 3], v[i + 2])
290 			+ MUL15(u[i + 4], v[i + 1]);
291 		w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4])
292 			+ MUL15(u[i + 3], v[i + 3])
293 			+ MUL15(u[i + 4], v[i + 2]);
294 		w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4])
295 			+ MUL15(u[i + 4], v[i + 3]);
296 		w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]);
297 		w[(i << 1) + 9] = 0;
298 	}
299 
300 	/*
301 	 * For the 9th multiplication, source words are up to 32764,
302 	 * so we must do some carry propagation. If we add up to
303 	 * 4 products and the carry is no more than 524224, then the
304 	 * result fits in 32 bits, and the next carry will be no more
305 	 * than 524224 (because 4*(32764^2)+524224 < 8192*524225).
306 	 *
307 	 * We thus just skip one of the products in the middle word,
308 	 * then do a carry propagation (this reduces words to 13 bits
309 	 * each, except possibly the last, which may use up to 17 bits
310 	 * or so), then add the missing product.
311 	 */
312 	w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]);
313 	w[80 + 1] = MUL15(u[40 + 0], v[40 + 1])
314 		+ MUL15(u[40 + 1], v[40 + 0]);
315 	w[80 + 2] = MUL15(u[40 + 0], v[40 + 2])
316 		+ MUL15(u[40 + 1], v[40 + 1])
317 		+ MUL15(u[40 + 2], v[40 + 0]);
318 	w[80 + 3] = MUL15(u[40 + 0], v[40 + 3])
319 		+ MUL15(u[40 + 1], v[40 + 2])
320 		+ MUL15(u[40 + 2], v[40 + 1])
321 		+ MUL15(u[40 + 3], v[40 + 0]);
322 	w[80 + 4] = MUL15(u[40 + 0], v[40 + 4])
323 		+ MUL15(u[40 + 1], v[40 + 3])
324 		+ MUL15(u[40 + 2], v[40 + 2])
325 		+ MUL15(u[40 + 3], v[40 + 1]);
326 		/* + MUL15(u[40 + 4], v[40 + 0]) */
327 	w[80 + 5] = MUL15(u[40 + 1], v[40 + 4])
328 		+ MUL15(u[40 + 2], v[40 + 3])
329 		+ MUL15(u[40 + 3], v[40 + 2])
330 		+ MUL15(u[40 + 4], v[40 + 1]);
331 	w[80 + 6] = MUL15(u[40 + 2], v[40 + 4])
332 		+ MUL15(u[40 + 3], v[40 + 3])
333 		+ MUL15(u[40 + 4], v[40 + 2]);
334 	w[80 + 7] = MUL15(u[40 + 3], v[40 + 4])
335 		+ MUL15(u[40 + 4], v[40 + 3]);
336 	w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]);
337 
338 	CPR(w, 80);
339 
340 	w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]);
341 
342 	/*
343 	 * The products on 14-bit words in slots 6 and 7 yield values
344 	 * up to 5*(16382^2) each, and we need to subtract two such
345 	 * values from the higher word. We need the subtraction to fit
346 	 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit.
347 	 * However, 10*(16382^2) does not fit. So we must perform a
348 	 * bit of reduction here.
349 	 */
350 	CPR(w, 60);
351 	CPR(w, 70);
352 
353 	/*
354 	 * Recompose results.
355 	 */
356 
357 	/* 0..1*0..1 into 0..3 */
358 	ZSUB2F(w, 8, w, 0, w, 2);
359 	ZSUB2F(w, 9, w, 1, w, 3);
360 	ZADDT(w, 1, w, 8);
361 	ZADDT(w, 2, w, 9);
362 
363 	/* 2..3*2..3 into 4..7 */
364 	ZSUB2F(w, 10, w, 4, w, 6);
365 	ZSUB2F(w, 11, w, 5, w, 7);
366 	ZADDT(w, 5, w, 10);
367 	ZADDT(w, 6, w, 11);
368 
369 	/* (0..1+2..3)*(0..1+2..3) into 12..15 */
370 	ZSUB2F(w, 16, w, 12, w, 14);
371 	ZSUB2F(w, 17, w, 13, w, 15);
372 	ZADDT(w, 13, w, 16);
373 	ZADDT(w, 14, w, 17);
374 
375 	/* first-level recomposition */
376 	ZSUB2F(w, 12, w, 0, w, 4);
377 	ZSUB2F(w, 13, w, 1, w, 5);
378 	ZSUB2F(w, 14, w, 2, w, 6);
379 	ZSUB2F(w, 15, w, 3, w, 7);
380 	ZADDT(w, 2, w, 12);
381 	ZADDT(w, 3, w, 13);
382 	ZADDT(w, 4, w, 14);
383 	ZADDT(w, 5, w, 15);
384 
385 	/*
386 	 * Perform carry propagation to bring all words down to 13 bits.
387 	 */
388 	cc = norm13(d, w, 40);
389 	d[39] += (cc << 13);
390 
391 #undef ZADD
392 #undef ZADDT
393 #undef ZSUB2F
394 #undef CPR1
395 #undef CPR
396 }
397 
398 static inline void
399 square20(uint32_t *d, const uint32_t *a)
400 {
401 	mul20(d, a, a);
402 }
403 
404 #else
405 
406 static void
407 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b)
408 {
409 	uint32_t t[39];
410 
411 	t[ 0] = MUL15(a[ 0], b[ 0]);
412 	t[ 1] = MUL15(a[ 0], b[ 1])
413 		+ MUL15(a[ 1], b[ 0]);
414 	t[ 2] = MUL15(a[ 0], b[ 2])
415 		+ MUL15(a[ 1], b[ 1])
416 		+ MUL15(a[ 2], b[ 0]);
417 	t[ 3] = MUL15(a[ 0], b[ 3])
418 		+ MUL15(a[ 1], b[ 2])
419 		+ MUL15(a[ 2], b[ 1])
420 		+ MUL15(a[ 3], b[ 0]);
421 	t[ 4] = MUL15(a[ 0], b[ 4])
422 		+ MUL15(a[ 1], b[ 3])
423 		+ MUL15(a[ 2], b[ 2])
424 		+ MUL15(a[ 3], b[ 1])
425 		+ MUL15(a[ 4], b[ 0]);
426 	t[ 5] = MUL15(a[ 0], b[ 5])
427 		+ MUL15(a[ 1], b[ 4])
428 		+ MUL15(a[ 2], b[ 3])
429 		+ MUL15(a[ 3], b[ 2])
430 		+ MUL15(a[ 4], b[ 1])
431 		+ MUL15(a[ 5], b[ 0]);
432 	t[ 6] = MUL15(a[ 0], b[ 6])
433 		+ MUL15(a[ 1], b[ 5])
434 		+ MUL15(a[ 2], b[ 4])
435 		+ MUL15(a[ 3], b[ 3])
436 		+ MUL15(a[ 4], b[ 2])
437 		+ MUL15(a[ 5], b[ 1])
438 		+ MUL15(a[ 6], b[ 0]);
439 	t[ 7] = MUL15(a[ 0], b[ 7])
440 		+ MUL15(a[ 1], b[ 6])
441 		+ MUL15(a[ 2], b[ 5])
442 		+ MUL15(a[ 3], b[ 4])
443 		+ MUL15(a[ 4], b[ 3])
444 		+ MUL15(a[ 5], b[ 2])
445 		+ MUL15(a[ 6], b[ 1])
446 		+ MUL15(a[ 7], b[ 0]);
447 	t[ 8] = MUL15(a[ 0], b[ 8])
448 		+ MUL15(a[ 1], b[ 7])
449 		+ MUL15(a[ 2], b[ 6])
450 		+ MUL15(a[ 3], b[ 5])
451 		+ MUL15(a[ 4], b[ 4])
452 		+ MUL15(a[ 5], b[ 3])
453 		+ MUL15(a[ 6], b[ 2])
454 		+ MUL15(a[ 7], b[ 1])
455 		+ MUL15(a[ 8], b[ 0]);
456 	t[ 9] = MUL15(a[ 0], b[ 9])
457 		+ MUL15(a[ 1], b[ 8])
458 		+ MUL15(a[ 2], b[ 7])
459 		+ MUL15(a[ 3], b[ 6])
460 		+ MUL15(a[ 4], b[ 5])
461 		+ MUL15(a[ 5], b[ 4])
462 		+ MUL15(a[ 6], b[ 3])
463 		+ MUL15(a[ 7], b[ 2])
464 		+ MUL15(a[ 8], b[ 1])
465 		+ MUL15(a[ 9], b[ 0]);
466 	t[10] = MUL15(a[ 0], b[10])
467 		+ MUL15(a[ 1], b[ 9])
468 		+ MUL15(a[ 2], b[ 8])
469 		+ MUL15(a[ 3], b[ 7])
470 		+ MUL15(a[ 4], b[ 6])
471 		+ MUL15(a[ 5], b[ 5])
472 		+ MUL15(a[ 6], b[ 4])
473 		+ MUL15(a[ 7], b[ 3])
474 		+ MUL15(a[ 8], b[ 2])
475 		+ MUL15(a[ 9], b[ 1])
476 		+ MUL15(a[10], b[ 0]);
477 	t[11] = MUL15(a[ 0], b[11])
478 		+ MUL15(a[ 1], b[10])
479 		+ MUL15(a[ 2], b[ 9])
480 		+ MUL15(a[ 3], b[ 8])
481 		+ MUL15(a[ 4], b[ 7])
482 		+ MUL15(a[ 5], b[ 6])
483 		+ MUL15(a[ 6], b[ 5])
484 		+ MUL15(a[ 7], b[ 4])
485 		+ MUL15(a[ 8], b[ 3])
486 		+ MUL15(a[ 9], b[ 2])
487 		+ MUL15(a[10], b[ 1])
488 		+ MUL15(a[11], b[ 0]);
489 	t[12] = MUL15(a[ 0], b[12])
490 		+ MUL15(a[ 1], b[11])
491 		+ MUL15(a[ 2], b[10])
492 		+ MUL15(a[ 3], b[ 9])
493 		+ MUL15(a[ 4], b[ 8])
494 		+ MUL15(a[ 5], b[ 7])
495 		+ MUL15(a[ 6], b[ 6])
496 		+ MUL15(a[ 7], b[ 5])
497 		+ MUL15(a[ 8], b[ 4])
498 		+ MUL15(a[ 9], b[ 3])
499 		+ MUL15(a[10], b[ 2])
500 		+ MUL15(a[11], b[ 1])
501 		+ MUL15(a[12], b[ 0]);
502 	t[13] = MUL15(a[ 0], b[13])
503 		+ MUL15(a[ 1], b[12])
504 		+ MUL15(a[ 2], b[11])
505 		+ MUL15(a[ 3], b[10])
506 		+ MUL15(a[ 4], b[ 9])
507 		+ MUL15(a[ 5], b[ 8])
508 		+ MUL15(a[ 6], b[ 7])
509 		+ MUL15(a[ 7], b[ 6])
510 		+ MUL15(a[ 8], b[ 5])
511 		+ MUL15(a[ 9], b[ 4])
512 		+ MUL15(a[10], b[ 3])
513 		+ MUL15(a[11], b[ 2])
514 		+ MUL15(a[12], b[ 1])
515 		+ MUL15(a[13], b[ 0]);
516 	t[14] = MUL15(a[ 0], b[14])
517 		+ MUL15(a[ 1], b[13])
518 		+ MUL15(a[ 2], b[12])
519 		+ MUL15(a[ 3], b[11])
520 		+ MUL15(a[ 4], b[10])
521 		+ MUL15(a[ 5], b[ 9])
522 		+ MUL15(a[ 6], b[ 8])
523 		+ MUL15(a[ 7], b[ 7])
524 		+ MUL15(a[ 8], b[ 6])
525 		+ MUL15(a[ 9], b[ 5])
526 		+ MUL15(a[10], b[ 4])
527 		+ MUL15(a[11], b[ 3])
528 		+ MUL15(a[12], b[ 2])
529 		+ MUL15(a[13], b[ 1])
530 		+ MUL15(a[14], b[ 0]);
531 	t[15] = MUL15(a[ 0], b[15])
532 		+ MUL15(a[ 1], b[14])
533 		+ MUL15(a[ 2], b[13])
534 		+ MUL15(a[ 3], b[12])
535 		+ MUL15(a[ 4], b[11])
536 		+ MUL15(a[ 5], b[10])
537 		+ MUL15(a[ 6], b[ 9])
538 		+ MUL15(a[ 7], b[ 8])
539 		+ MUL15(a[ 8], b[ 7])
540 		+ MUL15(a[ 9], b[ 6])
541 		+ MUL15(a[10], b[ 5])
542 		+ MUL15(a[11], b[ 4])
543 		+ MUL15(a[12], b[ 3])
544 		+ MUL15(a[13], b[ 2])
545 		+ MUL15(a[14], b[ 1])
546 		+ MUL15(a[15], b[ 0]);
547 	t[16] = MUL15(a[ 0], b[16])
548 		+ MUL15(a[ 1], b[15])
549 		+ MUL15(a[ 2], b[14])
550 		+ MUL15(a[ 3], b[13])
551 		+ MUL15(a[ 4], b[12])
552 		+ MUL15(a[ 5], b[11])
553 		+ MUL15(a[ 6], b[10])
554 		+ MUL15(a[ 7], b[ 9])
555 		+ MUL15(a[ 8], b[ 8])
556 		+ MUL15(a[ 9], b[ 7])
557 		+ MUL15(a[10], b[ 6])
558 		+ MUL15(a[11], b[ 5])
559 		+ MUL15(a[12], b[ 4])
560 		+ MUL15(a[13], b[ 3])
561 		+ MUL15(a[14], b[ 2])
562 		+ MUL15(a[15], b[ 1])
563 		+ MUL15(a[16], b[ 0]);
564 	t[17] = MUL15(a[ 0], b[17])
565 		+ MUL15(a[ 1], b[16])
566 		+ MUL15(a[ 2], b[15])
567 		+ MUL15(a[ 3], b[14])
568 		+ MUL15(a[ 4], b[13])
569 		+ MUL15(a[ 5], b[12])
570 		+ MUL15(a[ 6], b[11])
571 		+ MUL15(a[ 7], b[10])
572 		+ MUL15(a[ 8], b[ 9])
573 		+ MUL15(a[ 9], b[ 8])
574 		+ MUL15(a[10], b[ 7])
575 		+ MUL15(a[11], b[ 6])
576 		+ MUL15(a[12], b[ 5])
577 		+ MUL15(a[13], b[ 4])
578 		+ MUL15(a[14], b[ 3])
579 		+ MUL15(a[15], b[ 2])
580 		+ MUL15(a[16], b[ 1])
581 		+ MUL15(a[17], b[ 0]);
582 	t[18] = MUL15(a[ 0], b[18])
583 		+ MUL15(a[ 1], b[17])
584 		+ MUL15(a[ 2], b[16])
585 		+ MUL15(a[ 3], b[15])
586 		+ MUL15(a[ 4], b[14])
587 		+ MUL15(a[ 5], b[13])
588 		+ MUL15(a[ 6], b[12])
589 		+ MUL15(a[ 7], b[11])
590 		+ MUL15(a[ 8], b[10])
591 		+ MUL15(a[ 9], b[ 9])
592 		+ MUL15(a[10], b[ 8])
593 		+ MUL15(a[11], b[ 7])
594 		+ MUL15(a[12], b[ 6])
595 		+ MUL15(a[13], b[ 5])
596 		+ MUL15(a[14], b[ 4])
597 		+ MUL15(a[15], b[ 3])
598 		+ MUL15(a[16], b[ 2])
599 		+ MUL15(a[17], b[ 1])
600 		+ MUL15(a[18], b[ 0]);
601 	t[19] = MUL15(a[ 0], b[19])
602 		+ MUL15(a[ 1], b[18])
603 		+ MUL15(a[ 2], b[17])
604 		+ MUL15(a[ 3], b[16])
605 		+ MUL15(a[ 4], b[15])
606 		+ MUL15(a[ 5], b[14])
607 		+ MUL15(a[ 6], b[13])
608 		+ MUL15(a[ 7], b[12])
609 		+ MUL15(a[ 8], b[11])
610 		+ MUL15(a[ 9], b[10])
611 		+ MUL15(a[10], b[ 9])
612 		+ MUL15(a[11], b[ 8])
613 		+ MUL15(a[12], b[ 7])
614 		+ MUL15(a[13], b[ 6])
615 		+ MUL15(a[14], b[ 5])
616 		+ MUL15(a[15], b[ 4])
617 		+ MUL15(a[16], b[ 3])
618 		+ MUL15(a[17], b[ 2])
619 		+ MUL15(a[18], b[ 1])
620 		+ MUL15(a[19], b[ 0]);
621 	t[20] = MUL15(a[ 1], b[19])
622 		+ MUL15(a[ 2], b[18])
623 		+ MUL15(a[ 3], b[17])
624 		+ MUL15(a[ 4], b[16])
625 		+ MUL15(a[ 5], b[15])
626 		+ MUL15(a[ 6], b[14])
627 		+ MUL15(a[ 7], b[13])
628 		+ MUL15(a[ 8], b[12])
629 		+ MUL15(a[ 9], b[11])
630 		+ MUL15(a[10], b[10])
631 		+ MUL15(a[11], b[ 9])
632 		+ MUL15(a[12], b[ 8])
633 		+ MUL15(a[13], b[ 7])
634 		+ MUL15(a[14], b[ 6])
635 		+ MUL15(a[15], b[ 5])
636 		+ MUL15(a[16], b[ 4])
637 		+ MUL15(a[17], b[ 3])
638 		+ MUL15(a[18], b[ 2])
639 		+ MUL15(a[19], b[ 1]);
640 	t[21] = MUL15(a[ 2], b[19])
641 		+ MUL15(a[ 3], b[18])
642 		+ MUL15(a[ 4], b[17])
643 		+ MUL15(a[ 5], b[16])
644 		+ MUL15(a[ 6], b[15])
645 		+ MUL15(a[ 7], b[14])
646 		+ MUL15(a[ 8], b[13])
647 		+ MUL15(a[ 9], b[12])
648 		+ MUL15(a[10], b[11])
649 		+ MUL15(a[11], b[10])
650 		+ MUL15(a[12], b[ 9])
651 		+ MUL15(a[13], b[ 8])
652 		+ MUL15(a[14], b[ 7])
653 		+ MUL15(a[15], b[ 6])
654 		+ MUL15(a[16], b[ 5])
655 		+ MUL15(a[17], b[ 4])
656 		+ MUL15(a[18], b[ 3])
657 		+ MUL15(a[19], b[ 2]);
658 	t[22] = MUL15(a[ 3], b[19])
659 		+ MUL15(a[ 4], b[18])
660 		+ MUL15(a[ 5], b[17])
661 		+ MUL15(a[ 6], b[16])
662 		+ MUL15(a[ 7], b[15])
663 		+ MUL15(a[ 8], b[14])
664 		+ MUL15(a[ 9], b[13])
665 		+ MUL15(a[10], b[12])
666 		+ MUL15(a[11], b[11])
667 		+ MUL15(a[12], b[10])
668 		+ MUL15(a[13], b[ 9])
669 		+ MUL15(a[14], b[ 8])
670 		+ MUL15(a[15], b[ 7])
671 		+ MUL15(a[16], b[ 6])
672 		+ MUL15(a[17], b[ 5])
673 		+ MUL15(a[18], b[ 4])
674 		+ MUL15(a[19], b[ 3]);
675 	t[23] = MUL15(a[ 4], b[19])
676 		+ MUL15(a[ 5], b[18])
677 		+ MUL15(a[ 6], b[17])
678 		+ MUL15(a[ 7], b[16])
679 		+ MUL15(a[ 8], b[15])
680 		+ MUL15(a[ 9], b[14])
681 		+ MUL15(a[10], b[13])
682 		+ MUL15(a[11], b[12])
683 		+ MUL15(a[12], b[11])
684 		+ MUL15(a[13], b[10])
685 		+ MUL15(a[14], b[ 9])
686 		+ MUL15(a[15], b[ 8])
687 		+ MUL15(a[16], b[ 7])
688 		+ MUL15(a[17], b[ 6])
689 		+ MUL15(a[18], b[ 5])
690 		+ MUL15(a[19], b[ 4]);
691 	t[24] = MUL15(a[ 5], b[19])
692 		+ MUL15(a[ 6], b[18])
693 		+ MUL15(a[ 7], b[17])
694 		+ MUL15(a[ 8], b[16])
695 		+ MUL15(a[ 9], b[15])
696 		+ MUL15(a[10], b[14])
697 		+ MUL15(a[11], b[13])
698 		+ MUL15(a[12], b[12])
699 		+ MUL15(a[13], b[11])
700 		+ MUL15(a[14], b[10])
701 		+ MUL15(a[15], b[ 9])
702 		+ MUL15(a[16], b[ 8])
703 		+ MUL15(a[17], b[ 7])
704 		+ MUL15(a[18], b[ 6])
705 		+ MUL15(a[19], b[ 5]);
706 	t[25] = MUL15(a[ 6], b[19])
707 		+ MUL15(a[ 7], b[18])
708 		+ MUL15(a[ 8], b[17])
709 		+ MUL15(a[ 9], b[16])
710 		+ MUL15(a[10], b[15])
711 		+ MUL15(a[11], b[14])
712 		+ MUL15(a[12], b[13])
713 		+ MUL15(a[13], b[12])
714 		+ MUL15(a[14], b[11])
715 		+ MUL15(a[15], b[10])
716 		+ MUL15(a[16], b[ 9])
717 		+ MUL15(a[17], b[ 8])
718 		+ MUL15(a[18], b[ 7])
719 		+ MUL15(a[19], b[ 6]);
720 	t[26] = MUL15(a[ 7], b[19])
721 		+ MUL15(a[ 8], b[18])
722 		+ MUL15(a[ 9], b[17])
723 		+ MUL15(a[10], b[16])
724 		+ MUL15(a[11], b[15])
725 		+ MUL15(a[12], b[14])
726 		+ MUL15(a[13], b[13])
727 		+ MUL15(a[14], b[12])
728 		+ MUL15(a[15], b[11])
729 		+ MUL15(a[16], b[10])
730 		+ MUL15(a[17], b[ 9])
731 		+ MUL15(a[18], b[ 8])
732 		+ MUL15(a[19], b[ 7]);
733 	t[27] = MUL15(a[ 8], b[19])
734 		+ MUL15(a[ 9], b[18])
735 		+ MUL15(a[10], b[17])
736 		+ MUL15(a[11], b[16])
737 		+ MUL15(a[12], b[15])
738 		+ MUL15(a[13], b[14])
739 		+ MUL15(a[14], b[13])
740 		+ MUL15(a[15], b[12])
741 		+ MUL15(a[16], b[11])
742 		+ MUL15(a[17], b[10])
743 		+ MUL15(a[18], b[ 9])
744 		+ MUL15(a[19], b[ 8]);
745 	t[28] = MUL15(a[ 9], b[19])
746 		+ MUL15(a[10], b[18])
747 		+ MUL15(a[11], b[17])
748 		+ MUL15(a[12], b[16])
749 		+ MUL15(a[13], b[15])
750 		+ MUL15(a[14], b[14])
751 		+ MUL15(a[15], b[13])
752 		+ MUL15(a[16], b[12])
753 		+ MUL15(a[17], b[11])
754 		+ MUL15(a[18], b[10])
755 		+ MUL15(a[19], b[ 9]);
756 	t[29] = MUL15(a[10], b[19])
757 		+ MUL15(a[11], b[18])
758 		+ MUL15(a[12], b[17])
759 		+ MUL15(a[13], b[16])
760 		+ MUL15(a[14], b[15])
761 		+ MUL15(a[15], b[14])
762 		+ MUL15(a[16], b[13])
763 		+ MUL15(a[17], b[12])
764 		+ MUL15(a[18], b[11])
765 		+ MUL15(a[19], b[10]);
766 	t[30] = MUL15(a[11], b[19])
767 		+ MUL15(a[12], b[18])
768 		+ MUL15(a[13], b[17])
769 		+ MUL15(a[14], b[16])
770 		+ MUL15(a[15], b[15])
771 		+ MUL15(a[16], b[14])
772 		+ MUL15(a[17], b[13])
773 		+ MUL15(a[18], b[12])
774 		+ MUL15(a[19], b[11]);
775 	t[31] = MUL15(a[12], b[19])
776 		+ MUL15(a[13], b[18])
777 		+ MUL15(a[14], b[17])
778 		+ MUL15(a[15], b[16])
779 		+ MUL15(a[16], b[15])
780 		+ MUL15(a[17], b[14])
781 		+ MUL15(a[18], b[13])
782 		+ MUL15(a[19], b[12]);
783 	t[32] = MUL15(a[13], b[19])
784 		+ MUL15(a[14], b[18])
785 		+ MUL15(a[15], b[17])
786 		+ MUL15(a[16], b[16])
787 		+ MUL15(a[17], b[15])
788 		+ MUL15(a[18], b[14])
789 		+ MUL15(a[19], b[13]);
790 	t[33] = MUL15(a[14], b[19])
791 		+ MUL15(a[15], b[18])
792 		+ MUL15(a[16], b[17])
793 		+ MUL15(a[17], b[16])
794 		+ MUL15(a[18], b[15])
795 		+ MUL15(a[19], b[14]);
796 	t[34] = MUL15(a[15], b[19])
797 		+ MUL15(a[16], b[18])
798 		+ MUL15(a[17], b[17])
799 		+ MUL15(a[18], b[16])
800 		+ MUL15(a[19], b[15]);
801 	t[35] = MUL15(a[16], b[19])
802 		+ MUL15(a[17], b[18])
803 		+ MUL15(a[18], b[17])
804 		+ MUL15(a[19], b[16]);
805 	t[36] = MUL15(a[17], b[19])
806 		+ MUL15(a[18], b[18])
807 		+ MUL15(a[19], b[17]);
808 	t[37] = MUL15(a[18], b[19])
809 		+ MUL15(a[19], b[18]);
810 	t[38] = MUL15(a[19], b[19]);
811 
812 	d[39] = norm13(d, t, 39);
813 }
814 
815 static void
816 square20(uint32_t *d, const uint32_t *a)
817 {
818 	uint32_t t[39];
819 
820 	t[ 0] = MUL15(a[ 0], a[ 0]);
821 	t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1);
822 	t[ 2] = MUL15(a[ 1], a[ 1])
823 		+ ((MUL15(a[ 0], a[ 2])) << 1);
824 	t[ 3] = ((MUL15(a[ 0], a[ 3])
825 		+ MUL15(a[ 1], a[ 2])) << 1);
826 	t[ 4] = MUL15(a[ 2], a[ 2])
827 		+ ((MUL15(a[ 0], a[ 4])
828 		+ MUL15(a[ 1], a[ 3])) << 1);
829 	t[ 5] = ((MUL15(a[ 0], a[ 5])
830 		+ MUL15(a[ 1], a[ 4])
831 		+ MUL15(a[ 2], a[ 3])) << 1);
832 	t[ 6] = MUL15(a[ 3], a[ 3])
833 		+ ((MUL15(a[ 0], a[ 6])
834 		+ MUL15(a[ 1], a[ 5])
835 		+ MUL15(a[ 2], a[ 4])) << 1);
836 	t[ 7] = ((MUL15(a[ 0], a[ 7])
837 		+ MUL15(a[ 1], a[ 6])
838 		+ MUL15(a[ 2], a[ 5])
839 		+ MUL15(a[ 3], a[ 4])) << 1);
840 	t[ 8] = MUL15(a[ 4], a[ 4])
841 		+ ((MUL15(a[ 0], a[ 8])
842 		+ MUL15(a[ 1], a[ 7])
843 		+ MUL15(a[ 2], a[ 6])
844 		+ MUL15(a[ 3], a[ 5])) << 1);
845 	t[ 9] = ((MUL15(a[ 0], a[ 9])
846 		+ MUL15(a[ 1], a[ 8])
847 		+ MUL15(a[ 2], a[ 7])
848 		+ MUL15(a[ 3], a[ 6])
849 		+ MUL15(a[ 4], a[ 5])) << 1);
850 	t[10] = MUL15(a[ 5], a[ 5])
851 		+ ((MUL15(a[ 0], a[10])
852 		+ MUL15(a[ 1], a[ 9])
853 		+ MUL15(a[ 2], a[ 8])
854 		+ MUL15(a[ 3], a[ 7])
855 		+ MUL15(a[ 4], a[ 6])) << 1);
856 	t[11] = ((MUL15(a[ 0], a[11])
857 		+ MUL15(a[ 1], a[10])
858 		+ MUL15(a[ 2], a[ 9])
859 		+ MUL15(a[ 3], a[ 8])
860 		+ MUL15(a[ 4], a[ 7])
861 		+ MUL15(a[ 5], a[ 6])) << 1);
862 	t[12] = MUL15(a[ 6], a[ 6])
863 		+ ((MUL15(a[ 0], a[12])
864 		+ MUL15(a[ 1], a[11])
865 		+ MUL15(a[ 2], a[10])
866 		+ MUL15(a[ 3], a[ 9])
867 		+ MUL15(a[ 4], a[ 8])
868 		+ MUL15(a[ 5], a[ 7])) << 1);
869 	t[13] = ((MUL15(a[ 0], a[13])
870 		+ MUL15(a[ 1], a[12])
871 		+ MUL15(a[ 2], a[11])
872 		+ MUL15(a[ 3], a[10])
873 		+ MUL15(a[ 4], a[ 9])
874 		+ MUL15(a[ 5], a[ 8])
875 		+ MUL15(a[ 6], a[ 7])) << 1);
876 	t[14] = MUL15(a[ 7], a[ 7])
877 		+ ((MUL15(a[ 0], a[14])
878 		+ MUL15(a[ 1], a[13])
879 		+ MUL15(a[ 2], a[12])
880 		+ MUL15(a[ 3], a[11])
881 		+ MUL15(a[ 4], a[10])
882 		+ MUL15(a[ 5], a[ 9])
883 		+ MUL15(a[ 6], a[ 8])) << 1);
884 	t[15] = ((MUL15(a[ 0], a[15])
885 		+ MUL15(a[ 1], a[14])
886 		+ MUL15(a[ 2], a[13])
887 		+ MUL15(a[ 3], a[12])
888 		+ MUL15(a[ 4], a[11])
889 		+ MUL15(a[ 5], a[10])
890 		+ MUL15(a[ 6], a[ 9])
891 		+ MUL15(a[ 7], a[ 8])) << 1);
892 	t[16] = MUL15(a[ 8], a[ 8])
893 		+ ((MUL15(a[ 0], a[16])
894 		+ MUL15(a[ 1], a[15])
895 		+ MUL15(a[ 2], a[14])
896 		+ MUL15(a[ 3], a[13])
897 		+ MUL15(a[ 4], a[12])
898 		+ MUL15(a[ 5], a[11])
899 		+ MUL15(a[ 6], a[10])
900 		+ MUL15(a[ 7], a[ 9])) << 1);
901 	t[17] = ((MUL15(a[ 0], a[17])
902 		+ MUL15(a[ 1], a[16])
903 		+ MUL15(a[ 2], a[15])
904 		+ MUL15(a[ 3], a[14])
905 		+ MUL15(a[ 4], a[13])
906 		+ MUL15(a[ 5], a[12])
907 		+ MUL15(a[ 6], a[11])
908 		+ MUL15(a[ 7], a[10])
909 		+ MUL15(a[ 8], a[ 9])) << 1);
910 	t[18] = MUL15(a[ 9], a[ 9])
911 		+ ((MUL15(a[ 0], a[18])
912 		+ MUL15(a[ 1], a[17])
913 		+ MUL15(a[ 2], a[16])
914 		+ MUL15(a[ 3], a[15])
915 		+ MUL15(a[ 4], a[14])
916 		+ MUL15(a[ 5], a[13])
917 		+ MUL15(a[ 6], a[12])
918 		+ MUL15(a[ 7], a[11])
919 		+ MUL15(a[ 8], a[10])) << 1);
920 	t[19] = ((MUL15(a[ 0], a[19])
921 		+ MUL15(a[ 1], a[18])
922 		+ MUL15(a[ 2], a[17])
923 		+ MUL15(a[ 3], a[16])
924 		+ MUL15(a[ 4], a[15])
925 		+ MUL15(a[ 5], a[14])
926 		+ MUL15(a[ 6], a[13])
927 		+ MUL15(a[ 7], a[12])
928 		+ MUL15(a[ 8], a[11])
929 		+ MUL15(a[ 9], a[10])) << 1);
930 	t[20] = MUL15(a[10], a[10])
931 		+ ((MUL15(a[ 1], a[19])
932 		+ MUL15(a[ 2], a[18])
933 		+ MUL15(a[ 3], a[17])
934 		+ MUL15(a[ 4], a[16])
935 		+ MUL15(a[ 5], a[15])
936 		+ MUL15(a[ 6], a[14])
937 		+ MUL15(a[ 7], a[13])
938 		+ MUL15(a[ 8], a[12])
939 		+ MUL15(a[ 9], a[11])) << 1);
940 	t[21] = ((MUL15(a[ 2], a[19])
941 		+ MUL15(a[ 3], a[18])
942 		+ MUL15(a[ 4], a[17])
943 		+ MUL15(a[ 5], a[16])
944 		+ MUL15(a[ 6], a[15])
945 		+ MUL15(a[ 7], a[14])
946 		+ MUL15(a[ 8], a[13])
947 		+ MUL15(a[ 9], a[12])
948 		+ MUL15(a[10], a[11])) << 1);
949 	t[22] = MUL15(a[11], a[11])
950 		+ ((MUL15(a[ 3], a[19])
951 		+ MUL15(a[ 4], a[18])
952 		+ MUL15(a[ 5], a[17])
953 		+ MUL15(a[ 6], a[16])
954 		+ MUL15(a[ 7], a[15])
955 		+ MUL15(a[ 8], a[14])
956 		+ MUL15(a[ 9], a[13])
957 		+ MUL15(a[10], a[12])) << 1);
958 	t[23] = ((MUL15(a[ 4], a[19])
959 		+ MUL15(a[ 5], a[18])
960 		+ MUL15(a[ 6], a[17])
961 		+ MUL15(a[ 7], a[16])
962 		+ MUL15(a[ 8], a[15])
963 		+ MUL15(a[ 9], a[14])
964 		+ MUL15(a[10], a[13])
965 		+ MUL15(a[11], a[12])) << 1);
966 	t[24] = MUL15(a[12], a[12])
967 		+ ((MUL15(a[ 5], a[19])
968 		+ MUL15(a[ 6], a[18])
969 		+ MUL15(a[ 7], a[17])
970 		+ MUL15(a[ 8], a[16])
971 		+ MUL15(a[ 9], a[15])
972 		+ MUL15(a[10], a[14])
973 		+ MUL15(a[11], a[13])) << 1);
974 	t[25] = ((MUL15(a[ 6], a[19])
975 		+ MUL15(a[ 7], a[18])
976 		+ MUL15(a[ 8], a[17])
977 		+ MUL15(a[ 9], a[16])
978 		+ MUL15(a[10], a[15])
979 		+ MUL15(a[11], a[14])
980 		+ MUL15(a[12], a[13])) << 1);
981 	t[26] = MUL15(a[13], a[13])
982 		+ ((MUL15(a[ 7], a[19])
983 		+ MUL15(a[ 8], a[18])
984 		+ MUL15(a[ 9], a[17])
985 		+ MUL15(a[10], a[16])
986 		+ MUL15(a[11], a[15])
987 		+ MUL15(a[12], a[14])) << 1);
988 	t[27] = ((MUL15(a[ 8], a[19])
989 		+ MUL15(a[ 9], a[18])
990 		+ MUL15(a[10], a[17])
991 		+ MUL15(a[11], a[16])
992 		+ MUL15(a[12], a[15])
993 		+ MUL15(a[13], a[14])) << 1);
994 	t[28] = MUL15(a[14], a[14])
995 		+ ((MUL15(a[ 9], a[19])
996 		+ MUL15(a[10], a[18])
997 		+ MUL15(a[11], a[17])
998 		+ MUL15(a[12], a[16])
999 		+ MUL15(a[13], a[15])) << 1);
1000 	t[29] = ((MUL15(a[10], a[19])
1001 		+ MUL15(a[11], a[18])
1002 		+ MUL15(a[12], a[17])
1003 		+ MUL15(a[13], a[16])
1004 		+ MUL15(a[14], a[15])) << 1);
1005 	t[30] = MUL15(a[15], a[15])
1006 		+ ((MUL15(a[11], a[19])
1007 		+ MUL15(a[12], a[18])
1008 		+ MUL15(a[13], a[17])
1009 		+ MUL15(a[14], a[16])) << 1);
1010 	t[31] = ((MUL15(a[12], a[19])
1011 		+ MUL15(a[13], a[18])
1012 		+ MUL15(a[14], a[17])
1013 		+ MUL15(a[15], a[16])) << 1);
1014 	t[32] = MUL15(a[16], a[16])
1015 		+ ((MUL15(a[13], a[19])
1016 		+ MUL15(a[14], a[18])
1017 		+ MUL15(a[15], a[17])) << 1);
1018 	t[33] = ((MUL15(a[14], a[19])
1019 		+ MUL15(a[15], a[18])
1020 		+ MUL15(a[16], a[17])) << 1);
1021 	t[34] = MUL15(a[17], a[17])
1022 		+ ((MUL15(a[15], a[19])
1023 		+ MUL15(a[16], a[18])) << 1);
1024 	t[35] = ((MUL15(a[16], a[19])
1025 		+ MUL15(a[17], a[18])) << 1);
1026 	t[36] = MUL15(a[18], a[18])
1027 		+ ((MUL15(a[17], a[19])) << 1);
1028 	t[37] = ((MUL15(a[18], a[19])) << 1);
1029 	t[38] = MUL15(a[19], a[19]);
1030 
1031 	d[39] = norm13(d, t, 39);
1032 }
1033 
1034 #endif
1035 
1036 /*
1037  * Perform a "final reduction" in field F255 (field for Curve25519)
1038  * The source value must be less than twice the modulus. If the value
1039  * is not lower than the modulus, then the modulus is subtracted and
1040  * this function returns 1; otherwise, it leaves it untouched and it
1041  * returns 0.
1042  */
1043 static uint32_t
1044 reduce_final_f255(uint32_t *d)
1045 {
1046 	uint32_t t[20];
1047 	uint32_t cc;
1048 	int i;
1049 
1050 	memcpy(t, d, sizeof t);
1051 	cc = 19;
1052 	for (i = 0; i < 20; i ++) {
1053 		uint32_t w;
1054 
1055 		w = t[i] + cc;
1056 		cc = w >> 13;
1057 		t[i] = w & 0x1FFF;
1058 	}
1059 	cc = t[19] >> 8;
1060 	t[19] &= 0xFF;
1061 	CCOPY(cc, d, t, sizeof t);
1062 	return cc;
1063 }
1064 
1065 static void
1066 f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square)
1067 {
1068 	uint32_t t[40], cc, w;
1069 
1070 	/*
1071 	 * Compute raw multiplication. All result words fit in 13 bits
1072 	 * each; upper word (t[39]) must fit on 5 bits, since the product
1073 	 * of two 256-bit integers must fit on 512 bits.
1074 	 */
1075 	if (square) {
1076 		square20(t, a);
1077 	} else {
1078 		mul20(t, a, b);
1079 	}
1080 
1081 	/*
1082 	 * Modular reduction: each high word is added where necessary.
1083 	 * Since the modulus is 2^255-19 and word 20 corresponds to
1084 	 * offset 20*13 = 260, word 20+k must be added to word k with
1085 	 * a factor of 19*2^5 = 608. The extra bits in word 19 are also
1086 	 * added that way.
1087 	 */
1088 	cc = MUL15(t[19] >> 8, 19);
1089 	t[19] &= 0xFF;
1090 
1091 #define MM1(x)   do { \
1092 		w = t[x] + cc + MUL15(t[(x) + 20], 608); \
1093 		t[x] = w & 0x1FFF; \
1094 		cc = w >> 13; \
1095 	} while (0)
1096 
1097 	MM1( 0);
1098 	MM1( 1);
1099 	MM1( 2);
1100 	MM1( 3);
1101 	MM1( 4);
1102 	MM1( 5);
1103 	MM1( 6);
1104 	MM1( 7);
1105 	MM1( 8);
1106 	MM1( 9);
1107 	MM1(10);
1108 	MM1(11);
1109 	MM1(12);
1110 	MM1(13);
1111 	MM1(14);
1112 	MM1(15);
1113 	MM1(16);
1114 	MM1(17);
1115 	MM1(18);
1116 	MM1(19);
1117 
1118 #undef MM1
1119 
1120 	cc = MUL15(w >> 8, 19);
1121 	t[19] &= 0xFF;
1122 
1123 #define MM2(x)   do { \
1124 		w = t[x] + cc; \
1125 		d[x] = w & 0x1FFF; \
1126 		cc = w >> 13; \
1127 	} while (0)
1128 
1129 	MM2( 0);
1130 	MM2( 1);
1131 	MM2( 2);
1132 	MM2( 3);
1133 	MM2( 4);
1134 	MM2( 5);
1135 	MM2( 6);
1136 	MM2( 7);
1137 	MM2( 8);
1138 	MM2( 9);
1139 	MM2(10);
1140 	MM2(11);
1141 	MM2(12);
1142 	MM2(13);
1143 	MM2(14);
1144 	MM2(15);
1145 	MM2(16);
1146 	MM2(17);
1147 	MM2(18);
1148 	MM2(19);
1149 
1150 #undef MM2
1151 }
1152 
1153 /*
1154  * Perform a multiplication of two integers modulo 2^255-19.
1155  * Operands are arrays of 20 words, each containing 13 bits of data, in
1156  * little-endian order. Input value may be up to 2^256-1; on output, value
1157  * fits on 256 bits and is lower than twice the modulus.
1158  *
1159  * f255_mul() is the general multiplication, f255_square() is specialised
1160  * for squarings.
1161  */
1162 #define f255_mul(d, a, b)   f255_mulgen(d, a, b, 0)
1163 #define f255_square(d, a)   f255_mulgen(d, a, a, 1)
1164 
1165 /*
1166  * Add two values in F255. Partial reduction is performed (down to less
1167  * than twice the modulus).
1168  */
1169 static void
1170 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
1171 {
1172 	int i;
1173 	uint32_t cc, w;
1174 
1175 	cc = 0;
1176 	for (i = 0; i < 20; i ++) {
1177 		w = a[i] + b[i] + cc;
1178 		d[i] = w & 0x1FFF;
1179 		cc = w >> 13;
1180 	}
1181 	cc = MUL15(w >> 8, 19);
1182 	d[19] &= 0xFF;
1183 	for (i = 0; i < 20; i ++) {
1184 		w = d[i] + cc;
1185 		d[i] = w & 0x1FFF;
1186 		cc = w >> 13;
1187 	}
1188 }
1189 
1190 /*
1191  * Subtract one value from another in F255. Partial reduction is
1192  * performed (down to less than twice the modulus).
1193  */
1194 static void
1195 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
1196 {
1197 	/*
1198 	 * We actually compute a - b + 2*p, so that the final value is
1199 	 * necessarily positive.
1200 	 */
1201 	int i;
1202 	uint32_t cc, w;
1203 
1204 	cc = (uint32_t)-38;
1205 	for (i = 0; i < 20; i ++) {
1206 		w = a[i] - b[i] + cc;
1207 		d[i] = w & 0x1FFF;
1208 		cc = ARSH(w, 13);
1209 	}
1210 	cc = MUL15((w + 0x200) >> 8, 19);
1211 	d[19] &= 0xFF;
1212 	for (i = 0; i < 20; i ++) {
1213 		w = d[i] + cc;
1214 		d[i] = w & 0x1FFF;
1215 		cc = w >> 13;
1216 	}
1217 }
1218 
1219 /*
1220  * Multiply an integer by the 'A24' constant (121665). Partial reduction
1221  * is performed (down to less than twice the modulus).
1222  */
1223 static void
1224 f255_mul_a24(uint32_t *d, const uint32_t *a)
1225 {
1226 	int i;
1227 	uint32_t cc, w;
1228 
1229 	cc = 0;
1230 	for (i = 0; i < 20; i ++) {
1231 		w = MUL15(a[i], 121665) + cc;
1232 		d[i] = w & 0x1FFF;
1233 		cc = w >> 13;
1234 	}
1235 	cc = MUL15(w >> 8, 19);
1236 	d[19] &= 0xFF;
1237 	for (i = 0; i < 20; i ++) {
1238 		w = d[i] + cc;
1239 		d[i] = w & 0x1FFF;
1240 		cc = w >> 13;
1241 	}
1242 }
1243 
1244 static const unsigned char GEN[] = {
1245 	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1246 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1247 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
1248 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
1249 };
1250 
1251 static const unsigned char ORDER[] = {
1252 	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1253 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1254 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
1255 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
1256 };
1257 
1258 static const unsigned char *
1259 api_generator(int curve, size_t *len)
1260 {
1261 	(void)curve;
1262 	*len = 32;
1263 	return GEN;
1264 }
1265 
1266 static const unsigned char *
1267 api_order(int curve, size_t *len)
1268 {
1269 	(void)curve;
1270 	*len = 32;
1271 	return ORDER;
1272 }
1273 
1274 static size_t
1275 api_xoff(int curve, size_t *len)
1276 {
1277 	(void)curve;
1278 	*len = 32;
1279 	return 0;
1280 }
1281 
1282 static void
1283 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
1284 {
1285 	int i;
1286 
1287 	ctl = -ctl;
1288 	for (i = 0; i < 20; i ++) {
1289 		uint32_t aw, bw, tw;
1290 
1291 		aw = a[i];
1292 		bw = b[i];
1293 		tw = ctl & (aw ^ bw);
1294 		a[i] = aw ^ tw;
1295 		b[i] = bw ^ tw;
1296 	}
1297 }
1298 
1299 static uint32_t
1300 api_mul(unsigned char *G, size_t Glen,
1301 	const unsigned char *kb, size_t kblen, int curve)
1302 {
1303 	uint32_t x1[20], x2[20], x3[20], z2[20], z3[20];
1304 	uint32_t a[20], aa[20], b[20], bb[20];
1305 	uint32_t c[20], d[20], e[20], da[20], cb[20];
1306 	unsigned char k[32];
1307 	uint32_t swap;
1308 	int i;
1309 
1310 	(void)curve;
1311 
1312 	/*
1313 	 * Points are encoded over exactly 32 bytes. Multipliers must fit
1314 	 * in 32 bytes as well.
1315 	 * RFC 7748 mandates that the high bit of the last point byte must
1316 	 * be ignored/cleared.
1317 	 */
1318 	if (Glen != 32 || kblen > 32) {
1319 		return 0;
1320 	}
1321 	G[31] &= 0x7F;
1322 
1323 	/*
1324 	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
1325 	 * into Montgomery representation.
1326 	 */
1327 	x1[19] = le8_to_le13(x1, G, 32);
1328 	memcpy(x3, x1, sizeof x1);
1329 	memset(z2, 0, sizeof z2);
1330 	memset(x2, 0, sizeof x2);
1331 	x2[0] = 1;
1332 	memset(z3, 0, sizeof z3);
1333 	z3[0] = 1;
1334 
1335 	memset(k, 0, (sizeof k) - kblen);
1336 	memcpy(k + (sizeof k) - kblen, kb, kblen);
1337 	k[31] &= 0xF8;
1338 	k[0] &= 0x7F;
1339 	k[0] |= 0x40;
1340 
1341 	/* obsolete
1342 	print_int("x1", x1);
1343 	*/
1344 
1345 	swap = 0;
1346 	for (i = 254; i >= 0; i --) {
1347 		uint32_t kt;
1348 
1349 		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
1350 		swap ^= kt;
1351 		cswap(x2, x3, swap);
1352 		cswap(z2, z3, swap);
1353 		swap = kt;
1354 
1355 		/* obsolete
1356 		print_int("x2", x2);
1357 		print_int("z2", z2);
1358 		print_int("x3", x3);
1359 		print_int("z3", z3);
1360 		*/
1361 
1362 		f255_add(a, x2, z2);
1363 		f255_square(aa, a);
1364 		f255_sub(b, x2, z2);
1365 		f255_square(bb, b);
1366 		f255_sub(e, aa, bb);
1367 		f255_add(c, x3, z3);
1368 		f255_sub(d, x3, z3);
1369 		f255_mul(da, d, a);
1370 		f255_mul(cb, c, b);
1371 
1372 		/* obsolete
1373 		print_int("a ", a);
1374 		print_int("aa", aa);
1375 		print_int("b ", b);
1376 		print_int("bb", bb);
1377 		print_int("e ", e);
1378 		print_int("c ", c);
1379 		print_int("d ", d);
1380 		print_int("da", da);
1381 		print_int("cb", cb);
1382 		*/
1383 
1384 		f255_add(x3, da, cb);
1385 		f255_square(x3, x3);
1386 		f255_sub(z3, da, cb);
1387 		f255_square(z3, z3);
1388 		f255_mul(z3, z3, x1);
1389 		f255_mul(x2, aa, bb);
1390 		f255_mul_a24(z2, e);
1391 		f255_add(z2, z2, aa);
1392 		f255_mul(z2, e, z2);
1393 
1394 		/* obsolete
1395 		print_int("x2", x2);
1396 		print_int("z2", z2);
1397 		print_int("x3", x3);
1398 		print_int("z3", z3);
1399 		*/
1400 	}
1401 	cswap(x2, x3, swap);
1402 	cswap(z2, z3, swap);
1403 
1404 	/*
1405 	 * Inverse z2 with a modular exponentiation. This is a simple
1406 	 * square-and-multiply algorithm; we mutualise most non-squarings
1407 	 * since the exponent contains almost only ones.
1408 	 */
1409 	memcpy(a, z2, sizeof z2);
1410 	for (i = 0; i < 15; i ++) {
1411 		f255_square(a, a);
1412 		f255_mul(a, a, z2);
1413 	}
1414 	memcpy(b, a, sizeof a);
1415 	for (i = 0; i < 14; i ++) {
1416 		int j;
1417 
1418 		for (j = 0; j < 16; j ++) {
1419 			f255_square(b, b);
1420 		}
1421 		f255_mul(b, b, a);
1422 	}
1423 	for (i = 14; i >= 0; i --) {
1424 		f255_square(b, b);
1425 		if ((0xFFEB >> i) & 1) {
1426 			f255_mul(b, z2, b);
1427 		}
1428 	}
1429 	f255_mul(x2, x2, b);
1430 	reduce_final_f255(x2);
1431 	le13_to_le8(G, 32, x2);
1432 	return 1;
1433 }
1434 
1435 static size_t
1436 api_mulgen(unsigned char *R,
1437 	const unsigned char *x, size_t xlen, int curve)
1438 {
1439 	const unsigned char *G;
1440 	size_t Glen;
1441 
1442 	G = api_generator(curve, &Glen);
1443 	memcpy(R, G, Glen);
1444 	api_mul(R, Glen, x, xlen, curve);
1445 	return Glen;
1446 }
1447 
1448 static uint32_t
1449 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
1450 	const unsigned char *x, size_t xlen,
1451 	const unsigned char *y, size_t ylen, int curve)
1452 {
1453 	/*
1454 	 * We don't implement this method, since it is used for ECDSA
1455 	 * only, and there is no ECDSA over Curve25519 (which instead
1456 	 * uses EdDSA).
1457 	 */
1458 	(void)A;
1459 	(void)B;
1460 	(void)len;
1461 	(void)x;
1462 	(void)xlen;
1463 	(void)y;
1464 	(void)ylen;
1465 	(void)curve;
1466 	return 0;
1467 }
1468 
1469 /* see bearssl_ec.h */
1470 const br_ec_impl br_ec_c25519_m15 = {
1471 	(uint32_t)0x20000000,
1472 	&api_generator,
1473 	&api_order,
1474 	&api_xoff,
1475 	&api_mul,
1476 	&api_mulgen,
1477 	&api_muladd
1478 };
1479