xref: /freebsd/contrib/bearssl/src/ec/ec_c25519_m31.c (revision 2f513db72b034fd5ef7f080b11be5c711c15186a)
1 /*
2  * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org>
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining
5  * a copy of this software and associated documentation files (the
6  * "Software"), to deal in the Software without restriction, including
7  * without limitation the rights to use, copy, modify, merge, publish,
8  * distribute, sublicense, and/or sell copies of the Software, and to
9  * permit persons to whom the Software is furnished to do so, subject to
10  * the following conditions:
11  *
12  * The above copyright notice and this permission notice shall be
13  * included in all copies or substantial portions of the Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22  * SOFTWARE.
23  */
24 
25 #include "inner.h"
26 
27 /* obsolete
28 #include <stdio.h>
29 #include <stdlib.h>
30 static void
31 print_int(const char *name, const uint32_t *x)
32 {
33 	size_t u;
34 	unsigned char tmp[40];
35 
36 	printf("%s = ", name);
37 	for (u = 0; u < 9; u ++) {
38 		if (x[u] > 0x3FFFFFFF) {
39 			printf("INVALID:");
40 			for (u = 0; u < 9; u ++) {
41 				printf(" %08X", x[u]);
42 			}
43 			printf("\n");
44 			return;
45 		}
46 	}
47 	memset(tmp, 0, sizeof tmp);
48 	for (u = 0; u < 9; u ++) {
49 		uint64_t w;
50 		int j, k;
51 
52 		w = x[u];
53 		j = 30 * (int)u;
54 		k = j & 7;
55 		if (k != 0) {
56 			w <<= k;
57 			j -= k;
58 		}
59 		k = j >> 3;
60 		for (j = 0; j < 8; j ++) {
61 			tmp[39 - k - j] |= (unsigned char)w;
62 			w >>= 8;
63 		}
64 	}
65 	for (u = 8; u < 40; u ++) {
66 		printf("%02X", tmp[u]);
67 	}
68 	printf("\n");
69 }
70 */
71 
72 /*
73  * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_
74  * that right-shifting a signed negative integer copies the sign bit
75  * (arithmetic right-shift). This is "implementation-defined behaviour",
76  * i.e. it is not undefined, but it may differ between compilers. Each
77  * compiler is supposed to document its behaviour in that respect. GCC
78  * explicitly defines that an arithmetic right shift is used. We expect
79  * all other compilers to do the same, because underlying CPU offer an
80  * arithmetic right shift opcode that could not be used otherwise.
81  */
82 #if BR_NO_ARITH_SHIFT
83 #define ARSH(x, n)   (((uint32_t)(x) >> (n)) \
84                     | ((-((uint32_t)(x) >> 31)) << (32 - (n))))
85 #else
86 #define ARSH(x, n)   ((*(int32_t *)&(x)) >> (n))
87 #endif
88 
89 /*
90  * Convert an integer from unsigned little-endian encoding to a sequence of
91  * 30-bit words in little-endian order. The final "partial" word is
92  * returned.
93  */
94 static uint32_t
95 le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len)
96 {
97 	uint32_t acc;
98 	int acc_len;
99 
100 	acc = 0;
101 	acc_len = 0;
102 	while (len -- > 0) {
103 		uint32_t b;
104 
105 		b = *src ++;
106 		if (acc_len < 22) {
107 			acc |= b << acc_len;
108 			acc_len += 8;
109 		} else {
110 			*dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF;
111 			acc = b >> (30 - acc_len);
112 			acc_len -= 22;
113 		}
114 	}
115 	return acc;
116 }
117 
118 /*
119  * Convert an integer (30-bit words, little-endian) to unsigned
120  * little-endian encoding. The total encoding length is provided; all
121  * the destination bytes will be filled.
122  */
123 static void
124 le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src)
125 {
126 	uint32_t acc;
127 	int acc_len;
128 
129 	acc = 0;
130 	acc_len = 0;
131 	while (len -- > 0) {
132 		if (acc_len < 8) {
133 			uint32_t w;
134 
135 			w = *src ++;
136 			*dst ++ = (unsigned char)(acc | (w << acc_len));
137 			acc = w >> (8 - acc_len);
138 			acc_len += 22;
139 		} else {
140 			*dst ++ = (unsigned char)acc;
141 			acc >>= 8;
142 			acc_len -= 8;
143 		}
144 	}
145 }
146 
147 /*
148  * Multiply two integers. Source integers are represented as arrays of
149  * nine 30-bit words, for values up to 2^270-1. Result is encoded over
150  * 18 words of 30 bits each.
151  */
152 static void
153 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b)
154 {
155 	/*
156 	 * Maximum intermediate result is no more than
157 	 * 10376293531797946367, which fits in 64 bits. Reason:
158 	 *
159 	 *   10376293531797946367 = 9 * (2^30-1)^2 + 9663676406
160 	 *   10376293531797946367 < 9663676407 * 2^30
161 	 *
162 	 * Thus, adding together 9 products of 30-bit integers, with
163 	 * a carry of at most 9663676406, yields an integer that fits
164 	 * on 64 bits and generates a carry of at most 9663676406.
165 	 */
166 	uint64_t t[17];
167 	uint64_t cc;
168 	int i;
169 
170 	t[ 0] = MUL31(a[0], b[0]);
171 	t[ 1] = MUL31(a[0], b[1])
172 		+ MUL31(a[1], b[0]);
173 	t[ 2] = MUL31(a[0], b[2])
174 		+ MUL31(a[1], b[1])
175 		+ MUL31(a[2], b[0]);
176 	t[ 3] = MUL31(a[0], b[3])
177 		+ MUL31(a[1], b[2])
178 		+ MUL31(a[2], b[1])
179 		+ MUL31(a[3], b[0]);
180 	t[ 4] = MUL31(a[0], b[4])
181 		+ MUL31(a[1], b[3])
182 		+ MUL31(a[2], b[2])
183 		+ MUL31(a[3], b[1])
184 		+ MUL31(a[4], b[0]);
185 	t[ 5] = MUL31(a[0], b[5])
186 		+ MUL31(a[1], b[4])
187 		+ MUL31(a[2], b[3])
188 		+ MUL31(a[3], b[2])
189 		+ MUL31(a[4], b[1])
190 		+ MUL31(a[5], b[0]);
191 	t[ 6] = MUL31(a[0], b[6])
192 		+ MUL31(a[1], b[5])
193 		+ MUL31(a[2], b[4])
194 		+ MUL31(a[3], b[3])
195 		+ MUL31(a[4], b[2])
196 		+ MUL31(a[5], b[1])
197 		+ MUL31(a[6], b[0]);
198 	t[ 7] = MUL31(a[0], b[7])
199 		+ MUL31(a[1], b[6])
200 		+ MUL31(a[2], b[5])
201 		+ MUL31(a[3], b[4])
202 		+ MUL31(a[4], b[3])
203 		+ MUL31(a[5], b[2])
204 		+ MUL31(a[6], b[1])
205 		+ MUL31(a[7], b[0]);
206 	t[ 8] = MUL31(a[0], b[8])
207 		+ MUL31(a[1], b[7])
208 		+ MUL31(a[2], b[6])
209 		+ MUL31(a[3], b[5])
210 		+ MUL31(a[4], b[4])
211 		+ MUL31(a[5], b[3])
212 		+ MUL31(a[6], b[2])
213 		+ MUL31(a[7], b[1])
214 		+ MUL31(a[8], b[0]);
215 	t[ 9] = MUL31(a[1], b[8])
216 		+ MUL31(a[2], b[7])
217 		+ MUL31(a[3], b[6])
218 		+ MUL31(a[4], b[5])
219 		+ MUL31(a[5], b[4])
220 		+ MUL31(a[6], b[3])
221 		+ MUL31(a[7], b[2])
222 		+ MUL31(a[8], b[1]);
223 	t[10] = MUL31(a[2], b[8])
224 		+ MUL31(a[3], b[7])
225 		+ MUL31(a[4], b[6])
226 		+ MUL31(a[5], b[5])
227 		+ MUL31(a[6], b[4])
228 		+ MUL31(a[7], b[3])
229 		+ MUL31(a[8], b[2]);
230 	t[11] = MUL31(a[3], b[8])
231 		+ MUL31(a[4], b[7])
232 		+ MUL31(a[5], b[6])
233 		+ MUL31(a[6], b[5])
234 		+ MUL31(a[7], b[4])
235 		+ MUL31(a[8], b[3]);
236 	t[12] = MUL31(a[4], b[8])
237 		+ MUL31(a[5], b[7])
238 		+ MUL31(a[6], b[6])
239 		+ MUL31(a[7], b[5])
240 		+ MUL31(a[8], b[4]);
241 	t[13] = MUL31(a[5], b[8])
242 		+ MUL31(a[6], b[7])
243 		+ MUL31(a[7], b[6])
244 		+ MUL31(a[8], b[5]);
245 	t[14] = MUL31(a[6], b[8])
246 		+ MUL31(a[7], b[7])
247 		+ MUL31(a[8], b[6]);
248 	t[15] = MUL31(a[7], b[8])
249 		+ MUL31(a[8], b[7]);
250 	t[16] = MUL31(a[8], b[8]);
251 
252 	/*
253 	 * Propagate carries.
254 	 */
255 	cc = 0;
256 	for (i = 0; i < 17; i ++) {
257 		uint64_t w;
258 
259 		w = t[i] + cc;
260 		d[i] = (uint32_t)w & 0x3FFFFFFF;
261 		cc = w >> 30;
262 	}
263 	d[17] = (uint32_t)cc;
264 }
265 
266 /*
267  * Square a 270-bit integer, represented as an array of nine 30-bit words.
268  * Result uses 18 words of 30 bits each.
269  */
270 static void
271 square9(uint32_t *d, const uint32_t *a)
272 {
273 	uint64_t t[17];
274 	uint64_t cc;
275 	int i;
276 
277 	t[ 0] = MUL31(a[0], a[0]);
278 	t[ 1] = ((MUL31(a[0], a[1])) << 1);
279 	t[ 2] = MUL31(a[1], a[1])
280 		+ ((MUL31(a[0], a[2])) << 1);
281 	t[ 3] = ((MUL31(a[0], a[3])
282 		+ MUL31(a[1], a[2])) << 1);
283 	t[ 4] = MUL31(a[2], a[2])
284 		+ ((MUL31(a[0], a[4])
285 		+ MUL31(a[1], a[3])) << 1);
286 	t[ 5] = ((MUL31(a[0], a[5])
287 		+ MUL31(a[1], a[4])
288 		+ MUL31(a[2], a[3])) << 1);
289 	t[ 6] = MUL31(a[3], a[3])
290 		+ ((MUL31(a[0], a[6])
291 		+ MUL31(a[1], a[5])
292 		+ MUL31(a[2], a[4])) << 1);
293 	t[ 7] = ((MUL31(a[0], a[7])
294 		+ MUL31(a[1], a[6])
295 		+ MUL31(a[2], a[5])
296 		+ MUL31(a[3], a[4])) << 1);
297 	t[ 8] = MUL31(a[4], a[4])
298 		+ ((MUL31(a[0], a[8])
299 		+ MUL31(a[1], a[7])
300 		+ MUL31(a[2], a[6])
301 		+ MUL31(a[3], a[5])) << 1);
302 	t[ 9] = ((MUL31(a[1], a[8])
303 		+ MUL31(a[2], a[7])
304 		+ MUL31(a[3], a[6])
305 		+ MUL31(a[4], a[5])) << 1);
306 	t[10] = MUL31(a[5], a[5])
307 		+ ((MUL31(a[2], a[8])
308 		+ MUL31(a[3], a[7])
309 		+ MUL31(a[4], a[6])) << 1);
310 	t[11] = ((MUL31(a[3], a[8])
311 		+ MUL31(a[4], a[7])
312 		+ MUL31(a[5], a[6])) << 1);
313 	t[12] = MUL31(a[6], a[6])
314 		+ ((MUL31(a[4], a[8])
315 		+ MUL31(a[5], a[7])) << 1);
316 	t[13] = ((MUL31(a[5], a[8])
317 		+ MUL31(a[6], a[7])) << 1);
318 	t[14] = MUL31(a[7], a[7])
319 		+ ((MUL31(a[6], a[8])) << 1);
320 	t[15] = ((MUL31(a[7], a[8])) << 1);
321 	t[16] = MUL31(a[8], a[8]);
322 
323 	/*
324 	 * Propagate carries.
325 	 */
326 	cc = 0;
327 	for (i = 0; i < 17; i ++) {
328 		uint64_t w;
329 
330 		w = t[i] + cc;
331 		d[i] = (uint32_t)w & 0x3FFFFFFF;
332 		cc = w >> 30;
333 	}
334 	d[17] = (uint32_t)cc;
335 }
336 
337 /*
338  * Perform a "final reduction" in field F255 (field for Curve25519)
339  * The source value must be less than twice the modulus. If the value
340  * is not lower than the modulus, then the modulus is subtracted and
341  * this function returns 1; otherwise, it leaves it untouched and it
342  * returns 0.
343  */
344 static uint32_t
345 reduce_final_f255(uint32_t *d)
346 {
347 	uint32_t t[9];
348 	uint32_t cc;
349 	int i;
350 
351 	memcpy(t, d, sizeof t);
352 	cc = 19;
353 	for (i = 0; i < 9; i ++) {
354 		uint32_t w;
355 
356 		w = t[i] + cc;
357 		cc = w >> 30;
358 		t[i] = w & 0x3FFFFFFF;
359 	}
360 	cc = t[8] >> 15;
361 	t[8] &= 0x7FFF;
362 	CCOPY(cc, d, t, sizeof t);
363 	return cc;
364 }
365 
366 /*
367  * Perform a multiplication of two integers modulo 2^255-19.
368  * Operands are arrays of 9 words, each containing 30 bits of data, in
369  * little-endian order. Input value may be up to 2^256-1; on output, value
370  * fits on 256 bits and is lower than twice the modulus.
371  */
372 static void
373 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b)
374 {
375 	uint32_t t[18], cc;
376 	int i;
377 
378 	/*
379 	 * Compute raw multiplication. All result words fit in 30 bits
380 	 * each; upper word (t[17]) must fit on 2 bits, since the product
381 	 * of two 256-bit integers must fit on 512 bits.
382 	 */
383 	mul9(t, a, b);
384 
385 	/*
386 	 * Modular reduction: each high word is added where necessary.
387 	 * Since the modulus is 2^255-19 and word 9 corresponds to
388 	 * offset 9*30 = 270, word 9+k must be added to word k with
389 	 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also
390 	 * added that way.
391 	 *
392 	 * Keeping the carry on 32 bits helps with 32-bit architectures,
393 	 * and does not noticeably impact performance on 64-bit systems.
394 	 */
395 	cc = MUL15(t[8] >> 15, 19);  /* at most 19*(2^15-1) = 622573 */
396 	t[8] &= 0x7FFF;
397 	for (i = 0; i < 9; i ++) {
398 		uint64_t w;
399 
400 		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
401 		t[i] = (uint32_t)w & 0x3FFFFFFF;
402 		cc = (uint32_t)(w >> 30);  /* at most 622592 */
403 	}
404 
405 	/*
406 	 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer.
407 	 * This was split into two parts (upper of 257 bits, lower of 255
408 	 * bits), and the upper was added to the lower with a factor 19,
409 	 * which means that the intermediate value is less than 77*2^255
410 	 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are
411 	 * less than 77, and the initial carry cc is at most 76*19 = 1444.
412 	 */
413 	cc = MUL15(t[8] >> 15, 19);
414 	t[8] &= 0x7FFF;
415 	for (i = 0; i < 9; i ++) {
416 		uint32_t z;
417 
418 		z = t[i] + cc;
419 		d[i] = z & 0x3FFFFFFF;
420 		cc = z >> 30;
421 	}
422 
423 	/*
424 	 * Final result is at most 2^255 + 1443. In particular, the last
425 	 * carry is necessarily 0, since t[8] was truncated to 15 bits.
426 	 */
427 }
428 
429 /*
430  * Perform a squaring of an integer modulo 2^255-19.
431  * Operands are arrays of 9 words, each containing 30 bits of data, in
432  * little-endian order. Input value may be up to 2^256-1; on output, value
433  * fits on 256 bits and is lower than twice the modulus.
434  */
435 static void
436 f255_square(uint32_t *d, const uint32_t *a)
437 {
438 	uint32_t t[18], cc;
439 	int i;
440 
441 	/*
442 	 * Compute raw squaring. All result words fit in 30 bits
443 	 * each; upper word (t[17]) must fit on 2 bits, since the square
444 	 * of a 256-bit integers must fit on 512 bits.
445 	 */
446 	square9(t, a);
447 
448 	/*
449 	 * Modular reduction: each high word is added where necessary.
450 	 * See f255_mul() for details on the reduction and carry limits.
451 	 */
452 	cc = MUL15(t[8] >> 15, 19);
453 	t[8] &= 0x7FFF;
454 	for (i = 0; i < 9; i ++) {
455 		uint64_t w;
456 
457 		w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592);
458 		t[i] = (uint32_t)w & 0x3FFFFFFF;
459 		cc = (uint32_t)(w >> 30);
460 	}
461 	cc = MUL15(t[8] >> 15, 19);
462 	t[8] &= 0x7FFF;
463 	for (i = 0; i < 9; i ++) {
464 		uint32_t z;
465 
466 		z = t[i] + cc;
467 		d[i] = z & 0x3FFFFFFF;
468 		cc = z >> 30;
469 	}
470 }
471 
472 /*
473  * Add two values in F255. Partial reduction is performed (down to less
474  * than twice the modulus).
475  */
476 static void
477 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b)
478 {
479 	/*
480 	 * Since operand words fit on 30 bits, we can use 32-bit
481 	 * variables throughout.
482 	 */
483 	int i;
484 	uint32_t cc, w;
485 
486 	cc = 0;
487 	for (i = 0; i < 9; i ++) {
488 		w = a[i] + b[i] + cc;
489 		d[i] = w & 0x3FFFFFFF;
490 		cc = w >> 30;
491 	}
492 	cc = MUL15(w >> 15, 19);
493 	d[8] &= 0x7FFF;
494 	for (i = 0; i < 9; i ++) {
495 		w = d[i] + cc;
496 		d[i] = w & 0x3FFFFFFF;
497 		cc = w >> 30;
498 	}
499 }
500 
501 /*
502  * Subtract one value from another in F255. Partial reduction is
503  * performed (down to less than twice the modulus).
504  */
505 static void
506 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b)
507 {
508 	/*
509 	 * We actually compute a - b + 2*p, so that the final value is
510 	 * necessarily positive.
511 	 */
512 	int i;
513 	uint32_t cc, w;
514 
515 	cc = (uint32_t)-38;
516 	for (i = 0; i < 9; i ++) {
517 		w = a[i] - b[i] + cc;
518 		d[i] = w & 0x3FFFFFFF;
519 		cc = ARSH(w, 30);
520 	}
521 	cc = MUL15((w + 0x10000) >> 15, 19);
522 	d[8] &= 0x7FFF;
523 	for (i = 0; i < 9; i ++) {
524 		w = d[i] + cc;
525 		d[i] = w & 0x3FFFFFFF;
526 		cc = w >> 30;
527 	}
528 }
529 
530 /*
531  * Multiply an integer by the 'A24' constant (121665). Partial reduction
532  * is performed (down to less than twice the modulus).
533  */
534 static void
535 f255_mul_a24(uint32_t *d, const uint32_t *a)
536 {
537 	int i;
538 	uint64_t w;
539 	uint32_t cc;
540 
541 	/*
542 	 * a[] is over 256 bits, thus a[8] has length at most 16 bits.
543 	 * We single out the processing of the last word: intermediate
544 	 * value w is up to 121665*2^16, yielding a carry for the next
545 	 * loop of at most 19*(121665*2^16/2^15) = 4623289.
546 	 */
547 	cc = 0;
548 	for (i = 0; i < 8; i ++) {
549 		w = MUL31(a[i], 121665) + (uint64_t)cc;
550 		d[i] = (uint32_t)w & 0x3FFFFFFF;
551 		cc = (uint32_t)(w >> 30);
552 	}
553 	w = MUL31(a[8], 121665) + (uint64_t)cc;
554 	d[8] = (uint32_t)w & 0x7FFF;
555 	cc = MUL15((uint32_t)(w >> 15), 19);
556 
557 	for (i = 0; i < 9; i ++) {
558 		uint32_t z;
559 
560 		z = d[i] + cc;
561 		d[i] = z & 0x3FFFFFFF;
562 		cc = z >> 30;
563 	}
564 }
565 
566 static const unsigned char GEN[] = {
567 	0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
568 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
569 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
570 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00
571 };
572 
573 static const unsigned char ORDER[] = {
574 	0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
575 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
576 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
577 	0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF
578 };
579 
580 static const unsigned char *
581 api_generator(int curve, size_t *len)
582 {
583 	(void)curve;
584 	*len = 32;
585 	return GEN;
586 }
587 
588 static const unsigned char *
589 api_order(int curve, size_t *len)
590 {
591 	(void)curve;
592 	*len = 32;
593 	return ORDER;
594 }
595 
596 static size_t
597 api_xoff(int curve, size_t *len)
598 {
599 	(void)curve;
600 	*len = 32;
601 	return 0;
602 }
603 
604 static void
605 cswap(uint32_t *a, uint32_t *b, uint32_t ctl)
606 {
607 	int i;
608 
609 	ctl = -ctl;
610 	for (i = 0; i < 9; i ++) {
611 		uint32_t aw, bw, tw;
612 
613 		aw = a[i];
614 		bw = b[i];
615 		tw = ctl & (aw ^ bw);
616 		a[i] = aw ^ tw;
617 		b[i] = bw ^ tw;
618 	}
619 }
620 
621 static uint32_t
622 api_mul(unsigned char *G, size_t Glen,
623 	const unsigned char *kb, size_t kblen, int curve)
624 {
625 	uint32_t x1[9], x2[9], x3[9], z2[9], z3[9];
626 	uint32_t a[9], aa[9], b[9], bb[9];
627 	uint32_t c[9], d[9], e[9], da[9], cb[9];
628 	unsigned char k[32];
629 	uint32_t swap;
630 	int i;
631 
632 	(void)curve;
633 
634 	/*
635 	 * Points are encoded over exactly 32 bytes. Multipliers must fit
636 	 * in 32 bytes as well.
637 	 * RFC 7748 mandates that the high bit of the last point byte must
638 	 * be ignored/cleared.
639 	 */
640 	if (Glen != 32 || kblen > 32) {
641 		return 0;
642 	}
643 	G[31] &= 0x7F;
644 
645 	/*
646 	 * Initialise variables x1, x2, z2, x3 and z3. We set all of them
647 	 * into Montgomery representation.
648 	 */
649 	x1[8] = le8_to_le30(x1, G, 32);
650 	memcpy(x3, x1, sizeof x1);
651 	memset(z2, 0, sizeof z2);
652 	memset(x2, 0, sizeof x2);
653 	x2[0] = 1;
654 	memset(z3, 0, sizeof z3);
655 	z3[0] = 1;
656 
657 	memset(k, 0, (sizeof k) - kblen);
658 	memcpy(k + (sizeof k) - kblen, kb, kblen);
659 	k[31] &= 0xF8;
660 	k[0] &= 0x7F;
661 	k[0] |= 0x40;
662 
663 	/* obsolete
664 	print_int("x1", x1);
665 	*/
666 
667 	swap = 0;
668 	for (i = 254; i >= 0; i --) {
669 		uint32_t kt;
670 
671 		kt = (k[31 - (i >> 3)] >> (i & 7)) & 1;
672 		swap ^= kt;
673 		cswap(x2, x3, swap);
674 		cswap(z2, z3, swap);
675 		swap = kt;
676 
677 		/* obsolete
678 		print_int("x2", x2);
679 		print_int("z2", z2);
680 		print_int("x3", x3);
681 		print_int("z3", z3);
682 		*/
683 
684 		f255_add(a, x2, z2);
685 		f255_square(aa, a);
686 		f255_sub(b, x2, z2);
687 		f255_square(bb, b);
688 		f255_sub(e, aa, bb);
689 		f255_add(c, x3, z3);
690 		f255_sub(d, x3, z3);
691 		f255_mul(da, d, a);
692 		f255_mul(cb, c, b);
693 
694 		/* obsolete
695 		print_int("a ", a);
696 		print_int("aa", aa);
697 		print_int("b ", b);
698 		print_int("bb", bb);
699 		print_int("e ", e);
700 		print_int("c ", c);
701 		print_int("d ", d);
702 		print_int("da", da);
703 		print_int("cb", cb);
704 		*/
705 
706 		f255_add(x3, da, cb);
707 		f255_square(x3, x3);
708 		f255_sub(z3, da, cb);
709 		f255_square(z3, z3);
710 		f255_mul(z3, z3, x1);
711 		f255_mul(x2, aa, bb);
712 		f255_mul_a24(z2, e);
713 		f255_add(z2, z2, aa);
714 		f255_mul(z2, e, z2);
715 
716 		/* obsolete
717 		print_int("x2", x2);
718 		print_int("z2", z2);
719 		print_int("x3", x3);
720 		print_int("z3", z3);
721 		*/
722 	}
723 	cswap(x2, x3, swap);
724 	cswap(z2, z3, swap);
725 
726 	/*
727 	 * Inverse z2 with a modular exponentiation. This is a simple
728 	 * square-and-multiply algorithm; we mutualise most non-squarings
729 	 * since the exponent contains almost only ones.
730 	 */
731 	memcpy(a, z2, sizeof z2);
732 	for (i = 0; i < 15; i ++) {
733 		f255_square(a, a);
734 		f255_mul(a, a, z2);
735 	}
736 	memcpy(b, a, sizeof a);
737 	for (i = 0; i < 14; i ++) {
738 		int j;
739 
740 		for (j = 0; j < 16; j ++) {
741 			f255_square(b, b);
742 		}
743 		f255_mul(b, b, a);
744 	}
745 	for (i = 14; i >= 0; i --) {
746 		f255_square(b, b);
747 		if ((0xFFEB >> i) & 1) {
748 			f255_mul(b, z2, b);
749 		}
750 	}
751 	f255_mul(x2, x2, b);
752 	reduce_final_f255(x2);
753 	le30_to_le8(G, 32, x2);
754 	return 1;
755 }
756 
757 static size_t
758 api_mulgen(unsigned char *R,
759 	const unsigned char *x, size_t xlen, int curve)
760 {
761 	const unsigned char *G;
762 	size_t Glen;
763 
764 	G = api_generator(curve, &Glen);
765 	memcpy(R, G, Glen);
766 	api_mul(R, Glen, x, xlen, curve);
767 	return Glen;
768 }
769 
770 static uint32_t
771 api_muladd(unsigned char *A, const unsigned char *B, size_t len,
772 	const unsigned char *x, size_t xlen,
773 	const unsigned char *y, size_t ylen, int curve)
774 {
775 	/*
776 	 * We don't implement this method, since it is used for ECDSA
777 	 * only, and there is no ECDSA over Curve25519 (which instead
778 	 * uses EdDSA).
779 	 */
780 	(void)A;
781 	(void)B;
782 	(void)len;
783 	(void)x;
784 	(void)xlen;
785 	(void)y;
786 	(void)ylen;
787 	(void)curve;
788 	return 0;
789 }
790 
791 /* see bearssl_ec.h */
792 const br_ec_impl br_ec_c25519_m31 = {
793 	(uint32_t)0x20000000,
794 	&api_generator,
795 	&api_order,
796 	&api_xoff,
797 	&api_mul,
798 	&api_mulgen,
799 	&api_muladd
800 };
801