xref: /freebsd/crypto/openssl/crypto/ec/ecp_nistp521.c (revision f39bffc62c1395bde25d152c7f68fdf7cbaab414)
1 /* crypto/ec/ecp_nistp521.c */
2 /*
3  * Written by Adam Langley (Google) for the OpenSSL project
4  */
5 /* Copyright 2011 Google Inc.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  *
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  *  Unless required by applicable law or agreed to in writing, software
15  *  distributed under the License is distributed on an "AS IS" BASIS,
16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  *  See the License for the specific language governing permissions and
18  *  limitations under the License.
19  */
20 
21 /*
22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23  *
24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26  * work which got its smarts from Daniel J. Bernstein's work on the same.
27  */
28 
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31 
32 # ifndef OPENSSL_SYS_VMS
33 #  include <stdint.h>
34 # else
35 #  include <inttypes.h>
36 # endif
37 
38 # include <string.h>
39 # include <openssl/err.h>
40 # include "ec_lcl.h"
41 
42 # if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43   /* even with gcc, the typedef won't work for 32-bit platforms */
44 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
45                                  * platforms */
46 # else
47 #  error "Need GCC 3.1 or later to define type uint128_t"
48 # endif
49 
50 typedef uint8_t u8;
51 typedef uint64_t u64;
52 
53 /*
54  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
55  * element of this field into 66 bytes where the most significant byte
56  * contains only a single bit. We call this an felem_bytearray.
57  */
58 
59 typedef u8 felem_bytearray[66];
60 
61 /*
62  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
63  * These values are big-endian.
64  */
65 static const felem_bytearray nistp521_curve_params[5] = {
66     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
67      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74      0xff, 0xff},
75     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
76      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83      0xff, 0xfc},
84     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
85      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
86      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
87      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
88      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
89      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
90      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
91      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
92      0x3f, 0x00},
93     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
94      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
95      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
96      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
97      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
98      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
99      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
100      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
101      0xbd, 0x66},
102     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
103      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
104      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
105      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
106      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
107      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
108      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
109      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
110      0x66, 0x50}
111 };
112 
113 /*-
114  * The representation of field elements.
115  * ------------------------------------
116  *
117  * We represent field elements with nine values. These values are either 64 or
118  * 128 bits and the field element represented is:
119  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
120  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
121  * 58 bits apart, but are greater than 58 bits in length, the most significant
122  * bits of each limb overlap with the least significant bits of the next.
123  *
124  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
125  * 'largefelem' */
126 
127 # define NLIMBS 9
128 
129 typedef uint64_t limb;
130 typedef limb felem[NLIMBS];
131 typedef uint128_t largefelem[NLIMBS];
132 
133 static const limb bottom57bits = 0x1ffffffffffffff;
134 static const limb bottom58bits = 0x3ffffffffffffff;
135 
136 /*
137  * bin66_to_felem takes a little-endian byte array and converts it into felem
138  * form. This assumes that the CPU is little-endian.
139  */
140 static void bin66_to_felem(felem out, const u8 in[66])
141 {
142     out[0] = (*((limb *) & in[0])) & bottom58bits;
143     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
144     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
145     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
146     out[4] = (*((limb *) & in[29])) & bottom58bits;
147     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
148     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
149     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
150     out[8] = (*((limb *) & in[58])) & bottom57bits;
151 }
152 
153 /*
154  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
155  * array. This assumes that the CPU is little-endian.
156  */
157 static void felem_to_bin66(u8 out[66], const felem in)
158 {
159     memset(out, 0, 66);
160     (*((limb *) & out[0])) = in[0];
161     (*((limb *) & out[7])) |= in[1] << 2;
162     (*((limb *) & out[14])) |= in[2] << 4;
163     (*((limb *) & out[21])) |= in[3] << 6;
164     (*((limb *) & out[29])) = in[4];
165     (*((limb *) & out[36])) |= in[5] << 2;
166     (*((limb *) & out[43])) |= in[6] << 4;
167     (*((limb *) & out[50])) |= in[7] << 6;
168     (*((limb *) & out[58])) = in[8];
169 }
170 
171 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
172 static void flip_endian(u8 *out, const u8 *in, unsigned len)
173 {
174     unsigned i;
175     for (i = 0; i < len; ++i)
176         out[i] = in[len - 1 - i];
177 }
178 
179 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
180 static int BN_to_felem(felem out, const BIGNUM *bn)
181 {
182     felem_bytearray b_in;
183     felem_bytearray b_out;
184     unsigned num_bytes;
185 
186     /* BN_bn2bin eats leading zeroes */
187     memset(b_out, 0, sizeof(b_out));
188     num_bytes = BN_num_bytes(bn);
189     if (num_bytes > sizeof(b_out)) {
190         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
191         return 0;
192     }
193     if (BN_is_negative(bn)) {
194         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
195         return 0;
196     }
197     num_bytes = BN_bn2bin(bn, b_in);
198     flip_endian(b_out, b_in, num_bytes);
199     bin66_to_felem(out, b_out);
200     return 1;
201 }
202 
203 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
204 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
205 {
206     felem_bytearray b_in, b_out;
207     felem_to_bin66(b_in, in);
208     flip_endian(b_out, b_in, sizeof(b_out));
209     return BN_bin2bn(b_out, sizeof(b_out), out);
210 }
211 
212 /*-
213  * Field operations
214  * ----------------
215  */
216 
217 static void felem_one(felem out)
218 {
219     out[0] = 1;
220     out[1] = 0;
221     out[2] = 0;
222     out[3] = 0;
223     out[4] = 0;
224     out[5] = 0;
225     out[6] = 0;
226     out[7] = 0;
227     out[8] = 0;
228 }
229 
230 static void felem_assign(felem out, const felem in)
231 {
232     out[0] = in[0];
233     out[1] = in[1];
234     out[2] = in[2];
235     out[3] = in[3];
236     out[4] = in[4];
237     out[5] = in[5];
238     out[6] = in[6];
239     out[7] = in[7];
240     out[8] = in[8];
241 }
242 
243 /* felem_sum64 sets out = out + in. */
244 static void felem_sum64(felem out, const felem in)
245 {
246     out[0] += in[0];
247     out[1] += in[1];
248     out[2] += in[2];
249     out[3] += in[3];
250     out[4] += in[4];
251     out[5] += in[5];
252     out[6] += in[6];
253     out[7] += in[7];
254     out[8] += in[8];
255 }
256 
257 /* felem_scalar sets out = in * scalar */
258 static void felem_scalar(felem out, const felem in, limb scalar)
259 {
260     out[0] = in[0] * scalar;
261     out[1] = in[1] * scalar;
262     out[2] = in[2] * scalar;
263     out[3] = in[3] * scalar;
264     out[4] = in[4] * scalar;
265     out[5] = in[5] * scalar;
266     out[6] = in[6] * scalar;
267     out[7] = in[7] * scalar;
268     out[8] = in[8] * scalar;
269 }
270 
271 /* felem_scalar64 sets out = out * scalar */
272 static void felem_scalar64(felem out, limb scalar)
273 {
274     out[0] *= scalar;
275     out[1] *= scalar;
276     out[2] *= scalar;
277     out[3] *= scalar;
278     out[4] *= scalar;
279     out[5] *= scalar;
280     out[6] *= scalar;
281     out[7] *= scalar;
282     out[8] *= scalar;
283 }
284 
285 /* felem_scalar128 sets out = out * scalar */
286 static void felem_scalar128(largefelem out, limb scalar)
287 {
288     out[0] *= scalar;
289     out[1] *= scalar;
290     out[2] *= scalar;
291     out[3] *= scalar;
292     out[4] *= scalar;
293     out[5] *= scalar;
294     out[6] *= scalar;
295     out[7] *= scalar;
296     out[8] *= scalar;
297 }
298 
299 /*-
300  * felem_neg sets |out| to |-in|
301  * On entry:
302  *   in[i] < 2^59 + 2^14
303  * On exit:
304  *   out[i] < 2^62
305  */
306 static void felem_neg(felem out, const felem in)
307 {
308     /* In order to prevent underflow, we subtract from 0 mod p. */
309     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
310     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
311 
312     out[0] = two62m3 - in[0];
313     out[1] = two62m2 - in[1];
314     out[2] = two62m2 - in[2];
315     out[3] = two62m2 - in[3];
316     out[4] = two62m2 - in[4];
317     out[5] = two62m2 - in[5];
318     out[6] = two62m2 - in[6];
319     out[7] = two62m2 - in[7];
320     out[8] = two62m2 - in[8];
321 }
322 
323 /*-
324  * felem_diff64 subtracts |in| from |out|
325  * On entry:
326  *   in[i] < 2^59 + 2^14
327  * On exit:
328  *   out[i] < out[i] + 2^62
329  */
330 static void felem_diff64(felem out, const felem in)
331 {
332     /*
333      * In order to prevent underflow, we add 0 mod p before subtracting.
334      */
335     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
336     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
337 
338     out[0] += two62m3 - in[0];
339     out[1] += two62m2 - in[1];
340     out[2] += two62m2 - in[2];
341     out[3] += two62m2 - in[3];
342     out[4] += two62m2 - in[4];
343     out[5] += two62m2 - in[5];
344     out[6] += two62m2 - in[6];
345     out[7] += two62m2 - in[7];
346     out[8] += two62m2 - in[8];
347 }
348 
349 /*-
350  * felem_diff_128_64 subtracts |in| from |out|
351  * On entry:
352  *   in[i] < 2^62 + 2^17
353  * On exit:
354  *   out[i] < out[i] + 2^63
355  */
356 static void felem_diff_128_64(largefelem out, const felem in)
357 {
358     /*
359      * In order to prevent underflow, we add 0 mod p before subtracting.
360      */
361     static const limb two63m6 = (((limb) 1) << 62) - (((limb) 1) << 5);
362     static const limb two63m5 = (((limb) 1) << 62) - (((limb) 1) << 4);
363 
364     out[0] += two63m6 - in[0];
365     out[1] += two63m5 - in[1];
366     out[2] += two63m5 - in[2];
367     out[3] += two63m5 - in[3];
368     out[4] += two63m5 - in[4];
369     out[5] += two63m5 - in[5];
370     out[6] += two63m5 - in[6];
371     out[7] += two63m5 - in[7];
372     out[8] += two63m5 - in[8];
373 }
374 
375 /*-
376  * felem_diff_128_64 subtracts |in| from |out|
377  * On entry:
378  *   in[i] < 2^126
379  * On exit:
380  *   out[i] < out[i] + 2^127 - 2^69
381  */
382 static void felem_diff128(largefelem out, const largefelem in)
383 {
384     /*
385      * In order to prevent underflow, we add 0 mod p before subtracting.
386      */
387     static const uint128_t two127m70 =
388         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
389     static const uint128_t two127m69 =
390         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
391 
392     out[0] += (two127m70 - in[0]);
393     out[1] += (two127m69 - in[1]);
394     out[2] += (two127m69 - in[2]);
395     out[3] += (two127m69 - in[3]);
396     out[4] += (two127m69 - in[4]);
397     out[5] += (two127m69 - in[5]);
398     out[6] += (two127m69 - in[6]);
399     out[7] += (two127m69 - in[7]);
400     out[8] += (two127m69 - in[8]);
401 }
402 
403 /*-
404  * felem_square sets |out| = |in|^2
405  * On entry:
406  *   in[i] < 2^62
407  * On exit:
408  *   out[i] < 17 * max(in[i]) * max(in[i])
409  */
410 static void felem_square(largefelem out, const felem in)
411 {
412     felem inx2, inx4;
413     felem_scalar(inx2, in, 2);
414     felem_scalar(inx4, in, 4);
415 
416     /*-
417      * We have many cases were we want to do
418      *   in[x] * in[y] +
419      *   in[y] * in[x]
420      * This is obviously just
421      *   2 * in[x] * in[y]
422      * However, rather than do the doubling on the 128 bit result, we
423      * double one of the inputs to the multiplication by reading from
424      * |inx2|
425      */
426 
427     out[0] = ((uint128_t) in[0]) * in[0];
428     out[1] = ((uint128_t) in[0]) * inx2[1];
429     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
430     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
431     out[4] = ((uint128_t) in[0]) * inx2[4] +
432         ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
433     out[5] = ((uint128_t) in[0]) * inx2[5] +
434         ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
435     out[6] = ((uint128_t) in[0]) * inx2[6] +
436         ((uint128_t) in[1]) * inx2[5] +
437         ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
438     out[7] = ((uint128_t) in[0]) * inx2[7] +
439         ((uint128_t) in[1]) * inx2[6] +
440         ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
441     out[8] = ((uint128_t) in[0]) * inx2[8] +
442         ((uint128_t) in[1]) * inx2[7] +
443         ((uint128_t) in[2]) * inx2[6] +
444         ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
445 
446     /*
447      * The remaining limbs fall above 2^521, with the first falling at 2^522.
448      * They correspond to locations one bit up from the limbs produced above
449      * so we would have to multiply by two to align them. Again, rather than
450      * operate on the 128-bit result, we double one of the inputs to the
451      * multiplication. If we want to double for both this reason, and the
452      * reason above, then we end up multiplying by four.
453      */
454 
455     /* 9 */
456     out[0] += ((uint128_t) in[1]) * inx4[8] +
457         ((uint128_t) in[2]) * inx4[7] +
458         ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
459 
460     /* 10 */
461     out[1] += ((uint128_t) in[2]) * inx4[8] +
462         ((uint128_t) in[3]) * inx4[7] +
463         ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
464 
465     /* 11 */
466     out[2] += ((uint128_t) in[3]) * inx4[8] +
467         ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
468 
469     /* 12 */
470     out[3] += ((uint128_t) in[4]) * inx4[8] +
471         ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
472 
473     /* 13 */
474     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
475 
476     /* 14 */
477     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
478 
479     /* 15 */
480     out[6] += ((uint128_t) in[7]) * inx4[8];
481 
482     /* 16 */
483     out[7] += ((uint128_t) in[8]) * inx2[8];
484 }
485 
486 /*-
487  * felem_mul sets |out| = |in1| * |in2|
488  * On entry:
489  *   in1[i] < 2^64
490  *   in2[i] < 2^63
491  * On exit:
492  *   out[i] < 17 * max(in1[i]) * max(in2[i])
493  */
494 static void felem_mul(largefelem out, const felem in1, const felem in2)
495 {
496     felem in2x2;
497     felem_scalar(in2x2, in2, 2);
498 
499     out[0] = ((uint128_t) in1[0]) * in2[0];
500 
501     out[1] = ((uint128_t) in1[0]) * in2[1] + ((uint128_t) in1[1]) * in2[0];
502 
503     out[2] = ((uint128_t) in1[0]) * in2[2] +
504         ((uint128_t) in1[1]) * in2[1] + ((uint128_t) in1[2]) * in2[0];
505 
506     out[3] = ((uint128_t) in1[0]) * in2[3] +
507         ((uint128_t) in1[1]) * in2[2] +
508         ((uint128_t) in1[2]) * in2[1] + ((uint128_t) in1[3]) * in2[0];
509 
510     out[4] = ((uint128_t) in1[0]) * in2[4] +
511         ((uint128_t) in1[1]) * in2[3] +
512         ((uint128_t) in1[2]) * in2[2] +
513         ((uint128_t) in1[3]) * in2[1] + ((uint128_t) in1[4]) * in2[0];
514 
515     out[5] = ((uint128_t) in1[0]) * in2[5] +
516         ((uint128_t) in1[1]) * in2[4] +
517         ((uint128_t) in1[2]) * in2[3] +
518         ((uint128_t) in1[3]) * in2[2] +
519         ((uint128_t) in1[4]) * in2[1] + ((uint128_t) in1[5]) * in2[0];
520 
521     out[6] = ((uint128_t) in1[0]) * in2[6] +
522         ((uint128_t) in1[1]) * in2[5] +
523         ((uint128_t) in1[2]) * in2[4] +
524         ((uint128_t) in1[3]) * in2[3] +
525         ((uint128_t) in1[4]) * in2[2] +
526         ((uint128_t) in1[5]) * in2[1] + ((uint128_t) in1[6]) * in2[0];
527 
528     out[7] = ((uint128_t) in1[0]) * in2[7] +
529         ((uint128_t) in1[1]) * in2[6] +
530         ((uint128_t) in1[2]) * in2[5] +
531         ((uint128_t) in1[3]) * in2[4] +
532         ((uint128_t) in1[4]) * in2[3] +
533         ((uint128_t) in1[5]) * in2[2] +
534         ((uint128_t) in1[6]) * in2[1] + ((uint128_t) in1[7]) * in2[0];
535 
536     out[8] = ((uint128_t) in1[0]) * in2[8] +
537         ((uint128_t) in1[1]) * in2[7] +
538         ((uint128_t) in1[2]) * in2[6] +
539         ((uint128_t) in1[3]) * in2[5] +
540         ((uint128_t) in1[4]) * in2[4] +
541         ((uint128_t) in1[5]) * in2[3] +
542         ((uint128_t) in1[6]) * in2[2] +
543         ((uint128_t) in1[7]) * in2[1] + ((uint128_t) in1[8]) * in2[0];
544 
545     /* See comment in felem_square about the use of in2x2 here */
546 
547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
548         ((uint128_t) in1[2]) * in2x2[7] +
549         ((uint128_t) in1[3]) * in2x2[6] +
550         ((uint128_t) in1[4]) * in2x2[5] +
551         ((uint128_t) in1[5]) * in2x2[4] +
552         ((uint128_t) in1[6]) * in2x2[3] +
553         ((uint128_t) in1[7]) * in2x2[2] + ((uint128_t) in1[8]) * in2x2[1];
554 
555     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
556         ((uint128_t) in1[3]) * in2x2[7] +
557         ((uint128_t) in1[4]) * in2x2[6] +
558         ((uint128_t) in1[5]) * in2x2[5] +
559         ((uint128_t) in1[6]) * in2x2[4] +
560         ((uint128_t) in1[7]) * in2x2[3] + ((uint128_t) in1[8]) * in2x2[2];
561 
562     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563         ((uint128_t) in1[4]) * in2x2[7] +
564         ((uint128_t) in1[5]) * in2x2[6] +
565         ((uint128_t) in1[6]) * in2x2[5] +
566         ((uint128_t) in1[7]) * in2x2[4] + ((uint128_t) in1[8]) * in2x2[3];
567 
568     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
569         ((uint128_t) in1[5]) * in2x2[7] +
570         ((uint128_t) in1[6]) * in2x2[6] +
571         ((uint128_t) in1[7]) * in2x2[5] + ((uint128_t) in1[8]) * in2x2[4];
572 
573     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
574         ((uint128_t) in1[6]) * in2x2[7] +
575         ((uint128_t) in1[7]) * in2x2[6] + ((uint128_t) in1[8]) * in2x2[5];
576 
577     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
578         ((uint128_t) in1[7]) * in2x2[7] + ((uint128_t) in1[8]) * in2x2[6];
579 
580     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
581         ((uint128_t) in1[8]) * in2x2[7];
582 
583     out[7] += ((uint128_t) in1[8]) * in2x2[8];
584 }
585 
586 static const limb bottom52bits = 0xfffffffffffff;
587 
588 /*-
589  * felem_reduce converts a largefelem to an felem.
590  * On entry:
591  *   in[i] < 2^128
592  * On exit:
593  *   out[i] < 2^59 + 2^14
594  */
595 static void felem_reduce(felem out, const largefelem in)
596 {
597     u64 overflow1, overflow2;
598 
599     out[0] = ((limb) in[0]) & bottom58bits;
600     out[1] = ((limb) in[1]) & bottom58bits;
601     out[2] = ((limb) in[2]) & bottom58bits;
602     out[3] = ((limb) in[3]) & bottom58bits;
603     out[4] = ((limb) in[4]) & bottom58bits;
604     out[5] = ((limb) in[5]) & bottom58bits;
605     out[6] = ((limb) in[6]) & bottom58bits;
606     out[7] = ((limb) in[7]) & bottom58bits;
607     out[8] = ((limb) in[8]) & bottom58bits;
608 
609     /* out[i] < 2^58 */
610 
611     out[1] += ((limb) in[0]) >> 58;
612     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
613     /*-
614      * out[1] < 2^58 + 2^6 + 2^58
615      *        = 2^59 + 2^6
616      */
617     out[2] += ((limb) (in[0] >> 64)) >> 52;
618 
619     out[2] += ((limb) in[1]) >> 58;
620     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
621     out[3] += ((limb) (in[1] >> 64)) >> 52;
622 
623     out[3] += ((limb) in[2]) >> 58;
624     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
625     out[4] += ((limb) (in[2] >> 64)) >> 52;
626 
627     out[4] += ((limb) in[3]) >> 58;
628     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
629     out[5] += ((limb) (in[3] >> 64)) >> 52;
630 
631     out[5] += ((limb) in[4]) >> 58;
632     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
633     out[6] += ((limb) (in[4] >> 64)) >> 52;
634 
635     out[6] += ((limb) in[5]) >> 58;
636     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
637     out[7] += ((limb) (in[5] >> 64)) >> 52;
638 
639     out[7] += ((limb) in[6]) >> 58;
640     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
641     out[8] += ((limb) (in[6] >> 64)) >> 52;
642 
643     out[8] += ((limb) in[7]) >> 58;
644     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
645     /*-
646      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
647      *            < 2^59 + 2^13
648      */
649     overflow1 = ((limb) (in[7] >> 64)) >> 52;
650 
651     overflow1 += ((limb) in[8]) >> 58;
652     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
653     overflow2 = ((limb) (in[8] >> 64)) >> 52;
654 
655     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
656     overflow2 <<= 1;            /* overflow2 < 2^13 */
657 
658     out[0] += overflow1;        /* out[0] < 2^60 */
659     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
660 
661     out[1] += out[0] >> 58;
662     out[0] &= bottom58bits;
663     /*-
664      * out[0] < 2^58
665      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
666      *        < 2^59 + 2^14
667      */
668 }
669 
670 static void felem_square_reduce(felem out, const felem in)
671 {
672     largefelem tmp;
673     felem_square(tmp, in);
674     felem_reduce(out, tmp);
675 }
676 
677 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
678 {
679     largefelem tmp;
680     felem_mul(tmp, in1, in2);
681     felem_reduce(out, tmp);
682 }
683 
684 /*-
685  * felem_inv calculates |out| = |in|^{-1}
686  *
687  * Based on Fermat's Little Theorem:
688  *   a^p = a (mod p)
689  *   a^{p-1} = 1 (mod p)
690  *   a^{p-2} = a^{-1} (mod p)
691  */
692 static void felem_inv(felem out, const felem in)
693 {
694     felem ftmp, ftmp2, ftmp3, ftmp4;
695     largefelem tmp;
696     unsigned i;
697 
698     felem_square(tmp, in);
699     felem_reduce(ftmp, tmp);    /* 2^1 */
700     felem_mul(tmp, in, ftmp);
701     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
702     felem_assign(ftmp2, ftmp);
703     felem_square(tmp, ftmp);
704     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
705     felem_mul(tmp, in, ftmp);
706     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
707     felem_square(tmp, ftmp);
708     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
709 
710     felem_square(tmp, ftmp2);
711     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
712     felem_square(tmp, ftmp3);
713     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
714     felem_mul(tmp, ftmp3, ftmp2);
715     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
716 
717     felem_assign(ftmp2, ftmp3);
718     felem_square(tmp, ftmp3);
719     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
720     felem_square(tmp, ftmp3);
721     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
722     felem_square(tmp, ftmp3);
723     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
724     felem_square(tmp, ftmp3);
725     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
726     felem_assign(ftmp4, ftmp3);
727     felem_mul(tmp, ftmp3, ftmp);
728     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
729     felem_square(tmp, ftmp4);
730     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
731     felem_mul(tmp, ftmp3, ftmp2);
732     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
733     felem_assign(ftmp2, ftmp3);
734 
735     for (i = 0; i < 8; i++) {
736         felem_square(tmp, ftmp3);
737         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
738     }
739     felem_mul(tmp, ftmp3, ftmp2);
740     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
741     felem_assign(ftmp2, ftmp3);
742 
743     for (i = 0; i < 16; i++) {
744         felem_square(tmp, ftmp3);
745         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
746     }
747     felem_mul(tmp, ftmp3, ftmp2);
748     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
749     felem_assign(ftmp2, ftmp3);
750 
751     for (i = 0; i < 32; i++) {
752         felem_square(tmp, ftmp3);
753         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
754     }
755     felem_mul(tmp, ftmp3, ftmp2);
756     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
757     felem_assign(ftmp2, ftmp3);
758 
759     for (i = 0; i < 64; i++) {
760         felem_square(tmp, ftmp3);
761         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
762     }
763     felem_mul(tmp, ftmp3, ftmp2);
764     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
765     felem_assign(ftmp2, ftmp3);
766 
767     for (i = 0; i < 128; i++) {
768         felem_square(tmp, ftmp3);
769         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
770     }
771     felem_mul(tmp, ftmp3, ftmp2);
772     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
773     felem_assign(ftmp2, ftmp3);
774 
775     for (i = 0; i < 256; i++) {
776         felem_square(tmp, ftmp3);
777         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
778     }
779     felem_mul(tmp, ftmp3, ftmp2);
780     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
781 
782     for (i = 0; i < 9; i++) {
783         felem_square(tmp, ftmp3);
784         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
785     }
786     felem_mul(tmp, ftmp3, ftmp4);
787     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
788     felem_mul(tmp, ftmp3, in);
789     felem_reduce(out, tmp);     /* 2^512 - 3 */
790 }
791 
792 /* This is 2^521-1, expressed as an felem */
793 static const felem kPrime = {
794     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
795     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
796     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
797 };
798 
799 /*-
800  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
801  * otherwise.
802  * On entry:
803  *   in[i] < 2^59 + 2^14
804  */
805 static limb felem_is_zero(const felem in)
806 {
807     felem ftmp;
808     limb is_zero, is_p;
809     felem_assign(ftmp, in);
810 
811     ftmp[0] += ftmp[8] >> 57;
812     ftmp[8] &= bottom57bits;
813     /* ftmp[8] < 2^57 */
814     ftmp[1] += ftmp[0] >> 58;
815     ftmp[0] &= bottom58bits;
816     ftmp[2] += ftmp[1] >> 58;
817     ftmp[1] &= bottom58bits;
818     ftmp[3] += ftmp[2] >> 58;
819     ftmp[2] &= bottom58bits;
820     ftmp[4] += ftmp[3] >> 58;
821     ftmp[3] &= bottom58bits;
822     ftmp[5] += ftmp[4] >> 58;
823     ftmp[4] &= bottom58bits;
824     ftmp[6] += ftmp[5] >> 58;
825     ftmp[5] &= bottom58bits;
826     ftmp[7] += ftmp[6] >> 58;
827     ftmp[6] &= bottom58bits;
828     ftmp[8] += ftmp[7] >> 58;
829     ftmp[7] &= bottom58bits;
830     /* ftmp[8] < 2^57 + 4 */
831 
832     /*
833      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
834      * than our bound for ftmp[8]. Therefore we only have to check if the
835      * zero is zero or 2^521-1.
836      */
837 
838     is_zero = 0;
839     is_zero |= ftmp[0];
840     is_zero |= ftmp[1];
841     is_zero |= ftmp[2];
842     is_zero |= ftmp[3];
843     is_zero |= ftmp[4];
844     is_zero |= ftmp[5];
845     is_zero |= ftmp[6];
846     is_zero |= ftmp[7];
847     is_zero |= ftmp[8];
848 
849     is_zero--;
850     /*
851      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
852      * can be set is if is_zero was 0 before the decrement.
853      */
854     is_zero = 0 - (is_zero >> 63);
855 
856     is_p = ftmp[0] ^ kPrime[0];
857     is_p |= ftmp[1] ^ kPrime[1];
858     is_p |= ftmp[2] ^ kPrime[2];
859     is_p |= ftmp[3] ^ kPrime[3];
860     is_p |= ftmp[4] ^ kPrime[4];
861     is_p |= ftmp[5] ^ kPrime[5];
862     is_p |= ftmp[6] ^ kPrime[6];
863     is_p |= ftmp[7] ^ kPrime[7];
864     is_p |= ftmp[8] ^ kPrime[8];
865 
866     is_p--;
867     is_p = 0 - (is_p >> 63);
868 
869     is_zero |= is_p;
870     return is_zero;
871 }
872 
873 static int felem_is_zero_int(const void *in)
874 {
875     return (int)(felem_is_zero(in) & ((limb) 1));
876 }
877 
878 /*-
879  * felem_contract converts |in| to its unique, minimal representation.
880  * On entry:
881  *   in[i] < 2^59 + 2^14
882  */
883 static void felem_contract(felem out, const felem in)
884 {
885     limb is_p, is_greater, sign;
886     static const limb two58 = ((limb) 1) << 58;
887 
888     felem_assign(out, in);
889 
890     out[0] += out[8] >> 57;
891     out[8] &= bottom57bits;
892     /* out[8] < 2^57 */
893     out[1] += out[0] >> 58;
894     out[0] &= bottom58bits;
895     out[2] += out[1] >> 58;
896     out[1] &= bottom58bits;
897     out[3] += out[2] >> 58;
898     out[2] &= bottom58bits;
899     out[4] += out[3] >> 58;
900     out[3] &= bottom58bits;
901     out[5] += out[4] >> 58;
902     out[4] &= bottom58bits;
903     out[6] += out[5] >> 58;
904     out[5] &= bottom58bits;
905     out[7] += out[6] >> 58;
906     out[6] &= bottom58bits;
907     out[8] += out[7] >> 58;
908     out[7] &= bottom58bits;
909     /* out[8] < 2^57 + 4 */
910 
911     /*
912      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
913      * out. See the comments in felem_is_zero regarding why we don't test for
914      * other multiples of the prime.
915      */
916 
917     /*
918      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
919      */
920 
921     is_p = out[0] ^ kPrime[0];
922     is_p |= out[1] ^ kPrime[1];
923     is_p |= out[2] ^ kPrime[2];
924     is_p |= out[3] ^ kPrime[3];
925     is_p |= out[4] ^ kPrime[4];
926     is_p |= out[5] ^ kPrime[5];
927     is_p |= out[6] ^ kPrime[6];
928     is_p |= out[7] ^ kPrime[7];
929     is_p |= out[8] ^ kPrime[8];
930 
931     is_p--;
932     is_p &= is_p << 32;
933     is_p &= is_p << 16;
934     is_p &= is_p << 8;
935     is_p &= is_p << 4;
936     is_p &= is_p << 2;
937     is_p &= is_p << 1;
938     is_p = 0 - (is_p >> 63);
939     is_p = ~is_p;
940 
941     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
942 
943     out[0] &= is_p;
944     out[1] &= is_p;
945     out[2] &= is_p;
946     out[3] &= is_p;
947     out[4] &= is_p;
948     out[5] &= is_p;
949     out[6] &= is_p;
950     out[7] &= is_p;
951     out[8] &= is_p;
952 
953     /*
954      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
955      * 57 is greater than zero as (2^521-1) + x >= 2^522
956      */
957     is_greater = out[8] >> 57;
958     is_greater |= is_greater << 32;
959     is_greater |= is_greater << 16;
960     is_greater |= is_greater << 8;
961     is_greater |= is_greater << 4;
962     is_greater |= is_greater << 2;
963     is_greater |= is_greater << 1;
964     is_greater = 0 - (is_greater >> 63);
965 
966     out[0] -= kPrime[0] & is_greater;
967     out[1] -= kPrime[1] & is_greater;
968     out[2] -= kPrime[2] & is_greater;
969     out[3] -= kPrime[3] & is_greater;
970     out[4] -= kPrime[4] & is_greater;
971     out[5] -= kPrime[5] & is_greater;
972     out[6] -= kPrime[6] & is_greater;
973     out[7] -= kPrime[7] & is_greater;
974     out[8] -= kPrime[8] & is_greater;
975 
976     /* Eliminate negative coefficients */
977     sign = -(out[0] >> 63);
978     out[0] += (two58 & sign);
979     out[1] -= (1 & sign);
980     sign = -(out[1] >> 63);
981     out[1] += (two58 & sign);
982     out[2] -= (1 & sign);
983     sign = -(out[2] >> 63);
984     out[2] += (two58 & sign);
985     out[3] -= (1 & sign);
986     sign = -(out[3] >> 63);
987     out[3] += (two58 & sign);
988     out[4] -= (1 & sign);
989     sign = -(out[4] >> 63);
990     out[4] += (two58 & sign);
991     out[5] -= (1 & sign);
992     sign = -(out[0] >> 63);
993     out[5] += (two58 & sign);
994     out[6] -= (1 & sign);
995     sign = -(out[6] >> 63);
996     out[6] += (two58 & sign);
997     out[7] -= (1 & sign);
998     sign = -(out[7] >> 63);
999     out[7] += (two58 & sign);
1000     out[8] -= (1 & sign);
1001     sign = -(out[5] >> 63);
1002     out[5] += (two58 & sign);
1003     out[6] -= (1 & sign);
1004     sign = -(out[6] >> 63);
1005     out[6] += (two58 & sign);
1006     out[7] -= (1 & sign);
1007     sign = -(out[7] >> 63);
1008     out[7] += (two58 & sign);
1009     out[8] -= (1 & sign);
1010 }
1011 
1012 /*-
1013  * Group operations
1014  * ----------------
1015  *
1016  * Building on top of the field operations we have the operations on the
1017  * elliptic curve group itself. Points on the curve are represented in Jacobian
1018  * coordinates */
1019 
1020 /*-
1021  * point_double calcuates 2*(x_in, y_in, z_in)
1022  *
1023  * The method is taken from:
1024  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1025  *
1026  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1027  * while x_out == y_in is not (maybe this works, but it's not tested). */
1028 static void
1029 point_double(felem x_out, felem y_out, felem z_out,
1030              const felem x_in, const felem y_in, const felem z_in)
1031 {
1032     largefelem tmp, tmp2;
1033     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1034 
1035     felem_assign(ftmp, x_in);
1036     felem_assign(ftmp2, x_in);
1037 
1038     /* delta = z^2 */
1039     felem_square(tmp, z_in);
1040     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1041 
1042     /* gamma = y^2 */
1043     felem_square(tmp, y_in);
1044     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1045 
1046     /* beta = x*gamma */
1047     felem_mul(tmp, x_in, gamma);
1048     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1049 
1050     /* alpha = 3*(x-delta)*(x+delta) */
1051     felem_diff64(ftmp, delta);
1052     /* ftmp[i] < 2^61 */
1053     felem_sum64(ftmp2, delta);
1054     /* ftmp2[i] < 2^60 + 2^15 */
1055     felem_scalar64(ftmp2, 3);
1056     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1057     felem_mul(tmp, ftmp, ftmp2);
1058     /*-
1059      * tmp[i] < 17(3*2^121 + 3*2^76)
1060      *        = 61*2^121 + 61*2^76
1061      *        < 64*2^121 + 64*2^76
1062      *        = 2^127 + 2^82
1063      *        < 2^128
1064      */
1065     felem_reduce(alpha, tmp);
1066 
1067     /* x' = alpha^2 - 8*beta */
1068     felem_square(tmp, alpha);
1069     /*
1070      * tmp[i] < 17*2^120 < 2^125
1071      */
1072     felem_assign(ftmp, beta);
1073     felem_scalar64(ftmp, 8);
1074     /* ftmp[i] < 2^62 + 2^17 */
1075     felem_diff_128_64(tmp, ftmp);
1076     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1077     felem_reduce(x_out, tmp);
1078 
1079     /* z' = (y + z)^2 - gamma - delta */
1080     felem_sum64(delta, gamma);
1081     /* delta[i] < 2^60 + 2^15 */
1082     felem_assign(ftmp, y_in);
1083     felem_sum64(ftmp, z_in);
1084     /* ftmp[i] < 2^60 + 2^15 */
1085     felem_square(tmp, ftmp);
1086     /*
1087      * tmp[i] < 17(2^122) < 2^127
1088      */
1089     felem_diff_128_64(tmp, delta);
1090     /* tmp[i] < 2^127 + 2^63 */
1091     felem_reduce(z_out, tmp);
1092 
1093     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1094     felem_scalar64(beta, 4);
1095     /* beta[i] < 2^61 + 2^16 */
1096     felem_diff64(beta, x_out);
1097     /* beta[i] < 2^61 + 2^60 + 2^16 */
1098     felem_mul(tmp, alpha, beta);
1099     /*-
1100      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1101      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1102      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1103      *        < 2^128
1104      */
1105     felem_square(tmp2, gamma);
1106     /*-
1107      * tmp2[i] < 17*(2^59 + 2^14)^2
1108      *         = 17*(2^118 + 2^74 + 2^28)
1109      */
1110     felem_scalar128(tmp2, 8);
1111     /*-
1112      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1113      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1114      *         < 2^126
1115      */
1116     felem_diff128(tmp, tmp2);
1117     /*-
1118      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1119      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1120      *          2^74 + 2^69 + 2^34 + 2^30
1121      *        < 2^128
1122      */
1123     felem_reduce(y_out, tmp);
1124 }
1125 
1126 /* copy_conditional copies in to out iff mask is all ones. */
1127 static void copy_conditional(felem out, const felem in, limb mask)
1128 {
1129     unsigned i;
1130     for (i = 0; i < NLIMBS; ++i) {
1131         const limb tmp = mask & (in[i] ^ out[i]);
1132         out[i] ^= tmp;
1133     }
1134 }
1135 
1136 /*-
1137  * point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1138  *
1139  * The method is taken from
1140  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1141  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1142  *
1143  * This function includes a branch for checking whether the two input points
1144  * are equal (while not equal to the point at infinity). This case never
1145  * happens during single point multiplication, so there is no timing leak for
1146  * ECDH or ECDSA signing. */
1147 static void point_add(felem x3, felem y3, felem z3,
1148                       const felem x1, const felem y1, const felem z1,
1149                       const int mixed, const felem x2, const felem y2,
1150                       const felem z2)
1151 {
1152     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1153     largefelem tmp, tmp2;
1154     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1155 
1156     z1_is_zero = felem_is_zero(z1);
1157     z2_is_zero = felem_is_zero(z2);
1158 
1159     /* ftmp = z1z1 = z1**2 */
1160     felem_square(tmp, z1);
1161     felem_reduce(ftmp, tmp);
1162 
1163     if (!mixed) {
1164         /* ftmp2 = z2z2 = z2**2 */
1165         felem_square(tmp, z2);
1166         felem_reduce(ftmp2, tmp);
1167 
1168         /* u1 = ftmp3 = x1*z2z2 */
1169         felem_mul(tmp, x1, ftmp2);
1170         felem_reduce(ftmp3, tmp);
1171 
1172         /* ftmp5 = z1 + z2 */
1173         felem_assign(ftmp5, z1);
1174         felem_sum64(ftmp5, z2);
1175         /* ftmp5[i] < 2^61 */
1176 
1177         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1178         felem_square(tmp, ftmp5);
1179         /* tmp[i] < 17*2^122 */
1180         felem_diff_128_64(tmp, ftmp);
1181         /* tmp[i] < 17*2^122 + 2^63 */
1182         felem_diff_128_64(tmp, ftmp2);
1183         /* tmp[i] < 17*2^122 + 2^64 */
1184         felem_reduce(ftmp5, tmp);
1185 
1186         /* ftmp2 = z2 * z2z2 */
1187         felem_mul(tmp, ftmp2, z2);
1188         felem_reduce(ftmp2, tmp);
1189 
1190         /* s1 = ftmp6 = y1 * z2**3 */
1191         felem_mul(tmp, y1, ftmp2);
1192         felem_reduce(ftmp6, tmp);
1193     } else {
1194         /*
1195          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1196          */
1197 
1198         /* u1 = ftmp3 = x1*z2z2 */
1199         felem_assign(ftmp3, x1);
1200 
1201         /* ftmp5 = 2*z1z2 */
1202         felem_scalar(ftmp5, z1, 2);
1203 
1204         /* s1 = ftmp6 = y1 * z2**3 */
1205         felem_assign(ftmp6, y1);
1206     }
1207 
1208     /* u2 = x2*z1z1 */
1209     felem_mul(tmp, x2, ftmp);
1210     /* tmp[i] < 17*2^120 */
1211 
1212     /* h = ftmp4 = u2 - u1 */
1213     felem_diff_128_64(tmp, ftmp3);
1214     /* tmp[i] < 17*2^120 + 2^63 */
1215     felem_reduce(ftmp4, tmp);
1216 
1217     x_equal = felem_is_zero(ftmp4);
1218 
1219     /* z_out = ftmp5 * h */
1220     felem_mul(tmp, ftmp5, ftmp4);
1221     felem_reduce(z_out, tmp);
1222 
1223     /* ftmp = z1 * z1z1 */
1224     felem_mul(tmp, ftmp, z1);
1225     felem_reduce(ftmp, tmp);
1226 
1227     /* s2 = tmp = y2 * z1**3 */
1228     felem_mul(tmp, y2, ftmp);
1229     /* tmp[i] < 17*2^120 */
1230 
1231     /* r = ftmp5 = (s2 - s1)*2 */
1232     felem_diff_128_64(tmp, ftmp6);
1233     /* tmp[i] < 17*2^120 + 2^63 */
1234     felem_reduce(ftmp5, tmp);
1235     y_equal = felem_is_zero(ftmp5);
1236     felem_scalar64(ftmp5, 2);
1237     /* ftmp5[i] < 2^61 */
1238 
1239     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1240         point_double(x3, y3, z3, x1, y1, z1);
1241         return;
1242     }
1243 
1244     /* I = ftmp = (2h)**2 */
1245     felem_assign(ftmp, ftmp4);
1246     felem_scalar64(ftmp, 2);
1247     /* ftmp[i] < 2^61 */
1248     felem_square(tmp, ftmp);
1249     /* tmp[i] < 17*2^122 */
1250     felem_reduce(ftmp, tmp);
1251 
1252     /* J = ftmp2 = h * I */
1253     felem_mul(tmp, ftmp4, ftmp);
1254     felem_reduce(ftmp2, tmp);
1255 
1256     /* V = ftmp4 = U1 * I */
1257     felem_mul(tmp, ftmp3, ftmp);
1258     felem_reduce(ftmp4, tmp);
1259 
1260     /* x_out = r**2 - J - 2V */
1261     felem_square(tmp, ftmp5);
1262     /* tmp[i] < 17*2^122 */
1263     felem_diff_128_64(tmp, ftmp2);
1264     /* tmp[i] < 17*2^122 + 2^63 */
1265     felem_assign(ftmp3, ftmp4);
1266     felem_scalar64(ftmp4, 2);
1267     /* ftmp4[i] < 2^61 */
1268     felem_diff_128_64(tmp, ftmp4);
1269     /* tmp[i] < 17*2^122 + 2^64 */
1270     felem_reduce(x_out, tmp);
1271 
1272     /* y_out = r(V-x_out) - 2 * s1 * J */
1273     felem_diff64(ftmp3, x_out);
1274     /*
1275      * ftmp3[i] < 2^60 + 2^60 = 2^61
1276      */
1277     felem_mul(tmp, ftmp5, ftmp3);
1278     /* tmp[i] < 17*2^122 */
1279     felem_mul(tmp2, ftmp6, ftmp2);
1280     /* tmp2[i] < 17*2^120 */
1281     felem_scalar128(tmp2, 2);
1282     /* tmp2[i] < 17*2^121 */
1283     felem_diff128(tmp, tmp2);
1284         /*-
1285          * tmp[i] < 2^127 - 2^69 + 17*2^122
1286          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1287          *        < 2^127
1288          */
1289     felem_reduce(y_out, tmp);
1290 
1291     copy_conditional(x_out, x2, z1_is_zero);
1292     copy_conditional(x_out, x1, z2_is_zero);
1293     copy_conditional(y_out, y2, z1_is_zero);
1294     copy_conditional(y_out, y1, z2_is_zero);
1295     copy_conditional(z_out, z2, z1_is_zero);
1296     copy_conditional(z_out, z1, z2_is_zero);
1297     felem_assign(x3, x_out);
1298     felem_assign(y3, y_out);
1299     felem_assign(z3, z_out);
1300 }
1301 
1302 /*-
1303  * Base point pre computation
1304  * --------------------------
1305  *
1306  * Two different sorts of precomputed tables are used in the following code.
1307  * Each contain various points on the curve, where each point is three field
1308  * elements (x, y, z).
1309  *
1310  * For the base point table, z is usually 1 (0 for the point at infinity).
1311  * This table has 16 elements:
1312  * index | bits    | point
1313  * ------+---------+------------------------------
1314  *     0 | 0 0 0 0 | 0G
1315  *     1 | 0 0 0 1 | 1G
1316  *     2 | 0 0 1 0 | 2^130G
1317  *     3 | 0 0 1 1 | (2^130 + 1)G
1318  *     4 | 0 1 0 0 | 2^260G
1319  *     5 | 0 1 0 1 | (2^260 + 1)G
1320  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1321  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1322  *     8 | 1 0 0 0 | 2^390G
1323  *     9 | 1 0 0 1 | (2^390 + 1)G
1324  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1325  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1326  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1327  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1328  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1329  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1330  *
1331  * The reason for this is so that we can clock bits into four different
1332  * locations when doing simple scalar multiplies against the base point.
1333  *
1334  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1335 
1336 /* gmul is the table of precomputed base points */
1337 static const felem gmul[16][3] = { {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1338                                     {0, 0, 0, 0, 0, 0, 0, 0, 0},
1339                                     {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1340 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1341   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1342   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1343  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1344   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1345   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1346  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1347 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1348   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1349   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1350  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1351   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1352   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1353  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1354 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1355   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1356   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1357  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1358   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1359   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1360  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1361 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1362   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1363   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1364  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1365   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1366   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1367  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1368 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1369   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1370   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1371  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1372   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1373   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1374  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1375 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1376   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1377   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1378  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1379   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1380   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1381  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1383   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1384   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1385  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1386   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1387   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1388  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1390   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1391   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1392  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1393   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1394   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1395  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1397   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1398   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1399  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1400   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1401   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1402  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1404   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1405   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1406  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1407   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1408   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1409  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1411   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1412   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1413  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1414   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1415   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1416  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1418   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1419   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1420  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1421   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1422   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1423  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1425   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1426   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1427  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1428   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1429   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1430  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1432   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1433   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1434  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1435   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1436   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1437  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1439   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1440   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1441  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1442   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1443   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1444  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1445 };
1446 
1447 /*
1448  * select_point selects the |idx|th point from a precomputation table and
1449  * copies it to out.
1450  */
1451  /* pre_comp below is of the size provided in |size| */
1452 static void select_point(const limb idx, unsigned int size,
1453                          const felem pre_comp[][3], felem out[3])
1454 {
1455     unsigned i, j;
1456     limb *outlimbs = &out[0][0];
1457     memset(outlimbs, 0, 3 * sizeof(felem));
1458 
1459     for (i = 0; i < size; i++) {
1460         const limb *inlimbs = &pre_comp[i][0][0];
1461         limb mask = i ^ idx;
1462         mask |= mask >> 4;
1463         mask |= mask >> 2;
1464         mask |= mask >> 1;
1465         mask &= 1;
1466         mask--;
1467         for (j = 0; j < NLIMBS * 3; j++)
1468             outlimbs[j] |= inlimbs[j] & mask;
1469     }
1470 }
1471 
1472 /* get_bit returns the |i|th bit in |in| */
1473 static char get_bit(const felem_bytearray in, int i)
1474 {
1475     if (i < 0)
1476         return 0;
1477     return (in[i >> 3] >> (i & 7)) & 1;
1478 }
1479 
1480 /*
1481  * Interleaved point multiplication using precomputed point multiples: The
1482  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1483  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1484  * generator, using certain (large) precomputed multiples in g_pre_comp.
1485  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1486  */
1487 static void batch_mul(felem x_out, felem y_out, felem z_out,
1488                       const felem_bytearray scalars[],
1489                       const unsigned num_points, const u8 *g_scalar,
1490                       const int mixed, const felem pre_comp[][17][3],
1491                       const felem g_pre_comp[16][3])
1492 {
1493     int i, skip;
1494     unsigned num, gen_mul = (g_scalar != NULL);
1495     felem nq[3], tmp[4];
1496     limb bits;
1497     u8 sign, digit;
1498 
1499     /* set nq to the point at infinity */
1500     memset(nq, 0, 3 * sizeof(felem));
1501 
1502     /*
1503      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1504      * of the generator (last quarter of rounds) and additions of other
1505      * points multiples (every 5th round).
1506      */
1507     skip = 1;                   /* save two point operations in the first
1508                                  * round */
1509     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1510         /* double */
1511         if (!skip)
1512             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1513 
1514         /* add multiples of the generator */
1515         if (gen_mul && (i <= 130)) {
1516             bits = get_bit(g_scalar, i + 390) << 3;
1517             if (i < 130) {
1518                 bits |= get_bit(g_scalar, i + 260) << 2;
1519                 bits |= get_bit(g_scalar, i + 130) << 1;
1520                 bits |= get_bit(g_scalar, i);
1521             }
1522             /* select the point to add, in constant time */
1523             select_point(bits, 16, g_pre_comp, tmp);
1524             if (!skip) {
1525                 /* The 1 argument below is for "mixed" */
1526                 point_add(nq[0], nq[1], nq[2],
1527                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1528             } else {
1529                 memcpy(nq, tmp, 3 * sizeof(felem));
1530                 skip = 0;
1531             }
1532         }
1533 
1534         /* do other additions every 5 doublings */
1535         if (num_points && (i % 5 == 0)) {
1536             /* loop over all scalars */
1537             for (num = 0; num < num_points; ++num) {
1538                 bits = get_bit(scalars[num], i + 4) << 5;
1539                 bits |= get_bit(scalars[num], i + 3) << 4;
1540                 bits |= get_bit(scalars[num], i + 2) << 3;
1541                 bits |= get_bit(scalars[num], i + 1) << 2;
1542                 bits |= get_bit(scalars[num], i) << 1;
1543                 bits |= get_bit(scalars[num], i - 1);
1544                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1545 
1546                 /*
1547                  * select the point to add or subtract, in constant time
1548                  */
1549                 select_point(digit, 17, pre_comp[num], tmp);
1550                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1551                                             * point */
1552                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1553 
1554                 if (!skip) {
1555                     point_add(nq[0], nq[1], nq[2],
1556                               nq[0], nq[1], nq[2],
1557                               mixed, tmp[0], tmp[1], tmp[2]);
1558                 } else {
1559                     memcpy(nq, tmp, 3 * sizeof(felem));
1560                     skip = 0;
1561                 }
1562             }
1563         }
1564     }
1565     felem_assign(x_out, nq[0]);
1566     felem_assign(y_out, nq[1]);
1567     felem_assign(z_out, nq[2]);
1568 }
1569 
1570 /* Precomputation for the group generator. */
1571 typedef struct {
1572     felem g_pre_comp[16][3];
1573     int references;
1574 } NISTP521_PRE_COMP;
1575 
1576 const EC_METHOD *EC_GFp_nistp521_method(void)
1577 {
1578     static const EC_METHOD ret = {
1579         EC_FLAGS_DEFAULT_OCT,
1580         NID_X9_62_prime_field,
1581         ec_GFp_nistp521_group_init,
1582         ec_GFp_simple_group_finish,
1583         ec_GFp_simple_group_clear_finish,
1584         ec_GFp_nist_group_copy,
1585         ec_GFp_nistp521_group_set_curve,
1586         ec_GFp_simple_group_get_curve,
1587         ec_GFp_simple_group_get_degree,
1588         ec_GFp_simple_group_check_discriminant,
1589         ec_GFp_simple_point_init,
1590         ec_GFp_simple_point_finish,
1591         ec_GFp_simple_point_clear_finish,
1592         ec_GFp_simple_point_copy,
1593         ec_GFp_simple_point_set_to_infinity,
1594         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1595         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1596         ec_GFp_simple_point_set_affine_coordinates,
1597         ec_GFp_nistp521_point_get_affine_coordinates,
1598         0 /* point_set_compressed_coordinates */ ,
1599         0 /* point2oct */ ,
1600         0 /* oct2point */ ,
1601         ec_GFp_simple_add,
1602         ec_GFp_simple_dbl,
1603         ec_GFp_simple_invert,
1604         ec_GFp_simple_is_at_infinity,
1605         ec_GFp_simple_is_on_curve,
1606         ec_GFp_simple_cmp,
1607         ec_GFp_simple_make_affine,
1608         ec_GFp_simple_points_make_affine,
1609         ec_GFp_nistp521_points_mul,
1610         ec_GFp_nistp521_precompute_mult,
1611         ec_GFp_nistp521_have_precompute_mult,
1612         ec_GFp_nist_field_mul,
1613         ec_GFp_nist_field_sqr,
1614         0 /* field_div */ ,
1615         0 /* field_encode */ ,
1616         0 /* field_decode */ ,
1617         0                       /* field_set_to_one */
1618     };
1619 
1620     return &ret;
1621 }
1622 
1623 /******************************************************************************/
1624 /*
1625  * FUNCTIONS TO MANAGE PRECOMPUTATION
1626  */
1627 
1628 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1629 {
1630     NISTP521_PRE_COMP *ret = NULL;
1631     ret = (NISTP521_PRE_COMP *) OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1632     if (!ret) {
1633         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1634         return ret;
1635     }
1636     memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1637     ret->references = 1;
1638     return ret;
1639 }
1640 
1641 static void *nistp521_pre_comp_dup(void *src_)
1642 {
1643     NISTP521_PRE_COMP *src = src_;
1644 
1645     /* no need to actually copy, these objects never change! */
1646     CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1647 
1648     return src_;
1649 }
1650 
1651 static void nistp521_pre_comp_free(void *pre_)
1652 {
1653     int i;
1654     NISTP521_PRE_COMP *pre = pre_;
1655 
1656     if (!pre)
1657         return;
1658 
1659     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1660     if (i > 0)
1661         return;
1662 
1663     OPENSSL_free(pre);
1664 }
1665 
1666 static void nistp521_pre_comp_clear_free(void *pre_)
1667 {
1668     int i;
1669     NISTP521_PRE_COMP *pre = pre_;
1670 
1671     if (!pre)
1672         return;
1673 
1674     i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1675     if (i > 0)
1676         return;
1677 
1678     OPENSSL_cleanse(pre, sizeof(*pre));
1679     OPENSSL_free(pre);
1680 }
1681 
1682 /******************************************************************************/
1683 /*
1684  * OPENSSL EC_METHOD FUNCTIONS
1685  */
1686 
1687 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1688 {
1689     int ret;
1690     ret = ec_GFp_simple_group_init(group);
1691     group->a_is_minus3 = 1;
1692     return ret;
1693 }
1694 
1695 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1696                                     const BIGNUM *a, const BIGNUM *b,
1697                                     BN_CTX *ctx)
1698 {
1699     int ret = 0;
1700     BN_CTX *new_ctx = NULL;
1701     BIGNUM *curve_p, *curve_a, *curve_b;
1702 
1703     if (ctx == NULL)
1704         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1705             return 0;
1706     BN_CTX_start(ctx);
1707     if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1708         ((curve_a = BN_CTX_get(ctx)) == NULL) ||
1709         ((curve_b = BN_CTX_get(ctx)) == NULL))
1710         goto err;
1711     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1712     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1713     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1714     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1715         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1716               EC_R_WRONG_CURVE_PARAMETERS);
1717         goto err;
1718     }
1719     group->field_mod_func = BN_nist_mod_521;
1720     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1721  err:
1722     BN_CTX_end(ctx);
1723     if (new_ctx != NULL)
1724         BN_CTX_free(new_ctx);
1725     return ret;
1726 }
1727 
1728 /*
1729  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1730  * (X/Z^2, Y/Z^3)
1731  */
1732 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1733                                                  const EC_POINT *point,
1734                                                  BIGNUM *x, BIGNUM *y,
1735                                                  BN_CTX *ctx)
1736 {
1737     felem z1, z2, x_in, y_in, x_out, y_out;
1738     largefelem tmp;
1739 
1740     if (EC_POINT_is_at_infinity(group, point)) {
1741         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1742               EC_R_POINT_AT_INFINITY);
1743         return 0;
1744     }
1745     if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1746         (!BN_to_felem(z1, &point->Z)))
1747         return 0;
1748     felem_inv(z2, z1);
1749     felem_square(tmp, z2);
1750     felem_reduce(z1, tmp);
1751     felem_mul(tmp, x_in, z1);
1752     felem_reduce(x_in, tmp);
1753     felem_contract(x_out, x_in);
1754     if (x != NULL) {
1755         if (!felem_to_BN(x, x_out)) {
1756             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1757                   ERR_R_BN_LIB);
1758             return 0;
1759         }
1760     }
1761     felem_mul(tmp, z1, z2);
1762     felem_reduce(z1, tmp);
1763     felem_mul(tmp, y_in, z1);
1764     felem_reduce(y_in, tmp);
1765     felem_contract(y_out, y_in);
1766     if (y != NULL) {
1767         if (!felem_to_BN(y, y_out)) {
1768             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1769                   ERR_R_BN_LIB);
1770             return 0;
1771         }
1772     }
1773     return 1;
1774 }
1775 
1776 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1777 static void make_points_affine(size_t num, felem points[][3],
1778                                felem tmp_felems[])
1779 {
1780     /*
1781      * Runs in constant time, unless an input is the point at infinity (which
1782      * normally shouldn't happen).
1783      */
1784     ec_GFp_nistp_points_make_affine_internal(num,
1785                                              points,
1786                                              sizeof(felem),
1787                                              tmp_felems,
1788                                              (void (*)(void *))felem_one,
1789                                              felem_is_zero_int,
1790                                              (void (*)(void *, const void *))
1791                                              felem_assign,
1792                                              (void (*)(void *, const void *))
1793                                              felem_square_reduce, (void (*)
1794                                                                    (void *,
1795                                                                     const void
1796                                                                     *,
1797                                                                     const void
1798                                                                     *))
1799                                              felem_mul_reduce,
1800                                              (void (*)(void *, const void *))
1801                                              felem_inv,
1802                                              (void (*)(void *, const void *))
1803                                              felem_contract);
1804 }
1805 
1806 /*
1807  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1808  * values Result is stored in r (r can equal one of the inputs).
1809  */
1810 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1811                                const BIGNUM *scalar, size_t num,
1812                                const EC_POINT *points[],
1813                                const BIGNUM *scalars[], BN_CTX *ctx)
1814 {
1815     int ret = 0;
1816     int j;
1817     int mixed = 0;
1818     BN_CTX *new_ctx = NULL;
1819     BIGNUM *x, *y, *z, *tmp_scalar;
1820     felem_bytearray g_secret;
1821     felem_bytearray *secrets = NULL;
1822     felem(*pre_comp)[17][3] = NULL;
1823     felem *tmp_felems = NULL;
1824     felem_bytearray tmp;
1825     unsigned i, num_bytes;
1826     int have_pre_comp = 0;
1827     size_t num_points = num;
1828     felem x_in, y_in, z_in, x_out, y_out, z_out;
1829     NISTP521_PRE_COMP *pre = NULL;
1830     felem(*g_pre_comp)[3] = NULL;
1831     EC_POINT *generator = NULL;
1832     const EC_POINT *p = NULL;
1833     const BIGNUM *p_scalar = NULL;
1834 
1835     if (ctx == NULL)
1836         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1837             return 0;
1838     BN_CTX_start(ctx);
1839     if (((x = BN_CTX_get(ctx)) == NULL) ||
1840         ((y = BN_CTX_get(ctx)) == NULL) ||
1841         ((z = BN_CTX_get(ctx)) == NULL) ||
1842         ((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1843         goto err;
1844 
1845     if (scalar != NULL) {
1846         pre = EC_EX_DATA_get_data(group->extra_data,
1847                                   nistp521_pre_comp_dup,
1848                                   nistp521_pre_comp_free,
1849                                   nistp521_pre_comp_clear_free);
1850         if (pre)
1851             /* we have precomputation, try to use it */
1852             g_pre_comp = &pre->g_pre_comp[0];
1853         else
1854             /* try to use the standard precomputation */
1855             g_pre_comp = (felem(*)[3]) gmul;
1856         generator = EC_POINT_new(group);
1857         if (generator == NULL)
1858             goto err;
1859         /* get the generator from precomputation */
1860         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1861             !felem_to_BN(y, g_pre_comp[1][1]) ||
1862             !felem_to_BN(z, g_pre_comp[1][2])) {
1863             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1864             goto err;
1865         }
1866         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1867                                                       generator, x, y, z,
1868                                                       ctx))
1869             goto err;
1870         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1871             /* precomputation matches generator */
1872             have_pre_comp = 1;
1873         else
1874             /*
1875              * we don't have valid precomputation: treat the generator as a
1876              * random point
1877              */
1878             num_points++;
1879     }
1880 
1881     if (num_points > 0) {
1882         if (num_points >= 2) {
1883             /*
1884              * unless we precompute multiples for just one point, converting
1885              * those into affine form is time well spent
1886              */
1887             mixed = 1;
1888         }
1889         secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1890         pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1891         if (mixed)
1892             tmp_felems =
1893                 OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1894         if ((secrets == NULL) || (pre_comp == NULL)
1895             || (mixed && (tmp_felems == NULL))) {
1896             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1897             goto err;
1898         }
1899 
1900         /*
1901          * we treat NULL scalars as 0, and NULL points as points at infinity,
1902          * i.e., they contribute nothing to the linear combination
1903          */
1904         memset(secrets, 0, num_points * sizeof(felem_bytearray));
1905         memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1906         for (i = 0; i < num_points; ++i) {
1907             if (i == num)
1908                 /*
1909                  * we didn't have a valid precomputation, so we pick the
1910                  * generator
1911                  */
1912             {
1913                 p = EC_GROUP_get0_generator(group);
1914                 p_scalar = scalar;
1915             } else
1916                 /* the i^th point */
1917             {
1918                 p = points[i];
1919                 p_scalar = scalars[i];
1920             }
1921             if ((p_scalar != NULL) && (p != NULL)) {
1922                 /* reduce scalar to 0 <= scalar < 2^521 */
1923                 if ((BN_num_bits(p_scalar) > 521)
1924                     || (BN_is_negative(p_scalar))) {
1925                     /*
1926                      * this is an unusual input, and we don't guarantee
1927                      * constant-timeness
1928                      */
1929                     if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx)) {
1930                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1931                         goto err;
1932                     }
1933                     num_bytes = BN_bn2bin(tmp_scalar, tmp);
1934                 } else
1935                     num_bytes = BN_bn2bin(p_scalar, tmp);
1936                 flip_endian(secrets[i], tmp, num_bytes);
1937                 /* precompute multiples */
1938                 if ((!BN_to_felem(x_out, &p->X)) ||
1939                     (!BN_to_felem(y_out, &p->Y)) ||
1940                     (!BN_to_felem(z_out, &p->Z)))
1941                     goto err;
1942                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1943                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1944                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1945                 for (j = 2; j <= 16; ++j) {
1946                     if (j & 1) {
1947                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1948                                   pre_comp[i][j][2], pre_comp[i][1][0],
1949                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1950                                   pre_comp[i][j - 1][0],
1951                                   pre_comp[i][j - 1][1],
1952                                   pre_comp[i][j - 1][2]);
1953                     } else {
1954                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1955                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1956                                      pre_comp[i][j / 2][1],
1957                                      pre_comp[i][j / 2][2]);
1958                     }
1959                 }
1960             }
1961         }
1962         if (mixed)
1963             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1964     }
1965 
1966     /* the scalar for the generator */
1967     if ((scalar != NULL) && (have_pre_comp)) {
1968         memset(g_secret, 0, sizeof(g_secret));
1969         /* reduce scalar to 0 <= scalar < 2^521 */
1970         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1971             /*
1972              * this is an unusual input, and we don't guarantee
1973              * constant-timeness
1974              */
1975             if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx)) {
1976                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1977                 goto err;
1978             }
1979             num_bytes = BN_bn2bin(tmp_scalar, tmp);
1980         } else
1981             num_bytes = BN_bn2bin(scalar, tmp);
1982         flip_endian(g_secret, tmp, num_bytes);
1983         /* do the multiplication with generator precomputation */
1984         batch_mul(x_out, y_out, z_out,
1985                   (const felem_bytearray(*))secrets, num_points,
1986                   g_secret,
1987                   mixed, (const felem(*)[17][3])pre_comp,
1988                   (const felem(*)[3])g_pre_comp);
1989     } else
1990         /* do the multiplication without generator precomputation */
1991         batch_mul(x_out, y_out, z_out,
1992                   (const felem_bytearray(*))secrets, num_points,
1993                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1994     /* reduce the output to its unique minimal representation */
1995     felem_contract(x_in, x_out);
1996     felem_contract(y_in, y_out);
1997     felem_contract(z_in, z_out);
1998     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1999         (!felem_to_BN(z, z_in))) {
2000         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2001         goto err;
2002     }
2003     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2004 
2005  err:
2006     BN_CTX_end(ctx);
2007     if (generator != NULL)
2008         EC_POINT_free(generator);
2009     if (new_ctx != NULL)
2010         BN_CTX_free(new_ctx);
2011     if (secrets != NULL)
2012         OPENSSL_free(secrets);
2013     if (pre_comp != NULL)
2014         OPENSSL_free(pre_comp);
2015     if (tmp_felems != NULL)
2016         OPENSSL_free(tmp_felems);
2017     return ret;
2018 }
2019 
2020 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2021 {
2022     int ret = 0;
2023     NISTP521_PRE_COMP *pre = NULL;
2024     int i, j;
2025     BN_CTX *new_ctx = NULL;
2026     BIGNUM *x, *y;
2027     EC_POINT *generator = NULL;
2028     felem tmp_felems[16];
2029 
2030     /* throw away old precomputation */
2031     EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
2032                          nistp521_pre_comp_free,
2033                          nistp521_pre_comp_clear_free);
2034     if (ctx == NULL)
2035         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2036             return 0;
2037     BN_CTX_start(ctx);
2038     if (((x = BN_CTX_get(ctx)) == NULL) || ((y = BN_CTX_get(ctx)) == NULL))
2039         goto err;
2040     /* get the generator */
2041     if (group->generator == NULL)
2042         goto err;
2043     generator = EC_POINT_new(group);
2044     if (generator == NULL)
2045         goto err;
2046     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2047     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2048     if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
2049         goto err;
2050     if ((pre = nistp521_pre_comp_new()) == NULL)
2051         goto err;
2052     /*
2053      * if the generator is the standard one, use built-in precomputation
2054      */
2055     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2056         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2057         goto done;
2058     }
2059     if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
2060         (!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
2061         (!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
2062         goto err;
2063     /* compute 2^130*G, 2^260*G, 2^390*G */
2064     for (i = 1; i <= 4; i <<= 1) {
2065         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2066                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2067                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2068         for (j = 0; j < 129; ++j) {
2069             point_double(pre->g_pre_comp[2 * i][0],
2070                          pre->g_pre_comp[2 * i][1],
2071                          pre->g_pre_comp[2 * i][2],
2072                          pre->g_pre_comp[2 * i][0],
2073                          pre->g_pre_comp[2 * i][1],
2074                          pre->g_pre_comp[2 * i][2]);
2075         }
2076     }
2077     /* g_pre_comp[0] is the point at infinity */
2078     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2079     /* the remaining multiples */
2080     /* 2^130*G + 2^260*G */
2081     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2082               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2083               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2084               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2085               pre->g_pre_comp[2][2]);
2086     /* 2^130*G + 2^390*G */
2087     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2088               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2089               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2090               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2091               pre->g_pre_comp[2][2]);
2092     /* 2^260*G + 2^390*G */
2093     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2094               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2095               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2096               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2097               pre->g_pre_comp[4][2]);
2098     /* 2^130*G + 2^260*G + 2^390*G */
2099     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2100               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2101               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2102               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2103               pre->g_pre_comp[2][2]);
2104     for (i = 1; i < 8; ++i) {
2105         /* odd multiples: add G */
2106         point_add(pre->g_pre_comp[2 * i + 1][0],
2107                   pre->g_pre_comp[2 * i + 1][1],
2108                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2109                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2110                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2111                   pre->g_pre_comp[1][2]);
2112     }
2113     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2114 
2115  done:
2116     if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
2117                              nistp521_pre_comp_free,
2118                              nistp521_pre_comp_clear_free))
2119         goto err;
2120     ret = 1;
2121     pre = NULL;
2122  err:
2123     BN_CTX_end(ctx);
2124     if (generator != NULL)
2125         EC_POINT_free(generator);
2126     if (new_ctx != NULL)
2127         BN_CTX_free(new_ctx);
2128     if (pre)
2129         nistp521_pre_comp_free(pre);
2130     return ret;
2131 }
2132 
2133 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2134 {
2135     if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2136                             nistp521_pre_comp_free,
2137                             nistp521_pre_comp_clear_free)
2138         != NULL)
2139         return 1;
2140     else
2141         return 0;
2142 }
2143 
2144 #else
2145 static void *dummy = &dummy;
2146 #endif
2147