xref: /freebsd/crypto/openssl/crypto/ec/ecp_nistp521.c (revision a3d9bf49b57923118c339642594246ef73872ee8)
1 /*
2  * Copyright 2011-2020 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2011 Google Inc.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
28  *
29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31  * work which got its smarts from Daniel J. Bernstein's work on the same.
32  */
33 
34 #include <openssl/e_os2.h>
35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36 NON_EMPTY_TRANSLATION_UNIT
37 #else
38 
39 # include <string.h>
40 # include <openssl/err.h>
41 # include "ec_local.h"
42 
43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
44   /* even with gcc, the typedef won't work for 32-bit platforms */
45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                  * platforms */
47 # else
48 #  error "Your compiler doesn't appear to support 128-bit integer types"
49 # endif
50 
51 typedef uint8_t u8;
52 typedef uint64_t u64;
53 
54 /*
55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56  * element of this field into 66 bytes where the most significant byte
57  * contains only a single bit. We call this an felem_bytearray.
58  */
59 
60 typedef u8 felem_bytearray[66];
61 
62 /*
63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64  * These values are big-endian.
65  */
66 static const felem_bytearray nistp521_curve_params[5] = {
67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75      0xff, 0xff},
76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84      0xff, 0xfc},
85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93      0x3f, 0x00},
94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102      0xbd, 0x66},
103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111      0x66, 0x50}
112 };
113 
114 /*-
115  * The representation of field elements.
116  * ------------------------------------
117  *
118  * We represent field elements with nine values. These values are either 64 or
119  * 128 bits and the field element represented is:
120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122  * 58 bits apart, but are greater than 58 bits in length, the most significant
123  * bits of each limb overlap with the least significant bits of the next.
124  *
125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126  * 'largefelem' */
127 
128 # define NLIMBS 9
129 
130 typedef uint64_t limb;
131 typedef limb limb_aX __attribute((__aligned__(1)));
132 typedef limb felem[NLIMBS];
133 typedef uint128_t largefelem[NLIMBS];
134 
135 static const limb bottom57bits = 0x1ffffffffffffff;
136 static const limb bottom58bits = 0x3ffffffffffffff;
137 
138 /*
139  * bin66_to_felem takes a little-endian byte array and converts it into felem
140  * form. This assumes that the CPU is little-endian.
141  */
142 static void bin66_to_felem(felem out, const u8 in[66])
143 {
144     out[0] = (*((limb *) & in[0])) & bottom58bits;
145     out[1] = (*((limb_aX *) & in[7]) >> 2) & bottom58bits;
146     out[2] = (*((limb_aX *) & in[14]) >> 4) & bottom58bits;
147     out[3] = (*((limb_aX *) & in[21]) >> 6) & bottom58bits;
148     out[4] = (*((limb_aX *) & in[29])) & bottom58bits;
149     out[5] = (*((limb_aX *) & in[36]) >> 2) & bottom58bits;
150     out[6] = (*((limb_aX *) & in[43]) >> 4) & bottom58bits;
151     out[7] = (*((limb_aX *) & in[50]) >> 6) & bottom58bits;
152     out[8] = (*((limb_aX *) & in[58])) & bottom57bits;
153 }
154 
155 /*
156  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
157  * array. This assumes that the CPU is little-endian.
158  */
159 static void felem_to_bin66(u8 out[66], const felem in)
160 {
161     memset(out, 0, 66);
162     (*((limb *) & out[0])) = in[0];
163     (*((limb_aX *) & out[7])) |= in[1] << 2;
164     (*((limb_aX *) & out[14])) |= in[2] << 4;
165     (*((limb_aX *) & out[21])) |= in[3] << 6;
166     (*((limb_aX *) & out[29])) = in[4];
167     (*((limb_aX *) & out[36])) |= in[5] << 2;
168     (*((limb_aX *) & out[43])) |= in[6] << 4;
169     (*((limb_aX *) & out[50])) |= in[7] << 6;
170     (*((limb_aX *) & out[58])) = in[8];
171 }
172 
173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174 static int BN_to_felem(felem out, const BIGNUM *bn)
175 {
176     felem_bytearray b_out;
177     int num_bytes;
178 
179     if (BN_is_negative(bn)) {
180         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
181         return 0;
182     }
183     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
184     if (num_bytes < 0) {
185         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
186         return 0;
187     }
188     bin66_to_felem(out, b_out);
189     return 1;
190 }
191 
192 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
193 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
194 {
195     felem_bytearray b_out;
196     felem_to_bin66(b_out, in);
197     return BN_lebin2bn(b_out, sizeof(b_out), out);
198 }
199 
200 /*-
201  * Field operations
202  * ----------------
203  */
204 
205 static void felem_one(felem out)
206 {
207     out[0] = 1;
208     out[1] = 0;
209     out[2] = 0;
210     out[3] = 0;
211     out[4] = 0;
212     out[5] = 0;
213     out[6] = 0;
214     out[7] = 0;
215     out[8] = 0;
216 }
217 
218 static void felem_assign(felem out, const felem in)
219 {
220     out[0] = in[0];
221     out[1] = in[1];
222     out[2] = in[2];
223     out[3] = in[3];
224     out[4] = in[4];
225     out[5] = in[5];
226     out[6] = in[6];
227     out[7] = in[7];
228     out[8] = in[8];
229 }
230 
231 /* felem_sum64 sets out = out + in. */
232 static void felem_sum64(felem out, const felem in)
233 {
234     out[0] += in[0];
235     out[1] += in[1];
236     out[2] += in[2];
237     out[3] += in[3];
238     out[4] += in[4];
239     out[5] += in[5];
240     out[6] += in[6];
241     out[7] += in[7];
242     out[8] += in[8];
243 }
244 
245 /* felem_scalar sets out = in * scalar */
246 static void felem_scalar(felem out, const felem in, limb scalar)
247 {
248     out[0] = in[0] * scalar;
249     out[1] = in[1] * scalar;
250     out[2] = in[2] * scalar;
251     out[3] = in[3] * scalar;
252     out[4] = in[4] * scalar;
253     out[5] = in[5] * scalar;
254     out[6] = in[6] * scalar;
255     out[7] = in[7] * scalar;
256     out[8] = in[8] * scalar;
257 }
258 
259 /* felem_scalar64 sets out = out * scalar */
260 static void felem_scalar64(felem out, limb scalar)
261 {
262     out[0] *= scalar;
263     out[1] *= scalar;
264     out[2] *= scalar;
265     out[3] *= scalar;
266     out[4] *= scalar;
267     out[5] *= scalar;
268     out[6] *= scalar;
269     out[7] *= scalar;
270     out[8] *= scalar;
271 }
272 
273 /* felem_scalar128 sets out = out * scalar */
274 static void felem_scalar128(largefelem out, limb scalar)
275 {
276     out[0] *= scalar;
277     out[1] *= scalar;
278     out[2] *= scalar;
279     out[3] *= scalar;
280     out[4] *= scalar;
281     out[5] *= scalar;
282     out[6] *= scalar;
283     out[7] *= scalar;
284     out[8] *= scalar;
285 }
286 
287 /*-
288  * felem_neg sets |out| to |-in|
289  * On entry:
290  *   in[i] < 2^59 + 2^14
291  * On exit:
292  *   out[i] < 2^62
293  */
294 static void felem_neg(felem out, const felem in)
295 {
296     /* In order to prevent underflow, we subtract from 0 mod p. */
297     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
298     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
299 
300     out[0] = two62m3 - in[0];
301     out[1] = two62m2 - in[1];
302     out[2] = two62m2 - in[2];
303     out[3] = two62m2 - in[3];
304     out[4] = two62m2 - in[4];
305     out[5] = two62m2 - in[5];
306     out[6] = two62m2 - in[6];
307     out[7] = two62m2 - in[7];
308     out[8] = two62m2 - in[8];
309 }
310 
311 /*-
312  * felem_diff64 subtracts |in| from |out|
313  * On entry:
314  *   in[i] < 2^59 + 2^14
315  * On exit:
316  *   out[i] < out[i] + 2^62
317  */
318 static void felem_diff64(felem out, const felem in)
319 {
320     /*
321      * In order to prevent underflow, we add 0 mod p before subtracting.
322      */
323     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
324     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
325 
326     out[0] += two62m3 - in[0];
327     out[1] += two62m2 - in[1];
328     out[2] += two62m2 - in[2];
329     out[3] += two62m2 - in[3];
330     out[4] += two62m2 - in[4];
331     out[5] += two62m2 - in[5];
332     out[6] += two62m2 - in[6];
333     out[7] += two62m2 - in[7];
334     out[8] += two62m2 - in[8];
335 }
336 
337 /*-
338  * felem_diff_128_64 subtracts |in| from |out|
339  * On entry:
340  *   in[i] < 2^62 + 2^17
341  * On exit:
342  *   out[i] < out[i] + 2^63
343  */
344 static void felem_diff_128_64(largefelem out, const felem in)
345 {
346     /*
347      * In order to prevent underflow, we add 64p mod p (which is equivalent
348      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
349      * digit number with all bits set to 1. See "The representation of field
350      * elements" comment above for a description of how limbs are used to
351      * represent a number. 64p is represented with 8 limbs containing a number
352      * with 58 bits set and one limb with a number with 57 bits set.
353      */
354     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
355     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
356 
357     out[0] += two63m6 - in[0];
358     out[1] += two63m5 - in[1];
359     out[2] += two63m5 - in[2];
360     out[3] += two63m5 - in[3];
361     out[4] += two63m5 - in[4];
362     out[5] += two63m5 - in[5];
363     out[6] += two63m5 - in[6];
364     out[7] += two63m5 - in[7];
365     out[8] += two63m5 - in[8];
366 }
367 
368 /*-
369  * felem_diff_128_64 subtracts |in| from |out|
370  * On entry:
371  *   in[i] < 2^126
372  * On exit:
373  *   out[i] < out[i] + 2^127 - 2^69
374  */
375 static void felem_diff128(largefelem out, const largefelem in)
376 {
377     /*
378      * In order to prevent underflow, we add 0 mod p before subtracting.
379      */
380     static const uint128_t two127m70 =
381         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
382     static const uint128_t two127m69 =
383         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
384 
385     out[0] += (two127m70 - in[0]);
386     out[1] += (two127m69 - in[1]);
387     out[2] += (two127m69 - in[2]);
388     out[3] += (two127m69 - in[3]);
389     out[4] += (two127m69 - in[4]);
390     out[5] += (two127m69 - in[5]);
391     out[6] += (two127m69 - in[6]);
392     out[7] += (two127m69 - in[7]);
393     out[8] += (two127m69 - in[8]);
394 }
395 
396 /*-
397  * felem_square sets |out| = |in|^2
398  * On entry:
399  *   in[i] < 2^62
400  * On exit:
401  *   out[i] < 17 * max(in[i]) * max(in[i])
402  */
403 static void felem_square(largefelem out, const felem in)
404 {
405     felem inx2, inx4;
406     felem_scalar(inx2, in, 2);
407     felem_scalar(inx4, in, 4);
408 
409     /*-
410      * We have many cases were we want to do
411      *   in[x] * in[y] +
412      *   in[y] * in[x]
413      * This is obviously just
414      *   2 * in[x] * in[y]
415      * However, rather than do the doubling on the 128 bit result, we
416      * double one of the inputs to the multiplication by reading from
417      * |inx2|
418      */
419 
420     out[0] = ((uint128_t) in[0]) * in[0];
421     out[1] = ((uint128_t) in[0]) * inx2[1];
422     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
423     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
424     out[4] = ((uint128_t) in[0]) * inx2[4] +
425              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
426     out[5] = ((uint128_t) in[0]) * inx2[5] +
427              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
428     out[6] = ((uint128_t) in[0]) * inx2[6] +
429              ((uint128_t) in[1]) * inx2[5] +
430              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
431     out[7] = ((uint128_t) in[0]) * inx2[7] +
432              ((uint128_t) in[1]) * inx2[6] +
433              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
434     out[8] = ((uint128_t) in[0]) * inx2[8] +
435              ((uint128_t) in[1]) * inx2[7] +
436              ((uint128_t) in[2]) * inx2[6] +
437              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
438 
439     /*
440      * The remaining limbs fall above 2^521, with the first falling at 2^522.
441      * They correspond to locations one bit up from the limbs produced above
442      * so we would have to multiply by two to align them. Again, rather than
443      * operate on the 128-bit result, we double one of the inputs to the
444      * multiplication. If we want to double for both this reason, and the
445      * reason above, then we end up multiplying by four.
446      */
447 
448     /* 9 */
449     out[0] += ((uint128_t) in[1]) * inx4[8] +
450               ((uint128_t) in[2]) * inx4[7] +
451               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
452 
453     /* 10 */
454     out[1] += ((uint128_t) in[2]) * inx4[8] +
455               ((uint128_t) in[3]) * inx4[7] +
456               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
457 
458     /* 11 */
459     out[2] += ((uint128_t) in[3]) * inx4[8] +
460               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
461 
462     /* 12 */
463     out[3] += ((uint128_t) in[4]) * inx4[8] +
464               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
465 
466     /* 13 */
467     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
468 
469     /* 14 */
470     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
471 
472     /* 15 */
473     out[6] += ((uint128_t) in[7]) * inx4[8];
474 
475     /* 16 */
476     out[7] += ((uint128_t) in[8]) * inx2[8];
477 }
478 
479 /*-
480  * felem_mul sets |out| = |in1| * |in2|
481  * On entry:
482  *   in1[i] < 2^64
483  *   in2[i] < 2^63
484  * On exit:
485  *   out[i] < 17 * max(in1[i]) * max(in2[i])
486  */
487 static void felem_mul(largefelem out, const felem in1, const felem in2)
488 {
489     felem in2x2;
490     felem_scalar(in2x2, in2, 2);
491 
492     out[0] = ((uint128_t) in1[0]) * in2[0];
493 
494     out[1] = ((uint128_t) in1[0]) * in2[1] +
495              ((uint128_t) in1[1]) * in2[0];
496 
497     out[2] = ((uint128_t) in1[0]) * in2[2] +
498              ((uint128_t) in1[1]) * in2[1] +
499              ((uint128_t) in1[2]) * in2[0];
500 
501     out[3] = ((uint128_t) in1[0]) * in2[3] +
502              ((uint128_t) in1[1]) * in2[2] +
503              ((uint128_t) in1[2]) * in2[1] +
504              ((uint128_t) in1[3]) * in2[0];
505 
506     out[4] = ((uint128_t) in1[0]) * in2[4] +
507              ((uint128_t) in1[1]) * in2[3] +
508              ((uint128_t) in1[2]) * in2[2] +
509              ((uint128_t) in1[3]) * in2[1] +
510              ((uint128_t) in1[4]) * in2[0];
511 
512     out[5] = ((uint128_t) in1[0]) * in2[5] +
513              ((uint128_t) in1[1]) * in2[4] +
514              ((uint128_t) in1[2]) * in2[3] +
515              ((uint128_t) in1[3]) * in2[2] +
516              ((uint128_t) in1[4]) * in2[1] +
517              ((uint128_t) in1[5]) * in2[0];
518 
519     out[6] = ((uint128_t) in1[0]) * in2[6] +
520              ((uint128_t) in1[1]) * in2[5] +
521              ((uint128_t) in1[2]) * in2[4] +
522              ((uint128_t) in1[3]) * in2[3] +
523              ((uint128_t) in1[4]) * in2[2] +
524              ((uint128_t) in1[5]) * in2[1] +
525              ((uint128_t) in1[6]) * in2[0];
526 
527     out[7] = ((uint128_t) in1[0]) * in2[7] +
528              ((uint128_t) in1[1]) * in2[6] +
529              ((uint128_t) in1[2]) * in2[5] +
530              ((uint128_t) in1[3]) * in2[4] +
531              ((uint128_t) in1[4]) * in2[3] +
532              ((uint128_t) in1[5]) * in2[2] +
533              ((uint128_t) in1[6]) * in2[1] +
534              ((uint128_t) in1[7]) * in2[0];
535 
536     out[8] = ((uint128_t) in1[0]) * in2[8] +
537              ((uint128_t) in1[1]) * in2[7] +
538              ((uint128_t) in1[2]) * in2[6] +
539              ((uint128_t) in1[3]) * in2[5] +
540              ((uint128_t) in1[4]) * in2[4] +
541              ((uint128_t) in1[5]) * in2[3] +
542              ((uint128_t) in1[6]) * in2[2] +
543              ((uint128_t) in1[7]) * in2[1] +
544              ((uint128_t) in1[8]) * in2[0];
545 
546     /* See comment in felem_square about the use of in2x2 here */
547 
548     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
549               ((uint128_t) in1[2]) * in2x2[7] +
550               ((uint128_t) in1[3]) * in2x2[6] +
551               ((uint128_t) in1[4]) * in2x2[5] +
552               ((uint128_t) in1[5]) * in2x2[4] +
553               ((uint128_t) in1[6]) * in2x2[3] +
554               ((uint128_t) in1[7]) * in2x2[2] +
555               ((uint128_t) in1[8]) * in2x2[1];
556 
557     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
558               ((uint128_t) in1[3]) * in2x2[7] +
559               ((uint128_t) in1[4]) * in2x2[6] +
560               ((uint128_t) in1[5]) * in2x2[5] +
561               ((uint128_t) in1[6]) * in2x2[4] +
562               ((uint128_t) in1[7]) * in2x2[3] +
563               ((uint128_t) in1[8]) * in2x2[2];
564 
565     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
566               ((uint128_t) in1[4]) * in2x2[7] +
567               ((uint128_t) in1[5]) * in2x2[6] +
568               ((uint128_t) in1[6]) * in2x2[5] +
569               ((uint128_t) in1[7]) * in2x2[4] +
570               ((uint128_t) in1[8]) * in2x2[3];
571 
572     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
573               ((uint128_t) in1[5]) * in2x2[7] +
574               ((uint128_t) in1[6]) * in2x2[6] +
575               ((uint128_t) in1[7]) * in2x2[5] +
576               ((uint128_t) in1[8]) * in2x2[4];
577 
578     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
579               ((uint128_t) in1[6]) * in2x2[7] +
580               ((uint128_t) in1[7]) * in2x2[6] +
581               ((uint128_t) in1[8]) * in2x2[5];
582 
583     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
584               ((uint128_t) in1[7]) * in2x2[7] +
585               ((uint128_t) in1[8]) * in2x2[6];
586 
587     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
588               ((uint128_t) in1[8]) * in2x2[7];
589 
590     out[7] += ((uint128_t) in1[8]) * in2x2[8];
591 }
592 
593 static const limb bottom52bits = 0xfffffffffffff;
594 
595 /*-
596  * felem_reduce converts a largefelem to an felem.
597  * On entry:
598  *   in[i] < 2^128
599  * On exit:
600  *   out[i] < 2^59 + 2^14
601  */
602 static void felem_reduce(felem out, const largefelem in)
603 {
604     u64 overflow1, overflow2;
605 
606     out[0] = ((limb) in[0]) & bottom58bits;
607     out[1] = ((limb) in[1]) & bottom58bits;
608     out[2] = ((limb) in[2]) & bottom58bits;
609     out[3] = ((limb) in[3]) & bottom58bits;
610     out[4] = ((limb) in[4]) & bottom58bits;
611     out[5] = ((limb) in[5]) & bottom58bits;
612     out[6] = ((limb) in[6]) & bottom58bits;
613     out[7] = ((limb) in[7]) & bottom58bits;
614     out[8] = ((limb) in[8]) & bottom58bits;
615 
616     /* out[i] < 2^58 */
617 
618     out[1] += ((limb) in[0]) >> 58;
619     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
620     /*-
621      * out[1] < 2^58 + 2^6 + 2^58
622      *        = 2^59 + 2^6
623      */
624     out[2] += ((limb) (in[0] >> 64)) >> 52;
625 
626     out[2] += ((limb) in[1]) >> 58;
627     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
628     out[3] += ((limb) (in[1] >> 64)) >> 52;
629 
630     out[3] += ((limb) in[2]) >> 58;
631     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
632     out[4] += ((limb) (in[2] >> 64)) >> 52;
633 
634     out[4] += ((limb) in[3]) >> 58;
635     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
636     out[5] += ((limb) (in[3] >> 64)) >> 52;
637 
638     out[5] += ((limb) in[4]) >> 58;
639     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
640     out[6] += ((limb) (in[4] >> 64)) >> 52;
641 
642     out[6] += ((limb) in[5]) >> 58;
643     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
644     out[7] += ((limb) (in[5] >> 64)) >> 52;
645 
646     out[7] += ((limb) in[6]) >> 58;
647     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
648     out[8] += ((limb) (in[6] >> 64)) >> 52;
649 
650     out[8] += ((limb) in[7]) >> 58;
651     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
652     /*-
653      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
654      *            < 2^59 + 2^13
655      */
656     overflow1 = ((limb) (in[7] >> 64)) >> 52;
657 
658     overflow1 += ((limb) in[8]) >> 58;
659     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
660     overflow2 = ((limb) (in[8] >> 64)) >> 52;
661 
662     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
663     overflow2 <<= 1;            /* overflow2 < 2^13 */
664 
665     out[0] += overflow1;        /* out[0] < 2^60 */
666     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
667 
668     out[1] += out[0] >> 58;
669     out[0] &= bottom58bits;
670     /*-
671      * out[0] < 2^58
672      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
673      *        < 2^59 + 2^14
674      */
675 }
676 
677 static void felem_square_reduce(felem out, const felem in)
678 {
679     largefelem tmp;
680     felem_square(tmp, in);
681     felem_reduce(out, tmp);
682 }
683 
684 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
685 {
686     largefelem tmp;
687     felem_mul(tmp, in1, in2);
688     felem_reduce(out, tmp);
689 }
690 
691 /*-
692  * felem_inv calculates |out| = |in|^{-1}
693  *
694  * Based on Fermat's Little Theorem:
695  *   a^p = a (mod p)
696  *   a^{p-1} = 1 (mod p)
697  *   a^{p-2} = a^{-1} (mod p)
698  */
699 static void felem_inv(felem out, const felem in)
700 {
701     felem ftmp, ftmp2, ftmp3, ftmp4;
702     largefelem tmp;
703     unsigned i;
704 
705     felem_square(tmp, in);
706     felem_reduce(ftmp, tmp);    /* 2^1 */
707     felem_mul(tmp, in, ftmp);
708     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
709     felem_assign(ftmp2, ftmp);
710     felem_square(tmp, ftmp);
711     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
712     felem_mul(tmp, in, ftmp);
713     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
714     felem_square(tmp, ftmp);
715     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
716 
717     felem_square(tmp, ftmp2);
718     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
719     felem_square(tmp, ftmp3);
720     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
721     felem_mul(tmp, ftmp3, ftmp2);
722     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
723 
724     felem_assign(ftmp2, ftmp3);
725     felem_square(tmp, ftmp3);
726     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
727     felem_square(tmp, ftmp3);
728     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
729     felem_square(tmp, ftmp3);
730     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
731     felem_square(tmp, ftmp3);
732     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
733     felem_assign(ftmp4, ftmp3);
734     felem_mul(tmp, ftmp3, ftmp);
735     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
736     felem_square(tmp, ftmp4);
737     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
738     felem_mul(tmp, ftmp3, ftmp2);
739     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
740     felem_assign(ftmp2, ftmp3);
741 
742     for (i = 0; i < 8; i++) {
743         felem_square(tmp, ftmp3);
744         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
745     }
746     felem_mul(tmp, ftmp3, ftmp2);
747     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
748     felem_assign(ftmp2, ftmp3);
749 
750     for (i = 0; i < 16; i++) {
751         felem_square(tmp, ftmp3);
752         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
753     }
754     felem_mul(tmp, ftmp3, ftmp2);
755     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
756     felem_assign(ftmp2, ftmp3);
757 
758     for (i = 0; i < 32; i++) {
759         felem_square(tmp, ftmp3);
760         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
761     }
762     felem_mul(tmp, ftmp3, ftmp2);
763     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
764     felem_assign(ftmp2, ftmp3);
765 
766     for (i = 0; i < 64; i++) {
767         felem_square(tmp, ftmp3);
768         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
769     }
770     felem_mul(tmp, ftmp3, ftmp2);
771     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
772     felem_assign(ftmp2, ftmp3);
773 
774     for (i = 0; i < 128; i++) {
775         felem_square(tmp, ftmp3);
776         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
777     }
778     felem_mul(tmp, ftmp3, ftmp2);
779     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
780     felem_assign(ftmp2, ftmp3);
781 
782     for (i = 0; i < 256; i++) {
783         felem_square(tmp, ftmp3);
784         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
785     }
786     felem_mul(tmp, ftmp3, ftmp2);
787     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
788 
789     for (i = 0; i < 9; i++) {
790         felem_square(tmp, ftmp3);
791         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
792     }
793     felem_mul(tmp, ftmp3, ftmp4);
794     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
795     felem_mul(tmp, ftmp3, in);
796     felem_reduce(out, tmp);     /* 2^512 - 3 */
797 }
798 
799 /* This is 2^521-1, expressed as an felem */
800 static const felem kPrime = {
801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
803     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
804 };
805 
806 /*-
807  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
808  * otherwise.
809  * On entry:
810  *   in[i] < 2^59 + 2^14
811  */
812 static limb felem_is_zero(const felem in)
813 {
814     felem ftmp;
815     limb is_zero, is_p;
816     felem_assign(ftmp, in);
817 
818     ftmp[0] += ftmp[8] >> 57;
819     ftmp[8] &= bottom57bits;
820     /* ftmp[8] < 2^57 */
821     ftmp[1] += ftmp[0] >> 58;
822     ftmp[0] &= bottom58bits;
823     ftmp[2] += ftmp[1] >> 58;
824     ftmp[1] &= bottom58bits;
825     ftmp[3] += ftmp[2] >> 58;
826     ftmp[2] &= bottom58bits;
827     ftmp[4] += ftmp[3] >> 58;
828     ftmp[3] &= bottom58bits;
829     ftmp[5] += ftmp[4] >> 58;
830     ftmp[4] &= bottom58bits;
831     ftmp[6] += ftmp[5] >> 58;
832     ftmp[5] &= bottom58bits;
833     ftmp[7] += ftmp[6] >> 58;
834     ftmp[6] &= bottom58bits;
835     ftmp[8] += ftmp[7] >> 58;
836     ftmp[7] &= bottom58bits;
837     /* ftmp[8] < 2^57 + 4 */
838 
839     /*
840      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
841      * than our bound for ftmp[8]. Therefore we only have to check if the
842      * zero is zero or 2^521-1.
843      */
844 
845     is_zero = 0;
846     is_zero |= ftmp[0];
847     is_zero |= ftmp[1];
848     is_zero |= ftmp[2];
849     is_zero |= ftmp[3];
850     is_zero |= ftmp[4];
851     is_zero |= ftmp[5];
852     is_zero |= ftmp[6];
853     is_zero |= ftmp[7];
854     is_zero |= ftmp[8];
855 
856     is_zero--;
857     /*
858      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
859      * can be set is if is_zero was 0 before the decrement.
860      */
861     is_zero = 0 - (is_zero >> 63);
862 
863     is_p = ftmp[0] ^ kPrime[0];
864     is_p |= ftmp[1] ^ kPrime[1];
865     is_p |= ftmp[2] ^ kPrime[2];
866     is_p |= ftmp[3] ^ kPrime[3];
867     is_p |= ftmp[4] ^ kPrime[4];
868     is_p |= ftmp[5] ^ kPrime[5];
869     is_p |= ftmp[6] ^ kPrime[6];
870     is_p |= ftmp[7] ^ kPrime[7];
871     is_p |= ftmp[8] ^ kPrime[8];
872 
873     is_p--;
874     is_p = 0 - (is_p >> 63);
875 
876     is_zero |= is_p;
877     return is_zero;
878 }
879 
880 static int felem_is_zero_int(const void *in)
881 {
882     return (int)(felem_is_zero(in) & ((limb) 1));
883 }
884 
885 /*-
886  * felem_contract converts |in| to its unique, minimal representation.
887  * On entry:
888  *   in[i] < 2^59 + 2^14
889  */
890 static void felem_contract(felem out, const felem in)
891 {
892     limb is_p, is_greater, sign;
893     static const limb two58 = ((limb) 1) << 58;
894 
895     felem_assign(out, in);
896 
897     out[0] += out[8] >> 57;
898     out[8] &= bottom57bits;
899     /* out[8] < 2^57 */
900     out[1] += out[0] >> 58;
901     out[0] &= bottom58bits;
902     out[2] += out[1] >> 58;
903     out[1] &= bottom58bits;
904     out[3] += out[2] >> 58;
905     out[2] &= bottom58bits;
906     out[4] += out[3] >> 58;
907     out[3] &= bottom58bits;
908     out[5] += out[4] >> 58;
909     out[4] &= bottom58bits;
910     out[6] += out[5] >> 58;
911     out[5] &= bottom58bits;
912     out[7] += out[6] >> 58;
913     out[6] &= bottom58bits;
914     out[8] += out[7] >> 58;
915     out[7] &= bottom58bits;
916     /* out[8] < 2^57 + 4 */
917 
918     /*
919      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
920      * out. See the comments in felem_is_zero regarding why we don't test for
921      * other multiples of the prime.
922      */
923 
924     /*
925      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
926      */
927 
928     is_p = out[0] ^ kPrime[0];
929     is_p |= out[1] ^ kPrime[1];
930     is_p |= out[2] ^ kPrime[2];
931     is_p |= out[3] ^ kPrime[3];
932     is_p |= out[4] ^ kPrime[4];
933     is_p |= out[5] ^ kPrime[5];
934     is_p |= out[6] ^ kPrime[6];
935     is_p |= out[7] ^ kPrime[7];
936     is_p |= out[8] ^ kPrime[8];
937 
938     is_p--;
939     is_p &= is_p << 32;
940     is_p &= is_p << 16;
941     is_p &= is_p << 8;
942     is_p &= is_p << 4;
943     is_p &= is_p << 2;
944     is_p &= is_p << 1;
945     is_p = 0 - (is_p >> 63);
946     is_p = ~is_p;
947 
948     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
949 
950     out[0] &= is_p;
951     out[1] &= is_p;
952     out[2] &= is_p;
953     out[3] &= is_p;
954     out[4] &= is_p;
955     out[5] &= is_p;
956     out[6] &= is_p;
957     out[7] &= is_p;
958     out[8] &= is_p;
959 
960     /*
961      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
962      * 57 is greater than zero as (2^521-1) + x >= 2^522
963      */
964     is_greater = out[8] >> 57;
965     is_greater |= is_greater << 32;
966     is_greater |= is_greater << 16;
967     is_greater |= is_greater << 8;
968     is_greater |= is_greater << 4;
969     is_greater |= is_greater << 2;
970     is_greater |= is_greater << 1;
971     is_greater = 0 - (is_greater >> 63);
972 
973     out[0] -= kPrime[0] & is_greater;
974     out[1] -= kPrime[1] & is_greater;
975     out[2] -= kPrime[2] & is_greater;
976     out[3] -= kPrime[3] & is_greater;
977     out[4] -= kPrime[4] & is_greater;
978     out[5] -= kPrime[5] & is_greater;
979     out[6] -= kPrime[6] & is_greater;
980     out[7] -= kPrime[7] & is_greater;
981     out[8] -= kPrime[8] & is_greater;
982 
983     /* Eliminate negative coefficients */
984     sign = -(out[0] >> 63);
985     out[0] += (two58 & sign);
986     out[1] -= (1 & sign);
987     sign = -(out[1] >> 63);
988     out[1] += (two58 & sign);
989     out[2] -= (1 & sign);
990     sign = -(out[2] >> 63);
991     out[2] += (two58 & sign);
992     out[3] -= (1 & sign);
993     sign = -(out[3] >> 63);
994     out[3] += (two58 & sign);
995     out[4] -= (1 & sign);
996     sign = -(out[4] >> 63);
997     out[4] += (two58 & sign);
998     out[5] -= (1 & sign);
999     sign = -(out[0] >> 63);
1000     out[5] += (two58 & sign);
1001     out[6] -= (1 & sign);
1002     sign = -(out[6] >> 63);
1003     out[6] += (two58 & sign);
1004     out[7] -= (1 & sign);
1005     sign = -(out[7] >> 63);
1006     out[7] += (two58 & sign);
1007     out[8] -= (1 & sign);
1008     sign = -(out[5] >> 63);
1009     out[5] += (two58 & sign);
1010     out[6] -= (1 & sign);
1011     sign = -(out[6] >> 63);
1012     out[6] += (two58 & sign);
1013     out[7] -= (1 & sign);
1014     sign = -(out[7] >> 63);
1015     out[7] += (two58 & sign);
1016     out[8] -= (1 & sign);
1017 }
1018 
1019 /*-
1020  * Group operations
1021  * ----------------
1022  *
1023  * Building on top of the field operations we have the operations on the
1024  * elliptic curve group itself. Points on the curve are represented in Jacobian
1025  * coordinates */
1026 
1027 /*-
1028  * point_double calculates 2*(x_in, y_in, z_in)
1029  *
1030  * The method is taken from:
1031  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1032  *
1033  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1034  * while x_out == y_in is not (maybe this works, but it's not tested). */
1035 static void
1036 point_double(felem x_out, felem y_out, felem z_out,
1037              const felem x_in, const felem y_in, const felem z_in)
1038 {
1039     largefelem tmp, tmp2;
1040     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1041 
1042     felem_assign(ftmp, x_in);
1043     felem_assign(ftmp2, x_in);
1044 
1045     /* delta = z^2 */
1046     felem_square(tmp, z_in);
1047     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1048 
1049     /* gamma = y^2 */
1050     felem_square(tmp, y_in);
1051     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1052 
1053     /* beta = x*gamma */
1054     felem_mul(tmp, x_in, gamma);
1055     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1056 
1057     /* alpha = 3*(x-delta)*(x+delta) */
1058     felem_diff64(ftmp, delta);
1059     /* ftmp[i] < 2^61 */
1060     felem_sum64(ftmp2, delta);
1061     /* ftmp2[i] < 2^60 + 2^15 */
1062     felem_scalar64(ftmp2, 3);
1063     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1064     felem_mul(tmp, ftmp, ftmp2);
1065     /*-
1066      * tmp[i] < 17(3*2^121 + 3*2^76)
1067      *        = 61*2^121 + 61*2^76
1068      *        < 64*2^121 + 64*2^76
1069      *        = 2^127 + 2^82
1070      *        < 2^128
1071      */
1072     felem_reduce(alpha, tmp);
1073 
1074     /* x' = alpha^2 - 8*beta */
1075     felem_square(tmp, alpha);
1076     /*
1077      * tmp[i] < 17*2^120 < 2^125
1078      */
1079     felem_assign(ftmp, beta);
1080     felem_scalar64(ftmp, 8);
1081     /* ftmp[i] < 2^62 + 2^17 */
1082     felem_diff_128_64(tmp, ftmp);
1083     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1084     felem_reduce(x_out, tmp);
1085 
1086     /* z' = (y + z)^2 - gamma - delta */
1087     felem_sum64(delta, gamma);
1088     /* delta[i] < 2^60 + 2^15 */
1089     felem_assign(ftmp, y_in);
1090     felem_sum64(ftmp, z_in);
1091     /* ftmp[i] < 2^60 + 2^15 */
1092     felem_square(tmp, ftmp);
1093     /*
1094      * tmp[i] < 17(2^122) < 2^127
1095      */
1096     felem_diff_128_64(tmp, delta);
1097     /* tmp[i] < 2^127 + 2^63 */
1098     felem_reduce(z_out, tmp);
1099 
1100     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1101     felem_scalar64(beta, 4);
1102     /* beta[i] < 2^61 + 2^16 */
1103     felem_diff64(beta, x_out);
1104     /* beta[i] < 2^61 + 2^60 + 2^16 */
1105     felem_mul(tmp, alpha, beta);
1106     /*-
1107      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1108      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1109      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1110      *        < 2^128
1111      */
1112     felem_square(tmp2, gamma);
1113     /*-
1114      * tmp2[i] < 17*(2^59 + 2^14)^2
1115      *         = 17*(2^118 + 2^74 + 2^28)
1116      */
1117     felem_scalar128(tmp2, 8);
1118     /*-
1119      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1120      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1121      *         < 2^126
1122      */
1123     felem_diff128(tmp, tmp2);
1124     /*-
1125      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1126      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1127      *          2^74 + 2^69 + 2^34 + 2^30
1128      *        < 2^128
1129      */
1130     felem_reduce(y_out, tmp);
1131 }
1132 
1133 /* copy_conditional copies in to out iff mask is all ones. */
1134 static void copy_conditional(felem out, const felem in, limb mask)
1135 {
1136     unsigned i;
1137     for (i = 0; i < NLIMBS; ++i) {
1138         const limb tmp = mask & (in[i] ^ out[i]);
1139         out[i] ^= tmp;
1140     }
1141 }
1142 
1143 /*-
1144  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1145  *
1146  * The method is taken from
1147  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1148  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1149  *
1150  * This function includes a branch for checking whether the two input points
1151  * are equal (while not equal to the point at infinity). See comment below
1152  * on constant-time.
1153  */
1154 static void point_add(felem x3, felem y3, felem z3,
1155                       const felem x1, const felem y1, const felem z1,
1156                       const int mixed, const felem x2, const felem y2,
1157                       const felem z2)
1158 {
1159     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1160     largefelem tmp, tmp2;
1161     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1162     limb points_equal;
1163 
1164     z1_is_zero = felem_is_zero(z1);
1165     z2_is_zero = felem_is_zero(z2);
1166 
1167     /* ftmp = z1z1 = z1**2 */
1168     felem_square(tmp, z1);
1169     felem_reduce(ftmp, tmp);
1170 
1171     if (!mixed) {
1172         /* ftmp2 = z2z2 = z2**2 */
1173         felem_square(tmp, z2);
1174         felem_reduce(ftmp2, tmp);
1175 
1176         /* u1 = ftmp3 = x1*z2z2 */
1177         felem_mul(tmp, x1, ftmp2);
1178         felem_reduce(ftmp3, tmp);
1179 
1180         /* ftmp5 = z1 + z2 */
1181         felem_assign(ftmp5, z1);
1182         felem_sum64(ftmp5, z2);
1183         /* ftmp5[i] < 2^61 */
1184 
1185         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1186         felem_square(tmp, ftmp5);
1187         /* tmp[i] < 17*2^122 */
1188         felem_diff_128_64(tmp, ftmp);
1189         /* tmp[i] < 17*2^122 + 2^63 */
1190         felem_diff_128_64(tmp, ftmp2);
1191         /* tmp[i] < 17*2^122 + 2^64 */
1192         felem_reduce(ftmp5, tmp);
1193 
1194         /* ftmp2 = z2 * z2z2 */
1195         felem_mul(tmp, ftmp2, z2);
1196         felem_reduce(ftmp2, tmp);
1197 
1198         /* s1 = ftmp6 = y1 * z2**3 */
1199         felem_mul(tmp, y1, ftmp2);
1200         felem_reduce(ftmp6, tmp);
1201     } else {
1202         /*
1203          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1204          */
1205 
1206         /* u1 = ftmp3 = x1*z2z2 */
1207         felem_assign(ftmp3, x1);
1208 
1209         /* ftmp5 = 2*z1z2 */
1210         felem_scalar(ftmp5, z1, 2);
1211 
1212         /* s1 = ftmp6 = y1 * z2**3 */
1213         felem_assign(ftmp6, y1);
1214     }
1215 
1216     /* u2 = x2*z1z1 */
1217     felem_mul(tmp, x2, ftmp);
1218     /* tmp[i] < 17*2^120 */
1219 
1220     /* h = ftmp4 = u2 - u1 */
1221     felem_diff_128_64(tmp, ftmp3);
1222     /* tmp[i] < 17*2^120 + 2^63 */
1223     felem_reduce(ftmp4, tmp);
1224 
1225     x_equal = felem_is_zero(ftmp4);
1226 
1227     /* z_out = ftmp5 * h */
1228     felem_mul(tmp, ftmp5, ftmp4);
1229     felem_reduce(z_out, tmp);
1230 
1231     /* ftmp = z1 * z1z1 */
1232     felem_mul(tmp, ftmp, z1);
1233     felem_reduce(ftmp, tmp);
1234 
1235     /* s2 = tmp = y2 * z1**3 */
1236     felem_mul(tmp, y2, ftmp);
1237     /* tmp[i] < 17*2^120 */
1238 
1239     /* r = ftmp5 = (s2 - s1)*2 */
1240     felem_diff_128_64(tmp, ftmp6);
1241     /* tmp[i] < 17*2^120 + 2^63 */
1242     felem_reduce(ftmp5, tmp);
1243     y_equal = felem_is_zero(ftmp5);
1244     felem_scalar64(ftmp5, 2);
1245     /* ftmp5[i] < 2^61 */
1246 
1247     /*
1248      * The formulae are incorrect if the points are equal, in affine coordinates
1249      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1250      * happens.
1251      *
1252      * We use bitwise operations to avoid potential side-channels introduced by
1253      * the short-circuiting behaviour of boolean operators.
1254      *
1255      * The special case of either point being the point at infinity (z1 and/or
1256      * z2 are zero), is handled separately later on in this function, so we
1257      * avoid jumping to point_double here in those special cases.
1258      *
1259      * Notice the comment below on the implications of this branching for timing
1260      * leaks and why it is considered practically irrelevant.
1261      */
1262     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1263 
1264     if (points_equal) {
1265         /*
1266          * This is obviously not constant-time but it will almost-never happen
1267          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1268          * where the intermediate value gets very close to the group order.
1269          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1270          * the scalar, it's possible for the intermediate value to be a small
1271          * negative multiple of the base point, and for the final signed digit
1272          * to be the same value. We believe that this only occurs for the scalar
1273          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1274          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1275          * 71e913863f7, in that case the penultimate intermediate is -9G and
1276          * the final digit is also -9G. Since this only happens for a single
1277          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1278          * check whether a secret scalar was that exact value, can already do
1279          * so.)
1280          */
1281         point_double(x3, y3, z3, x1, y1, z1);
1282         return;
1283     }
1284 
1285     /* I = ftmp = (2h)**2 */
1286     felem_assign(ftmp, ftmp4);
1287     felem_scalar64(ftmp, 2);
1288     /* ftmp[i] < 2^61 */
1289     felem_square(tmp, ftmp);
1290     /* tmp[i] < 17*2^122 */
1291     felem_reduce(ftmp, tmp);
1292 
1293     /* J = ftmp2 = h * I */
1294     felem_mul(tmp, ftmp4, ftmp);
1295     felem_reduce(ftmp2, tmp);
1296 
1297     /* V = ftmp4 = U1 * I */
1298     felem_mul(tmp, ftmp3, ftmp);
1299     felem_reduce(ftmp4, tmp);
1300 
1301     /* x_out = r**2 - J - 2V */
1302     felem_square(tmp, ftmp5);
1303     /* tmp[i] < 17*2^122 */
1304     felem_diff_128_64(tmp, ftmp2);
1305     /* tmp[i] < 17*2^122 + 2^63 */
1306     felem_assign(ftmp3, ftmp4);
1307     felem_scalar64(ftmp4, 2);
1308     /* ftmp4[i] < 2^61 */
1309     felem_diff_128_64(tmp, ftmp4);
1310     /* tmp[i] < 17*2^122 + 2^64 */
1311     felem_reduce(x_out, tmp);
1312 
1313     /* y_out = r(V-x_out) - 2 * s1 * J */
1314     felem_diff64(ftmp3, x_out);
1315     /*
1316      * ftmp3[i] < 2^60 + 2^60 = 2^61
1317      */
1318     felem_mul(tmp, ftmp5, ftmp3);
1319     /* tmp[i] < 17*2^122 */
1320     felem_mul(tmp2, ftmp6, ftmp2);
1321     /* tmp2[i] < 17*2^120 */
1322     felem_scalar128(tmp2, 2);
1323     /* tmp2[i] < 17*2^121 */
1324     felem_diff128(tmp, tmp2);
1325         /*-
1326          * tmp[i] < 2^127 - 2^69 + 17*2^122
1327          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1328          *        < 2^127
1329          */
1330     felem_reduce(y_out, tmp);
1331 
1332     copy_conditional(x_out, x2, z1_is_zero);
1333     copy_conditional(x_out, x1, z2_is_zero);
1334     copy_conditional(y_out, y2, z1_is_zero);
1335     copy_conditional(y_out, y1, z2_is_zero);
1336     copy_conditional(z_out, z2, z1_is_zero);
1337     copy_conditional(z_out, z1, z2_is_zero);
1338     felem_assign(x3, x_out);
1339     felem_assign(y3, y_out);
1340     felem_assign(z3, z_out);
1341 }
1342 
1343 /*-
1344  * Base point pre computation
1345  * --------------------------
1346  *
1347  * Two different sorts of precomputed tables are used in the following code.
1348  * Each contain various points on the curve, where each point is three field
1349  * elements (x, y, z).
1350  *
1351  * For the base point table, z is usually 1 (0 for the point at infinity).
1352  * This table has 16 elements:
1353  * index | bits    | point
1354  * ------+---------+------------------------------
1355  *     0 | 0 0 0 0 | 0G
1356  *     1 | 0 0 0 1 | 1G
1357  *     2 | 0 0 1 0 | 2^130G
1358  *     3 | 0 0 1 1 | (2^130 + 1)G
1359  *     4 | 0 1 0 0 | 2^260G
1360  *     5 | 0 1 0 1 | (2^260 + 1)G
1361  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1362  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1363  *     8 | 1 0 0 0 | 2^390G
1364  *     9 | 1 0 0 1 | (2^390 + 1)G
1365  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1366  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1367  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1368  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1369  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1370  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1371  *
1372  * The reason for this is so that we can clock bits into four different
1373  * locations when doing simple scalar multiplies against the base point.
1374  *
1375  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1376 
1377 /* gmul is the table of precomputed base points */
1378 static const felem gmul[16][3] = {
1379 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1380  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1381  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1382 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1383   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1384   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1385  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1386   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1387   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1388  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1389 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1390   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1391   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1392  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1393   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1394   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1395  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1396 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1397   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1398   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1399  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1400   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1401   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1402  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1403 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1404   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1405   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1406  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1407   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1408   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1409  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1410 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1411   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1412   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1413  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1414   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1415   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1416  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1417 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1418   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1419   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1420  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1421   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1422   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1423  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1424 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1425   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1426   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1427  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1428   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1429   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1430  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1431 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1432   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1433   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1434  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1435   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1436   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1437  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1438 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1439   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1440   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1441  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1442   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1443   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1444  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1445 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1446   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1447   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1448  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1449   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1450   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1451  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1452 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1453   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1454   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1455  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1456   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1457   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1458  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1459 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1460   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1461   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1462  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1463   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1464   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1465  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1466 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1467   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1468   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1469  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1470   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1471   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1472  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1473 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1474   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1475   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1476  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1477   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1478   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1479  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1480 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1481   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1482   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1483  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1484   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1485   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1486  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1487 };
1488 
1489 /*
1490  * select_point selects the |idx|th point from a precomputation table and
1491  * copies it to out.
1492  */
1493  /* pre_comp below is of the size provided in |size| */
1494 static void select_point(const limb idx, unsigned int size,
1495                          const felem pre_comp[][3], felem out[3])
1496 {
1497     unsigned i, j;
1498     limb *outlimbs = &out[0][0];
1499 
1500     memset(out, 0, sizeof(*out) * 3);
1501 
1502     for (i = 0; i < size; i++) {
1503         const limb *inlimbs = &pre_comp[i][0][0];
1504         limb mask = i ^ idx;
1505         mask |= mask >> 4;
1506         mask |= mask >> 2;
1507         mask |= mask >> 1;
1508         mask &= 1;
1509         mask--;
1510         for (j = 0; j < NLIMBS * 3; j++)
1511             outlimbs[j] |= inlimbs[j] & mask;
1512     }
1513 }
1514 
1515 /* get_bit returns the |i|th bit in |in| */
1516 static char get_bit(const felem_bytearray in, int i)
1517 {
1518     if (i < 0)
1519         return 0;
1520     return (in[i >> 3] >> (i & 7)) & 1;
1521 }
1522 
1523 /*
1524  * Interleaved point multiplication using precomputed point multiples: The
1525  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1526  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1527  * generator, using certain (large) precomputed multiples in g_pre_comp.
1528  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1529  */
1530 static void batch_mul(felem x_out, felem y_out, felem z_out,
1531                       const felem_bytearray scalars[],
1532                       const unsigned num_points, const u8 *g_scalar,
1533                       const int mixed, const felem pre_comp[][17][3],
1534                       const felem g_pre_comp[16][3])
1535 {
1536     int i, skip;
1537     unsigned num, gen_mul = (g_scalar != NULL);
1538     felem nq[3], tmp[4];
1539     limb bits;
1540     u8 sign, digit;
1541 
1542     /* set nq to the point at infinity */
1543     memset(nq, 0, sizeof(nq));
1544 
1545     /*
1546      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1547      * of the generator (last quarter of rounds) and additions of other
1548      * points multiples (every 5th round).
1549      */
1550     skip = 1;                   /* save two point operations in the first
1551                                  * round */
1552     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1553         /* double */
1554         if (!skip)
1555             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1556 
1557         /* add multiples of the generator */
1558         if (gen_mul && (i <= 130)) {
1559             bits = get_bit(g_scalar, i + 390) << 3;
1560             if (i < 130) {
1561                 bits |= get_bit(g_scalar, i + 260) << 2;
1562                 bits |= get_bit(g_scalar, i + 130) << 1;
1563                 bits |= get_bit(g_scalar, i);
1564             }
1565             /* select the point to add, in constant time */
1566             select_point(bits, 16, g_pre_comp, tmp);
1567             if (!skip) {
1568                 /* The 1 argument below is for "mixed" */
1569                 point_add(nq[0], nq[1], nq[2],
1570                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1571             } else {
1572                 memcpy(nq, tmp, 3 * sizeof(felem));
1573                 skip = 0;
1574             }
1575         }
1576 
1577         /* do other additions every 5 doublings */
1578         if (num_points && (i % 5 == 0)) {
1579             /* loop over all scalars */
1580             for (num = 0; num < num_points; ++num) {
1581                 bits = get_bit(scalars[num], i + 4) << 5;
1582                 bits |= get_bit(scalars[num], i + 3) << 4;
1583                 bits |= get_bit(scalars[num], i + 2) << 3;
1584                 bits |= get_bit(scalars[num], i + 1) << 2;
1585                 bits |= get_bit(scalars[num], i) << 1;
1586                 bits |= get_bit(scalars[num], i - 1);
1587                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1588 
1589                 /*
1590                  * select the point to add or subtract, in constant time
1591                  */
1592                 select_point(digit, 17, pre_comp[num], tmp);
1593                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1594                                             * point */
1595                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1596 
1597                 if (!skip) {
1598                     point_add(nq[0], nq[1], nq[2],
1599                               nq[0], nq[1], nq[2],
1600                               mixed, tmp[0], tmp[1], tmp[2]);
1601                 } else {
1602                     memcpy(nq, tmp, 3 * sizeof(felem));
1603                     skip = 0;
1604                 }
1605             }
1606         }
1607     }
1608     felem_assign(x_out, nq[0]);
1609     felem_assign(y_out, nq[1]);
1610     felem_assign(z_out, nq[2]);
1611 }
1612 
1613 /* Precomputation for the group generator. */
1614 struct nistp521_pre_comp_st {
1615     felem g_pre_comp[16][3];
1616     CRYPTO_REF_COUNT references;
1617     CRYPTO_RWLOCK *lock;
1618 };
1619 
1620 const EC_METHOD *EC_GFp_nistp521_method(void)
1621 {
1622     static const EC_METHOD ret = {
1623         EC_FLAGS_DEFAULT_OCT,
1624         NID_X9_62_prime_field,
1625         ec_GFp_nistp521_group_init,
1626         ec_GFp_simple_group_finish,
1627         ec_GFp_simple_group_clear_finish,
1628         ec_GFp_nist_group_copy,
1629         ec_GFp_nistp521_group_set_curve,
1630         ec_GFp_simple_group_get_curve,
1631         ec_GFp_simple_group_get_degree,
1632         ec_group_simple_order_bits,
1633         ec_GFp_simple_group_check_discriminant,
1634         ec_GFp_simple_point_init,
1635         ec_GFp_simple_point_finish,
1636         ec_GFp_simple_point_clear_finish,
1637         ec_GFp_simple_point_copy,
1638         ec_GFp_simple_point_set_to_infinity,
1639         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1640         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1641         ec_GFp_simple_point_set_affine_coordinates,
1642         ec_GFp_nistp521_point_get_affine_coordinates,
1643         0 /* point_set_compressed_coordinates */ ,
1644         0 /* point2oct */ ,
1645         0 /* oct2point */ ,
1646         ec_GFp_simple_add,
1647         ec_GFp_simple_dbl,
1648         ec_GFp_simple_invert,
1649         ec_GFp_simple_is_at_infinity,
1650         ec_GFp_simple_is_on_curve,
1651         ec_GFp_simple_cmp,
1652         ec_GFp_simple_make_affine,
1653         ec_GFp_simple_points_make_affine,
1654         ec_GFp_nistp521_points_mul,
1655         ec_GFp_nistp521_precompute_mult,
1656         ec_GFp_nistp521_have_precompute_mult,
1657         ec_GFp_nist_field_mul,
1658         ec_GFp_nist_field_sqr,
1659         0 /* field_div */ ,
1660         ec_GFp_simple_field_inv,
1661         0 /* field_encode */ ,
1662         0 /* field_decode */ ,
1663         0,                      /* field_set_to_one */
1664         ec_key_simple_priv2oct,
1665         ec_key_simple_oct2priv,
1666         0, /* set private */
1667         ec_key_simple_generate_key,
1668         ec_key_simple_check_key,
1669         ec_key_simple_generate_public_key,
1670         0, /* keycopy */
1671         0, /* keyfinish */
1672         ecdh_simple_compute_key,
1673         0, /* field_inverse_mod_ord */
1674         0, /* blind_coordinates */
1675         0, /* ladder_pre */
1676         0, /* ladder_step */
1677         0  /* ladder_post */
1678     };
1679 
1680     return &ret;
1681 }
1682 
1683 /******************************************************************************/
1684 /*
1685  * FUNCTIONS TO MANAGE PRECOMPUTATION
1686  */
1687 
1688 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1689 {
1690     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1691 
1692     if (ret == NULL) {
1693         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1694         return ret;
1695     }
1696 
1697     ret->references = 1;
1698 
1699     ret->lock = CRYPTO_THREAD_lock_new();
1700     if (ret->lock == NULL) {
1701         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1702         OPENSSL_free(ret);
1703         return NULL;
1704     }
1705     return ret;
1706 }
1707 
1708 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1709 {
1710     int i;
1711     if (p != NULL)
1712         CRYPTO_UP_REF(&p->references, &i, p->lock);
1713     return p;
1714 }
1715 
1716 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1717 {
1718     int i;
1719 
1720     if (p == NULL)
1721         return;
1722 
1723     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1724     REF_PRINT_COUNT("EC_nistp521", x);
1725     if (i > 0)
1726         return;
1727     REF_ASSERT_ISNT(i < 0);
1728 
1729     CRYPTO_THREAD_lock_free(p->lock);
1730     OPENSSL_free(p);
1731 }
1732 
1733 /******************************************************************************/
1734 /*
1735  * OPENSSL EC_METHOD FUNCTIONS
1736  */
1737 
1738 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1739 {
1740     int ret;
1741     ret = ec_GFp_simple_group_init(group);
1742     group->a_is_minus3 = 1;
1743     return ret;
1744 }
1745 
1746 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1747                                     const BIGNUM *a, const BIGNUM *b,
1748                                     BN_CTX *ctx)
1749 {
1750     int ret = 0;
1751     BN_CTX *new_ctx = NULL;
1752     BIGNUM *curve_p, *curve_a, *curve_b;
1753 
1754     if (ctx == NULL)
1755         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1756             return 0;
1757     BN_CTX_start(ctx);
1758     curve_p = BN_CTX_get(ctx);
1759     curve_a = BN_CTX_get(ctx);
1760     curve_b = BN_CTX_get(ctx);
1761     if (curve_b == NULL)
1762         goto err;
1763     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1764     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1765     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1766     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1767         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1768               EC_R_WRONG_CURVE_PARAMETERS);
1769         goto err;
1770     }
1771     group->field_mod_func = BN_nist_mod_521;
1772     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1773  err:
1774     BN_CTX_end(ctx);
1775     BN_CTX_free(new_ctx);
1776     return ret;
1777 }
1778 
1779 /*
1780  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1781  * (X/Z^2, Y/Z^3)
1782  */
1783 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1784                                                  const EC_POINT *point,
1785                                                  BIGNUM *x, BIGNUM *y,
1786                                                  BN_CTX *ctx)
1787 {
1788     felem z1, z2, x_in, y_in, x_out, y_out;
1789     largefelem tmp;
1790 
1791     if (EC_POINT_is_at_infinity(group, point)) {
1792         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1793               EC_R_POINT_AT_INFINITY);
1794         return 0;
1795     }
1796     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1797         (!BN_to_felem(z1, point->Z)))
1798         return 0;
1799     felem_inv(z2, z1);
1800     felem_square(tmp, z2);
1801     felem_reduce(z1, tmp);
1802     felem_mul(tmp, x_in, z1);
1803     felem_reduce(x_in, tmp);
1804     felem_contract(x_out, x_in);
1805     if (x != NULL) {
1806         if (!felem_to_BN(x, x_out)) {
1807             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1808                   ERR_R_BN_LIB);
1809             return 0;
1810         }
1811     }
1812     felem_mul(tmp, z1, z2);
1813     felem_reduce(z1, tmp);
1814     felem_mul(tmp, y_in, z1);
1815     felem_reduce(y_in, tmp);
1816     felem_contract(y_out, y_in);
1817     if (y != NULL) {
1818         if (!felem_to_BN(y, y_out)) {
1819             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1820                   ERR_R_BN_LIB);
1821             return 0;
1822         }
1823     }
1824     return 1;
1825 }
1826 
1827 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1828 static void make_points_affine(size_t num, felem points[][3],
1829                                felem tmp_felems[])
1830 {
1831     /*
1832      * Runs in constant time, unless an input is the point at infinity (which
1833      * normally shouldn't happen).
1834      */
1835     ec_GFp_nistp_points_make_affine_internal(num,
1836                                              points,
1837                                              sizeof(felem),
1838                                              tmp_felems,
1839                                              (void (*)(void *))felem_one,
1840                                              felem_is_zero_int,
1841                                              (void (*)(void *, const void *))
1842                                              felem_assign,
1843                                              (void (*)(void *, const void *))
1844                                              felem_square_reduce, (void (*)
1845                                                                    (void *,
1846                                                                     const void
1847                                                                     *,
1848                                                                     const void
1849                                                                     *))
1850                                              felem_mul_reduce,
1851                                              (void (*)(void *, const void *))
1852                                              felem_inv,
1853                                              (void (*)(void *, const void *))
1854                                              felem_contract);
1855 }
1856 
1857 /*
1858  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1859  * values Result is stored in r (r can equal one of the inputs).
1860  */
1861 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1862                                const BIGNUM *scalar, size_t num,
1863                                const EC_POINT *points[],
1864                                const BIGNUM *scalars[], BN_CTX *ctx)
1865 {
1866     int ret = 0;
1867     int j;
1868     int mixed = 0;
1869     BIGNUM *x, *y, *z, *tmp_scalar;
1870     felem_bytearray g_secret;
1871     felem_bytearray *secrets = NULL;
1872     felem (*pre_comp)[17][3] = NULL;
1873     felem *tmp_felems = NULL;
1874     unsigned i;
1875     int num_bytes;
1876     int have_pre_comp = 0;
1877     size_t num_points = num;
1878     felem x_in, y_in, z_in, x_out, y_out, z_out;
1879     NISTP521_PRE_COMP *pre = NULL;
1880     felem(*g_pre_comp)[3] = NULL;
1881     EC_POINT *generator = NULL;
1882     const EC_POINT *p = NULL;
1883     const BIGNUM *p_scalar = NULL;
1884 
1885     BN_CTX_start(ctx);
1886     x = BN_CTX_get(ctx);
1887     y = BN_CTX_get(ctx);
1888     z = BN_CTX_get(ctx);
1889     tmp_scalar = BN_CTX_get(ctx);
1890     if (tmp_scalar == NULL)
1891         goto err;
1892 
1893     if (scalar != NULL) {
1894         pre = group->pre_comp.nistp521;
1895         if (pre)
1896             /* we have precomputation, try to use it */
1897             g_pre_comp = &pre->g_pre_comp[0];
1898         else
1899             /* try to use the standard precomputation */
1900             g_pre_comp = (felem(*)[3]) gmul;
1901         generator = EC_POINT_new(group);
1902         if (generator == NULL)
1903             goto err;
1904         /* get the generator from precomputation */
1905         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1906             !felem_to_BN(y, g_pre_comp[1][1]) ||
1907             !felem_to_BN(z, g_pre_comp[1][2])) {
1908             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1909             goto err;
1910         }
1911         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1912                                                       generator, x, y, z,
1913                                                       ctx))
1914             goto err;
1915         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1916             /* precomputation matches generator */
1917             have_pre_comp = 1;
1918         else
1919             /*
1920              * we don't have valid precomputation: treat the generator as a
1921              * random point
1922              */
1923             num_points++;
1924     }
1925 
1926     if (num_points > 0) {
1927         if (num_points >= 2) {
1928             /*
1929              * unless we precompute multiples for just one point, converting
1930              * those into affine form is time well spent
1931              */
1932             mixed = 1;
1933         }
1934         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1935         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1936         if (mixed)
1937             tmp_felems =
1938                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1939         if ((secrets == NULL) || (pre_comp == NULL)
1940             || (mixed && (tmp_felems == NULL))) {
1941             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1942             goto err;
1943         }
1944 
1945         /*
1946          * we treat NULL scalars as 0, and NULL points as points at infinity,
1947          * i.e., they contribute nothing to the linear combination
1948          */
1949         for (i = 0; i < num_points; ++i) {
1950             if (i == num) {
1951                 /*
1952                  * we didn't have a valid precomputation, so we pick the
1953                  * generator
1954                  */
1955                 p = EC_GROUP_get0_generator(group);
1956                 p_scalar = scalar;
1957             } else {
1958                 /* the i^th point */
1959                 p = points[i];
1960                 p_scalar = scalars[i];
1961             }
1962             if ((p_scalar != NULL) && (p != NULL)) {
1963                 /* reduce scalar to 0 <= scalar < 2^521 */
1964                 if ((BN_num_bits(p_scalar) > 521)
1965                     || (BN_is_negative(p_scalar))) {
1966                     /*
1967                      * this is an unusual input, and we don't guarantee
1968                      * constant-timeness
1969                      */
1970                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1971                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1972                         goto err;
1973                     }
1974                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1975                                                secrets[i], sizeof(secrets[i]));
1976                 } else {
1977                     num_bytes = BN_bn2lebinpad(p_scalar,
1978                                                secrets[i], sizeof(secrets[i]));
1979                 }
1980                 if (num_bytes < 0) {
1981                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1982                     goto err;
1983                 }
1984                 /* precompute multiples */
1985                 if ((!BN_to_felem(x_out, p->X)) ||
1986                     (!BN_to_felem(y_out, p->Y)) ||
1987                     (!BN_to_felem(z_out, p->Z)))
1988                     goto err;
1989                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1990                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1991                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1992                 for (j = 2; j <= 16; ++j) {
1993                     if (j & 1) {
1994                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1995                                   pre_comp[i][j][2], pre_comp[i][1][0],
1996                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1997                                   pre_comp[i][j - 1][0],
1998                                   pre_comp[i][j - 1][1],
1999                                   pre_comp[i][j - 1][2]);
2000                     } else {
2001                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
2002                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
2003                                      pre_comp[i][j / 2][1],
2004                                      pre_comp[i][j / 2][2]);
2005                     }
2006                 }
2007             }
2008         }
2009         if (mixed)
2010             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
2011     }
2012 
2013     /* the scalar for the generator */
2014     if ((scalar != NULL) && (have_pre_comp)) {
2015         memset(g_secret, 0, sizeof(g_secret));
2016         /* reduce scalar to 0 <= scalar < 2^521 */
2017         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
2018             /*
2019              * this is an unusual input, and we don't guarantee
2020              * constant-timeness
2021              */
2022             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2023                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2024                 goto err;
2025             }
2026             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2027         } else {
2028             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2029         }
2030         /* do the multiplication with generator precomputation */
2031         batch_mul(x_out, y_out, z_out,
2032                   (const felem_bytearray(*))secrets, num_points,
2033                   g_secret,
2034                   mixed, (const felem(*)[17][3])pre_comp,
2035                   (const felem(*)[3])g_pre_comp);
2036     } else {
2037         /* do the multiplication without generator precomputation */
2038         batch_mul(x_out, y_out, z_out,
2039                   (const felem_bytearray(*))secrets, num_points,
2040                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2041     }
2042     /* reduce the output to its unique minimal representation */
2043     felem_contract(x_in, x_out);
2044     felem_contract(y_in, y_out);
2045     felem_contract(z_in, z_out);
2046     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2047         (!felem_to_BN(z, z_in))) {
2048         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2049         goto err;
2050     }
2051     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2052 
2053  err:
2054     BN_CTX_end(ctx);
2055     EC_POINT_free(generator);
2056     OPENSSL_free(secrets);
2057     OPENSSL_free(pre_comp);
2058     OPENSSL_free(tmp_felems);
2059     return ret;
2060 }
2061 
2062 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2063 {
2064     int ret = 0;
2065     NISTP521_PRE_COMP *pre = NULL;
2066     int i, j;
2067     BN_CTX *new_ctx = NULL;
2068     BIGNUM *x, *y;
2069     EC_POINT *generator = NULL;
2070     felem tmp_felems[16];
2071 
2072     /* throw away old precomputation */
2073     EC_pre_comp_free(group);
2074     if (ctx == NULL)
2075         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2076             return 0;
2077     BN_CTX_start(ctx);
2078     x = BN_CTX_get(ctx);
2079     y = BN_CTX_get(ctx);
2080     if (y == NULL)
2081         goto err;
2082     /* get the generator */
2083     if (group->generator == NULL)
2084         goto err;
2085     generator = EC_POINT_new(group);
2086     if (generator == NULL)
2087         goto err;
2088     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2089     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2090     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2091         goto err;
2092     if ((pre = nistp521_pre_comp_new()) == NULL)
2093         goto err;
2094     /*
2095      * if the generator is the standard one, use built-in precomputation
2096      */
2097     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2098         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2099         goto done;
2100     }
2101     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2102         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2103         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2104         goto err;
2105     /* compute 2^130*G, 2^260*G, 2^390*G */
2106     for (i = 1; i <= 4; i <<= 1) {
2107         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2108                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2109                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2110         for (j = 0; j < 129; ++j) {
2111             point_double(pre->g_pre_comp[2 * i][0],
2112                          pre->g_pre_comp[2 * i][1],
2113                          pre->g_pre_comp[2 * i][2],
2114                          pre->g_pre_comp[2 * i][0],
2115                          pre->g_pre_comp[2 * i][1],
2116                          pre->g_pre_comp[2 * i][2]);
2117         }
2118     }
2119     /* g_pre_comp[0] is the point at infinity */
2120     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2121     /* the remaining multiples */
2122     /* 2^130*G + 2^260*G */
2123     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2124               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2125               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2126               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2127               pre->g_pre_comp[2][2]);
2128     /* 2^130*G + 2^390*G */
2129     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2130               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2131               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2132               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2133               pre->g_pre_comp[2][2]);
2134     /* 2^260*G + 2^390*G */
2135     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2136               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2137               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2138               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2139               pre->g_pre_comp[4][2]);
2140     /* 2^130*G + 2^260*G + 2^390*G */
2141     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2142               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2143               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2144               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2145               pre->g_pre_comp[2][2]);
2146     for (i = 1; i < 8; ++i) {
2147         /* odd multiples: add G */
2148         point_add(pre->g_pre_comp[2 * i + 1][0],
2149                   pre->g_pre_comp[2 * i + 1][1],
2150                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2151                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2152                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2153                   pre->g_pre_comp[1][2]);
2154     }
2155     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2156 
2157  done:
2158     SETPRECOMP(group, nistp521, pre);
2159     ret = 1;
2160     pre = NULL;
2161  err:
2162     BN_CTX_end(ctx);
2163     EC_POINT_free(generator);
2164     BN_CTX_free(new_ctx);
2165     EC_nistp521_pre_comp_free(pre);
2166     return ret;
2167 }
2168 
2169 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2170 {
2171     return HAVEPRECOMP(group, nistp521);
2172 }
2173 
2174 #endif
2175