xref: /freebsd/crypto/openssl/crypto/ec/ecp_nistp521.c (revision 036d2e814bf0f5d88ffb4b24c159320894541757)
1 /*
2  * Copyright 2011-2019 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the OpenSSL license (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2011 Google Inc.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
28  *
29  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
30  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
31  * work which got its smarts from Daniel J. Bernstein's work on the same.
32  */
33 
34 #include <openssl/e_os2.h>
35 #ifdef OPENSSL_NO_EC_NISTP_64_GCC_128
36 NON_EMPTY_TRANSLATION_UNIT
37 #else
38 
39 # include <string.h>
40 # include <openssl/err.h>
41 # include "ec_lcl.h"
42 
43 # if defined(__SIZEOF_INT128__) && __SIZEOF_INT128__==16
44   /* even with gcc, the typedef won't work for 32-bit platforms */
45 typedef __uint128_t uint128_t;  /* nonstandard; implemented by gcc on 64-bit
46                                  * platforms */
47 # else
48 #  error "Your compiler doesn't appear to support 128-bit integer types"
49 # endif
50 
51 typedef uint8_t u8;
52 typedef uint64_t u64;
53 
54 /*
55  * The underlying field. P521 operates over GF(2^521-1). We can serialise an
56  * element of this field into 66 bytes where the most significant byte
57  * contains only a single bit. We call this an felem_bytearray.
58  */
59 
60 typedef u8 felem_bytearray[66];
61 
62 /*
63  * These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
64  * These values are big-endian.
65  */
66 static const felem_bytearray nistp521_curve_params[5] = {
67     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* p */
68      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
74      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
75      0xff, 0xff},
76     {0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, /* a = -3 */
77      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
83      0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
84      0xff, 0xfc},
85     {0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c, /* b */
86      0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
87      0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
88      0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
89      0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
90      0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
91      0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
92      0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
93      0x3f, 0x00},
94     {0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04, /* x */
95      0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
96      0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
97      0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
98      0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
99      0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
100      0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
101      0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
102      0xbd, 0x66},
103     {0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b, /* y */
104      0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
105      0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
106      0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
107      0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
108      0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
109      0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
110      0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
111      0x66, 0x50}
112 };
113 
114 /*-
115  * The representation of field elements.
116  * ------------------------------------
117  *
118  * We represent field elements with nine values. These values are either 64 or
119  * 128 bits and the field element represented is:
120  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
121  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
122  * 58 bits apart, but are greater than 58 bits in length, the most significant
123  * bits of each limb overlap with the least significant bits of the next.
124  *
125  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
126  * 'largefelem' */
127 
128 # define NLIMBS 9
129 
130 typedef uint64_t limb;
131 typedef limb felem[NLIMBS];
132 typedef uint128_t largefelem[NLIMBS];
133 
134 static const limb bottom57bits = 0x1ffffffffffffff;
135 static const limb bottom58bits = 0x3ffffffffffffff;
136 
137 /*
138  * bin66_to_felem takes a little-endian byte array and converts it into felem
139  * form. This assumes that the CPU is little-endian.
140  */
141 static void bin66_to_felem(felem out, const u8 in[66])
142 {
143     out[0] = (*((limb *) & in[0])) & bottom58bits;
144     out[1] = (*((limb *) & in[7]) >> 2) & bottom58bits;
145     out[2] = (*((limb *) & in[14]) >> 4) & bottom58bits;
146     out[3] = (*((limb *) & in[21]) >> 6) & bottom58bits;
147     out[4] = (*((limb *) & in[29])) & bottom58bits;
148     out[5] = (*((limb *) & in[36]) >> 2) & bottom58bits;
149     out[6] = (*((limb *) & in[43]) >> 4) & bottom58bits;
150     out[7] = (*((limb *) & in[50]) >> 6) & bottom58bits;
151     out[8] = (*((limb *) & in[58])) & bottom57bits;
152 }
153 
154 /*
155  * felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
156  * array. This assumes that the CPU is little-endian.
157  */
158 static void felem_to_bin66(u8 out[66], const felem in)
159 {
160     memset(out, 0, 66);
161     (*((limb *) & out[0])) = in[0];
162     (*((limb *) & out[7])) |= in[1] << 2;
163     (*((limb *) & out[14])) |= in[2] << 4;
164     (*((limb *) & out[21])) |= in[3] << 6;
165     (*((limb *) & out[29])) = in[4];
166     (*((limb *) & out[36])) |= in[5] << 2;
167     (*((limb *) & out[43])) |= in[6] << 4;
168     (*((limb *) & out[50])) |= in[7] << 6;
169     (*((limb *) & out[58])) = in[8];
170 }
171 
172 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
173 static int BN_to_felem(felem out, const BIGNUM *bn)
174 {
175     felem_bytearray b_out;
176     int num_bytes;
177 
178     if (BN_is_negative(bn)) {
179         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
180         return 0;
181     }
182     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
183     if (num_bytes < 0) {
184         ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
185         return 0;
186     }
187     bin66_to_felem(out, b_out);
188     return 1;
189 }
190 
191 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
192 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
193 {
194     felem_bytearray b_out;
195     felem_to_bin66(b_out, in);
196     return BN_lebin2bn(b_out, sizeof(b_out), out);
197 }
198 
199 /*-
200  * Field operations
201  * ----------------
202  */
203 
204 static void felem_one(felem out)
205 {
206     out[0] = 1;
207     out[1] = 0;
208     out[2] = 0;
209     out[3] = 0;
210     out[4] = 0;
211     out[5] = 0;
212     out[6] = 0;
213     out[7] = 0;
214     out[8] = 0;
215 }
216 
217 static void felem_assign(felem out, const felem in)
218 {
219     out[0] = in[0];
220     out[1] = in[1];
221     out[2] = in[2];
222     out[3] = in[3];
223     out[4] = in[4];
224     out[5] = in[5];
225     out[6] = in[6];
226     out[7] = in[7];
227     out[8] = in[8];
228 }
229 
230 /* felem_sum64 sets out = out + in. */
231 static void felem_sum64(felem out, const felem in)
232 {
233     out[0] += in[0];
234     out[1] += in[1];
235     out[2] += in[2];
236     out[3] += in[3];
237     out[4] += in[4];
238     out[5] += in[5];
239     out[6] += in[6];
240     out[7] += in[7];
241     out[8] += in[8];
242 }
243 
244 /* felem_scalar sets out = in * scalar */
245 static void felem_scalar(felem out, const felem in, limb scalar)
246 {
247     out[0] = in[0] * scalar;
248     out[1] = in[1] * scalar;
249     out[2] = in[2] * scalar;
250     out[3] = in[3] * scalar;
251     out[4] = in[4] * scalar;
252     out[5] = in[5] * scalar;
253     out[6] = in[6] * scalar;
254     out[7] = in[7] * scalar;
255     out[8] = in[8] * scalar;
256 }
257 
258 /* felem_scalar64 sets out = out * scalar */
259 static void felem_scalar64(felem out, limb scalar)
260 {
261     out[0] *= scalar;
262     out[1] *= scalar;
263     out[2] *= scalar;
264     out[3] *= scalar;
265     out[4] *= scalar;
266     out[5] *= scalar;
267     out[6] *= scalar;
268     out[7] *= scalar;
269     out[8] *= scalar;
270 }
271 
272 /* felem_scalar128 sets out = out * scalar */
273 static void felem_scalar128(largefelem out, limb scalar)
274 {
275     out[0] *= scalar;
276     out[1] *= scalar;
277     out[2] *= scalar;
278     out[3] *= scalar;
279     out[4] *= scalar;
280     out[5] *= scalar;
281     out[6] *= scalar;
282     out[7] *= scalar;
283     out[8] *= scalar;
284 }
285 
286 /*-
287  * felem_neg sets |out| to |-in|
288  * On entry:
289  *   in[i] < 2^59 + 2^14
290  * On exit:
291  *   out[i] < 2^62
292  */
293 static void felem_neg(felem out, const felem in)
294 {
295     /* In order to prevent underflow, we subtract from 0 mod p. */
296     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
297     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
298 
299     out[0] = two62m3 - in[0];
300     out[1] = two62m2 - in[1];
301     out[2] = two62m2 - in[2];
302     out[3] = two62m2 - in[3];
303     out[4] = two62m2 - in[4];
304     out[5] = two62m2 - in[5];
305     out[6] = two62m2 - in[6];
306     out[7] = two62m2 - in[7];
307     out[8] = two62m2 - in[8];
308 }
309 
310 /*-
311  * felem_diff64 subtracts |in| from |out|
312  * On entry:
313  *   in[i] < 2^59 + 2^14
314  * On exit:
315  *   out[i] < out[i] + 2^62
316  */
317 static void felem_diff64(felem out, const felem in)
318 {
319     /*
320      * In order to prevent underflow, we add 0 mod p before subtracting.
321      */
322     static const limb two62m3 = (((limb) 1) << 62) - (((limb) 1) << 5);
323     static const limb two62m2 = (((limb) 1) << 62) - (((limb) 1) << 4);
324 
325     out[0] += two62m3 - in[0];
326     out[1] += two62m2 - in[1];
327     out[2] += two62m2 - in[2];
328     out[3] += two62m2 - in[3];
329     out[4] += two62m2 - in[4];
330     out[5] += two62m2 - in[5];
331     out[6] += two62m2 - in[6];
332     out[7] += two62m2 - in[7];
333     out[8] += two62m2 - in[8];
334 }
335 
336 /*-
337  * felem_diff_128_64 subtracts |in| from |out|
338  * On entry:
339  *   in[i] < 2^62 + 2^17
340  * On exit:
341  *   out[i] < out[i] + 2^63
342  */
343 static void felem_diff_128_64(largefelem out, const felem in)
344 {
345     /*
346      * In order to prevent underflow, we add 64p mod p (which is equivalent
347      * to 0 mod p) before subtracting. p is 2^521 - 1, i.e. in binary a 521
348      * digit number with all bits set to 1. See "The representation of field
349      * elements" comment above for a description of how limbs are used to
350      * represent a number. 64p is represented with 8 limbs containing a number
351      * with 58 bits set and one limb with a number with 57 bits set.
352      */
353     static const limb two63m6 = (((limb) 1) << 63) - (((limb) 1) << 6);
354     static const limb two63m5 = (((limb) 1) << 63) - (((limb) 1) << 5);
355 
356     out[0] += two63m6 - in[0];
357     out[1] += two63m5 - in[1];
358     out[2] += two63m5 - in[2];
359     out[3] += two63m5 - in[3];
360     out[4] += two63m5 - in[4];
361     out[5] += two63m5 - in[5];
362     out[6] += two63m5 - in[6];
363     out[7] += two63m5 - in[7];
364     out[8] += two63m5 - in[8];
365 }
366 
367 /*-
368  * felem_diff_128_64 subtracts |in| from |out|
369  * On entry:
370  *   in[i] < 2^126
371  * On exit:
372  *   out[i] < out[i] + 2^127 - 2^69
373  */
374 static void felem_diff128(largefelem out, const largefelem in)
375 {
376     /*
377      * In order to prevent underflow, we add 0 mod p before subtracting.
378      */
379     static const uint128_t two127m70 =
380         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 70);
381     static const uint128_t two127m69 =
382         (((uint128_t) 1) << 127) - (((uint128_t) 1) << 69);
383 
384     out[0] += (two127m70 - in[0]);
385     out[1] += (two127m69 - in[1]);
386     out[2] += (two127m69 - in[2]);
387     out[3] += (two127m69 - in[3]);
388     out[4] += (two127m69 - in[4]);
389     out[5] += (two127m69 - in[5]);
390     out[6] += (two127m69 - in[6]);
391     out[7] += (two127m69 - in[7]);
392     out[8] += (two127m69 - in[8]);
393 }
394 
395 /*-
396  * felem_square sets |out| = |in|^2
397  * On entry:
398  *   in[i] < 2^62
399  * On exit:
400  *   out[i] < 17 * max(in[i]) * max(in[i])
401  */
402 static void felem_square(largefelem out, const felem in)
403 {
404     felem inx2, inx4;
405     felem_scalar(inx2, in, 2);
406     felem_scalar(inx4, in, 4);
407 
408     /*-
409      * We have many cases were we want to do
410      *   in[x] * in[y] +
411      *   in[y] * in[x]
412      * This is obviously just
413      *   2 * in[x] * in[y]
414      * However, rather than do the doubling on the 128 bit result, we
415      * double one of the inputs to the multiplication by reading from
416      * |inx2|
417      */
418 
419     out[0] = ((uint128_t) in[0]) * in[0];
420     out[1] = ((uint128_t) in[0]) * inx2[1];
421     out[2] = ((uint128_t) in[0]) * inx2[2] + ((uint128_t) in[1]) * in[1];
422     out[3] = ((uint128_t) in[0]) * inx2[3] + ((uint128_t) in[1]) * inx2[2];
423     out[4] = ((uint128_t) in[0]) * inx2[4] +
424              ((uint128_t) in[1]) * inx2[3] + ((uint128_t) in[2]) * in[2];
425     out[5] = ((uint128_t) in[0]) * inx2[5] +
426              ((uint128_t) in[1]) * inx2[4] + ((uint128_t) in[2]) * inx2[3];
427     out[6] = ((uint128_t) in[0]) * inx2[6] +
428              ((uint128_t) in[1]) * inx2[5] +
429              ((uint128_t) in[2]) * inx2[4] + ((uint128_t) in[3]) * in[3];
430     out[7] = ((uint128_t) in[0]) * inx2[7] +
431              ((uint128_t) in[1]) * inx2[6] +
432              ((uint128_t) in[2]) * inx2[5] + ((uint128_t) in[3]) * inx2[4];
433     out[8] = ((uint128_t) in[0]) * inx2[8] +
434              ((uint128_t) in[1]) * inx2[7] +
435              ((uint128_t) in[2]) * inx2[6] +
436              ((uint128_t) in[3]) * inx2[5] + ((uint128_t) in[4]) * in[4];
437 
438     /*
439      * The remaining limbs fall above 2^521, with the first falling at 2^522.
440      * They correspond to locations one bit up from the limbs produced above
441      * so we would have to multiply by two to align them. Again, rather than
442      * operate on the 128-bit result, we double one of the inputs to the
443      * multiplication. If we want to double for both this reason, and the
444      * reason above, then we end up multiplying by four.
445      */
446 
447     /* 9 */
448     out[0] += ((uint128_t) in[1]) * inx4[8] +
449               ((uint128_t) in[2]) * inx4[7] +
450               ((uint128_t) in[3]) * inx4[6] + ((uint128_t) in[4]) * inx4[5];
451 
452     /* 10 */
453     out[1] += ((uint128_t) in[2]) * inx4[8] +
454               ((uint128_t) in[3]) * inx4[7] +
455               ((uint128_t) in[4]) * inx4[6] + ((uint128_t) in[5]) * inx2[5];
456 
457     /* 11 */
458     out[2] += ((uint128_t) in[3]) * inx4[8] +
459               ((uint128_t) in[4]) * inx4[7] + ((uint128_t) in[5]) * inx4[6];
460 
461     /* 12 */
462     out[3] += ((uint128_t) in[4]) * inx4[8] +
463               ((uint128_t) in[5]) * inx4[7] + ((uint128_t) in[6]) * inx2[6];
464 
465     /* 13 */
466     out[4] += ((uint128_t) in[5]) * inx4[8] + ((uint128_t) in[6]) * inx4[7];
467 
468     /* 14 */
469     out[5] += ((uint128_t) in[6]) * inx4[8] + ((uint128_t) in[7]) * inx2[7];
470 
471     /* 15 */
472     out[6] += ((uint128_t) in[7]) * inx4[8];
473 
474     /* 16 */
475     out[7] += ((uint128_t) in[8]) * inx2[8];
476 }
477 
478 /*-
479  * felem_mul sets |out| = |in1| * |in2|
480  * On entry:
481  *   in1[i] < 2^64
482  *   in2[i] < 2^63
483  * On exit:
484  *   out[i] < 17 * max(in1[i]) * max(in2[i])
485  */
486 static void felem_mul(largefelem out, const felem in1, const felem in2)
487 {
488     felem in2x2;
489     felem_scalar(in2x2, in2, 2);
490 
491     out[0] = ((uint128_t) in1[0]) * in2[0];
492 
493     out[1] = ((uint128_t) in1[0]) * in2[1] +
494              ((uint128_t) in1[1]) * in2[0];
495 
496     out[2] = ((uint128_t) in1[0]) * in2[2] +
497              ((uint128_t) in1[1]) * in2[1] +
498              ((uint128_t) in1[2]) * in2[0];
499 
500     out[3] = ((uint128_t) in1[0]) * in2[3] +
501              ((uint128_t) in1[1]) * in2[2] +
502              ((uint128_t) in1[2]) * in2[1] +
503              ((uint128_t) in1[3]) * in2[0];
504 
505     out[4] = ((uint128_t) in1[0]) * in2[4] +
506              ((uint128_t) in1[1]) * in2[3] +
507              ((uint128_t) in1[2]) * in2[2] +
508              ((uint128_t) in1[3]) * in2[1] +
509              ((uint128_t) in1[4]) * in2[0];
510 
511     out[5] = ((uint128_t) in1[0]) * in2[5] +
512              ((uint128_t) in1[1]) * in2[4] +
513              ((uint128_t) in1[2]) * in2[3] +
514              ((uint128_t) in1[3]) * in2[2] +
515              ((uint128_t) in1[4]) * in2[1] +
516              ((uint128_t) in1[5]) * in2[0];
517 
518     out[6] = ((uint128_t) in1[0]) * in2[6] +
519              ((uint128_t) in1[1]) * in2[5] +
520              ((uint128_t) in1[2]) * in2[4] +
521              ((uint128_t) in1[3]) * in2[3] +
522              ((uint128_t) in1[4]) * in2[2] +
523              ((uint128_t) in1[5]) * in2[1] +
524              ((uint128_t) in1[6]) * in2[0];
525 
526     out[7] = ((uint128_t) in1[0]) * in2[7] +
527              ((uint128_t) in1[1]) * in2[6] +
528              ((uint128_t) in1[2]) * in2[5] +
529              ((uint128_t) in1[3]) * in2[4] +
530              ((uint128_t) in1[4]) * in2[3] +
531              ((uint128_t) in1[5]) * in2[2] +
532              ((uint128_t) in1[6]) * in2[1] +
533              ((uint128_t) in1[7]) * in2[0];
534 
535     out[8] = ((uint128_t) in1[0]) * in2[8] +
536              ((uint128_t) in1[1]) * in2[7] +
537              ((uint128_t) in1[2]) * in2[6] +
538              ((uint128_t) in1[3]) * in2[5] +
539              ((uint128_t) in1[4]) * in2[4] +
540              ((uint128_t) in1[5]) * in2[3] +
541              ((uint128_t) in1[6]) * in2[2] +
542              ((uint128_t) in1[7]) * in2[1] +
543              ((uint128_t) in1[8]) * in2[0];
544 
545     /* See comment in felem_square about the use of in2x2 here */
546 
547     out[0] += ((uint128_t) in1[1]) * in2x2[8] +
548               ((uint128_t) in1[2]) * in2x2[7] +
549               ((uint128_t) in1[3]) * in2x2[6] +
550               ((uint128_t) in1[4]) * in2x2[5] +
551               ((uint128_t) in1[5]) * in2x2[4] +
552               ((uint128_t) in1[6]) * in2x2[3] +
553               ((uint128_t) in1[7]) * in2x2[2] +
554               ((uint128_t) in1[8]) * in2x2[1];
555 
556     out[1] += ((uint128_t) in1[2]) * in2x2[8] +
557               ((uint128_t) in1[3]) * in2x2[7] +
558               ((uint128_t) in1[4]) * in2x2[6] +
559               ((uint128_t) in1[5]) * in2x2[5] +
560               ((uint128_t) in1[6]) * in2x2[4] +
561               ((uint128_t) in1[7]) * in2x2[3] +
562               ((uint128_t) in1[8]) * in2x2[2];
563 
564     out[2] += ((uint128_t) in1[3]) * in2x2[8] +
565               ((uint128_t) in1[4]) * in2x2[7] +
566               ((uint128_t) in1[5]) * in2x2[6] +
567               ((uint128_t) in1[6]) * in2x2[5] +
568               ((uint128_t) in1[7]) * in2x2[4] +
569               ((uint128_t) in1[8]) * in2x2[3];
570 
571     out[3] += ((uint128_t) in1[4]) * in2x2[8] +
572               ((uint128_t) in1[5]) * in2x2[7] +
573               ((uint128_t) in1[6]) * in2x2[6] +
574               ((uint128_t) in1[7]) * in2x2[5] +
575               ((uint128_t) in1[8]) * in2x2[4];
576 
577     out[4] += ((uint128_t) in1[5]) * in2x2[8] +
578               ((uint128_t) in1[6]) * in2x2[7] +
579               ((uint128_t) in1[7]) * in2x2[6] +
580               ((uint128_t) in1[8]) * in2x2[5];
581 
582     out[5] += ((uint128_t) in1[6]) * in2x2[8] +
583               ((uint128_t) in1[7]) * in2x2[7] +
584               ((uint128_t) in1[8]) * in2x2[6];
585 
586     out[6] += ((uint128_t) in1[7]) * in2x2[8] +
587               ((uint128_t) in1[8]) * in2x2[7];
588 
589     out[7] += ((uint128_t) in1[8]) * in2x2[8];
590 }
591 
592 static const limb bottom52bits = 0xfffffffffffff;
593 
594 /*-
595  * felem_reduce converts a largefelem to an felem.
596  * On entry:
597  *   in[i] < 2^128
598  * On exit:
599  *   out[i] < 2^59 + 2^14
600  */
601 static void felem_reduce(felem out, const largefelem in)
602 {
603     u64 overflow1, overflow2;
604 
605     out[0] = ((limb) in[0]) & bottom58bits;
606     out[1] = ((limb) in[1]) & bottom58bits;
607     out[2] = ((limb) in[2]) & bottom58bits;
608     out[3] = ((limb) in[3]) & bottom58bits;
609     out[4] = ((limb) in[4]) & bottom58bits;
610     out[5] = ((limb) in[5]) & bottom58bits;
611     out[6] = ((limb) in[6]) & bottom58bits;
612     out[7] = ((limb) in[7]) & bottom58bits;
613     out[8] = ((limb) in[8]) & bottom58bits;
614 
615     /* out[i] < 2^58 */
616 
617     out[1] += ((limb) in[0]) >> 58;
618     out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
619     /*-
620      * out[1] < 2^58 + 2^6 + 2^58
621      *        = 2^59 + 2^6
622      */
623     out[2] += ((limb) (in[0] >> 64)) >> 52;
624 
625     out[2] += ((limb) in[1]) >> 58;
626     out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
627     out[3] += ((limb) (in[1] >> 64)) >> 52;
628 
629     out[3] += ((limb) in[2]) >> 58;
630     out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
631     out[4] += ((limb) (in[2] >> 64)) >> 52;
632 
633     out[4] += ((limb) in[3]) >> 58;
634     out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
635     out[5] += ((limb) (in[3] >> 64)) >> 52;
636 
637     out[5] += ((limb) in[4]) >> 58;
638     out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
639     out[6] += ((limb) (in[4] >> 64)) >> 52;
640 
641     out[6] += ((limb) in[5]) >> 58;
642     out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
643     out[7] += ((limb) (in[5] >> 64)) >> 52;
644 
645     out[7] += ((limb) in[6]) >> 58;
646     out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
647     out[8] += ((limb) (in[6] >> 64)) >> 52;
648 
649     out[8] += ((limb) in[7]) >> 58;
650     out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
651     /*-
652      * out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
653      *            < 2^59 + 2^13
654      */
655     overflow1 = ((limb) (in[7] >> 64)) >> 52;
656 
657     overflow1 += ((limb) in[8]) >> 58;
658     overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
659     overflow2 = ((limb) (in[8] >> 64)) >> 52;
660 
661     overflow1 <<= 1;            /* overflow1 < 2^13 + 2^7 + 2^59 */
662     overflow2 <<= 1;            /* overflow2 < 2^13 */
663 
664     out[0] += overflow1;        /* out[0] < 2^60 */
665     out[1] += overflow2;        /* out[1] < 2^59 + 2^6 + 2^13 */
666 
667     out[1] += out[0] >> 58;
668     out[0] &= bottom58bits;
669     /*-
670      * out[0] < 2^58
671      * out[1] < 2^59 + 2^6 + 2^13 + 2^2
672      *        < 2^59 + 2^14
673      */
674 }
675 
676 static void felem_square_reduce(felem out, const felem in)
677 {
678     largefelem tmp;
679     felem_square(tmp, in);
680     felem_reduce(out, tmp);
681 }
682 
683 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
684 {
685     largefelem tmp;
686     felem_mul(tmp, in1, in2);
687     felem_reduce(out, tmp);
688 }
689 
690 /*-
691  * felem_inv calculates |out| = |in|^{-1}
692  *
693  * Based on Fermat's Little Theorem:
694  *   a^p = a (mod p)
695  *   a^{p-1} = 1 (mod p)
696  *   a^{p-2} = a^{-1} (mod p)
697  */
698 static void felem_inv(felem out, const felem in)
699 {
700     felem ftmp, ftmp2, ftmp3, ftmp4;
701     largefelem tmp;
702     unsigned i;
703 
704     felem_square(tmp, in);
705     felem_reduce(ftmp, tmp);    /* 2^1 */
706     felem_mul(tmp, in, ftmp);
707     felem_reduce(ftmp, tmp);    /* 2^2 - 2^0 */
708     felem_assign(ftmp2, ftmp);
709     felem_square(tmp, ftmp);
710     felem_reduce(ftmp, tmp);    /* 2^3 - 2^1 */
711     felem_mul(tmp, in, ftmp);
712     felem_reduce(ftmp, tmp);    /* 2^3 - 2^0 */
713     felem_square(tmp, ftmp);
714     felem_reduce(ftmp, tmp);    /* 2^4 - 2^1 */
715 
716     felem_square(tmp, ftmp2);
717     felem_reduce(ftmp3, tmp);   /* 2^3 - 2^1 */
718     felem_square(tmp, ftmp3);
719     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^2 */
720     felem_mul(tmp, ftmp3, ftmp2);
721     felem_reduce(ftmp3, tmp);   /* 2^4 - 2^0 */
722 
723     felem_assign(ftmp2, ftmp3);
724     felem_square(tmp, ftmp3);
725     felem_reduce(ftmp3, tmp);   /* 2^5 - 2^1 */
726     felem_square(tmp, ftmp3);
727     felem_reduce(ftmp3, tmp);   /* 2^6 - 2^2 */
728     felem_square(tmp, ftmp3);
729     felem_reduce(ftmp3, tmp);   /* 2^7 - 2^3 */
730     felem_square(tmp, ftmp3);
731     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^4 */
732     felem_assign(ftmp4, ftmp3);
733     felem_mul(tmp, ftmp3, ftmp);
734     felem_reduce(ftmp4, tmp);   /* 2^8 - 2^1 */
735     felem_square(tmp, ftmp4);
736     felem_reduce(ftmp4, tmp);   /* 2^9 - 2^2 */
737     felem_mul(tmp, ftmp3, ftmp2);
738     felem_reduce(ftmp3, tmp);   /* 2^8 - 2^0 */
739     felem_assign(ftmp2, ftmp3);
740 
741     for (i = 0; i < 8; i++) {
742         felem_square(tmp, ftmp3);
743         felem_reduce(ftmp3, tmp); /* 2^16 - 2^8 */
744     }
745     felem_mul(tmp, ftmp3, ftmp2);
746     felem_reduce(ftmp3, tmp);   /* 2^16 - 2^0 */
747     felem_assign(ftmp2, ftmp3);
748 
749     for (i = 0; i < 16; i++) {
750         felem_square(tmp, ftmp3);
751         felem_reduce(ftmp3, tmp); /* 2^32 - 2^16 */
752     }
753     felem_mul(tmp, ftmp3, ftmp2);
754     felem_reduce(ftmp3, tmp);   /* 2^32 - 2^0 */
755     felem_assign(ftmp2, ftmp3);
756 
757     for (i = 0; i < 32; i++) {
758         felem_square(tmp, ftmp3);
759         felem_reduce(ftmp3, tmp); /* 2^64 - 2^32 */
760     }
761     felem_mul(tmp, ftmp3, ftmp2);
762     felem_reduce(ftmp3, tmp);   /* 2^64 - 2^0 */
763     felem_assign(ftmp2, ftmp3);
764 
765     for (i = 0; i < 64; i++) {
766         felem_square(tmp, ftmp3);
767         felem_reduce(ftmp3, tmp); /* 2^128 - 2^64 */
768     }
769     felem_mul(tmp, ftmp3, ftmp2);
770     felem_reduce(ftmp3, tmp);   /* 2^128 - 2^0 */
771     felem_assign(ftmp2, ftmp3);
772 
773     for (i = 0; i < 128; i++) {
774         felem_square(tmp, ftmp3);
775         felem_reduce(ftmp3, tmp); /* 2^256 - 2^128 */
776     }
777     felem_mul(tmp, ftmp3, ftmp2);
778     felem_reduce(ftmp3, tmp);   /* 2^256 - 2^0 */
779     felem_assign(ftmp2, ftmp3);
780 
781     for (i = 0; i < 256; i++) {
782         felem_square(tmp, ftmp3);
783         felem_reduce(ftmp3, tmp); /* 2^512 - 2^256 */
784     }
785     felem_mul(tmp, ftmp3, ftmp2);
786     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^0 */
787 
788     for (i = 0; i < 9; i++) {
789         felem_square(tmp, ftmp3);
790         felem_reduce(ftmp3, tmp); /* 2^521 - 2^9 */
791     }
792     felem_mul(tmp, ftmp3, ftmp4);
793     felem_reduce(ftmp3, tmp);   /* 2^512 - 2^2 */
794     felem_mul(tmp, ftmp3, in);
795     felem_reduce(out, tmp);     /* 2^512 - 3 */
796 }
797 
798 /* This is 2^521-1, expressed as an felem */
799 static const felem kPrime = {
800     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
801     0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
802     0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
803 };
804 
805 /*-
806  * felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
807  * otherwise.
808  * On entry:
809  *   in[i] < 2^59 + 2^14
810  */
811 static limb felem_is_zero(const felem in)
812 {
813     felem ftmp;
814     limb is_zero, is_p;
815     felem_assign(ftmp, in);
816 
817     ftmp[0] += ftmp[8] >> 57;
818     ftmp[8] &= bottom57bits;
819     /* ftmp[8] < 2^57 */
820     ftmp[1] += ftmp[0] >> 58;
821     ftmp[0] &= bottom58bits;
822     ftmp[2] += ftmp[1] >> 58;
823     ftmp[1] &= bottom58bits;
824     ftmp[3] += ftmp[2] >> 58;
825     ftmp[2] &= bottom58bits;
826     ftmp[4] += ftmp[3] >> 58;
827     ftmp[3] &= bottom58bits;
828     ftmp[5] += ftmp[4] >> 58;
829     ftmp[4] &= bottom58bits;
830     ftmp[6] += ftmp[5] >> 58;
831     ftmp[5] &= bottom58bits;
832     ftmp[7] += ftmp[6] >> 58;
833     ftmp[6] &= bottom58bits;
834     ftmp[8] += ftmp[7] >> 58;
835     ftmp[7] &= bottom58bits;
836     /* ftmp[8] < 2^57 + 4 */
837 
838     /*
839      * The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is greater
840      * than our bound for ftmp[8]. Therefore we only have to check if the
841      * zero is zero or 2^521-1.
842      */
843 
844     is_zero = 0;
845     is_zero |= ftmp[0];
846     is_zero |= ftmp[1];
847     is_zero |= ftmp[2];
848     is_zero |= ftmp[3];
849     is_zero |= ftmp[4];
850     is_zero |= ftmp[5];
851     is_zero |= ftmp[6];
852     is_zero |= ftmp[7];
853     is_zero |= ftmp[8];
854 
855     is_zero--;
856     /*
857      * We know that ftmp[i] < 2^63, therefore the only way that the top bit
858      * can be set is if is_zero was 0 before the decrement.
859      */
860     is_zero = 0 - (is_zero >> 63);
861 
862     is_p = ftmp[0] ^ kPrime[0];
863     is_p |= ftmp[1] ^ kPrime[1];
864     is_p |= ftmp[2] ^ kPrime[2];
865     is_p |= ftmp[3] ^ kPrime[3];
866     is_p |= ftmp[4] ^ kPrime[4];
867     is_p |= ftmp[5] ^ kPrime[5];
868     is_p |= ftmp[6] ^ kPrime[6];
869     is_p |= ftmp[7] ^ kPrime[7];
870     is_p |= ftmp[8] ^ kPrime[8];
871 
872     is_p--;
873     is_p = 0 - (is_p >> 63);
874 
875     is_zero |= is_p;
876     return is_zero;
877 }
878 
879 static int felem_is_zero_int(const void *in)
880 {
881     return (int)(felem_is_zero(in) & ((limb) 1));
882 }
883 
884 /*-
885  * felem_contract converts |in| to its unique, minimal representation.
886  * On entry:
887  *   in[i] < 2^59 + 2^14
888  */
889 static void felem_contract(felem out, const felem in)
890 {
891     limb is_p, is_greater, sign;
892     static const limb two58 = ((limb) 1) << 58;
893 
894     felem_assign(out, in);
895 
896     out[0] += out[8] >> 57;
897     out[8] &= bottom57bits;
898     /* out[8] < 2^57 */
899     out[1] += out[0] >> 58;
900     out[0] &= bottom58bits;
901     out[2] += out[1] >> 58;
902     out[1] &= bottom58bits;
903     out[3] += out[2] >> 58;
904     out[2] &= bottom58bits;
905     out[4] += out[3] >> 58;
906     out[3] &= bottom58bits;
907     out[5] += out[4] >> 58;
908     out[4] &= bottom58bits;
909     out[6] += out[5] >> 58;
910     out[5] &= bottom58bits;
911     out[7] += out[6] >> 58;
912     out[6] &= bottom58bits;
913     out[8] += out[7] >> 58;
914     out[7] &= bottom58bits;
915     /* out[8] < 2^57 + 4 */
916 
917     /*
918      * If the value is greater than 2^521-1 then we have to subtract 2^521-1
919      * out. See the comments in felem_is_zero regarding why we don't test for
920      * other multiples of the prime.
921      */
922 
923     /*
924      * First, if |out| is equal to 2^521-1, we subtract it out to get zero.
925      */
926 
927     is_p = out[0] ^ kPrime[0];
928     is_p |= out[1] ^ kPrime[1];
929     is_p |= out[2] ^ kPrime[2];
930     is_p |= out[3] ^ kPrime[3];
931     is_p |= out[4] ^ kPrime[4];
932     is_p |= out[5] ^ kPrime[5];
933     is_p |= out[6] ^ kPrime[6];
934     is_p |= out[7] ^ kPrime[7];
935     is_p |= out[8] ^ kPrime[8];
936 
937     is_p--;
938     is_p &= is_p << 32;
939     is_p &= is_p << 16;
940     is_p &= is_p << 8;
941     is_p &= is_p << 4;
942     is_p &= is_p << 2;
943     is_p &= is_p << 1;
944     is_p = 0 - (is_p >> 63);
945     is_p = ~is_p;
946 
947     /* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
948 
949     out[0] &= is_p;
950     out[1] &= is_p;
951     out[2] &= is_p;
952     out[3] &= is_p;
953     out[4] &= is_p;
954     out[5] &= is_p;
955     out[6] &= is_p;
956     out[7] &= is_p;
957     out[8] &= is_p;
958 
959     /*
960      * In order to test that |out| >= 2^521-1 we need only test if out[8] >>
961      * 57 is greater than zero as (2^521-1) + x >= 2^522
962      */
963     is_greater = out[8] >> 57;
964     is_greater |= is_greater << 32;
965     is_greater |= is_greater << 16;
966     is_greater |= is_greater << 8;
967     is_greater |= is_greater << 4;
968     is_greater |= is_greater << 2;
969     is_greater |= is_greater << 1;
970     is_greater = 0 - (is_greater >> 63);
971 
972     out[0] -= kPrime[0] & is_greater;
973     out[1] -= kPrime[1] & is_greater;
974     out[2] -= kPrime[2] & is_greater;
975     out[3] -= kPrime[3] & is_greater;
976     out[4] -= kPrime[4] & is_greater;
977     out[5] -= kPrime[5] & is_greater;
978     out[6] -= kPrime[6] & is_greater;
979     out[7] -= kPrime[7] & is_greater;
980     out[8] -= kPrime[8] & is_greater;
981 
982     /* Eliminate negative coefficients */
983     sign = -(out[0] >> 63);
984     out[0] += (two58 & sign);
985     out[1] -= (1 & sign);
986     sign = -(out[1] >> 63);
987     out[1] += (two58 & sign);
988     out[2] -= (1 & sign);
989     sign = -(out[2] >> 63);
990     out[2] += (two58 & sign);
991     out[3] -= (1 & sign);
992     sign = -(out[3] >> 63);
993     out[3] += (two58 & sign);
994     out[4] -= (1 & sign);
995     sign = -(out[4] >> 63);
996     out[4] += (two58 & sign);
997     out[5] -= (1 & sign);
998     sign = -(out[0] >> 63);
999     out[5] += (two58 & sign);
1000     out[6] -= (1 & sign);
1001     sign = -(out[6] >> 63);
1002     out[6] += (two58 & sign);
1003     out[7] -= (1 & sign);
1004     sign = -(out[7] >> 63);
1005     out[7] += (two58 & sign);
1006     out[8] -= (1 & sign);
1007     sign = -(out[5] >> 63);
1008     out[5] += (two58 & sign);
1009     out[6] -= (1 & sign);
1010     sign = -(out[6] >> 63);
1011     out[6] += (two58 & sign);
1012     out[7] -= (1 & sign);
1013     sign = -(out[7] >> 63);
1014     out[7] += (two58 & sign);
1015     out[8] -= (1 & sign);
1016 }
1017 
1018 /*-
1019  * Group operations
1020  * ----------------
1021  *
1022  * Building on top of the field operations we have the operations on the
1023  * elliptic curve group itself. Points on the curve are represented in Jacobian
1024  * coordinates */
1025 
1026 /*-
1027  * point_double calculates 2*(x_in, y_in, z_in)
1028  *
1029  * The method is taken from:
1030  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1031  *
1032  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1033  * while x_out == y_in is not (maybe this works, but it's not tested). */
1034 static void
1035 point_double(felem x_out, felem y_out, felem z_out,
1036              const felem x_in, const felem y_in, const felem z_in)
1037 {
1038     largefelem tmp, tmp2;
1039     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1040 
1041     felem_assign(ftmp, x_in);
1042     felem_assign(ftmp2, x_in);
1043 
1044     /* delta = z^2 */
1045     felem_square(tmp, z_in);
1046     felem_reduce(delta, tmp);   /* delta[i] < 2^59 + 2^14 */
1047 
1048     /* gamma = y^2 */
1049     felem_square(tmp, y_in);
1050     felem_reduce(gamma, tmp);   /* gamma[i] < 2^59 + 2^14 */
1051 
1052     /* beta = x*gamma */
1053     felem_mul(tmp, x_in, gamma);
1054     felem_reduce(beta, tmp);    /* beta[i] < 2^59 + 2^14 */
1055 
1056     /* alpha = 3*(x-delta)*(x+delta) */
1057     felem_diff64(ftmp, delta);
1058     /* ftmp[i] < 2^61 */
1059     felem_sum64(ftmp2, delta);
1060     /* ftmp2[i] < 2^60 + 2^15 */
1061     felem_scalar64(ftmp2, 3);
1062     /* ftmp2[i] < 3*2^60 + 3*2^15 */
1063     felem_mul(tmp, ftmp, ftmp2);
1064     /*-
1065      * tmp[i] < 17(3*2^121 + 3*2^76)
1066      *        = 61*2^121 + 61*2^76
1067      *        < 64*2^121 + 64*2^76
1068      *        = 2^127 + 2^82
1069      *        < 2^128
1070      */
1071     felem_reduce(alpha, tmp);
1072 
1073     /* x' = alpha^2 - 8*beta */
1074     felem_square(tmp, alpha);
1075     /*
1076      * tmp[i] < 17*2^120 < 2^125
1077      */
1078     felem_assign(ftmp, beta);
1079     felem_scalar64(ftmp, 8);
1080     /* ftmp[i] < 2^62 + 2^17 */
1081     felem_diff_128_64(tmp, ftmp);
1082     /* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
1083     felem_reduce(x_out, tmp);
1084 
1085     /* z' = (y + z)^2 - gamma - delta */
1086     felem_sum64(delta, gamma);
1087     /* delta[i] < 2^60 + 2^15 */
1088     felem_assign(ftmp, y_in);
1089     felem_sum64(ftmp, z_in);
1090     /* ftmp[i] < 2^60 + 2^15 */
1091     felem_square(tmp, ftmp);
1092     /*
1093      * tmp[i] < 17(2^122) < 2^127
1094      */
1095     felem_diff_128_64(tmp, delta);
1096     /* tmp[i] < 2^127 + 2^63 */
1097     felem_reduce(z_out, tmp);
1098 
1099     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1100     felem_scalar64(beta, 4);
1101     /* beta[i] < 2^61 + 2^16 */
1102     felem_diff64(beta, x_out);
1103     /* beta[i] < 2^61 + 2^60 + 2^16 */
1104     felem_mul(tmp, alpha, beta);
1105     /*-
1106      * tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1107      *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1108      *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1109      *        < 2^128
1110      */
1111     felem_square(tmp2, gamma);
1112     /*-
1113      * tmp2[i] < 17*(2^59 + 2^14)^2
1114      *         = 17*(2^118 + 2^74 + 2^28)
1115      */
1116     felem_scalar128(tmp2, 8);
1117     /*-
1118      * tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1119      *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1120      *         < 2^126
1121      */
1122     felem_diff128(tmp, tmp2);
1123     /*-
1124      * tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1125      *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1126      *          2^74 + 2^69 + 2^34 + 2^30
1127      *        < 2^128
1128      */
1129     felem_reduce(y_out, tmp);
1130 }
1131 
1132 /* copy_conditional copies in to out iff mask is all ones. */
1133 static void copy_conditional(felem out, const felem in, limb mask)
1134 {
1135     unsigned i;
1136     for (i = 0; i < NLIMBS; ++i) {
1137         const limb tmp = mask & (in[i] ^ out[i]);
1138         out[i] ^= tmp;
1139     }
1140 }
1141 
1142 /*-
1143  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1144  *
1145  * The method is taken from
1146  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1147  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1148  *
1149  * This function includes a branch for checking whether the two input points
1150  * are equal (while not equal to the point at infinity). See comment below
1151  * on constant-time.
1152  */
1153 static void point_add(felem x3, felem y3, felem z3,
1154                       const felem x1, const felem y1, const felem z1,
1155                       const int mixed, const felem x2, const felem y2,
1156                       const felem z2)
1157 {
1158     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1159     largefelem tmp, tmp2;
1160     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1161 
1162     z1_is_zero = felem_is_zero(z1);
1163     z2_is_zero = felem_is_zero(z2);
1164 
1165     /* ftmp = z1z1 = z1**2 */
1166     felem_square(tmp, z1);
1167     felem_reduce(ftmp, tmp);
1168 
1169     if (!mixed) {
1170         /* ftmp2 = z2z2 = z2**2 */
1171         felem_square(tmp, z2);
1172         felem_reduce(ftmp2, tmp);
1173 
1174         /* u1 = ftmp3 = x1*z2z2 */
1175         felem_mul(tmp, x1, ftmp2);
1176         felem_reduce(ftmp3, tmp);
1177 
1178         /* ftmp5 = z1 + z2 */
1179         felem_assign(ftmp5, z1);
1180         felem_sum64(ftmp5, z2);
1181         /* ftmp5[i] < 2^61 */
1182 
1183         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1184         felem_square(tmp, ftmp5);
1185         /* tmp[i] < 17*2^122 */
1186         felem_diff_128_64(tmp, ftmp);
1187         /* tmp[i] < 17*2^122 + 2^63 */
1188         felem_diff_128_64(tmp, ftmp2);
1189         /* tmp[i] < 17*2^122 + 2^64 */
1190         felem_reduce(ftmp5, tmp);
1191 
1192         /* ftmp2 = z2 * z2z2 */
1193         felem_mul(tmp, ftmp2, z2);
1194         felem_reduce(ftmp2, tmp);
1195 
1196         /* s1 = ftmp6 = y1 * z2**3 */
1197         felem_mul(tmp, y1, ftmp2);
1198         felem_reduce(ftmp6, tmp);
1199     } else {
1200         /*
1201          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1202          */
1203 
1204         /* u1 = ftmp3 = x1*z2z2 */
1205         felem_assign(ftmp3, x1);
1206 
1207         /* ftmp5 = 2*z1z2 */
1208         felem_scalar(ftmp5, z1, 2);
1209 
1210         /* s1 = ftmp6 = y1 * z2**3 */
1211         felem_assign(ftmp6, y1);
1212     }
1213 
1214     /* u2 = x2*z1z1 */
1215     felem_mul(tmp, x2, ftmp);
1216     /* tmp[i] < 17*2^120 */
1217 
1218     /* h = ftmp4 = u2 - u1 */
1219     felem_diff_128_64(tmp, ftmp3);
1220     /* tmp[i] < 17*2^120 + 2^63 */
1221     felem_reduce(ftmp4, tmp);
1222 
1223     x_equal = felem_is_zero(ftmp4);
1224 
1225     /* z_out = ftmp5 * h */
1226     felem_mul(tmp, ftmp5, ftmp4);
1227     felem_reduce(z_out, tmp);
1228 
1229     /* ftmp = z1 * z1z1 */
1230     felem_mul(tmp, ftmp, z1);
1231     felem_reduce(ftmp, tmp);
1232 
1233     /* s2 = tmp = y2 * z1**3 */
1234     felem_mul(tmp, y2, ftmp);
1235     /* tmp[i] < 17*2^120 */
1236 
1237     /* r = ftmp5 = (s2 - s1)*2 */
1238     felem_diff_128_64(tmp, ftmp6);
1239     /* tmp[i] < 17*2^120 + 2^63 */
1240     felem_reduce(ftmp5, tmp);
1241     y_equal = felem_is_zero(ftmp5);
1242     felem_scalar64(ftmp5, 2);
1243     /* ftmp5[i] < 2^61 */
1244 
1245     if (x_equal && y_equal && !z1_is_zero && !z2_is_zero) {
1246         /*
1247          * This is obviously not constant-time but it will almost-never happen
1248          * for ECDH / ECDSA. The case where it can happen is during scalar-mult
1249          * where the intermediate value gets very close to the group order.
1250          * Since |ec_GFp_nistp_recode_scalar_bits| produces signed digits for
1251          * the scalar, it's possible for the intermediate value to be a small
1252          * negative multiple of the base point, and for the final signed digit
1253          * to be the same value. We believe that this only occurs for the scalar
1254          * 1fffffffffffffffffffffffffffffffffffffffffffffffffffffffffff
1255          * ffffffa51868783bf2f966b7fcc0148f709a5d03bb5c9b8899c47aebb6fb
1256          * 71e913863f7, in that case the penultimate intermediate is -9G and
1257          * the final digit is also -9G. Since this only happens for a single
1258          * scalar, the timing leak is irrelevant. (Any attacker who wanted to
1259          * check whether a secret scalar was that exact value, can already do
1260          * so.)
1261          */
1262         point_double(x3, y3, z3, x1, y1, z1);
1263         return;
1264     }
1265 
1266     /* I = ftmp = (2h)**2 */
1267     felem_assign(ftmp, ftmp4);
1268     felem_scalar64(ftmp, 2);
1269     /* ftmp[i] < 2^61 */
1270     felem_square(tmp, ftmp);
1271     /* tmp[i] < 17*2^122 */
1272     felem_reduce(ftmp, tmp);
1273 
1274     /* J = ftmp2 = h * I */
1275     felem_mul(tmp, ftmp4, ftmp);
1276     felem_reduce(ftmp2, tmp);
1277 
1278     /* V = ftmp4 = U1 * I */
1279     felem_mul(tmp, ftmp3, ftmp);
1280     felem_reduce(ftmp4, tmp);
1281 
1282     /* x_out = r**2 - J - 2V */
1283     felem_square(tmp, ftmp5);
1284     /* tmp[i] < 17*2^122 */
1285     felem_diff_128_64(tmp, ftmp2);
1286     /* tmp[i] < 17*2^122 + 2^63 */
1287     felem_assign(ftmp3, ftmp4);
1288     felem_scalar64(ftmp4, 2);
1289     /* ftmp4[i] < 2^61 */
1290     felem_diff_128_64(tmp, ftmp4);
1291     /* tmp[i] < 17*2^122 + 2^64 */
1292     felem_reduce(x_out, tmp);
1293 
1294     /* y_out = r(V-x_out) - 2 * s1 * J */
1295     felem_diff64(ftmp3, x_out);
1296     /*
1297      * ftmp3[i] < 2^60 + 2^60 = 2^61
1298      */
1299     felem_mul(tmp, ftmp5, ftmp3);
1300     /* tmp[i] < 17*2^122 */
1301     felem_mul(tmp2, ftmp6, ftmp2);
1302     /* tmp2[i] < 17*2^120 */
1303     felem_scalar128(tmp2, 2);
1304     /* tmp2[i] < 17*2^121 */
1305     felem_diff128(tmp, tmp2);
1306         /*-
1307          * tmp[i] < 2^127 - 2^69 + 17*2^122
1308          *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1309          *        < 2^127
1310          */
1311     felem_reduce(y_out, tmp);
1312 
1313     copy_conditional(x_out, x2, z1_is_zero);
1314     copy_conditional(x_out, x1, z2_is_zero);
1315     copy_conditional(y_out, y2, z1_is_zero);
1316     copy_conditional(y_out, y1, z2_is_zero);
1317     copy_conditional(z_out, z2, z1_is_zero);
1318     copy_conditional(z_out, z1, z2_is_zero);
1319     felem_assign(x3, x_out);
1320     felem_assign(y3, y_out);
1321     felem_assign(z3, z_out);
1322 }
1323 
1324 /*-
1325  * Base point pre computation
1326  * --------------------------
1327  *
1328  * Two different sorts of precomputed tables are used in the following code.
1329  * Each contain various points on the curve, where each point is three field
1330  * elements (x, y, z).
1331  *
1332  * For the base point table, z is usually 1 (0 for the point at infinity).
1333  * This table has 16 elements:
1334  * index | bits    | point
1335  * ------+---------+------------------------------
1336  *     0 | 0 0 0 0 | 0G
1337  *     1 | 0 0 0 1 | 1G
1338  *     2 | 0 0 1 0 | 2^130G
1339  *     3 | 0 0 1 1 | (2^130 + 1)G
1340  *     4 | 0 1 0 0 | 2^260G
1341  *     5 | 0 1 0 1 | (2^260 + 1)G
1342  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1343  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1344  *     8 | 1 0 0 0 | 2^390G
1345  *     9 | 1 0 0 1 | (2^390 + 1)G
1346  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1347  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1348  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1349  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1350  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1351  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1352  *
1353  * The reason for this is so that we can clock bits into four different
1354  * locations when doing simple scalar multiplies against the base point.
1355  *
1356  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1357 
1358 /* gmul is the table of precomputed base points */
1359 static const felem gmul[16][3] = {
1360 {{0, 0, 0, 0, 0, 0, 0, 0, 0},
1361  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1362  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1363 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1364   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1365   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1366  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1367   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1368   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1369  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1370 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1371   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1372   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1373  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1374   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1375   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1376  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1377 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1378   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1379   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1380  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1381   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1382   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1383  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1384 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1385   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1386   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1387  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1388   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1389   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1390  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1391 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1392   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1393   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1394  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1395   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1396   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1397  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1398 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1399   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1400   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1401  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1402   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1403   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1404  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1405 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1406   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1407   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1408  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1409   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1410   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1411  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1412 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1413   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1414   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1415  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1416   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1417   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1418  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1419 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1420   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1421   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1422  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1423   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1424   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1425  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1426 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1427   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1428   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1429  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1430   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1431   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1432  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1433 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1434   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1435   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1436  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1437   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1438   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1439  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1440 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1441   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1442   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1443  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1444   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1445   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1446  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1447 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1448   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1449   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1450  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1451   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1452   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1453  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1454 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1455   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1456   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1457  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1458   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1459   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1460  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1461 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1462   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1463   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1464  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1465   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1466   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1467  {1, 0, 0, 0, 0, 0, 0, 0, 0}}
1468 };
1469 
1470 /*
1471  * select_point selects the |idx|th point from a precomputation table and
1472  * copies it to out.
1473  */
1474  /* pre_comp below is of the size provided in |size| */
1475 static void select_point(const limb idx, unsigned int size,
1476                          const felem pre_comp[][3], felem out[3])
1477 {
1478     unsigned i, j;
1479     limb *outlimbs = &out[0][0];
1480 
1481     memset(out, 0, sizeof(*out) * 3);
1482 
1483     for (i = 0; i < size; i++) {
1484         const limb *inlimbs = &pre_comp[i][0][0];
1485         limb mask = i ^ idx;
1486         mask |= mask >> 4;
1487         mask |= mask >> 2;
1488         mask |= mask >> 1;
1489         mask &= 1;
1490         mask--;
1491         for (j = 0; j < NLIMBS * 3; j++)
1492             outlimbs[j] |= inlimbs[j] & mask;
1493     }
1494 }
1495 
1496 /* get_bit returns the |i|th bit in |in| */
1497 static char get_bit(const felem_bytearray in, int i)
1498 {
1499     if (i < 0)
1500         return 0;
1501     return (in[i >> 3] >> (i & 7)) & 1;
1502 }
1503 
1504 /*
1505  * Interleaved point multiplication using precomputed point multiples: The
1506  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1507  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1508  * generator, using certain (large) precomputed multiples in g_pre_comp.
1509  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1510  */
1511 static void batch_mul(felem x_out, felem y_out, felem z_out,
1512                       const felem_bytearray scalars[],
1513                       const unsigned num_points, const u8 *g_scalar,
1514                       const int mixed, const felem pre_comp[][17][3],
1515                       const felem g_pre_comp[16][3])
1516 {
1517     int i, skip;
1518     unsigned num, gen_mul = (g_scalar != NULL);
1519     felem nq[3], tmp[4];
1520     limb bits;
1521     u8 sign, digit;
1522 
1523     /* set nq to the point at infinity */
1524     memset(nq, 0, sizeof(nq));
1525 
1526     /*
1527      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1528      * of the generator (last quarter of rounds) and additions of other
1529      * points multiples (every 5th round).
1530      */
1531     skip = 1;                   /* save two point operations in the first
1532                                  * round */
1533     for (i = (num_points ? 520 : 130); i >= 0; --i) {
1534         /* double */
1535         if (!skip)
1536             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1537 
1538         /* add multiples of the generator */
1539         if (gen_mul && (i <= 130)) {
1540             bits = get_bit(g_scalar, i + 390) << 3;
1541             if (i < 130) {
1542                 bits |= get_bit(g_scalar, i + 260) << 2;
1543                 bits |= get_bit(g_scalar, i + 130) << 1;
1544                 bits |= get_bit(g_scalar, i);
1545             }
1546             /* select the point to add, in constant time */
1547             select_point(bits, 16, g_pre_comp, tmp);
1548             if (!skip) {
1549                 /* The 1 argument below is for "mixed" */
1550                 point_add(nq[0], nq[1], nq[2],
1551                           nq[0], nq[1], nq[2], 1, tmp[0], tmp[1], tmp[2]);
1552             } else {
1553                 memcpy(nq, tmp, 3 * sizeof(felem));
1554                 skip = 0;
1555             }
1556         }
1557 
1558         /* do other additions every 5 doublings */
1559         if (num_points && (i % 5 == 0)) {
1560             /* loop over all scalars */
1561             for (num = 0; num < num_points; ++num) {
1562                 bits = get_bit(scalars[num], i + 4) << 5;
1563                 bits |= get_bit(scalars[num], i + 3) << 4;
1564                 bits |= get_bit(scalars[num], i + 2) << 3;
1565                 bits |= get_bit(scalars[num], i + 1) << 2;
1566                 bits |= get_bit(scalars[num], i) << 1;
1567                 bits |= get_bit(scalars[num], i - 1);
1568                 ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1569 
1570                 /*
1571                  * select the point to add or subtract, in constant time
1572                  */
1573                 select_point(digit, 17, pre_comp[num], tmp);
1574                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1575                                             * point */
1576                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1577 
1578                 if (!skip) {
1579                     point_add(nq[0], nq[1], nq[2],
1580                               nq[0], nq[1], nq[2],
1581                               mixed, tmp[0], tmp[1], tmp[2]);
1582                 } else {
1583                     memcpy(nq, tmp, 3 * sizeof(felem));
1584                     skip = 0;
1585                 }
1586             }
1587         }
1588     }
1589     felem_assign(x_out, nq[0]);
1590     felem_assign(y_out, nq[1]);
1591     felem_assign(z_out, nq[2]);
1592 }
1593 
1594 /* Precomputation for the group generator. */
1595 struct nistp521_pre_comp_st {
1596     felem g_pre_comp[16][3];
1597     CRYPTO_REF_COUNT references;
1598     CRYPTO_RWLOCK *lock;
1599 };
1600 
1601 const EC_METHOD *EC_GFp_nistp521_method(void)
1602 {
1603     static const EC_METHOD ret = {
1604         EC_FLAGS_DEFAULT_OCT,
1605         NID_X9_62_prime_field,
1606         ec_GFp_nistp521_group_init,
1607         ec_GFp_simple_group_finish,
1608         ec_GFp_simple_group_clear_finish,
1609         ec_GFp_nist_group_copy,
1610         ec_GFp_nistp521_group_set_curve,
1611         ec_GFp_simple_group_get_curve,
1612         ec_GFp_simple_group_get_degree,
1613         ec_group_simple_order_bits,
1614         ec_GFp_simple_group_check_discriminant,
1615         ec_GFp_simple_point_init,
1616         ec_GFp_simple_point_finish,
1617         ec_GFp_simple_point_clear_finish,
1618         ec_GFp_simple_point_copy,
1619         ec_GFp_simple_point_set_to_infinity,
1620         ec_GFp_simple_set_Jprojective_coordinates_GFp,
1621         ec_GFp_simple_get_Jprojective_coordinates_GFp,
1622         ec_GFp_simple_point_set_affine_coordinates,
1623         ec_GFp_nistp521_point_get_affine_coordinates,
1624         0 /* point_set_compressed_coordinates */ ,
1625         0 /* point2oct */ ,
1626         0 /* oct2point */ ,
1627         ec_GFp_simple_add,
1628         ec_GFp_simple_dbl,
1629         ec_GFp_simple_invert,
1630         ec_GFp_simple_is_at_infinity,
1631         ec_GFp_simple_is_on_curve,
1632         ec_GFp_simple_cmp,
1633         ec_GFp_simple_make_affine,
1634         ec_GFp_simple_points_make_affine,
1635         ec_GFp_nistp521_points_mul,
1636         ec_GFp_nistp521_precompute_mult,
1637         ec_GFp_nistp521_have_precompute_mult,
1638         ec_GFp_nist_field_mul,
1639         ec_GFp_nist_field_sqr,
1640         0 /* field_div */ ,
1641         ec_GFp_simple_field_inv,
1642         0 /* field_encode */ ,
1643         0 /* field_decode */ ,
1644         0,                      /* field_set_to_one */
1645         ec_key_simple_priv2oct,
1646         ec_key_simple_oct2priv,
1647         0, /* set private */
1648         ec_key_simple_generate_key,
1649         ec_key_simple_check_key,
1650         ec_key_simple_generate_public_key,
1651         0, /* keycopy */
1652         0, /* keyfinish */
1653         ecdh_simple_compute_key,
1654         0, /* field_inverse_mod_ord */
1655         0, /* blind_coordinates */
1656         0, /* ladder_pre */
1657         0, /* ladder_step */
1658         0  /* ladder_post */
1659     };
1660 
1661     return &ret;
1662 }
1663 
1664 /******************************************************************************/
1665 /*
1666  * FUNCTIONS TO MANAGE PRECOMPUTATION
1667  */
1668 
1669 static NISTP521_PRE_COMP *nistp521_pre_comp_new(void)
1670 {
1671     NISTP521_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1672 
1673     if (ret == NULL) {
1674         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1675         return ret;
1676     }
1677 
1678     ret->references = 1;
1679 
1680     ret->lock = CRYPTO_THREAD_lock_new();
1681     if (ret->lock == NULL) {
1682         ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1683         OPENSSL_free(ret);
1684         return NULL;
1685     }
1686     return ret;
1687 }
1688 
1689 NISTP521_PRE_COMP *EC_nistp521_pre_comp_dup(NISTP521_PRE_COMP *p)
1690 {
1691     int i;
1692     if (p != NULL)
1693         CRYPTO_UP_REF(&p->references, &i, p->lock);
1694     return p;
1695 }
1696 
1697 void EC_nistp521_pre_comp_free(NISTP521_PRE_COMP *p)
1698 {
1699     int i;
1700 
1701     if (p == NULL)
1702         return;
1703 
1704     CRYPTO_DOWN_REF(&p->references, &i, p->lock);
1705     REF_PRINT_COUNT("EC_nistp521", x);
1706     if (i > 0)
1707         return;
1708     REF_ASSERT_ISNT(i < 0);
1709 
1710     CRYPTO_THREAD_lock_free(p->lock);
1711     OPENSSL_free(p);
1712 }
1713 
1714 /******************************************************************************/
1715 /*
1716  * OPENSSL EC_METHOD FUNCTIONS
1717  */
1718 
1719 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1720 {
1721     int ret;
1722     ret = ec_GFp_simple_group_init(group);
1723     group->a_is_minus3 = 1;
1724     return ret;
1725 }
1726 
1727 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1728                                     const BIGNUM *a, const BIGNUM *b,
1729                                     BN_CTX *ctx)
1730 {
1731     int ret = 0;
1732     BN_CTX *new_ctx = NULL;
1733     BIGNUM *curve_p, *curve_a, *curve_b;
1734 
1735     if (ctx == NULL)
1736         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
1737             return 0;
1738     BN_CTX_start(ctx);
1739     curve_p = BN_CTX_get(ctx);
1740     curve_a = BN_CTX_get(ctx);
1741     curve_b = BN_CTX_get(ctx);
1742     if (curve_b == NULL)
1743         goto err;
1744     BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1745     BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1746     BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1747     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1748         ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1749               EC_R_WRONG_CURVE_PARAMETERS);
1750         goto err;
1751     }
1752     group->field_mod_func = BN_nist_mod_521;
1753     ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1754  err:
1755     BN_CTX_end(ctx);
1756     BN_CTX_free(new_ctx);
1757     return ret;
1758 }
1759 
1760 /*
1761  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1762  * (X/Z^2, Y/Z^3)
1763  */
1764 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1765                                                  const EC_POINT *point,
1766                                                  BIGNUM *x, BIGNUM *y,
1767                                                  BN_CTX *ctx)
1768 {
1769     felem z1, z2, x_in, y_in, x_out, y_out;
1770     largefelem tmp;
1771 
1772     if (EC_POINT_is_at_infinity(group, point)) {
1773         ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1774               EC_R_POINT_AT_INFINITY);
1775         return 0;
1776     }
1777     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1778         (!BN_to_felem(z1, point->Z)))
1779         return 0;
1780     felem_inv(z2, z1);
1781     felem_square(tmp, z2);
1782     felem_reduce(z1, tmp);
1783     felem_mul(tmp, x_in, z1);
1784     felem_reduce(x_in, tmp);
1785     felem_contract(x_out, x_in);
1786     if (x != NULL) {
1787         if (!felem_to_BN(x, x_out)) {
1788             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1789                   ERR_R_BN_LIB);
1790             return 0;
1791         }
1792     }
1793     felem_mul(tmp, z1, z2);
1794     felem_reduce(z1, tmp);
1795     felem_mul(tmp, y_in, z1);
1796     felem_reduce(y_in, tmp);
1797     felem_contract(y_out, y_in);
1798     if (y != NULL) {
1799         if (!felem_to_BN(y, y_out)) {
1800             ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1801                   ERR_R_BN_LIB);
1802             return 0;
1803         }
1804     }
1805     return 1;
1806 }
1807 
1808 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
1809 static void make_points_affine(size_t num, felem points[][3],
1810                                felem tmp_felems[])
1811 {
1812     /*
1813      * Runs in constant time, unless an input is the point at infinity (which
1814      * normally shouldn't happen).
1815      */
1816     ec_GFp_nistp_points_make_affine_internal(num,
1817                                              points,
1818                                              sizeof(felem),
1819                                              tmp_felems,
1820                                              (void (*)(void *))felem_one,
1821                                              felem_is_zero_int,
1822                                              (void (*)(void *, const void *))
1823                                              felem_assign,
1824                                              (void (*)(void *, const void *))
1825                                              felem_square_reduce, (void (*)
1826                                                                    (void *,
1827                                                                     const void
1828                                                                     *,
1829                                                                     const void
1830                                                                     *))
1831                                              felem_mul_reduce,
1832                                              (void (*)(void *, const void *))
1833                                              felem_inv,
1834                                              (void (*)(void *, const void *))
1835                                              felem_contract);
1836 }
1837 
1838 /*
1839  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1840  * values Result is stored in r (r can equal one of the inputs).
1841  */
1842 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1843                                const BIGNUM *scalar, size_t num,
1844                                const EC_POINT *points[],
1845                                const BIGNUM *scalars[], BN_CTX *ctx)
1846 {
1847     int ret = 0;
1848     int j;
1849     int mixed = 0;
1850     BIGNUM *x, *y, *z, *tmp_scalar;
1851     felem_bytearray g_secret;
1852     felem_bytearray *secrets = NULL;
1853     felem (*pre_comp)[17][3] = NULL;
1854     felem *tmp_felems = NULL;
1855     unsigned i;
1856     int num_bytes;
1857     int have_pre_comp = 0;
1858     size_t num_points = num;
1859     felem x_in, y_in, z_in, x_out, y_out, z_out;
1860     NISTP521_PRE_COMP *pre = NULL;
1861     felem(*g_pre_comp)[3] = NULL;
1862     EC_POINT *generator = NULL;
1863     const EC_POINT *p = NULL;
1864     const BIGNUM *p_scalar = NULL;
1865 
1866     BN_CTX_start(ctx);
1867     x = BN_CTX_get(ctx);
1868     y = BN_CTX_get(ctx);
1869     z = BN_CTX_get(ctx);
1870     tmp_scalar = BN_CTX_get(ctx);
1871     if (tmp_scalar == NULL)
1872         goto err;
1873 
1874     if (scalar != NULL) {
1875         pre = group->pre_comp.nistp521;
1876         if (pre)
1877             /* we have precomputation, try to use it */
1878             g_pre_comp = &pre->g_pre_comp[0];
1879         else
1880             /* try to use the standard precomputation */
1881             g_pre_comp = (felem(*)[3]) gmul;
1882         generator = EC_POINT_new(group);
1883         if (generator == NULL)
1884             goto err;
1885         /* get the generator from precomputation */
1886         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1887             !felem_to_BN(y, g_pre_comp[1][1]) ||
1888             !felem_to_BN(z, g_pre_comp[1][2])) {
1889             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1890             goto err;
1891         }
1892         if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1893                                                       generator, x, y, z,
1894                                                       ctx))
1895             goto err;
1896         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1897             /* precomputation matches generator */
1898             have_pre_comp = 1;
1899         else
1900             /*
1901              * we don't have valid precomputation: treat the generator as a
1902              * random point
1903              */
1904             num_points++;
1905     }
1906 
1907     if (num_points > 0) {
1908         if (num_points >= 2) {
1909             /*
1910              * unless we precompute multiples for just one point, converting
1911              * those into affine form is time well spent
1912              */
1913             mixed = 1;
1914         }
1915         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1916         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1917         if (mixed)
1918             tmp_felems =
1919                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1920         if ((secrets == NULL) || (pre_comp == NULL)
1921             || (mixed && (tmp_felems == NULL))) {
1922             ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1923             goto err;
1924         }
1925 
1926         /*
1927          * we treat NULL scalars as 0, and NULL points as points at infinity,
1928          * i.e., they contribute nothing to the linear combination
1929          */
1930         for (i = 0; i < num_points; ++i) {
1931             if (i == num) {
1932                 /*
1933                  * we didn't have a valid precomputation, so we pick the
1934                  * generator
1935                  */
1936                 p = EC_GROUP_get0_generator(group);
1937                 p_scalar = scalar;
1938             } else {
1939                 /* the i^th point */
1940                 p = points[i];
1941                 p_scalar = scalars[i];
1942             }
1943             if ((p_scalar != NULL) && (p != NULL)) {
1944                 /* reduce scalar to 0 <= scalar < 2^521 */
1945                 if ((BN_num_bits(p_scalar) > 521)
1946                     || (BN_is_negative(p_scalar))) {
1947                     /*
1948                      * this is an unusual input, and we don't guarantee
1949                      * constant-timeness
1950                      */
1951                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1952                         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1953                         goto err;
1954                     }
1955                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1956                                                secrets[i], sizeof(secrets[i]));
1957                 } else {
1958                     num_bytes = BN_bn2lebinpad(p_scalar,
1959                                                secrets[i], sizeof(secrets[i]));
1960                 }
1961                 if (num_bytes < 0) {
1962                     ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1963                     goto err;
1964                 }
1965                 /* precompute multiples */
1966                 if ((!BN_to_felem(x_out, p->X)) ||
1967                     (!BN_to_felem(y_out, p->Y)) ||
1968                     (!BN_to_felem(z_out, p->Z)))
1969                     goto err;
1970                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1971                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1972                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1973                 for (j = 2; j <= 16; ++j) {
1974                     if (j & 1) {
1975                         point_add(pre_comp[i][j][0], pre_comp[i][j][1],
1976                                   pre_comp[i][j][2], pre_comp[i][1][0],
1977                                   pre_comp[i][1][1], pre_comp[i][1][2], 0,
1978                                   pre_comp[i][j - 1][0],
1979                                   pre_comp[i][j - 1][1],
1980                                   pre_comp[i][j - 1][2]);
1981                     } else {
1982                         point_double(pre_comp[i][j][0], pre_comp[i][j][1],
1983                                      pre_comp[i][j][2], pre_comp[i][j / 2][0],
1984                                      pre_comp[i][j / 2][1],
1985                                      pre_comp[i][j / 2][2]);
1986                     }
1987                 }
1988             }
1989         }
1990         if (mixed)
1991             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1992     }
1993 
1994     /* the scalar for the generator */
1995     if ((scalar != NULL) && (have_pre_comp)) {
1996         memset(g_secret, 0, sizeof(g_secret));
1997         /* reduce scalar to 0 <= scalar < 2^521 */
1998         if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar))) {
1999             /*
2000              * this is an unusual input, and we don't guarantee
2001              * constant-timeness
2002              */
2003             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
2004                 ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2005                 goto err;
2006             }
2007             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
2008         } else {
2009             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
2010         }
2011         /* do the multiplication with generator precomputation */
2012         batch_mul(x_out, y_out, z_out,
2013                   (const felem_bytearray(*))secrets, num_points,
2014                   g_secret,
2015                   mixed, (const felem(*)[17][3])pre_comp,
2016                   (const felem(*)[3])g_pre_comp);
2017     } else {
2018         /* do the multiplication without generator precomputation */
2019         batch_mul(x_out, y_out, z_out,
2020                   (const felem_bytearray(*))secrets, num_points,
2021                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
2022     }
2023     /* reduce the output to its unique minimal representation */
2024     felem_contract(x_in, x_out);
2025     felem_contract(y_in, y_out);
2026     felem_contract(z_in, z_out);
2027     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
2028         (!felem_to_BN(z, z_in))) {
2029         ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
2030         goto err;
2031     }
2032     ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
2033 
2034  err:
2035     BN_CTX_end(ctx);
2036     EC_POINT_free(generator);
2037     OPENSSL_free(secrets);
2038     OPENSSL_free(pre_comp);
2039     OPENSSL_free(tmp_felems);
2040     return ret;
2041 }
2042 
2043 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
2044 {
2045     int ret = 0;
2046     NISTP521_PRE_COMP *pre = NULL;
2047     int i, j;
2048     BN_CTX *new_ctx = NULL;
2049     BIGNUM *x, *y;
2050     EC_POINT *generator = NULL;
2051     felem tmp_felems[16];
2052 
2053     /* throw away old precomputation */
2054     EC_pre_comp_free(group);
2055     if (ctx == NULL)
2056         if ((ctx = new_ctx = BN_CTX_new()) == NULL)
2057             return 0;
2058     BN_CTX_start(ctx);
2059     x = BN_CTX_get(ctx);
2060     y = BN_CTX_get(ctx);
2061     if (y == NULL)
2062         goto err;
2063     /* get the generator */
2064     if (group->generator == NULL)
2065         goto err;
2066     generator = EC_POINT_new(group);
2067     if (generator == NULL)
2068         goto err;
2069     BN_bin2bn(nistp521_curve_params[3], sizeof(felem_bytearray), x);
2070     BN_bin2bn(nistp521_curve_params[4], sizeof(felem_bytearray), y);
2071     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
2072         goto err;
2073     if ((pre = nistp521_pre_comp_new()) == NULL)
2074         goto err;
2075     /*
2076      * if the generator is the standard one, use built-in precomputation
2077      */
2078     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
2079         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
2080         goto done;
2081     }
2082     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
2083         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
2084         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
2085         goto err;
2086     /* compute 2^130*G, 2^260*G, 2^390*G */
2087     for (i = 1; i <= 4; i <<= 1) {
2088         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1],
2089                      pre->g_pre_comp[2 * i][2], pre->g_pre_comp[i][0],
2090                      pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
2091         for (j = 0; j < 129; ++j) {
2092             point_double(pre->g_pre_comp[2 * i][0],
2093                          pre->g_pre_comp[2 * i][1],
2094                          pre->g_pre_comp[2 * i][2],
2095                          pre->g_pre_comp[2 * i][0],
2096                          pre->g_pre_comp[2 * i][1],
2097                          pre->g_pre_comp[2 * i][2]);
2098         }
2099     }
2100     /* g_pre_comp[0] is the point at infinity */
2101     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
2102     /* the remaining multiples */
2103     /* 2^130*G + 2^260*G */
2104     point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
2105               pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
2106               pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
2107               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2108               pre->g_pre_comp[2][2]);
2109     /* 2^130*G + 2^390*G */
2110     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
2111               pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
2112               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2113               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2114               pre->g_pre_comp[2][2]);
2115     /* 2^260*G + 2^390*G */
2116     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
2117               pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
2118               pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
2119               0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
2120               pre->g_pre_comp[4][2]);
2121     /* 2^130*G + 2^260*G + 2^390*G */
2122     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
2123               pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
2124               pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2125               0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
2126               pre->g_pre_comp[2][2]);
2127     for (i = 1; i < 8; ++i) {
2128         /* odd multiples: add G */
2129         point_add(pre->g_pre_comp[2 * i + 1][0],
2130                   pre->g_pre_comp[2 * i + 1][1],
2131                   pre->g_pre_comp[2 * i + 1][2], pre->g_pre_comp[2 * i][0],
2132                   pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2], 0,
2133                   pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
2134                   pre->g_pre_comp[1][2]);
2135     }
2136     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2137 
2138  done:
2139     SETPRECOMP(group, nistp521, pre);
2140     ret = 1;
2141     pre = NULL;
2142  err:
2143     BN_CTX_end(ctx);
2144     EC_POINT_free(generator);
2145     BN_CTX_free(new_ctx);
2146     EC_nistp521_pre_comp_free(pre);
2147     return ret;
2148 }
2149 
2150 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2151 {
2152     return HAVEPRECOMP(group, nistp521);
2153 }
2154 
2155 #endif
2156