xref: /freebsd/crypto/openssl/crypto/ec/ecp_nistp521.c (revision fcb560670601b2a4d87bb31d7531c8dcc37ee71b)
1 /* crypto/ec/ecp_nistp521.c */
2 /*
3  * Written by Adam Langley (Google) for the OpenSSL project
4  */
5 /* Copyright 2011 Google Inc.
6  *
7  * Licensed under the Apache License, Version 2.0 (the "License");
8  *
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  *
12  *     http://www.apache.org/licenses/LICENSE-2.0
13  *
14  *  Unless required by applicable law or agreed to in writing, software
15  *  distributed under the License is distributed on an "AS IS" BASIS,
16  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  *  See the License for the specific language governing permissions and
18  *  limitations under the License.
19  */
20 
21 /*
22  * A 64-bit implementation of the NIST P-521 elliptic curve point multiplication
23  *
24  * OpenSSL integration was taken from Emilia Kasper's work in ecp_nistp224.c.
25  * Otherwise based on Emilia's P224 work, which was inspired by my curve25519
26  * work which got its smarts from Daniel J. Bernstein's work on the same.
27  */
28 
29 #include <openssl/opensslconf.h>
30 #ifndef OPENSSL_NO_EC_NISTP_64_GCC_128
31 
32 #ifndef OPENSSL_SYS_VMS
33 #include <stdint.h>
34 #else
35 #include <inttypes.h>
36 #endif
37 
38 #include <string.h>
39 #include <openssl/err.h>
40 #include "ec_lcl.h"
41 
42 #if defined(__GNUC__) && (__GNUC__ > 3 || (__GNUC__ == 3 && __GNUC_MINOR__ >= 1))
43   /* even with gcc, the typedef won't work for 32-bit platforms */
44   typedef __uint128_t uint128_t; /* nonstandard; implemented by gcc on 64-bit platforms */
45 #else
46   #error "Need GCC 3.1 or later to define type uint128_t"
47 #endif
48 
49 typedef uint8_t u8;
50 typedef uint64_t u64;
51 typedef int64_t s64;
52 
53 /* The underlying field.
54  *
55  * P521 operates over GF(2^521-1). We can serialise an element of this field
56  * into 66 bytes where the most significant byte contains only a single bit. We
57  * call this an felem_bytearray. */
58 
59 typedef u8 felem_bytearray[66];
60 
61 /* These are the parameters of P521, taken from FIPS 186-3, section D.1.2.5.
62  * These values are big-endian. */
63 static const felem_bytearray nistp521_curve_params[5] =
64 	{
65 	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* p */
66 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
67 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
68 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
69 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
70 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
71 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
72 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
73 	 0xff, 0xff},
74 	{0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,  /* a = -3 */
75 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
76 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
77 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
78 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
79 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
80 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
81 	 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff,
82 	 0xff, 0xfc},
83 	{0x00, 0x51, 0x95, 0x3e, 0xb9, 0x61, 0x8e, 0x1c,  /* b */
84 	 0x9a, 0x1f, 0x92, 0x9a, 0x21, 0xa0, 0xb6, 0x85,
85 	 0x40, 0xee, 0xa2, 0xda, 0x72, 0x5b, 0x99, 0xb3,
86 	 0x15, 0xf3, 0xb8, 0xb4, 0x89, 0x91, 0x8e, 0xf1,
87 	 0x09, 0xe1, 0x56, 0x19, 0x39, 0x51, 0xec, 0x7e,
88 	 0x93, 0x7b, 0x16, 0x52, 0xc0, 0xbd, 0x3b, 0xb1,
89 	 0xbf, 0x07, 0x35, 0x73, 0xdf, 0x88, 0x3d, 0x2c,
90 	 0x34, 0xf1, 0xef, 0x45, 0x1f, 0xd4, 0x6b, 0x50,
91 	 0x3f, 0x00},
92 	{0x00, 0xc6, 0x85, 0x8e, 0x06, 0xb7, 0x04, 0x04,  /* x */
93 	 0xe9, 0xcd, 0x9e, 0x3e, 0xcb, 0x66, 0x23, 0x95,
94 	 0xb4, 0x42, 0x9c, 0x64, 0x81, 0x39, 0x05, 0x3f,
95 	 0xb5, 0x21, 0xf8, 0x28, 0xaf, 0x60, 0x6b, 0x4d,
96 	 0x3d, 0xba, 0xa1, 0x4b, 0x5e, 0x77, 0xef, 0xe7,
97 	 0x59, 0x28, 0xfe, 0x1d, 0xc1, 0x27, 0xa2, 0xff,
98 	 0xa8, 0xde, 0x33, 0x48, 0xb3, 0xc1, 0x85, 0x6a,
99 	 0x42, 0x9b, 0xf9, 0x7e, 0x7e, 0x31, 0xc2, 0xe5,
100 	 0xbd, 0x66},
101 	{0x01, 0x18, 0x39, 0x29, 0x6a, 0x78, 0x9a, 0x3b,  /* y */
102 	 0xc0, 0x04, 0x5c, 0x8a, 0x5f, 0xb4, 0x2c, 0x7d,
103 	 0x1b, 0xd9, 0x98, 0xf5, 0x44, 0x49, 0x57, 0x9b,
104 	 0x44, 0x68, 0x17, 0xaf, 0xbd, 0x17, 0x27, 0x3e,
105 	 0x66, 0x2c, 0x97, 0xee, 0x72, 0x99, 0x5e, 0xf4,
106 	 0x26, 0x40, 0xc5, 0x50, 0xb9, 0x01, 0x3f, 0xad,
107 	 0x07, 0x61, 0x35, 0x3c, 0x70, 0x86, 0xa2, 0x72,
108 	 0xc2, 0x40, 0x88, 0xbe, 0x94, 0x76, 0x9f, 0xd1,
109 	 0x66, 0x50}
110 	};
111 
112 /* The representation of field elements.
113  * ------------------------------------
114  *
115  * We represent field elements with nine values. These values are either 64 or
116  * 128 bits and the field element represented is:
117  *   v[0]*2^0 + v[1]*2^58 + v[2]*2^116 + ... + v[8]*2^464  (mod p)
118  * Each of the nine values is called a 'limb'. Since the limbs are spaced only
119  * 58 bits apart, but are greater than 58 bits in length, the most significant
120  * bits of each limb overlap with the least significant bits of the next.
121  *
122  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
123  * 'largefelem' */
124 
125 #define NLIMBS 9
126 
127 typedef uint64_t limb;
128 typedef limb felem[NLIMBS];
129 typedef uint128_t largefelem[NLIMBS];
130 
131 static const limb bottom57bits = 0x1ffffffffffffff;
132 static const limb bottom58bits = 0x3ffffffffffffff;
133 
134 /* bin66_to_felem takes a little-endian byte array and converts it into felem
135  * form. This assumes that the CPU is little-endian. */
136 static void bin66_to_felem(felem out, const u8 in[66])
137 	{
138 	out[0] = (*((limb*) &in[0])) & bottom58bits;
139 	out[1] = (*((limb*) &in[7]) >> 2) & bottom58bits;
140 	out[2] = (*((limb*) &in[14]) >> 4) & bottom58bits;
141 	out[3] = (*((limb*) &in[21]) >> 6) & bottom58bits;
142 	out[4] = (*((limb*) &in[29])) & bottom58bits;
143 	out[5] = (*((limb*) &in[36]) >> 2) & bottom58bits;
144 	out[6] = (*((limb*) &in[43]) >> 4) & bottom58bits;
145 	out[7] = (*((limb*) &in[50]) >> 6) & bottom58bits;
146 	out[8] = (*((limb*) &in[58])) & bottom57bits;
147 	}
148 
149 /* felem_to_bin66 takes an felem and serialises into a little endian, 66 byte
150  * array. This assumes that the CPU is little-endian. */
151 static void felem_to_bin66(u8 out[66], const felem in)
152 	{
153 	memset(out, 0, 66);
154 	(*((limb*) &out[0])) = in[0];
155 	(*((limb*) &out[7])) |= in[1] << 2;
156 	(*((limb*) &out[14])) |= in[2] << 4;
157 	(*((limb*) &out[21])) |= in[3] << 6;
158 	(*((limb*) &out[29])) = in[4];
159 	(*((limb*) &out[36])) |= in[5] << 2;
160 	(*((limb*) &out[43])) |= in[6] << 4;
161 	(*((limb*) &out[50])) |= in[7] << 6;
162 	(*((limb*) &out[58])) = in[8];
163 	}
164 
165 /* To preserve endianness when using BN_bn2bin and BN_bin2bn */
166 static void flip_endian(u8 *out, const u8 *in, unsigned len)
167 	{
168 	unsigned i;
169 	for (i = 0; i < len; ++i)
170 		out[i] = in[len-1-i];
171 	}
172 
173 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
174 static int BN_to_felem(felem out, const BIGNUM *bn)
175 	{
176 	felem_bytearray b_in;
177 	felem_bytearray b_out;
178 	unsigned num_bytes;
179 
180 	/* BN_bn2bin eats leading zeroes */
181 	memset(b_out, 0, sizeof b_out);
182 	num_bytes = BN_num_bytes(bn);
183 	if (num_bytes > sizeof b_out)
184 		{
185 		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
186 		return 0;
187 		}
188 	if (BN_is_negative(bn))
189 		{
190 		ECerr(EC_F_BN_TO_FELEM, EC_R_BIGNUM_OUT_OF_RANGE);
191 		return 0;
192 		}
193 	num_bytes = BN_bn2bin(bn, b_in);
194 	flip_endian(b_out, b_in, num_bytes);
195 	bin66_to_felem(out, b_out);
196 	return 1;
197 	}
198 
199 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
200 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
201 	{
202 	felem_bytearray b_in, b_out;
203 	felem_to_bin66(b_in, in);
204 	flip_endian(b_out, b_in, sizeof b_out);
205 	return BN_bin2bn(b_out, sizeof b_out, out);
206 	}
207 
208 
209 /* Field operations
210  * ---------------- */
211 
212 static void felem_one(felem out)
213 	{
214 	out[0] = 1;
215 	out[1] = 0;
216 	out[2] = 0;
217 	out[3] = 0;
218 	out[4] = 0;
219 	out[5] = 0;
220 	out[6] = 0;
221 	out[7] = 0;
222 	out[8] = 0;
223 	}
224 
225 static void felem_assign(felem out, const felem in)
226 	{
227 	out[0] = in[0];
228 	out[1] = in[1];
229 	out[2] = in[2];
230 	out[3] = in[3];
231 	out[4] = in[4];
232 	out[5] = in[5];
233 	out[6] = in[6];
234 	out[7] = in[7];
235 	out[8] = in[8];
236 	}
237 
238 /* felem_sum64 sets out = out + in. */
239 static void felem_sum64(felem out, const felem in)
240 	{
241 	out[0] += in[0];
242 	out[1] += in[1];
243 	out[2] += in[2];
244 	out[3] += in[3];
245 	out[4] += in[4];
246 	out[5] += in[5];
247 	out[6] += in[6];
248 	out[7] += in[7];
249 	out[8] += in[8];
250 	}
251 
252 /* felem_scalar sets out = in * scalar */
253 static void felem_scalar(felem out, const felem in, limb scalar)
254 	{
255 	out[0] = in[0] * scalar;
256 	out[1] = in[1] * scalar;
257 	out[2] = in[2] * scalar;
258 	out[3] = in[3] * scalar;
259 	out[4] = in[4] * scalar;
260 	out[5] = in[5] * scalar;
261 	out[6] = in[6] * scalar;
262 	out[7] = in[7] * scalar;
263 	out[8] = in[8] * scalar;
264 	}
265 
266 /* felem_scalar64 sets out = out * scalar */
267 static void felem_scalar64(felem out, limb scalar)
268 	{
269 	out[0] *= scalar;
270 	out[1] *= scalar;
271 	out[2] *= scalar;
272 	out[3] *= scalar;
273 	out[4] *= scalar;
274 	out[5] *= scalar;
275 	out[6] *= scalar;
276 	out[7] *= scalar;
277 	out[8] *= scalar;
278 	}
279 
280 /* felem_scalar128 sets out = out * scalar */
281 static void felem_scalar128(largefelem out, limb scalar)
282 	{
283 	out[0] *= scalar;
284 	out[1] *= scalar;
285 	out[2] *= scalar;
286 	out[3] *= scalar;
287 	out[4] *= scalar;
288 	out[5] *= scalar;
289 	out[6] *= scalar;
290 	out[7] *= scalar;
291 	out[8] *= scalar;
292 	}
293 
294 /* felem_neg sets |out| to |-in|
295  * On entry:
296  *   in[i] < 2^59 + 2^14
297  * On exit:
298  *   out[i] < 2^62
299  */
300 static void felem_neg(felem out, const felem in)
301 	{
302 	/* In order to prevent underflow, we subtract from 0 mod p. */
303 	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
304 	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
305 
306 	out[0] = two62m3 - in[0];
307 	out[1] = two62m2 - in[1];
308 	out[2] = two62m2 - in[2];
309 	out[3] = two62m2 - in[3];
310 	out[4] = two62m2 - in[4];
311 	out[5] = two62m2 - in[5];
312 	out[6] = two62m2 - in[6];
313 	out[7] = two62m2 - in[7];
314 	out[8] = two62m2 - in[8];
315 	}
316 
317 /* felem_diff64 subtracts |in| from |out|
318  * On entry:
319  *   in[i] < 2^59 + 2^14
320  * On exit:
321  *   out[i] < out[i] + 2^62
322  */
323 static void felem_diff64(felem out, const felem in)
324 	{
325 	/* In order to prevent underflow, we add 0 mod p before subtracting. */
326 	static const limb two62m3 = (((limb)1) << 62) - (((limb)1) << 5);
327 	static const limb two62m2 = (((limb)1) << 62) - (((limb)1) << 4);
328 
329 	out[0] += two62m3 - in[0];
330 	out[1] += two62m2 - in[1];
331 	out[2] += two62m2 - in[2];
332 	out[3] += two62m2 - in[3];
333 	out[4] += two62m2 - in[4];
334 	out[5] += two62m2 - in[5];
335 	out[6] += two62m2 - in[6];
336 	out[7] += two62m2 - in[7];
337 	out[8] += two62m2 - in[8];
338 	}
339 
340 /* felem_diff_128_64 subtracts |in| from |out|
341  * On entry:
342  *   in[i] < 2^62 + 2^17
343  * On exit:
344  *   out[i] < out[i] + 2^63
345  */
346 static void felem_diff_128_64(largefelem out, const felem in)
347 	{
348 	/* In order to prevent underflow, we add 0 mod p before subtracting. */
349 	static const limb two63m6 = (((limb)1) << 62) - (((limb)1) << 5);
350 	static const limb two63m5 = (((limb)1) << 62) - (((limb)1) << 4);
351 
352 	out[0] += two63m6 - in[0];
353 	out[1] += two63m5 - in[1];
354 	out[2] += two63m5 - in[2];
355 	out[3] += two63m5 - in[3];
356 	out[4] += two63m5 - in[4];
357 	out[5] += two63m5 - in[5];
358 	out[6] += two63m5 - in[6];
359 	out[7] += two63m5 - in[7];
360 	out[8] += two63m5 - in[8];
361 	}
362 
363 /* felem_diff_128_64 subtracts |in| from |out|
364  * On entry:
365  *   in[i] < 2^126
366  * On exit:
367  *   out[i] < out[i] + 2^127 - 2^69
368  */
369 static void felem_diff128(largefelem out, const largefelem in)
370 	{
371 	/* In order to prevent underflow, we add 0 mod p before subtracting. */
372 	static const uint128_t two127m70 = (((uint128_t)1) << 127) - (((uint128_t)1) << 70);
373 	static const uint128_t two127m69 = (((uint128_t)1) << 127) - (((uint128_t)1) << 69);
374 
375 	out[0] += (two127m70 - in[0]);
376 	out[1] += (two127m69 - in[1]);
377 	out[2] += (two127m69 - in[2]);
378 	out[3] += (two127m69 - in[3]);
379 	out[4] += (two127m69 - in[4]);
380 	out[5] += (two127m69 - in[5]);
381 	out[6] += (two127m69 - in[6]);
382 	out[7] += (two127m69 - in[7]);
383 	out[8] += (two127m69 - in[8]);
384 	}
385 
386 /* felem_square sets |out| = |in|^2
387  * On entry:
388  *   in[i] < 2^62
389  * On exit:
390  *   out[i] < 17 * max(in[i]) * max(in[i])
391  */
392 static void felem_square(largefelem out, const felem in)
393 	{
394 	felem inx2, inx4;
395 	felem_scalar(inx2, in, 2);
396 	felem_scalar(inx4, in, 4);
397 
398 	/* We have many cases were we want to do
399 	 *   in[x] * in[y] +
400 	 *   in[y] * in[x]
401 	 * This is obviously just
402 	 *   2 * in[x] * in[y]
403 	 * However, rather than do the doubling on the 128 bit result, we
404 	 * double one of the inputs to the multiplication by reading from
405 	 * |inx2| */
406 
407 	out[0] = ((uint128_t) in[0]) * in[0];
408 	out[1] = ((uint128_t) in[0]) * inx2[1];
409 	out[2] = ((uint128_t) in[0]) * inx2[2] +
410 		 ((uint128_t) in[1]) * in[1];
411 	out[3] = ((uint128_t) in[0]) * inx2[3] +
412 		 ((uint128_t) in[1]) * inx2[2];
413 	out[4] = ((uint128_t) in[0]) * inx2[4] +
414 		 ((uint128_t) in[1]) * inx2[3] +
415 		 ((uint128_t) in[2]) * in[2];
416 	out[5] = ((uint128_t) in[0]) * inx2[5] +
417 		 ((uint128_t) in[1]) * inx2[4] +
418 		 ((uint128_t) in[2]) * inx2[3];
419 	out[6] = ((uint128_t) in[0]) * inx2[6] +
420 		 ((uint128_t) in[1]) * inx2[5] +
421 		 ((uint128_t) in[2]) * inx2[4] +
422 		 ((uint128_t) in[3]) * in[3];
423 	out[7] = ((uint128_t) in[0]) * inx2[7] +
424 		 ((uint128_t) in[1]) * inx2[6] +
425 		 ((uint128_t) in[2]) * inx2[5] +
426 		 ((uint128_t) in[3]) * inx2[4];
427 	out[8] = ((uint128_t) in[0]) * inx2[8] +
428 		 ((uint128_t) in[1]) * inx2[7] +
429 		 ((uint128_t) in[2]) * inx2[6] +
430 		 ((uint128_t) in[3]) * inx2[5] +
431 		 ((uint128_t) in[4]) * in[4];
432 
433 	/* The remaining limbs fall above 2^521, with the first falling at
434 	 * 2^522. They correspond to locations one bit up from the limbs
435 	 * produced above so we would have to multiply by two to align them.
436 	 * Again, rather than operate on the 128-bit result, we double one of
437 	 * the inputs to the multiplication. If we want to double for both this
438 	 * reason, and the reason above, then we end up multiplying by four. */
439 
440 	/* 9 */
441 	out[0] += ((uint128_t) in[1]) * inx4[8] +
442 		  ((uint128_t) in[2]) * inx4[7] +
443 		  ((uint128_t) in[3]) * inx4[6] +
444 		  ((uint128_t) in[4]) * inx4[5];
445 
446 	/* 10 */
447 	out[1] += ((uint128_t) in[2]) * inx4[8] +
448 		  ((uint128_t) in[3]) * inx4[7] +
449 		  ((uint128_t) in[4]) * inx4[6] +
450 		  ((uint128_t) in[5]) * inx2[5];
451 
452 	/* 11 */
453 	out[2] += ((uint128_t) in[3]) * inx4[8] +
454 		  ((uint128_t) in[4]) * inx4[7] +
455 		  ((uint128_t) in[5]) * inx4[6];
456 
457 	/* 12 */
458 	out[3] += ((uint128_t) in[4]) * inx4[8] +
459 		  ((uint128_t) in[5]) * inx4[7] +
460 		  ((uint128_t) in[6]) * inx2[6];
461 
462 	/* 13 */
463 	out[4] += ((uint128_t) in[5]) * inx4[8] +
464 		  ((uint128_t) in[6]) * inx4[7];
465 
466 	/* 14 */
467 	out[5] += ((uint128_t) in[6]) * inx4[8] +
468 		  ((uint128_t) in[7]) * inx2[7];
469 
470 	/* 15 */
471 	out[6] += ((uint128_t) in[7]) * inx4[8];
472 
473 	/* 16 */
474 	out[7] += ((uint128_t) in[8]) * inx2[8];
475 	}
476 
477 /* felem_mul sets |out| = |in1| * |in2|
478  * On entry:
479  *   in1[i] < 2^64
480  *   in2[i] < 2^63
481  * On exit:
482  *   out[i] < 17 * max(in1[i]) * max(in2[i])
483  */
484 static void felem_mul(largefelem out, const felem in1, const felem in2)
485 	{
486 	felem in2x2;
487 	felem_scalar(in2x2, in2, 2);
488 
489 	out[0] = ((uint128_t) in1[0]) * in2[0];
490 
491 	out[1] = ((uint128_t) in1[0]) * in2[1] +
492 	         ((uint128_t) in1[1]) * in2[0];
493 
494 	out[2] = ((uint128_t) in1[0]) * in2[2] +
495 		 ((uint128_t) in1[1]) * in2[1] +
496 	         ((uint128_t) in1[2]) * in2[0];
497 
498 	out[3] = ((uint128_t) in1[0]) * in2[3] +
499 		 ((uint128_t) in1[1]) * in2[2] +
500 		 ((uint128_t) in1[2]) * in2[1] +
501 		 ((uint128_t) in1[3]) * in2[0];
502 
503 	out[4] = ((uint128_t) in1[0]) * in2[4] +
504 		 ((uint128_t) in1[1]) * in2[3] +
505 		 ((uint128_t) in1[2]) * in2[2] +
506 		 ((uint128_t) in1[3]) * in2[1] +
507 		 ((uint128_t) in1[4]) * in2[0];
508 
509 	out[5] = ((uint128_t) in1[0]) * in2[5] +
510 		 ((uint128_t) in1[1]) * in2[4] +
511 		 ((uint128_t) in1[2]) * in2[3] +
512 		 ((uint128_t) in1[3]) * in2[2] +
513 		 ((uint128_t) in1[4]) * in2[1] +
514 		 ((uint128_t) in1[5]) * in2[0];
515 
516 	out[6] = ((uint128_t) in1[0]) * in2[6] +
517 		 ((uint128_t) in1[1]) * in2[5] +
518 		 ((uint128_t) in1[2]) * in2[4] +
519 		 ((uint128_t) in1[3]) * in2[3] +
520 		 ((uint128_t) in1[4]) * in2[2] +
521 		 ((uint128_t) in1[5]) * in2[1] +
522 		 ((uint128_t) in1[6]) * in2[0];
523 
524 	out[7] = ((uint128_t) in1[0]) * in2[7] +
525 		 ((uint128_t) in1[1]) * in2[6] +
526 		 ((uint128_t) in1[2]) * in2[5] +
527 		 ((uint128_t) in1[3]) * in2[4] +
528 		 ((uint128_t) in1[4]) * in2[3] +
529 		 ((uint128_t) in1[5]) * in2[2] +
530 		 ((uint128_t) in1[6]) * in2[1] +
531 		 ((uint128_t) in1[7]) * in2[0];
532 
533 	out[8] = ((uint128_t) in1[0]) * in2[8] +
534 		 ((uint128_t) in1[1]) * in2[7] +
535 		 ((uint128_t) in1[2]) * in2[6] +
536 		 ((uint128_t) in1[3]) * in2[5] +
537 		 ((uint128_t) in1[4]) * in2[4] +
538 		 ((uint128_t) in1[5]) * in2[3] +
539 		 ((uint128_t) in1[6]) * in2[2] +
540 		 ((uint128_t) in1[7]) * in2[1] +
541 		 ((uint128_t) in1[8]) * in2[0];
542 
543 	/* See comment in felem_square about the use of in2x2 here */
544 
545 	out[0] += ((uint128_t) in1[1]) * in2x2[8] +
546 		  ((uint128_t) in1[2]) * in2x2[7] +
547 		  ((uint128_t) in1[3]) * in2x2[6] +
548 		  ((uint128_t) in1[4]) * in2x2[5] +
549 		  ((uint128_t) in1[5]) * in2x2[4] +
550 		  ((uint128_t) in1[6]) * in2x2[3] +
551 		  ((uint128_t) in1[7]) * in2x2[2] +
552 		  ((uint128_t) in1[8]) * in2x2[1];
553 
554 	out[1] += ((uint128_t) in1[2]) * in2x2[8] +
555 		  ((uint128_t) in1[3]) * in2x2[7] +
556 		  ((uint128_t) in1[4]) * in2x2[6] +
557 		  ((uint128_t) in1[5]) * in2x2[5] +
558 		  ((uint128_t) in1[6]) * in2x2[4] +
559 		  ((uint128_t) in1[7]) * in2x2[3] +
560 		  ((uint128_t) in1[8]) * in2x2[2];
561 
562 	out[2] += ((uint128_t) in1[3]) * in2x2[8] +
563 		  ((uint128_t) in1[4]) * in2x2[7] +
564 		  ((uint128_t) in1[5]) * in2x2[6] +
565 		  ((uint128_t) in1[6]) * in2x2[5] +
566 		  ((uint128_t) in1[7]) * in2x2[4] +
567 		  ((uint128_t) in1[8]) * in2x2[3];
568 
569 	out[3] += ((uint128_t) in1[4]) * in2x2[8] +
570 		  ((uint128_t) in1[5]) * in2x2[7] +
571 		  ((uint128_t) in1[6]) * in2x2[6] +
572 		  ((uint128_t) in1[7]) * in2x2[5] +
573 		  ((uint128_t) in1[8]) * in2x2[4];
574 
575 	out[4] += ((uint128_t) in1[5]) * in2x2[8] +
576 		  ((uint128_t) in1[6]) * in2x2[7] +
577 		  ((uint128_t) in1[7]) * in2x2[6] +
578 		  ((uint128_t) in1[8]) * in2x2[5];
579 
580 	out[5] += ((uint128_t) in1[6]) * in2x2[8] +
581 		  ((uint128_t) in1[7]) * in2x2[7] +
582 		  ((uint128_t) in1[8]) * in2x2[6];
583 
584 	out[6] += ((uint128_t) in1[7]) * in2x2[8] +
585 		  ((uint128_t) in1[8]) * in2x2[7];
586 
587 	out[7] += ((uint128_t) in1[8]) * in2x2[8];
588 	}
589 
590 static const limb bottom52bits = 0xfffffffffffff;
591 
592 /* felem_reduce converts a largefelem to an felem.
593  * On entry:
594  *   in[i] < 2^128
595  * On exit:
596  *   out[i] < 2^59 + 2^14
597  */
598 static void felem_reduce(felem out, const largefelem in)
599 	{
600 	u64 overflow1, overflow2;
601 
602 	out[0] = ((limb) in[0]) & bottom58bits;
603 	out[1] = ((limb) in[1]) & bottom58bits;
604 	out[2] = ((limb) in[2]) & bottom58bits;
605 	out[3] = ((limb) in[3]) & bottom58bits;
606 	out[4] = ((limb) in[4]) & bottom58bits;
607 	out[5] = ((limb) in[5]) & bottom58bits;
608 	out[6] = ((limb) in[6]) & bottom58bits;
609 	out[7] = ((limb) in[7]) & bottom58bits;
610 	out[8] = ((limb) in[8]) & bottom58bits;
611 
612 	/* out[i] < 2^58 */
613 
614 	out[1] += ((limb) in[0]) >> 58;
615 	out[1] += (((limb) (in[0] >> 64)) & bottom52bits) << 6;
616 	/* out[1] < 2^58 + 2^6 + 2^58
617 	 *        = 2^59 + 2^6 */
618 	out[2] += ((limb) (in[0] >> 64)) >> 52;
619 
620 	out[2] += ((limb) in[1]) >> 58;
621 	out[2] += (((limb) (in[1] >> 64)) & bottom52bits) << 6;
622 	out[3] += ((limb) (in[1] >> 64)) >> 52;
623 
624 	out[3] += ((limb) in[2]) >> 58;
625 	out[3] += (((limb) (in[2] >> 64)) & bottom52bits) << 6;
626 	out[4] += ((limb) (in[2] >> 64)) >> 52;
627 
628 	out[4] += ((limb) in[3]) >> 58;
629 	out[4] += (((limb) (in[3] >> 64)) & bottom52bits) << 6;
630 	out[5] += ((limb) (in[3] >> 64)) >> 52;
631 
632 	out[5] += ((limb) in[4]) >> 58;
633 	out[5] += (((limb) (in[4] >> 64)) & bottom52bits) << 6;
634 	out[6] += ((limb) (in[4] >> 64)) >> 52;
635 
636 	out[6] += ((limb) in[5]) >> 58;
637 	out[6] += (((limb) (in[5] >> 64)) & bottom52bits) << 6;
638 	out[7] += ((limb) (in[5] >> 64)) >> 52;
639 
640 	out[7] += ((limb) in[6]) >> 58;
641 	out[7] += (((limb) (in[6] >> 64)) & bottom52bits) << 6;
642 	out[8] += ((limb) (in[6] >> 64)) >> 52;
643 
644 	out[8] += ((limb) in[7]) >> 58;
645 	out[8] += (((limb) (in[7] >> 64)) & bottom52bits) << 6;
646 	/* out[x > 1] < 2^58 + 2^6 + 2^58 + 2^12
647 	 *            < 2^59 + 2^13 */
648 	overflow1 = ((limb) (in[7] >> 64)) >> 52;
649 
650 	overflow1 += ((limb) in[8]) >> 58;
651 	overflow1 += (((limb) (in[8] >> 64)) & bottom52bits) << 6;
652 	overflow2 = ((limb) (in[8] >> 64)) >> 52;
653 
654 	overflow1 <<= 1;  /* overflow1 < 2^13 + 2^7 + 2^59 */
655 	overflow2 <<= 1;  /* overflow2 < 2^13 */
656 
657 	out[0] += overflow1;  /* out[0] < 2^60 */
658 	out[1] += overflow2;  /* out[1] < 2^59 + 2^6 + 2^13 */
659 
660 	out[1] += out[0] >> 58; out[0] &= bottom58bits;
661 	/* out[0] < 2^58
662 	 * out[1] < 2^59 + 2^6 + 2^13 + 2^2
663 	 *        < 2^59 + 2^14 */
664 	}
665 
666 static void felem_square_reduce(felem out, const felem in)
667 	{
668 	largefelem tmp;
669 	felem_square(tmp, in);
670 	felem_reduce(out, tmp);
671 	}
672 
673 static void felem_mul_reduce(felem out, const felem in1, const felem in2)
674 	{
675 	largefelem tmp;
676 	felem_mul(tmp, in1, in2);
677 	felem_reduce(out, tmp);
678 	}
679 
680 /* felem_inv calculates |out| = |in|^{-1}
681  *
682  * Based on Fermat's Little Theorem:
683  *   a^p = a (mod p)
684  *   a^{p-1} = 1 (mod p)
685  *   a^{p-2} = a^{-1} (mod p)
686  */
687 static void felem_inv(felem out, const felem in)
688 	{
689 	felem ftmp, ftmp2, ftmp3, ftmp4;
690 	largefelem tmp;
691 	unsigned i;
692 
693 	felem_square(tmp, in); felem_reduce(ftmp, tmp);		/* 2^1 */
694 	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^2 - 2^0 */
695 	felem_assign(ftmp2, ftmp);
696 	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^1 */
697 	felem_mul(tmp, in, ftmp); felem_reduce(ftmp, tmp);	/* 2^3 - 2^0 */
698 	felem_square(tmp, ftmp); felem_reduce(ftmp, tmp);	/* 2^4 - 2^1 */
699 
700 	felem_square(tmp, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^3 - 2^1 */
701 	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^2 */
702 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^4 - 2^0 */
703 
704 	felem_assign(ftmp2, ftmp3);
705 	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^5 - 2^1 */
706 	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^6 - 2^2 */
707 	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^7 - 2^3 */
708 	felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^4 */
709 	felem_assign(ftmp4, ftmp3);
710 	felem_mul(tmp, ftmp3, ftmp); felem_reduce(ftmp4, tmp);	/* 2^8 - 2^1 */
711 	felem_square(tmp, ftmp4); felem_reduce(ftmp4, tmp);	/* 2^9 - 2^2 */
712 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^8 - 2^0 */
713 	felem_assign(ftmp2, ftmp3);
714 
715 	for (i = 0; i < 8; i++)
716 		{
717 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^8 */
718 		}
719 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^16 - 2^0 */
720 	felem_assign(ftmp2, ftmp3);
721 
722 	for (i = 0; i < 16; i++)
723 		{
724 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^16 */
725 		}
726 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^32 - 2^0 */
727 	felem_assign(ftmp2, ftmp3);
728 
729 	for (i = 0; i < 32; i++)
730 		{
731 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^32 */
732 		}
733 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^64 - 2^0 */
734 	felem_assign(ftmp2, ftmp3);
735 
736 	for (i = 0; i < 64; i++)
737 		{
738 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^64 */
739 		}
740 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^128 - 2^0 */
741 	felem_assign(ftmp2, ftmp3);
742 
743 	for (i = 0; i < 128; i++)
744 		{
745 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^128 */
746 		}
747 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^256 - 2^0 */
748 	felem_assign(ftmp2, ftmp3);
749 
750 	for (i = 0; i < 256; i++)
751 		{
752 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^256 */
753 		}
754 	felem_mul(tmp, ftmp3, ftmp2); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^0 */
755 
756 	for (i = 0; i < 9; i++)
757 		{
758 		felem_square(tmp, ftmp3); felem_reduce(ftmp3, tmp);	/* 2^521 - 2^9 */
759 		}
760 	felem_mul(tmp, ftmp3, ftmp4); felem_reduce(ftmp3, tmp);	/* 2^512 - 2^2 */
761 	felem_mul(tmp, ftmp3, in); felem_reduce(out, tmp);	/* 2^512 - 3 */
762 }
763 
764 /* This is 2^521-1, expressed as an felem */
765 static const felem kPrime =
766 	{
767 	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
768 	0x03ffffffffffffff, 0x03ffffffffffffff, 0x03ffffffffffffff,
769 	0x03ffffffffffffff, 0x03ffffffffffffff, 0x01ffffffffffffff
770 	};
771 
772 /* felem_is_zero returns a limb with all bits set if |in| == 0 (mod p) and 0
773  * otherwise.
774  * On entry:
775  *   in[i] < 2^59 + 2^14
776  */
777 static limb felem_is_zero(const felem in)
778 	{
779 	felem ftmp;
780 	limb is_zero, is_p;
781 	felem_assign(ftmp, in);
782 
783 	ftmp[0] += ftmp[8] >> 57; ftmp[8] &= bottom57bits;
784 	/* ftmp[8] < 2^57 */
785 	ftmp[1] += ftmp[0] >> 58; ftmp[0] &= bottom58bits;
786 	ftmp[2] += ftmp[1] >> 58; ftmp[1] &= bottom58bits;
787 	ftmp[3] += ftmp[2] >> 58; ftmp[2] &= bottom58bits;
788 	ftmp[4] += ftmp[3] >> 58; ftmp[3] &= bottom58bits;
789 	ftmp[5] += ftmp[4] >> 58; ftmp[4] &= bottom58bits;
790 	ftmp[6] += ftmp[5] >> 58; ftmp[5] &= bottom58bits;
791 	ftmp[7] += ftmp[6] >> 58; ftmp[6] &= bottom58bits;
792 	ftmp[8] += ftmp[7] >> 58; ftmp[7] &= bottom58bits;
793 	/* ftmp[8] < 2^57 + 4 */
794 
795 	/* The ninth limb of 2*(2^521-1) is 0x03ffffffffffffff, which is
796 	 * greater than our bound for ftmp[8]. Therefore we only have to check
797 	 * if the zero is zero or 2^521-1. */
798 
799 	is_zero = 0;
800 	is_zero |= ftmp[0];
801 	is_zero |= ftmp[1];
802 	is_zero |= ftmp[2];
803 	is_zero |= ftmp[3];
804 	is_zero |= ftmp[4];
805 	is_zero |= ftmp[5];
806 	is_zero |= ftmp[6];
807 	is_zero |= ftmp[7];
808 	is_zero |= ftmp[8];
809 
810 	is_zero--;
811 	/* We know that ftmp[i] < 2^63, therefore the only way that the top bit
812 	 * can be set is if is_zero was 0 before the decrement. */
813 	is_zero = ((s64) is_zero) >> 63;
814 
815 	is_p = ftmp[0] ^ kPrime[0];
816 	is_p |= ftmp[1] ^ kPrime[1];
817 	is_p |= ftmp[2] ^ kPrime[2];
818 	is_p |= ftmp[3] ^ kPrime[3];
819 	is_p |= ftmp[4] ^ kPrime[4];
820 	is_p |= ftmp[5] ^ kPrime[5];
821 	is_p |= ftmp[6] ^ kPrime[6];
822 	is_p |= ftmp[7] ^ kPrime[7];
823 	is_p |= ftmp[8] ^ kPrime[8];
824 
825 	is_p--;
826 	is_p = ((s64) is_p) >> 63;
827 
828 	is_zero |= is_p;
829 	return is_zero;
830 	}
831 
832 static int felem_is_zero_int(const felem in)
833 	{
834 	return (int) (felem_is_zero(in) & ((limb)1));
835 	}
836 
837 /* felem_contract converts |in| to its unique, minimal representation.
838  * On entry:
839  *   in[i] < 2^59 + 2^14
840  */
841 static void felem_contract(felem out, const felem in)
842 	{
843 	limb is_p, is_greater, sign;
844 	static const limb two58 = ((limb)1) << 58;
845 
846 	felem_assign(out, in);
847 
848 	out[0] += out[8] >> 57; out[8] &= bottom57bits;
849 	/* out[8] < 2^57 */
850 	out[1] += out[0] >> 58; out[0] &= bottom58bits;
851 	out[2] += out[1] >> 58; out[1] &= bottom58bits;
852 	out[3] += out[2] >> 58; out[2] &= bottom58bits;
853 	out[4] += out[3] >> 58; out[3] &= bottom58bits;
854 	out[5] += out[4] >> 58; out[4] &= bottom58bits;
855 	out[6] += out[5] >> 58; out[5] &= bottom58bits;
856 	out[7] += out[6] >> 58; out[6] &= bottom58bits;
857 	out[8] += out[7] >> 58; out[7] &= bottom58bits;
858 	/* out[8] < 2^57 + 4 */
859 
860 	/* If the value is greater than 2^521-1 then we have to subtract
861 	 * 2^521-1 out. See the comments in felem_is_zero regarding why we
862 	 * don't test for other multiples of the prime. */
863 
864 	/* First, if |out| is equal to 2^521-1, we subtract it out to get zero. */
865 
866 	is_p = out[0] ^ kPrime[0];
867 	is_p |= out[1] ^ kPrime[1];
868 	is_p |= out[2] ^ kPrime[2];
869 	is_p |= out[3] ^ kPrime[3];
870 	is_p |= out[4] ^ kPrime[4];
871 	is_p |= out[5] ^ kPrime[5];
872 	is_p |= out[6] ^ kPrime[6];
873 	is_p |= out[7] ^ kPrime[7];
874 	is_p |= out[8] ^ kPrime[8];
875 
876 	is_p--;
877 	is_p &= is_p << 32;
878 	is_p &= is_p << 16;
879 	is_p &= is_p << 8;
880 	is_p &= is_p << 4;
881 	is_p &= is_p << 2;
882 	is_p &= is_p << 1;
883 	is_p = ((s64) is_p) >> 63;
884 	is_p = ~is_p;
885 
886 	/* is_p is 0 iff |out| == 2^521-1 and all ones otherwise */
887 
888 	out[0] &= is_p;
889 	out[1] &= is_p;
890 	out[2] &= is_p;
891 	out[3] &= is_p;
892 	out[4] &= is_p;
893 	out[5] &= is_p;
894 	out[6] &= is_p;
895 	out[7] &= is_p;
896 	out[8] &= is_p;
897 
898 	/* In order to test that |out| >= 2^521-1 we need only test if out[8]
899 	 * >> 57 is greater than zero as (2^521-1) + x >= 2^522 */
900 	is_greater = out[8] >> 57;
901 	is_greater |= is_greater << 32;
902 	is_greater |= is_greater << 16;
903 	is_greater |= is_greater << 8;
904 	is_greater |= is_greater << 4;
905 	is_greater |= is_greater << 2;
906 	is_greater |= is_greater << 1;
907 	is_greater = ((s64) is_greater) >> 63;
908 
909 	out[0] -= kPrime[0] & is_greater;
910 	out[1] -= kPrime[1] & is_greater;
911 	out[2] -= kPrime[2] & is_greater;
912 	out[3] -= kPrime[3] & is_greater;
913 	out[4] -= kPrime[4] & is_greater;
914 	out[5] -= kPrime[5] & is_greater;
915 	out[6] -= kPrime[6] & is_greater;
916 	out[7] -= kPrime[7] & is_greater;
917 	out[8] -= kPrime[8] & is_greater;
918 
919 	/* Eliminate negative coefficients */
920 	sign = -(out[0] >> 63); out[0] += (two58 & sign); out[1] -= (1 & sign);
921 	sign = -(out[1] >> 63); out[1] += (two58 & sign); out[2] -= (1 & sign);
922 	sign = -(out[2] >> 63); out[2] += (two58 & sign); out[3] -= (1 & sign);
923 	sign = -(out[3] >> 63); out[3] += (two58 & sign); out[4] -= (1 & sign);
924 	sign = -(out[4] >> 63); out[4] += (two58 & sign); out[5] -= (1 & sign);
925 	sign = -(out[0] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
926 	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
927 	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
928 	sign = -(out[5] >> 63); out[5] += (two58 & sign); out[6] -= (1 & sign);
929 	sign = -(out[6] >> 63); out[6] += (two58 & sign); out[7] -= (1 & sign);
930 	sign = -(out[7] >> 63); out[7] += (two58 & sign); out[8] -= (1 & sign);
931 	}
932 
933 /* Group operations
934  * ----------------
935  *
936  * Building on top of the field operations we have the operations on the
937  * elliptic curve group itself. Points on the curve are represented in Jacobian
938  * coordinates */
939 
940 /* point_double calcuates 2*(x_in, y_in, z_in)
941  *
942  * The method is taken from:
943  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
944  *
945  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
946  * while x_out == y_in is not (maybe this works, but it's not tested). */
947 static void
948 point_double(felem x_out, felem y_out, felem z_out,
949 	     const felem x_in, const felem y_in, const felem z_in)
950 	{
951 	largefelem tmp, tmp2;
952 	felem delta, gamma, beta, alpha, ftmp, ftmp2;
953 
954 	felem_assign(ftmp, x_in);
955 	felem_assign(ftmp2, x_in);
956 
957 	/* delta = z^2 */
958 	felem_square(tmp, z_in);
959 	felem_reduce(delta, tmp);  /* delta[i] < 2^59 + 2^14 */
960 
961 	/* gamma = y^2 */
962 	felem_square(tmp, y_in);
963 	felem_reduce(gamma, tmp);  /* gamma[i] < 2^59 + 2^14 */
964 
965 	/* beta = x*gamma */
966 	felem_mul(tmp, x_in, gamma);
967 	felem_reduce(beta, tmp);  /* beta[i] < 2^59 + 2^14 */
968 
969 	/* alpha = 3*(x-delta)*(x+delta) */
970 	felem_diff64(ftmp, delta);
971 	/* ftmp[i] < 2^61 */
972 	felem_sum64(ftmp2, delta);
973 	/* ftmp2[i] < 2^60 + 2^15 */
974 	felem_scalar64(ftmp2, 3);
975 	/* ftmp2[i] < 3*2^60 + 3*2^15 */
976 	felem_mul(tmp, ftmp, ftmp2);
977 	/* tmp[i] < 17(3*2^121 + 3*2^76)
978 	 *        = 61*2^121 + 61*2^76
979 	 *        < 64*2^121 + 64*2^76
980 	 *        = 2^127 + 2^82
981 	 *        < 2^128 */
982 	felem_reduce(alpha, tmp);
983 
984 	/* x' = alpha^2 - 8*beta */
985 	felem_square(tmp, alpha);
986 	/* tmp[i] < 17*2^120
987 	 *        < 2^125 */
988 	felem_assign(ftmp, beta);
989 	felem_scalar64(ftmp, 8);
990 	/* ftmp[i] < 2^62 + 2^17 */
991 	felem_diff_128_64(tmp, ftmp);
992 	/* tmp[i] < 2^125 + 2^63 + 2^62 + 2^17 */
993 	felem_reduce(x_out, tmp);
994 
995 	/* z' = (y + z)^2 - gamma - delta */
996 	felem_sum64(delta, gamma);
997 	/* delta[i] < 2^60 + 2^15 */
998 	felem_assign(ftmp, y_in);
999 	felem_sum64(ftmp, z_in);
1000 	/* ftmp[i] < 2^60 + 2^15 */
1001 	felem_square(tmp, ftmp);
1002 	/* tmp[i] < 17(2^122)
1003 	 *        < 2^127 */
1004 	felem_diff_128_64(tmp, delta);
1005 	/* tmp[i] < 2^127 + 2^63 */
1006 	felem_reduce(z_out, tmp);
1007 
1008 	/* y' = alpha*(4*beta - x') - 8*gamma^2 */
1009 	felem_scalar64(beta, 4);
1010 	/* beta[i] < 2^61 + 2^16 */
1011 	felem_diff64(beta, x_out);
1012 	/* beta[i] < 2^61 + 2^60 + 2^16 */
1013 	felem_mul(tmp, alpha, beta);
1014 	/* tmp[i] < 17*((2^59 + 2^14)(2^61 + 2^60 + 2^16))
1015 	 *        = 17*(2^120 + 2^75 + 2^119 + 2^74 + 2^75 + 2^30)
1016 	 *        = 17*(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1017 	 *        < 2^128 */
1018 	felem_square(tmp2, gamma);
1019 	/* tmp2[i] < 17*(2^59 + 2^14)^2
1020 	 *         = 17*(2^118 + 2^74 + 2^28) */
1021 	felem_scalar128(tmp2, 8);
1022 	/* tmp2[i] < 8*17*(2^118 + 2^74 + 2^28)
1023 	 *         = 2^125 + 2^121 + 2^81 + 2^77 + 2^35 + 2^31
1024 	 *         < 2^126 */
1025 	felem_diff128(tmp, tmp2);
1026 	/* tmp[i] < 2^127 - 2^69 + 17(2^120 + 2^119 + 2^76 + 2^74 + 2^30)
1027 	 *        = 2^127 + 2^124 + 2^122 + 2^120 + 2^118 + 2^80 + 2^78 + 2^76 +
1028 	 *          2^74 + 2^69 + 2^34 + 2^30
1029 	 *        < 2^128 */
1030 	felem_reduce(y_out, tmp);
1031 	}
1032 
1033 /* copy_conditional copies in to out iff mask is all ones. */
1034 static void
1035 copy_conditional(felem out, const felem in, limb mask)
1036 	{
1037 	unsigned i;
1038 	for (i = 0; i < NLIMBS; ++i)
1039 		{
1040 		const limb tmp = mask & (in[i] ^ out[i]);
1041 		out[i] ^= tmp;
1042 		}
1043 	}
1044 
1045 /* point_add calcuates (x1, y1, z1) + (x2, y2, z2)
1046  *
1047  * The method is taken from
1048  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1049  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1050  *
1051  * This function includes a branch for checking whether the two input points
1052  * are equal (while not equal to the point at infinity). This case never
1053  * happens during single point multiplication, so there is no timing leak for
1054  * ECDH or ECDSA signing. */
1055 static void point_add(felem x3, felem y3, felem z3,
1056 	const felem x1, const felem y1, const felem z1,
1057 	const int mixed, const felem x2, const felem y2, const felem z2)
1058 	{
1059 	felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1060 	largefelem tmp, tmp2;
1061 	limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1062 
1063 	z1_is_zero = felem_is_zero(z1);
1064 	z2_is_zero = felem_is_zero(z2);
1065 
1066 	/* ftmp = z1z1 = z1**2 */
1067 	felem_square(tmp, z1);
1068 	felem_reduce(ftmp, tmp);
1069 
1070 	if (!mixed)
1071 		{
1072 		/* ftmp2 = z2z2 = z2**2 */
1073 		felem_square(tmp, z2);
1074 		felem_reduce(ftmp2, tmp);
1075 
1076 		/* u1 = ftmp3 = x1*z2z2 */
1077 		felem_mul(tmp, x1, ftmp2);
1078 		felem_reduce(ftmp3, tmp);
1079 
1080 		/* ftmp5 = z1 + z2 */
1081 		felem_assign(ftmp5, z1);
1082 		felem_sum64(ftmp5, z2);
1083 		/* ftmp5[i] < 2^61 */
1084 
1085 		/* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1086 		felem_square(tmp, ftmp5);
1087 		/* tmp[i] < 17*2^122 */
1088 		felem_diff_128_64(tmp, ftmp);
1089 		/* tmp[i] < 17*2^122 + 2^63 */
1090 		felem_diff_128_64(tmp, ftmp2);
1091 		/* tmp[i] < 17*2^122 + 2^64 */
1092 		felem_reduce(ftmp5, tmp);
1093 
1094 		/* ftmp2 = z2 * z2z2 */
1095 		felem_mul(tmp, ftmp2, z2);
1096 		felem_reduce(ftmp2, tmp);
1097 
1098 		/* s1 = ftmp6 = y1 * z2**3 */
1099 		felem_mul(tmp, y1, ftmp2);
1100 		felem_reduce(ftmp6, tmp);
1101 		}
1102 	else
1103 		{
1104 		/* We'll assume z2 = 1 (special case z2 = 0 is handled later) */
1105 
1106 		/* u1 = ftmp3 = x1*z2z2 */
1107 		felem_assign(ftmp3, x1);
1108 
1109 		/* ftmp5 = 2*z1z2 */
1110 		felem_scalar(ftmp5, z1, 2);
1111 
1112 		/* s1 = ftmp6 = y1 * z2**3 */
1113 		felem_assign(ftmp6, y1);
1114 		}
1115 
1116 	/* u2 = x2*z1z1 */
1117 	felem_mul(tmp, x2, ftmp);
1118 	/* tmp[i] < 17*2^120 */
1119 
1120 	/* h = ftmp4 = u2 - u1 */
1121 	felem_diff_128_64(tmp, ftmp3);
1122 	/* tmp[i] < 17*2^120 + 2^63 */
1123 	felem_reduce(ftmp4, tmp);
1124 
1125 	x_equal = felem_is_zero(ftmp4);
1126 
1127 	/* z_out = ftmp5 * h */
1128 	felem_mul(tmp, ftmp5, ftmp4);
1129 	felem_reduce(z_out, tmp);
1130 
1131 	/* ftmp = z1 * z1z1 */
1132 	felem_mul(tmp, ftmp, z1);
1133 	felem_reduce(ftmp, tmp);
1134 
1135 	/* s2 = tmp = y2 * z1**3 */
1136 	felem_mul(tmp, y2, ftmp);
1137 	/* tmp[i] < 17*2^120 */
1138 
1139 	/* r = ftmp5 = (s2 - s1)*2 */
1140 	felem_diff_128_64(tmp, ftmp6);
1141 	/* tmp[i] < 17*2^120 + 2^63 */
1142 	felem_reduce(ftmp5, tmp);
1143 	y_equal = felem_is_zero(ftmp5);
1144 	felem_scalar64(ftmp5, 2);
1145 	/* ftmp5[i] < 2^61 */
1146 
1147 	if (x_equal && y_equal && !z1_is_zero && !z2_is_zero)
1148 		{
1149 		point_double(x3, y3, z3, x1, y1, z1);
1150 		return;
1151 		}
1152 
1153 	/* I = ftmp = (2h)**2 */
1154 	felem_assign(ftmp, ftmp4);
1155 	felem_scalar64(ftmp, 2);
1156 	/* ftmp[i] < 2^61 */
1157 	felem_square(tmp, ftmp);
1158 	/* tmp[i] < 17*2^122 */
1159 	felem_reduce(ftmp, tmp);
1160 
1161 	/* J = ftmp2 = h * I */
1162 	felem_mul(tmp, ftmp4, ftmp);
1163 	felem_reduce(ftmp2, tmp);
1164 
1165 	/* V = ftmp4 = U1 * I */
1166 	felem_mul(tmp, ftmp3, ftmp);
1167 	felem_reduce(ftmp4, tmp);
1168 
1169 	/* x_out = r**2 - J - 2V */
1170 	felem_square(tmp, ftmp5);
1171 	/* tmp[i] < 17*2^122 */
1172 	felem_diff_128_64(tmp, ftmp2);
1173 	/* tmp[i] < 17*2^122 + 2^63 */
1174 	felem_assign(ftmp3, ftmp4);
1175 	felem_scalar64(ftmp4, 2);
1176 	/* ftmp4[i] < 2^61 */
1177 	felem_diff_128_64(tmp, ftmp4);
1178 	/* tmp[i] < 17*2^122 + 2^64 */
1179 	felem_reduce(x_out, tmp);
1180 
1181 	/* y_out = r(V-x_out) - 2 * s1 * J */
1182 	felem_diff64(ftmp3, x_out);
1183 	/* ftmp3[i] < 2^60 + 2^60
1184 	 *          = 2^61 */
1185 	felem_mul(tmp, ftmp5, ftmp3);
1186 	/* tmp[i] < 17*2^122 */
1187 	felem_mul(tmp2, ftmp6, ftmp2);
1188 	/* tmp2[i] < 17*2^120 */
1189 	felem_scalar128(tmp2, 2);
1190 	/* tmp2[i] < 17*2^121 */
1191 	felem_diff128(tmp, tmp2);
1192 	/* tmp[i] < 2^127 - 2^69 + 17*2^122
1193 	 *        = 2^126 - 2^122 - 2^6 - 2^2 - 1
1194 	 *        < 2^127 */
1195 	felem_reduce(y_out, tmp);
1196 
1197 	copy_conditional(x_out, x2, z1_is_zero);
1198 	copy_conditional(x_out, x1, z2_is_zero);
1199 	copy_conditional(y_out, y2, z1_is_zero);
1200 	copy_conditional(y_out, y1, z2_is_zero);
1201 	copy_conditional(z_out, z2, z1_is_zero);
1202 	copy_conditional(z_out, z1, z2_is_zero);
1203 	felem_assign(x3, x_out);
1204 	felem_assign(y3, y_out);
1205 	felem_assign(z3, z_out);
1206 	}
1207 
1208 /* Base point pre computation
1209  * --------------------------
1210  *
1211  * Two different sorts of precomputed tables are used in the following code.
1212  * Each contain various points on the curve, where each point is three field
1213  * elements (x, y, z).
1214  *
1215  * For the base point table, z is usually 1 (0 for the point at infinity).
1216  * This table has 16 elements:
1217  * index | bits    | point
1218  * ------+---------+------------------------------
1219  *     0 | 0 0 0 0 | 0G
1220  *     1 | 0 0 0 1 | 1G
1221  *     2 | 0 0 1 0 | 2^130G
1222  *     3 | 0 0 1 1 | (2^130 + 1)G
1223  *     4 | 0 1 0 0 | 2^260G
1224  *     5 | 0 1 0 1 | (2^260 + 1)G
1225  *     6 | 0 1 1 0 | (2^260 + 2^130)G
1226  *     7 | 0 1 1 1 | (2^260 + 2^130 + 1)G
1227  *     8 | 1 0 0 0 | 2^390G
1228  *     9 | 1 0 0 1 | (2^390 + 1)G
1229  *    10 | 1 0 1 0 | (2^390 + 2^130)G
1230  *    11 | 1 0 1 1 | (2^390 + 2^130 + 1)G
1231  *    12 | 1 1 0 0 | (2^390 + 2^260)G
1232  *    13 | 1 1 0 1 | (2^390 + 2^260 + 1)G
1233  *    14 | 1 1 1 0 | (2^390 + 2^260 + 2^130)G
1234  *    15 | 1 1 1 1 | (2^390 + 2^260 + 2^130 + 1)G
1235  *
1236  * The reason for this is so that we can clock bits into four different
1237  * locations when doing simple scalar multiplies against the base point.
1238  *
1239  * Tables for other points have table[i] = iG for i in 0 .. 16. */
1240 
1241 /* gmul is the table of precomputed base points */
1242 static const felem gmul[16][3] =
1243 	{{{0, 0, 0, 0, 0, 0, 0, 0, 0},
1244 	  {0, 0, 0, 0, 0, 0, 0, 0, 0},
1245 	  {0, 0, 0, 0, 0, 0, 0, 0, 0}},
1246 	 {{0x017e7e31c2e5bd66, 0x022cf0615a90a6fe, 0x00127a2ffa8de334,
1247 	   0x01dfbf9d64a3f877, 0x006b4d3dbaa14b5e, 0x014fed487e0a2bd8,
1248 	   0x015b4429c6481390, 0x03a73678fb2d988e, 0x00c6858e06b70404},
1249 	  {0x00be94769fd16650, 0x031c21a89cb09022, 0x039013fad0761353,
1250 	   0x02657bd099031542, 0x03273e662c97ee72, 0x01e6d11a05ebef45,
1251 	   0x03d1bd998f544495, 0x03001172297ed0b1, 0x011839296a789a3b},
1252 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1253 	 {{0x0373faacbc875bae, 0x00f325023721c671, 0x00f666fd3dbde5ad,
1254 	   0x01a6932363f88ea7, 0x01fc6d9e13f9c47b, 0x03bcbffc2bbf734e,
1255 	   0x013ee3c3647f3a92, 0x029409fefe75d07d, 0x00ef9199963d85e5},
1256 	  {0x011173743ad5b178, 0x02499c7c21bf7d46, 0x035beaeabb8b1a58,
1257 	   0x00f989c4752ea0a3, 0x0101e1de48a9c1a3, 0x01a20076be28ba6c,
1258 	   0x02f8052e5eb2de95, 0x01bfe8f82dea117c, 0x0160074d3c36ddb7},
1259 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1260 	 {{0x012f3fc373393b3b, 0x03d3d6172f1419fa, 0x02adc943c0b86873,
1261 	   0x00d475584177952b, 0x012a4d1673750ee2, 0x00512517a0f13b0c,
1262 	   0x02b184671a7b1734, 0x0315b84236f1a50a, 0x00a4afc472edbdb9},
1263 	  {0x00152a7077f385c4, 0x03044007d8d1c2ee, 0x0065829d61d52b52,
1264 	   0x00494ff6b6631d0d, 0x00a11d94d5f06bcf, 0x02d2f89474d9282e,
1265 	   0x0241c5727c06eeb9, 0x0386928710fbdb9d, 0x01f883f727b0dfbe},
1266 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1267 	 {{0x019b0c3c9185544d, 0x006243a37c9d97db, 0x02ee3cbe030a2ad2,
1268 	   0x00cfdd946bb51e0d, 0x0271c00932606b91, 0x03f817d1ec68c561,
1269 	   0x03f37009806a369c, 0x03c1f30baf184fd5, 0x01091022d6d2f065},
1270 	  {0x0292c583514c45ed, 0x0316fca51f9a286c, 0x00300af507c1489a,
1271 	   0x0295f69008298cf1, 0x02c0ed8274943d7b, 0x016509b9b47a431e,
1272 	   0x02bc9de9634868ce, 0x005b34929bffcb09, 0x000c1a0121681524},
1273 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1274 	 {{0x0286abc0292fb9f2, 0x02665eee9805b3f7, 0x01ed7455f17f26d6,
1275 	   0x0346355b83175d13, 0x006284944cd0a097, 0x0191895bcdec5e51,
1276 	   0x02e288370afda7d9, 0x03b22312bfefa67a, 0x01d104d3fc0613fe},
1277 	  {0x0092421a12f7e47f, 0x0077a83fa373c501, 0x03bd25c5f696bd0d,
1278 	   0x035c41e4d5459761, 0x01ca0d1742b24f53, 0x00aaab27863a509c,
1279 	   0x018b6de47df73917, 0x025c0b771705cd01, 0x01fd51d566d760a7},
1280 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1281 	 {{0x01dd92ff6b0d1dbd, 0x039c5e2e8f8afa69, 0x0261ed13242c3b27,
1282 	   0x0382c6e67026e6a0, 0x01d60b10be2089f9, 0x03c15f3dce86723f,
1283 	   0x03c764a32d2a062d, 0x017307eac0fad056, 0x018207c0b96c5256},
1284 	  {0x0196a16d60e13154, 0x03e6ce74c0267030, 0x00ddbf2b4e52a5aa,
1285 	   0x012738241bbf31c8, 0x00ebe8dc04685a28, 0x024c2ad6d380d4a2,
1286 	   0x035ee062a6e62d0e, 0x0029ed74af7d3a0f, 0x00eef32aec142ebd},
1287 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1288 	 {{0x00c31ec398993b39, 0x03a9f45bcda68253, 0x00ac733c24c70890,
1289 	   0x00872b111401ff01, 0x01d178c23195eafb, 0x03bca2c816b87f74,
1290 	   0x0261a9af46fbad7a, 0x0324b2a8dd3d28f9, 0x00918121d8f24e23},
1291 	  {0x032bc8c1ca983cd7, 0x00d869dfb08fc8c6, 0x01693cb61fce1516,
1292 	   0x012a5ea68f4e88a8, 0x010869cab88d7ae3, 0x009081ad277ceee1,
1293 	   0x033a77166d064cdc, 0x03955235a1fb3a95, 0x01251a4a9b25b65e},
1294 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1295 	 {{0x00148a3a1b27f40b, 0x0123186df1b31fdc, 0x00026e7beaad34ce,
1296 	   0x01db446ac1d3dbba, 0x0299c1a33437eaec, 0x024540610183cbb7,
1297 	   0x0173bb0e9ce92e46, 0x02b937e43921214b, 0x01ab0436a9bf01b5},
1298 	  {0x0383381640d46948, 0x008dacbf0e7f330f, 0x03602122bcc3f318,
1299 	   0x01ee596b200620d6, 0x03bd0585fda430b3, 0x014aed77fd123a83,
1300 	   0x005ace749e52f742, 0x0390fe041da2b842, 0x0189a8ceb3299242},
1301 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1302 	 {{0x012a19d6b3282473, 0x00c0915918b423ce, 0x023a954eb94405ae,
1303 	   0x00529f692be26158, 0x0289fa1b6fa4b2aa, 0x0198ae4ceea346ef,
1304 	   0x0047d8cdfbdedd49, 0x00cc8c8953f0f6b8, 0x001424abbff49203},
1305 	  {0x0256732a1115a03a, 0x0351bc38665c6733, 0x03f7b950fb4a6447,
1306 	   0x000afffa94c22155, 0x025763d0a4dab540, 0x000511e92d4fc283,
1307 	   0x030a7e9eda0ee96c, 0x004c3cd93a28bf0a, 0x017edb3a8719217f},
1308 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1309 	 {{0x011de5675a88e673, 0x031d7d0f5e567fbe, 0x0016b2062c970ae5,
1310 	   0x03f4a2be49d90aa7, 0x03cef0bd13822866, 0x03f0923dcf774a6c,
1311 	   0x0284bebc4f322f72, 0x016ab2645302bb2c, 0x01793f95dace0e2a},
1312 	  {0x010646e13527a28f, 0x01ca1babd59dc5e7, 0x01afedfd9a5595df,
1313 	   0x01f15785212ea6b1, 0x0324e5d64f6ae3f4, 0x02d680f526d00645,
1314 	   0x0127920fadf627a7, 0x03b383f75df4f684, 0x0089e0057e783b0a},
1315 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1316 	 {{0x00f334b9eb3c26c6, 0x0298fdaa98568dce, 0x01c2d24843a82292,
1317 	   0x020bcb24fa1b0711, 0x02cbdb3d2b1875e6, 0x0014907598f89422,
1318 	   0x03abe3aa43b26664, 0x02cbf47f720bc168, 0x0133b5e73014b79b},
1319 	  {0x034aab5dab05779d, 0x00cdc5d71fee9abb, 0x0399f16bd4bd9d30,
1320 	   0x03582fa592d82647, 0x02be1cdfb775b0e9, 0x0034f7cea32e94cb,
1321 	   0x0335a7f08f56f286, 0x03b707e9565d1c8b, 0x0015c946ea5b614f},
1322 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1323 	 {{0x024676f6cff72255, 0x00d14625cac96378, 0x00532b6008bc3767,
1324 	   0x01fc16721b985322, 0x023355ea1b091668, 0x029de7afdc0317c3,
1325 	   0x02fc8a7ca2da037c, 0x02de1217d74a6f30, 0x013f7173175b73bf},
1326 	  {0x0344913f441490b5, 0x0200f9e272b61eca, 0x0258a246b1dd55d2,
1327 	   0x03753db9ea496f36, 0x025e02937a09c5ef, 0x030cbd3d14012692,
1328 	   0x01793a67e70dc72a, 0x03ec1d37048a662e, 0x006550f700c32a8d},
1329 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1330 	 {{0x00d3f48a347eba27, 0x008e636649b61bd8, 0x00d3b93716778fb3,
1331 	   0x004d1915757bd209, 0x019d5311a3da44e0, 0x016d1afcbbe6aade,
1332 	   0x0241bf5f73265616, 0x0384672e5d50d39b, 0x005009fee522b684},
1333 	  {0x029b4fab064435fe, 0x018868ee095bbb07, 0x01ea3d6936cc92b8,
1334 	   0x000608b00f78a2f3, 0x02db911073d1c20f, 0x018205938470100a,
1335 	   0x01f1e4964cbe6ff2, 0x021a19a29eed4663, 0x01414485f42afa81},
1336 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1337 	 {{0x01612b3a17f63e34, 0x03813992885428e6, 0x022b3c215b5a9608,
1338 	   0x029b4057e19f2fcb, 0x0384059a587af7e6, 0x02d6400ace6fe610,
1339 	   0x029354d896e8e331, 0x00c047ee6dfba65e, 0x0037720542e9d49d},
1340 	  {0x02ce9eed7c5e9278, 0x0374ed703e79643b, 0x01316c54c4072006,
1341 	   0x005aaa09054b2ee8, 0x002824000c840d57, 0x03d4eba24771ed86,
1342 	   0x0189c50aabc3bdae, 0x0338c01541e15510, 0x00466d56e38eed42},
1343 	  {1, 0, 0, 0, 0, 0, 0, 0, 0}},
1344 	 {{0x007efd8330ad8bd6, 0x02465ed48047710b, 0x0034c6606b215e0c,
1345 	   0x016ae30c53cbf839, 0x01fa17bd37161216, 0x018ead4e61ce8ab9,
1346 	   0x005482ed5f5dee46, 0x037543755bba1d7f, 0x005e5ac7e70a9d0f},
1347 	  {0x0117e1bb2fdcb2a2, 0x03deea36249f40c4, 0x028d09b4a6246cb7,
1348 	   0x03524b8855bcf756, 0x023d7d109d5ceb58, 0x0178e43e3223ef9c,
1349 	   0x0154536a0c6e966a, 0x037964d1286ee9fe, 0x0199bcd90e125055},
1350 	 {1, 0, 0, 0, 0, 0, 0, 0, 0}}};
1351 
1352 /* select_point selects the |idx|th point from a precomputation table and
1353  * copies it to out. */
1354 static void select_point(const limb idx, unsigned int size, const felem pre_comp[/* size */][3],
1355 			 felem out[3])
1356 	{
1357 	unsigned i, j;
1358 	limb *outlimbs = &out[0][0];
1359 	memset(outlimbs, 0, 3 * sizeof(felem));
1360 
1361 	for (i = 0; i < size; i++)
1362 		{
1363 		const limb *inlimbs = &pre_comp[i][0][0];
1364 		limb mask = i ^ idx;
1365 		mask |= mask >> 4;
1366 		mask |= mask >> 2;
1367 		mask |= mask >> 1;
1368 		mask &= 1;
1369 		mask--;
1370 		for (j = 0; j < NLIMBS * 3; j++)
1371 			outlimbs[j] |= inlimbs[j] & mask;
1372 		}
1373 	}
1374 
1375 /* get_bit returns the |i|th bit in |in| */
1376 static char get_bit(const felem_bytearray in, int i)
1377 	{
1378 	if (i < 0)
1379 		return 0;
1380 	return (in[i >> 3] >> (i & 7)) & 1;
1381 	}
1382 
1383 /* Interleaved point multiplication using precomputed point multiples:
1384  * The small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[],
1385  * the scalars in scalars[]. If g_scalar is non-NULL, we also add this multiple
1386  * of the generator, using certain (large) precomputed multiples in g_pre_comp.
1387  * Output point (X, Y, Z) is stored in x_out, y_out, z_out */
1388 static void batch_mul(felem x_out, felem y_out, felem z_out,
1389 	const felem_bytearray scalars[], const unsigned num_points, const u8 *g_scalar,
1390 	const int mixed, const felem pre_comp[][17][3], const felem g_pre_comp[16][3])
1391 	{
1392 	int i, skip;
1393 	unsigned num, gen_mul = (g_scalar != NULL);
1394 	felem nq[3], tmp[4];
1395 	limb bits;
1396 	u8 sign, digit;
1397 
1398 	/* set nq to the point at infinity */
1399 	memset(nq, 0, 3 * sizeof(felem));
1400 
1401 	/* Loop over all scalars msb-to-lsb, interleaving additions
1402 	 * of multiples of the generator (last quarter of rounds)
1403 	 * and additions of other points multiples (every 5th round).
1404 	 */
1405 	skip = 1; /* save two point operations in the first round */
1406 	for (i = (num_points ? 520 : 130); i >= 0; --i)
1407 		{
1408 		/* double */
1409 		if (!skip)
1410 			point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1411 
1412 		/* add multiples of the generator */
1413 		if (gen_mul && (i <= 130))
1414 			{
1415 			bits = get_bit(g_scalar, i + 390) << 3;
1416 			if (i < 130)
1417 				{
1418 				bits |= get_bit(g_scalar, i + 260) << 2;
1419 				bits |= get_bit(g_scalar, i + 130) << 1;
1420 				bits |= get_bit(g_scalar, i);
1421 				}
1422 			/* select the point to add, in constant time */
1423 			select_point(bits, 16, g_pre_comp, tmp);
1424 			if (!skip)
1425 				{
1426 				point_add(nq[0], nq[1], nq[2],
1427 					nq[0], nq[1], nq[2],
1428 					1 /* mixed */, tmp[0], tmp[1], tmp[2]);
1429 				}
1430 			else
1431 				{
1432 				memcpy(nq, tmp, 3 * sizeof(felem));
1433 				skip = 0;
1434 				}
1435 			}
1436 
1437 		/* do other additions every 5 doublings */
1438 		if (num_points && (i % 5 == 0))
1439 			{
1440 			/* loop over all scalars */
1441 			for (num = 0; num < num_points; ++num)
1442 				{
1443 				bits = get_bit(scalars[num], i + 4) << 5;
1444 				bits |= get_bit(scalars[num], i + 3) << 4;
1445 				bits |= get_bit(scalars[num], i + 2) << 3;
1446 				bits |= get_bit(scalars[num], i + 1) << 2;
1447 				bits |= get_bit(scalars[num], i) << 1;
1448 				bits |= get_bit(scalars[num], i - 1);
1449 				ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1450 
1451 				/* select the point to add or subtract, in constant time */
1452 				select_point(digit, 17, pre_comp[num], tmp);
1453 				felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative point */
1454 				copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1455 
1456 				if (!skip)
1457 					{
1458 					point_add(nq[0], nq[1], nq[2],
1459 						nq[0], nq[1], nq[2],
1460 						mixed, tmp[0], tmp[1], tmp[2]);
1461 					}
1462 				else
1463 					{
1464 					memcpy(nq, tmp, 3 * sizeof(felem));
1465 					skip = 0;
1466 					}
1467 				}
1468 			}
1469 		}
1470 	felem_assign(x_out, nq[0]);
1471 	felem_assign(y_out, nq[1]);
1472 	felem_assign(z_out, nq[2]);
1473 	}
1474 
1475 
1476 /* Precomputation for the group generator. */
1477 typedef struct {
1478 	felem g_pre_comp[16][3];
1479 	int references;
1480 } NISTP521_PRE_COMP;
1481 
1482 const EC_METHOD *EC_GFp_nistp521_method(void)
1483 	{
1484 	static const EC_METHOD ret = {
1485 		EC_FLAGS_DEFAULT_OCT,
1486 		NID_X9_62_prime_field,
1487 		ec_GFp_nistp521_group_init,
1488 		ec_GFp_simple_group_finish,
1489 		ec_GFp_simple_group_clear_finish,
1490 		ec_GFp_nist_group_copy,
1491 		ec_GFp_nistp521_group_set_curve,
1492 		ec_GFp_simple_group_get_curve,
1493 		ec_GFp_simple_group_get_degree,
1494 		ec_GFp_simple_group_check_discriminant,
1495 		ec_GFp_simple_point_init,
1496 		ec_GFp_simple_point_finish,
1497 		ec_GFp_simple_point_clear_finish,
1498 		ec_GFp_simple_point_copy,
1499 		ec_GFp_simple_point_set_to_infinity,
1500 		ec_GFp_simple_set_Jprojective_coordinates_GFp,
1501 		ec_GFp_simple_get_Jprojective_coordinates_GFp,
1502 		ec_GFp_simple_point_set_affine_coordinates,
1503 		ec_GFp_nistp521_point_get_affine_coordinates,
1504 		0 /* point_set_compressed_coordinates */,
1505 		0 /* point2oct */,
1506 		0 /* oct2point */,
1507 		ec_GFp_simple_add,
1508 		ec_GFp_simple_dbl,
1509 		ec_GFp_simple_invert,
1510 		ec_GFp_simple_is_at_infinity,
1511 		ec_GFp_simple_is_on_curve,
1512 		ec_GFp_simple_cmp,
1513 		ec_GFp_simple_make_affine,
1514 		ec_GFp_simple_points_make_affine,
1515 		ec_GFp_nistp521_points_mul,
1516 		ec_GFp_nistp521_precompute_mult,
1517 		ec_GFp_nistp521_have_precompute_mult,
1518 		ec_GFp_nist_field_mul,
1519 		ec_GFp_nist_field_sqr,
1520 		0 /* field_div */,
1521 		0 /* field_encode */,
1522 		0 /* field_decode */,
1523 		0 /* field_set_to_one */ };
1524 
1525 	return &ret;
1526 	}
1527 
1528 
1529 /******************************************************************************/
1530 /*		       FUNCTIONS TO MANAGE PRECOMPUTATION
1531  */
1532 
1533 static NISTP521_PRE_COMP *nistp521_pre_comp_new()
1534 	{
1535 	NISTP521_PRE_COMP *ret = NULL;
1536 	ret = (NISTP521_PRE_COMP *)OPENSSL_malloc(sizeof(NISTP521_PRE_COMP));
1537 	if (!ret)
1538 		{
1539 		ECerr(EC_F_NISTP521_PRE_COMP_NEW, ERR_R_MALLOC_FAILURE);
1540 		return ret;
1541 		}
1542 	memset(ret->g_pre_comp, 0, sizeof(ret->g_pre_comp));
1543 	ret->references = 1;
1544 	return ret;
1545 	}
1546 
1547 static void *nistp521_pre_comp_dup(void *src_)
1548 	{
1549 	NISTP521_PRE_COMP *src = src_;
1550 
1551 	/* no need to actually copy, these objects never change! */
1552 	CRYPTO_add(&src->references, 1, CRYPTO_LOCK_EC_PRE_COMP);
1553 
1554 	return src_;
1555 	}
1556 
1557 static void nistp521_pre_comp_free(void *pre_)
1558 	{
1559 	int i;
1560 	NISTP521_PRE_COMP *pre = pre_;
1561 
1562 	if (!pre)
1563 		return;
1564 
1565 	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1566 	if (i > 0)
1567 		return;
1568 
1569 	OPENSSL_free(pre);
1570 	}
1571 
1572 static void nistp521_pre_comp_clear_free(void *pre_)
1573 	{
1574 	int i;
1575 	NISTP521_PRE_COMP *pre = pre_;
1576 
1577 	if (!pre)
1578 		return;
1579 
1580 	i = CRYPTO_add(&pre->references, -1, CRYPTO_LOCK_EC_PRE_COMP);
1581 	if (i > 0)
1582 		return;
1583 
1584 	OPENSSL_cleanse(pre, sizeof(*pre));
1585 	OPENSSL_free(pre);
1586 	}
1587 
1588 /******************************************************************************/
1589 /*			   OPENSSL EC_METHOD FUNCTIONS
1590  */
1591 
1592 int ec_GFp_nistp521_group_init(EC_GROUP *group)
1593 	{
1594 	int ret;
1595 	ret = ec_GFp_simple_group_init(group);
1596 	group->a_is_minus3 = 1;
1597 	return ret;
1598 	}
1599 
1600 int ec_GFp_nistp521_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1601 	const BIGNUM *a, const BIGNUM *b, BN_CTX *ctx)
1602 	{
1603 	int ret = 0;
1604 	BN_CTX *new_ctx = NULL;
1605 	BIGNUM *curve_p, *curve_a, *curve_b;
1606 
1607 	if (ctx == NULL)
1608 		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1609 	BN_CTX_start(ctx);
1610 	if (((curve_p = BN_CTX_get(ctx)) == NULL) ||
1611 		((curve_a = BN_CTX_get(ctx)) == NULL) ||
1612 		((curve_b = BN_CTX_get(ctx)) == NULL)) goto err;
1613 	BN_bin2bn(nistp521_curve_params[0], sizeof(felem_bytearray), curve_p);
1614 	BN_bin2bn(nistp521_curve_params[1], sizeof(felem_bytearray), curve_a);
1615 	BN_bin2bn(nistp521_curve_params[2], sizeof(felem_bytearray), curve_b);
1616 	if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) ||
1617 		(BN_cmp(curve_b, b)))
1618 		{
1619 		ECerr(EC_F_EC_GFP_NISTP521_GROUP_SET_CURVE,
1620 			EC_R_WRONG_CURVE_PARAMETERS);
1621 		goto err;
1622 		}
1623 	group->field_mod_func = BN_nist_mod_521;
1624 	ret = ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1625 err:
1626 	BN_CTX_end(ctx);
1627 	if (new_ctx != NULL)
1628 		BN_CTX_free(new_ctx);
1629 	return ret;
1630 	}
1631 
1632 /* Takes the Jacobian coordinates (X, Y, Z) of a point and returns
1633  * (X', Y') = (X/Z^2, Y/Z^3) */
1634 int ec_GFp_nistp521_point_get_affine_coordinates(const EC_GROUP *group,
1635 	const EC_POINT *point, BIGNUM *x, BIGNUM *y, BN_CTX *ctx)
1636 	{
1637 	felem z1, z2, x_in, y_in, x_out, y_out;
1638 	largefelem tmp;
1639 
1640 	if (EC_POINT_is_at_infinity(group, point))
1641 		{
1642 		ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES,
1643 			EC_R_POINT_AT_INFINITY);
1644 		return 0;
1645 		}
1646 	if ((!BN_to_felem(x_in, &point->X)) || (!BN_to_felem(y_in, &point->Y)) ||
1647 		(!BN_to_felem(z1, &point->Z))) return 0;
1648 	felem_inv(z2, z1);
1649 	felem_square(tmp, z2); felem_reduce(z1, tmp);
1650 	felem_mul(tmp, x_in, z1); felem_reduce(x_in, tmp);
1651 	felem_contract(x_out, x_in);
1652 	if (x != NULL)
1653 		{
1654 		if (!felem_to_BN(x, x_out))
1655 			{
1656 			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1657 			return 0;
1658 			}
1659 		}
1660 	felem_mul(tmp, z1, z2); felem_reduce(z1, tmp);
1661 	felem_mul(tmp, y_in, z1); felem_reduce(y_in, tmp);
1662 	felem_contract(y_out, y_in);
1663 	if (y != NULL)
1664 		{
1665 		if (!felem_to_BN(y, y_out))
1666 			{
1667 			ECerr(EC_F_EC_GFP_NISTP521_POINT_GET_AFFINE_COORDINATES, ERR_R_BN_LIB);
1668 			return 0;
1669 			}
1670 		}
1671 	return 1;
1672 	}
1673 
1674 static void make_points_affine(size_t num, felem points[/* num */][3], felem tmp_felems[/* num+1 */])
1675 	{
1676 	/* Runs in constant time, unless an input is the point at infinity
1677 	 * (which normally shouldn't happen). */
1678 	ec_GFp_nistp_points_make_affine_internal(
1679 		num,
1680 		points,
1681 		sizeof(felem),
1682 		tmp_felems,
1683 		(void (*)(void *)) felem_one,
1684 		(int (*)(const void *)) felem_is_zero_int,
1685 		(void (*)(void *, const void *)) felem_assign,
1686 		(void (*)(void *, const void *)) felem_square_reduce,
1687 		(void (*)(void *, const void *, const void *)) felem_mul_reduce,
1688 		(void (*)(void *, const void *)) felem_inv,
1689 		(void (*)(void *, const void *)) felem_contract);
1690 	}
1691 
1692 /* Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL values
1693  * Result is stored in r (r can equal one of the inputs). */
1694 int ec_GFp_nistp521_points_mul(const EC_GROUP *group, EC_POINT *r,
1695 	const BIGNUM *scalar, size_t num, const EC_POINT *points[],
1696 	const BIGNUM *scalars[], BN_CTX *ctx)
1697 	{
1698 	int ret = 0;
1699 	int j;
1700 	int mixed = 0;
1701 	BN_CTX *new_ctx = NULL;
1702 	BIGNUM *x, *y, *z, *tmp_scalar;
1703 	felem_bytearray g_secret;
1704 	felem_bytearray *secrets = NULL;
1705 	felem (*pre_comp)[17][3] = NULL;
1706 	felem *tmp_felems = NULL;
1707 	felem_bytearray tmp;
1708 	unsigned i, num_bytes;
1709 	int have_pre_comp = 0;
1710 	size_t num_points = num;
1711 	felem x_in, y_in, z_in, x_out, y_out, z_out;
1712 	NISTP521_PRE_COMP *pre = NULL;
1713 	felem (*g_pre_comp)[3] = NULL;
1714 	EC_POINT *generator = NULL;
1715 	const EC_POINT *p = NULL;
1716 	const BIGNUM *p_scalar = NULL;
1717 
1718 	if (ctx == NULL)
1719 		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1720 	BN_CTX_start(ctx);
1721 	if (((x = BN_CTX_get(ctx)) == NULL) ||
1722 		((y = BN_CTX_get(ctx)) == NULL) ||
1723 		((z = BN_CTX_get(ctx)) == NULL) ||
1724 		((tmp_scalar = BN_CTX_get(ctx)) == NULL))
1725 		goto err;
1726 
1727 	if (scalar != NULL)
1728 		{
1729 		pre = EC_EX_DATA_get_data(group->extra_data,
1730 			nistp521_pre_comp_dup, nistp521_pre_comp_free,
1731 			nistp521_pre_comp_clear_free);
1732 		if (pre)
1733 			/* we have precomputation, try to use it */
1734 			g_pre_comp = &pre->g_pre_comp[0];
1735 		else
1736 			/* try to use the standard precomputation */
1737 			g_pre_comp = (felem (*)[3]) gmul;
1738 		generator = EC_POINT_new(group);
1739 		if (generator == NULL)
1740 			goto err;
1741 		/* get the generator from precomputation */
1742 		if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1743 			!felem_to_BN(y, g_pre_comp[1][1]) ||
1744 			!felem_to_BN(z, g_pre_comp[1][2]))
1745 			{
1746 			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1747 			goto err;
1748 			}
1749 		if (!EC_POINT_set_Jprojective_coordinates_GFp(group,
1750 				generator, x, y, z, ctx))
1751 			goto err;
1752 		if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1753 			/* precomputation matches generator */
1754 			have_pre_comp = 1;
1755 		else
1756 			/* we don't have valid precomputation:
1757 			 * treat the generator as a random point */
1758 			num_points++;
1759 		}
1760 
1761 	if (num_points > 0)
1762 		{
1763 		if (num_points >= 2)
1764 			{
1765 			/* unless we precompute multiples for just one point,
1766 			 * converting those into affine form is time well spent  */
1767 			mixed = 1;
1768 			}
1769 		secrets = OPENSSL_malloc(num_points * sizeof(felem_bytearray));
1770 		pre_comp = OPENSSL_malloc(num_points * 17 * 3 * sizeof(felem));
1771 		if (mixed)
1772 			tmp_felems = OPENSSL_malloc((num_points * 17 + 1) * sizeof(felem));
1773 		if ((secrets == NULL) || (pre_comp == NULL) || (mixed && (tmp_felems == NULL)))
1774 			{
1775 			ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_MALLOC_FAILURE);
1776 			goto err;
1777 			}
1778 
1779 		/* we treat NULL scalars as 0, and NULL points as points at infinity,
1780 		 * i.e., they contribute nothing to the linear combination */
1781 		memset(secrets, 0, num_points * sizeof(felem_bytearray));
1782 		memset(pre_comp, 0, num_points * 17 * 3 * sizeof(felem));
1783 		for (i = 0; i < num_points; ++i)
1784 			{
1785 			if (i == num)
1786 				/* we didn't have a valid precomputation, so we pick
1787 				 * the generator */
1788 				{
1789 				p = EC_GROUP_get0_generator(group);
1790 				p_scalar = scalar;
1791 				}
1792 			else
1793 				/* the i^th point */
1794 				{
1795 				p = points[i];
1796 				p_scalar = scalars[i];
1797 				}
1798 			if ((p_scalar != NULL) && (p != NULL))
1799 				{
1800 				/* reduce scalar to 0 <= scalar < 2^521 */
1801 				if ((BN_num_bits(p_scalar) > 521) || (BN_is_negative(p_scalar)))
1802 					{
1803 					/* this is an unusual input, and we don't guarantee
1804 					 * constant-timeness */
1805 					if (!BN_nnmod(tmp_scalar, p_scalar, &group->order, ctx))
1806 						{
1807 						ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1808 						goto err;
1809 						}
1810 					num_bytes = BN_bn2bin(tmp_scalar, tmp);
1811 					}
1812 				else
1813 					num_bytes = BN_bn2bin(p_scalar, tmp);
1814 				flip_endian(secrets[i], tmp, num_bytes);
1815 				/* precompute multiples */
1816 				if ((!BN_to_felem(x_out, &p->X)) ||
1817 					(!BN_to_felem(y_out, &p->Y)) ||
1818 					(!BN_to_felem(z_out, &p->Z))) goto err;
1819 				memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1820 				memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1821 				memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1822 				for (j = 2; j <= 16; ++j)
1823 					{
1824 					if (j & 1)
1825 						{
1826 						point_add(
1827 							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1828 							pre_comp[i][1][0], pre_comp[i][1][1], pre_comp[i][1][2],
1829 							0, pre_comp[i][j-1][0], pre_comp[i][j-1][1], pre_comp[i][j-1][2]);
1830 						}
1831 					else
1832 						{
1833 						point_double(
1834 							pre_comp[i][j][0], pre_comp[i][j][1], pre_comp[i][j][2],
1835 							pre_comp[i][j/2][0], pre_comp[i][j/2][1], pre_comp[i][j/2][2]);
1836 						}
1837 					}
1838 				}
1839 			}
1840 		if (mixed)
1841 			make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1842 		}
1843 
1844 	/* the scalar for the generator */
1845 	if ((scalar != NULL) && (have_pre_comp))
1846 		{
1847 		memset(g_secret, 0, sizeof(g_secret));
1848 		/* reduce scalar to 0 <= scalar < 2^521 */
1849 		if ((BN_num_bits(scalar) > 521) || (BN_is_negative(scalar)))
1850 			{
1851 			/* this is an unusual input, and we don't guarantee
1852 			 * constant-timeness */
1853 			if (!BN_nnmod(tmp_scalar, scalar, &group->order, ctx))
1854 				{
1855 				ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1856 				goto err;
1857 				}
1858 			num_bytes = BN_bn2bin(tmp_scalar, tmp);
1859 			}
1860 		else
1861 			num_bytes = BN_bn2bin(scalar, tmp);
1862 		flip_endian(g_secret, tmp, num_bytes);
1863 		/* do the multiplication with generator precomputation*/
1864 		batch_mul(x_out, y_out, z_out,
1865 			(const felem_bytearray (*)) secrets, num_points,
1866 			g_secret,
1867 			mixed, (const felem (*)[17][3]) pre_comp,
1868 			(const felem (*)[3]) g_pre_comp);
1869 		}
1870 	else
1871 		/* do the multiplication without generator precomputation */
1872 		batch_mul(x_out, y_out, z_out,
1873 			(const felem_bytearray (*)) secrets, num_points,
1874 			NULL, mixed, (const felem (*)[17][3]) pre_comp, NULL);
1875 	/* reduce the output to its unique minimal representation */
1876 	felem_contract(x_in, x_out);
1877 	felem_contract(y_in, y_out);
1878 	felem_contract(z_in, z_out);
1879 	if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1880 		(!felem_to_BN(z, z_in)))
1881 		{
1882 		ECerr(EC_F_EC_GFP_NISTP521_POINTS_MUL, ERR_R_BN_LIB);
1883 		goto err;
1884 		}
1885 	ret = EC_POINT_set_Jprojective_coordinates_GFp(group, r, x, y, z, ctx);
1886 
1887 err:
1888 	BN_CTX_end(ctx);
1889 	if (generator != NULL)
1890 		EC_POINT_free(generator);
1891 	if (new_ctx != NULL)
1892 		BN_CTX_free(new_ctx);
1893 	if (secrets != NULL)
1894 		OPENSSL_free(secrets);
1895 	if (pre_comp != NULL)
1896 		OPENSSL_free(pre_comp);
1897 	if (tmp_felems != NULL)
1898 		OPENSSL_free(tmp_felems);
1899 	return ret;
1900 	}
1901 
1902 int ec_GFp_nistp521_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1903 	{
1904 	int ret = 0;
1905 	NISTP521_PRE_COMP *pre = NULL;
1906 	int i, j;
1907 	BN_CTX *new_ctx = NULL;
1908 	BIGNUM *x, *y;
1909 	EC_POINT *generator = NULL;
1910 	felem tmp_felems[16];
1911 
1912 	/* throw away old precomputation */
1913 	EC_EX_DATA_free_data(&group->extra_data, nistp521_pre_comp_dup,
1914 		nistp521_pre_comp_free, nistp521_pre_comp_clear_free);
1915 	if (ctx == NULL)
1916 		if ((ctx = new_ctx = BN_CTX_new()) == NULL) return 0;
1917 	BN_CTX_start(ctx);
1918 	if (((x = BN_CTX_get(ctx)) == NULL) ||
1919 		((y = BN_CTX_get(ctx)) == NULL))
1920 		goto err;
1921 	/* get the generator */
1922 	if (group->generator == NULL) goto err;
1923 	generator = EC_POINT_new(group);
1924 	if (generator == NULL)
1925 		goto err;
1926 	BN_bin2bn(nistp521_curve_params[3], sizeof (felem_bytearray), x);
1927 	BN_bin2bn(nistp521_curve_params[4], sizeof (felem_bytearray), y);
1928 	if (!EC_POINT_set_affine_coordinates_GFp(group, generator, x, y, ctx))
1929 		goto err;
1930 	if ((pre = nistp521_pre_comp_new()) == NULL)
1931 		goto err;
1932 	/* if the generator is the standard one, use built-in precomputation */
1933 	if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1934 		{
1935 		memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1936 		ret = 1;
1937 		goto err;
1938 		}
1939 	if ((!BN_to_felem(pre->g_pre_comp[1][0], &group->generator->X)) ||
1940 		(!BN_to_felem(pre->g_pre_comp[1][1], &group->generator->Y)) ||
1941 		(!BN_to_felem(pre->g_pre_comp[1][2], &group->generator->Z)))
1942 		goto err;
1943 	/* compute 2^130*G, 2^260*G, 2^390*G */
1944 	for (i = 1; i <= 4; i <<= 1)
1945 		{
1946 		point_double(pre->g_pre_comp[2*i][0], pre->g_pre_comp[2*i][1],
1947 			pre->g_pre_comp[2*i][2], pre->g_pre_comp[i][0],
1948 			pre->g_pre_comp[i][1], pre->g_pre_comp[i][2]);
1949 		for (j = 0; j < 129; ++j)
1950 			{
1951 			point_double(pre->g_pre_comp[2*i][0],
1952 				pre->g_pre_comp[2*i][1],
1953 				pre->g_pre_comp[2*i][2],
1954 				pre->g_pre_comp[2*i][0],
1955 				pre->g_pre_comp[2*i][1],
1956 				pre->g_pre_comp[2*i][2]);
1957 			}
1958 		}
1959 	/* g_pre_comp[0] is the point at infinity */
1960 	memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1961 	/* the remaining multiples */
1962 	/* 2^130*G + 2^260*G */
1963 	point_add(pre->g_pre_comp[6][0], pre->g_pre_comp[6][1],
1964 		pre->g_pre_comp[6][2], pre->g_pre_comp[4][0],
1965 		pre->g_pre_comp[4][1], pre->g_pre_comp[4][2],
1966 		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1967 		pre->g_pre_comp[2][2]);
1968 	/* 2^130*G + 2^390*G */
1969 	point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1],
1970 		pre->g_pre_comp[10][2], pre->g_pre_comp[8][0],
1971 		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1972 		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1973 		pre->g_pre_comp[2][2]);
1974 	/* 2^260*G + 2^390*G */
1975 	point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1],
1976 		pre->g_pre_comp[12][2], pre->g_pre_comp[8][0],
1977 		pre->g_pre_comp[8][1], pre->g_pre_comp[8][2],
1978 		0, pre->g_pre_comp[4][0], pre->g_pre_comp[4][1],
1979 		pre->g_pre_comp[4][2]);
1980 	/* 2^130*G + 2^260*G + 2^390*G */
1981 	point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1],
1982 		pre->g_pre_comp[14][2], pre->g_pre_comp[12][0],
1983 		pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
1984 		0, pre->g_pre_comp[2][0], pre->g_pre_comp[2][1],
1985 		pre->g_pre_comp[2][2]);
1986 	for (i = 1; i < 8; ++i)
1987 		{
1988 		/* odd multiples: add G */
1989 		point_add(pre->g_pre_comp[2*i+1][0], pre->g_pre_comp[2*i+1][1],
1990 			pre->g_pre_comp[2*i+1][2], pre->g_pre_comp[2*i][0],
1991 			pre->g_pre_comp[2*i][1], pre->g_pre_comp[2*i][2],
1992 			0, pre->g_pre_comp[1][0], pre->g_pre_comp[1][1],
1993 			pre->g_pre_comp[1][2]);
1994 		}
1995 	make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
1996 
1997 	if (!EC_EX_DATA_set_data(&group->extra_data, pre, nistp521_pre_comp_dup,
1998 			nistp521_pre_comp_free, nistp521_pre_comp_clear_free))
1999 		goto err;
2000 	ret = 1;
2001 	pre = NULL;
2002  err:
2003 	BN_CTX_end(ctx);
2004 	if (generator != NULL)
2005 		EC_POINT_free(generator);
2006 	if (new_ctx != NULL)
2007 		BN_CTX_free(new_ctx);
2008 	if (pre)
2009 		nistp521_pre_comp_free(pre);
2010 	return ret;
2011 	}
2012 
2013 int ec_GFp_nistp521_have_precompute_mult(const EC_GROUP *group)
2014 	{
2015 	if (EC_EX_DATA_get_data(group->extra_data, nistp521_pre_comp_dup,
2016 			nistp521_pre_comp_free, nistp521_pre_comp_clear_free)
2017 		!= NULL)
2018 		return 1;
2019 	else
2020 		return 0;
2021 	}
2022 
2023 #else
2024 static void *dummy=&dummy;
2025 #endif
2026