xref: /freebsd/crypto/openssl/crypto/ec/ecp_nistp384.c (revision e7be843b4a162e68651d3911f0357ed464915629)
1 /*
2  * Copyright 2023-2025 The OpenSSL Project Authors. All Rights Reserved.
3  *
4  * Licensed under the Apache License 2.0 (the "License").  You may not use
5  * this file except in compliance with the License.  You can obtain a copy
6  * in the file LICENSE in the source distribution or at
7  * https://www.openssl.org/source/license.html
8  */
9 
10 /* Copyright 2023 IBM Corp.
11  *
12  * Licensed under the Apache License, Version 2.0 (the "License");
13  *
14  * you may not use this file except in compliance with the License.
15  * You may obtain a copy of the License at
16  *
17  *     http://www.apache.org/licenses/LICENSE-2.0
18  *
19  *  Unless required by applicable law or agreed to in writing, software
20  *  distributed under the License is distributed on an "AS IS" BASIS,
21  *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  *  See the License for the specific language governing permissions and
23  *  limitations under the License.
24  */
25 
26 /*
27  * Designed for 56-bit limbs by Rohan McLure <rohan.mclure@linux.ibm.com>.
28  * The layout is based on that of ecp_nistp{224,521}.c, allowing even for asm
29  * acceleration of felem_{square,mul} as supported in these files.
30  */
31 
32 #include <openssl/e_os2.h>
33 
34 #include <string.h>
35 #include <openssl/err.h>
36 #include "ec_local.h"
37 
38 #include "internal/numbers.h"
39 
40 #ifndef INT128_MAX
41 # error "Your compiler doesn't appear to support 128-bit integer types"
42 #endif
43 
44 typedef uint8_t u8;
45 typedef uint64_t u64;
46 
47 /*
48  * The underlying field. P384 operates over GF(2^384-2^128-2^96+2^32-1). We
49  * can serialize an element of this field into 48 bytes. We call this an
50  * felem_bytearray.
51  */
52 
53 typedef u8 felem_bytearray[48];
54 
55 /*
56  * These are the parameters of P384, taken from FIPS 186-3, section D.1.2.4.
57  * These values are big-endian.
58  */
59 static const felem_bytearray nistp384_curve_params[5] = {
60   {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* p */
61    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
62    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
63    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFF},
64   {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, /* a = -3 */
65    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
66    0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFE, 0xFF, 0xFF, 0xFF, 0xFF,
67    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 0xFF, 0xFC},
68   {0xB3, 0x31, 0x2F, 0xA7, 0xE2, 0x3E, 0xE7, 0xE4, 0x98, 0x8E, 0x05, 0x6B, /* b */
69    0xE3, 0xF8, 0x2D, 0x19, 0x18, 0x1D, 0x9C, 0x6E, 0xFE, 0x81, 0x41, 0x12,
70    0x03, 0x14, 0x08, 0x8F, 0x50, 0x13, 0x87, 0x5A, 0xC6, 0x56, 0x39, 0x8D,
71    0x8A, 0x2E, 0xD1, 0x9D, 0x2A, 0x85, 0xC8, 0xED, 0xD3, 0xEC, 0x2A, 0xEF},
72   {0xAA, 0x87, 0xCA, 0x22, 0xBE, 0x8B, 0x05, 0x37, 0x8E, 0xB1, 0xC7, 0x1E, /* x */
73    0xF3, 0x20, 0xAD, 0x74, 0x6E, 0x1D, 0x3B, 0x62, 0x8B, 0xA7, 0x9B, 0x98,
74    0x59, 0xF7, 0x41, 0xE0, 0x82, 0x54, 0x2A, 0x38, 0x55, 0x02, 0xF2, 0x5D,
75    0xBF, 0x55, 0x29, 0x6C, 0x3A, 0x54, 0x5E, 0x38, 0x72, 0x76, 0x0A, 0xB7},
76   {0x36, 0x17, 0xDE, 0x4A, 0x96, 0x26, 0x2C, 0x6F, 0x5D, 0x9E, 0x98, 0xBF, /* y */
77    0x92, 0x92, 0xDC, 0x29, 0xF8, 0xF4, 0x1D, 0xBD, 0x28, 0x9A, 0x14, 0x7C,
78    0xE9, 0xDA, 0x31, 0x13, 0xB5, 0xF0, 0xB8, 0xC0, 0x0A, 0x60, 0xB1, 0xCE,
79    0x1D, 0x7E, 0x81, 0x9D, 0x7A, 0x43, 0x1D, 0x7C, 0x90, 0xEA, 0x0E, 0x5F},
80 };
81 
82 /*-
83  * The representation of field elements.
84  * ------------------------------------
85  *
86  * We represent field elements with seven values. These values are either 64 or
87  * 128 bits and the field element represented is:
88  *   v[0]*2^0 + v[1]*2^56 + v[2]*2^112 + ... + v[6]*2^336  (mod p)
89  * Each of the seven values is called a 'limb'. Since the limbs are spaced only
90  * 56 bits apart, but are greater than 56 bits in length, the most significant
91  * bits of each limb overlap with the least significant bits of the next
92  *
93  * This representation is considered to be 'redundant' in the sense that
94  * intermediate values can each contain more than a 56-bit value in each limb.
95  * Reduction causes all but the final limb to be reduced to contain a value less
96  * than 2^56, with the final value represented allowed to be larger than 2^384,
97  * inasmuch as we can be sure that arithmetic overflow remains impossible. The
98  * reduced value must of course be congruent to the unreduced value.
99  *
100  * A field element with 64-bit limbs is an 'felem'. One with 128-bit limbs is a
101  * 'widefelem', featuring enough bits to store the result of a multiplication
102  * and even some further arithmetic without need for immediate reduction.
103  */
104 
105 #define NLIMBS 7
106 
107 typedef uint64_t limb;
108 typedef uint128_t widelimb;
109 typedef limb limb_aX __attribute((__aligned__(1)));
110 typedef limb felem[NLIMBS];
111 typedef widelimb widefelem[2*NLIMBS-1];
112 
113 static const limb bottom56bits = 0xffffffffffffff;
114 
115 /* Helper functions (de)serialising reduced field elements in little endian */
bin48_to_felem(felem out,const u8 in[48])116 static void bin48_to_felem(felem out, const u8 in[48])
117 {
118     memset(out, 0, 56);
119     out[0] = (*((limb *) & in[0])) & bottom56bits;
120     out[1] = (*((limb_aX *) & in[7])) & bottom56bits;
121     out[2] = (*((limb_aX *) & in[14])) & bottom56bits;
122     out[3] = (*((limb_aX *) & in[21])) & bottom56bits;
123     out[4] = (*((limb_aX *) & in[28])) & bottom56bits;
124     out[5] = (*((limb_aX *) & in[35])) & bottom56bits;
125     memmove(&out[6], &in[42], 6);
126 }
127 
felem_to_bin48(u8 out[48],const felem in)128 static void felem_to_bin48(u8 out[48], const felem in)
129 {
130     memset(out, 0, 48);
131     (*((limb *) & out[0]))     |= (in[0] & bottom56bits);
132     (*((limb_aX *) & out[7]))  |= (in[1] & bottom56bits);
133     (*((limb_aX *) & out[14])) |= (in[2] & bottom56bits);
134     (*((limb_aX *) & out[21])) |= (in[3] & bottom56bits);
135     (*((limb_aX *) & out[28])) |= (in[4] & bottom56bits);
136     (*((limb_aX *) & out[35])) |= (in[5] & bottom56bits);
137     memmove(&out[42], &in[6], 6);
138 }
139 
140 /* BN_to_felem converts an OpenSSL BIGNUM into an felem */
BN_to_felem(felem out,const BIGNUM * bn)141 static int BN_to_felem(felem out, const BIGNUM *bn)
142 {
143     felem_bytearray b_out;
144     int num_bytes;
145 
146     if (BN_is_negative(bn)) {
147         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
148         return 0;
149     }
150     num_bytes = BN_bn2lebinpad(bn, b_out, sizeof(b_out));
151     if (num_bytes < 0) {
152         ERR_raise(ERR_LIB_EC, EC_R_BIGNUM_OUT_OF_RANGE);
153         return 0;
154     }
155     bin48_to_felem(out, b_out);
156     return 1;
157 }
158 
159 /* felem_to_BN converts an felem into an OpenSSL BIGNUM */
felem_to_BN(BIGNUM * out,const felem in)160 static BIGNUM *felem_to_BN(BIGNUM *out, const felem in)
161 {
162     felem_bytearray b_out;
163 
164     felem_to_bin48(b_out, in);
165     return BN_lebin2bn(b_out, sizeof(b_out), out);
166 }
167 
168 /*-
169  * Field operations
170  * ----------------
171  */
172 
felem_one(felem out)173 static void felem_one(felem out)
174 {
175     out[0] = 1;
176     memset(&out[1], 0, sizeof(limb) * (NLIMBS-1));
177 }
178 
felem_assign(felem out,const felem in)179 static void felem_assign(felem out, const felem in)
180 {
181     memcpy(out, in, sizeof(felem));
182 }
183 
184 /* felem_sum64 sets out = out + in. */
felem_sum64(felem out,const felem in)185 static void felem_sum64(felem out, const felem in)
186 {
187     unsigned int i;
188 
189     for (i = 0; i < NLIMBS; i++)
190         out[i] += in[i];
191 }
192 
193 /* felem_scalar sets out = in * scalar */
felem_scalar(felem out,const felem in,limb scalar)194 static void felem_scalar(felem out, const felem in, limb scalar)
195 {
196     unsigned int i;
197 
198     for (i = 0; i < NLIMBS; i++)
199         out[i] = in[i] * scalar;
200 }
201 
202 /* felem_scalar64 sets out = out * scalar */
felem_scalar64(felem out,limb scalar)203 static void felem_scalar64(felem out, limb scalar)
204 {
205     unsigned int i;
206 
207     for (i = 0; i < NLIMBS; i++)
208         out[i] *= scalar;
209 }
210 
211 /* felem_scalar128 sets out = out * scalar */
felem_scalar128(widefelem out,limb scalar)212 static void felem_scalar128(widefelem out, limb scalar)
213 {
214     unsigned int i;
215 
216     for (i = 0; i < 2*NLIMBS-1; i++)
217         out[i] *= scalar;
218 }
219 
220 /*-
221  * felem_neg sets |out| to |-in|
222  * On entry:
223  *   in[i] < 2^60 - 2^29
224  * On exit:
225  *   out[i] < 2^60
226  */
felem_neg(felem out,const felem in)227 static void felem_neg(felem out, const felem in)
228 {
229     /*
230      * In order to prevent underflow, we add a multiple of p before subtracting.
231      * Use telescopic sums to represent 2^12 * p redundantly with each limb
232      * of the form 2^60 + ...
233      */
234     static const limb two60m52m4 = (((limb) 1) << 60)
235                                  - (((limb) 1) << 52)
236                                  - (((limb) 1) << 4);
237     static const limb two60p44m12 = (((limb) 1) << 60)
238                                   + (((limb) 1) << 44)
239                                   - (((limb) 1) << 12);
240     static const limb two60m28m4 = (((limb) 1) << 60)
241                                  - (((limb) 1) << 28)
242                                  - (((limb) 1) << 4);
243     static const limb two60m4 = (((limb) 1) << 60)
244                               - (((limb) 1) << 4);
245 
246     out[0] = two60p44m12 - in[0];
247     out[1] = two60m52m4 - in[1];
248     out[2] = two60m28m4 - in[2];
249     out[3] = two60m4 - in[3];
250     out[4] = two60m4 - in[4];
251     out[5] = two60m4 - in[5];
252     out[6] = two60m4 - in[6];
253 }
254 
255 #if defined(ECP_NISTP384_ASM)
256 void p384_felem_diff64(felem out, const felem in);
257 void p384_felem_diff128(widefelem out, const widefelem in);
258 void p384_felem_diff_128_64(widefelem out, const felem in);
259 
260 # define felem_diff64           p384_felem_diff64
261 # define felem_diff128          p384_felem_diff128
262 # define felem_diff_128_64      p384_felem_diff_128_64
263 
264 #else
265 /*-
266  * felem_diff64 subtracts |in| from |out|
267  * On entry:
268  *   in[i] < 2^60 - 2^52 - 2^4
269  * On exit:
270  *   out[i] < out_orig[i] + 2^60 + 2^44
271  */
felem_diff64(felem out,const felem in)272 static void felem_diff64(felem out, const felem in)
273 {
274     /*
275      * In order to prevent underflow, we add a multiple of p before subtracting.
276      * Use telescopic sums to represent 2^12 * p redundantly with each limb
277      * of the form 2^60 + ...
278      */
279 
280     static const limb two60m52m4 = (((limb) 1) << 60)
281                                  - (((limb) 1) << 52)
282                                  - (((limb) 1) << 4);
283     static const limb two60p44m12 = (((limb) 1) << 60)
284                                   + (((limb) 1) << 44)
285                                   - (((limb) 1) << 12);
286     static const limb two60m28m4 = (((limb) 1) << 60)
287                                  - (((limb) 1) << 28)
288                                  - (((limb) 1) << 4);
289     static const limb two60m4 = (((limb) 1) << 60)
290                               - (((limb) 1) << 4);
291 
292     out[0] += two60p44m12 - in[0];
293     out[1] += two60m52m4 - in[1];
294     out[2] += two60m28m4 - in[2];
295     out[3] += two60m4 - in[3];
296     out[4] += two60m4 - in[4];
297     out[5] += two60m4 - in[5];
298     out[6] += two60m4 - in[6];
299 }
300 
301 /*
302  * in[i] < 2^63
303  * out[i] < out_orig[i] + 2^64 + 2^48
304  */
felem_diff_128_64(widefelem out,const felem in)305 static void felem_diff_128_64(widefelem out, const felem in)
306 {
307     /*
308      * In order to prevent underflow, we add a multiple of p before subtracting.
309      * Use telescopic sums to represent 2^16 * p redundantly with each limb
310      * of the form 2^64 + ...
311      */
312 
313     static const widelimb two64m56m8 = (((widelimb) 1) << 64)
314                                      - (((widelimb) 1) << 56)
315                                      - (((widelimb) 1) << 8);
316     static const widelimb two64m32m8 = (((widelimb) 1) << 64)
317                                      - (((widelimb) 1) << 32)
318                                      - (((widelimb) 1) << 8);
319     static const widelimb two64m8 = (((widelimb) 1) << 64)
320                                   - (((widelimb) 1) << 8);
321     static const widelimb two64p48m16 = (((widelimb) 1) << 64)
322                                       + (((widelimb) 1) << 48)
323                                       - (((widelimb) 1) << 16);
324     unsigned int i;
325 
326     out[0] += two64p48m16;
327     out[1] += two64m56m8;
328     out[2] += two64m32m8;
329     out[3] += two64m8;
330     out[4] += two64m8;
331     out[5] += two64m8;
332     out[6] += two64m8;
333 
334     for (i = 0; i < NLIMBS; i++)
335         out[i] -= in[i];
336 }
337 
338 /*
339  * in[i] < 2^127 - 2^119 - 2^71
340  * out[i] < out_orig[i] + 2^127 + 2^111
341  */
felem_diff128(widefelem out,const widefelem in)342 static void felem_diff128(widefelem out, const widefelem in)
343 {
344     /*
345      * In order to prevent underflow, we add a multiple of p before subtracting.
346      * Use telescopic sums to represent 2^415 * p redundantly with each limb
347      * of the form 2^127 + ...
348      */
349 
350     static const widelimb two127 = ((widelimb) 1) << 127;
351     static const widelimb two127m71 = (((widelimb) 1) << 127)
352                                     - (((widelimb) 1) << 71);
353     static const widelimb two127p111m79m71 = (((widelimb) 1) << 127)
354                                            + (((widelimb) 1) << 111)
355                                            - (((widelimb) 1) << 79)
356                                            - (((widelimb) 1) << 71);
357     static const widelimb two127m119m71 = (((widelimb) 1) << 127)
358                                         - (((widelimb) 1) << 119)
359                                         - (((widelimb) 1) << 71);
360     static const widelimb two127m95m71 = (((widelimb) 1) << 127)
361                                        - (((widelimb) 1) << 95)
362                                        - (((widelimb) 1) << 71);
363     unsigned int i;
364 
365     out[0]  += two127;
366     out[1]  += two127m71;
367     out[2]  += two127m71;
368     out[3]  += two127m71;
369     out[4]  += two127m71;
370     out[5]  += two127m71;
371     out[6]  += two127p111m79m71;
372     out[7]  += two127m119m71;
373     out[8]  += two127m95m71;
374     out[9]  += two127m71;
375     out[10] += two127m71;
376     out[11] += two127m71;
377     out[12] += two127m71;
378 
379     for (i = 0; i < 2*NLIMBS-1; i++)
380         out[i] -= in[i];
381 }
382 #endif /* ECP_NISTP384_ASM */
383 
felem_square_ref(widefelem out,const felem in)384 static void felem_square_ref(widefelem out, const felem in)
385 {
386     felem inx2;
387     felem_scalar(inx2, in, 2);
388 
389     out[0] = ((uint128_t) in[0]) * in[0];
390 
391     out[1] = ((uint128_t) in[0]) * inx2[1];
392 
393     out[2] = ((uint128_t) in[0]) * inx2[2]
394            + ((uint128_t) in[1]) * in[1];
395 
396     out[3] = ((uint128_t) in[0]) * inx2[3]
397            + ((uint128_t) in[1]) * inx2[2];
398 
399     out[4] = ((uint128_t) in[0]) * inx2[4]
400            + ((uint128_t) in[1]) * inx2[3]
401            + ((uint128_t) in[2]) * in[2];
402 
403     out[5] = ((uint128_t) in[0]) * inx2[5]
404            + ((uint128_t) in[1]) * inx2[4]
405            + ((uint128_t) in[2]) * inx2[3];
406 
407     out[6] = ((uint128_t) in[0]) * inx2[6]
408            + ((uint128_t) in[1]) * inx2[5]
409            + ((uint128_t) in[2]) * inx2[4]
410            + ((uint128_t) in[3]) * in[3];
411 
412     out[7] = ((uint128_t) in[1]) * inx2[6]
413            + ((uint128_t) in[2]) * inx2[5]
414            + ((uint128_t) in[3]) * inx2[4];
415 
416     out[8] = ((uint128_t) in[2]) * inx2[6]
417            + ((uint128_t) in[3]) * inx2[5]
418            + ((uint128_t) in[4]) * in[4];
419 
420     out[9] = ((uint128_t) in[3]) * inx2[6]
421            + ((uint128_t) in[4]) * inx2[5];
422 
423     out[10] = ((uint128_t) in[4]) * inx2[6]
424             + ((uint128_t) in[5]) * in[5];
425 
426     out[11] = ((uint128_t) in[5]) * inx2[6];
427 
428     out[12] = ((uint128_t) in[6]) * in[6];
429 }
430 
felem_mul_ref(widefelem out,const felem in1,const felem in2)431 static void felem_mul_ref(widefelem out, const felem in1, const felem in2)
432 {
433     out[0] = ((uint128_t) in1[0]) * in2[0];
434 
435     out[1] = ((uint128_t) in1[0]) * in2[1]
436            + ((uint128_t) in1[1]) * in2[0];
437 
438     out[2] = ((uint128_t) in1[0]) * in2[2]
439            + ((uint128_t) in1[1]) * in2[1]
440            + ((uint128_t) in1[2]) * in2[0];
441 
442     out[3] = ((uint128_t) in1[0]) * in2[3]
443            + ((uint128_t) in1[1]) * in2[2]
444            + ((uint128_t) in1[2]) * in2[1]
445            + ((uint128_t) in1[3]) * in2[0];
446 
447     out[4] = ((uint128_t) in1[0]) * in2[4]
448            + ((uint128_t) in1[1]) * in2[3]
449            + ((uint128_t) in1[2]) * in2[2]
450            + ((uint128_t) in1[3]) * in2[1]
451            + ((uint128_t) in1[4]) * in2[0];
452 
453     out[5] = ((uint128_t) in1[0]) * in2[5]
454            + ((uint128_t) in1[1]) * in2[4]
455            + ((uint128_t) in1[2]) * in2[3]
456            + ((uint128_t) in1[3]) * in2[2]
457            + ((uint128_t) in1[4]) * in2[1]
458            + ((uint128_t) in1[5]) * in2[0];
459 
460     out[6] = ((uint128_t) in1[0]) * in2[6]
461            + ((uint128_t) in1[1]) * in2[5]
462            + ((uint128_t) in1[2]) * in2[4]
463            + ((uint128_t) in1[3]) * in2[3]
464            + ((uint128_t) in1[4]) * in2[2]
465            + ((uint128_t) in1[5]) * in2[1]
466            + ((uint128_t) in1[6]) * in2[0];
467 
468     out[7] = ((uint128_t) in1[1]) * in2[6]
469            + ((uint128_t) in1[2]) * in2[5]
470            + ((uint128_t) in1[3]) * in2[4]
471            + ((uint128_t) in1[4]) * in2[3]
472            + ((uint128_t) in1[5]) * in2[2]
473            + ((uint128_t) in1[6]) * in2[1];
474 
475     out[8] = ((uint128_t) in1[2]) * in2[6]
476            + ((uint128_t) in1[3]) * in2[5]
477            + ((uint128_t) in1[4]) * in2[4]
478            + ((uint128_t) in1[5]) * in2[3]
479            + ((uint128_t) in1[6]) * in2[2];
480 
481     out[9] = ((uint128_t) in1[3]) * in2[6]
482            + ((uint128_t) in1[4]) * in2[5]
483            + ((uint128_t) in1[5]) * in2[4]
484            + ((uint128_t) in1[6]) * in2[3];
485 
486     out[10] = ((uint128_t) in1[4]) * in2[6]
487             + ((uint128_t) in1[5]) * in2[5]
488             + ((uint128_t) in1[6]) * in2[4];
489 
490     out[11] = ((uint128_t) in1[5]) * in2[6]
491             + ((uint128_t) in1[6]) * in2[5];
492 
493     out[12] = ((uint128_t) in1[6]) * in2[6];
494 }
495 
496 /*-
497  * Reduce thirteen 128-bit coefficients to seven 64-bit coefficients.
498  * in[i] < 2^128 - 2^125
499  * out[i] < 2^56 for i < 6,
500  * out[6] <= 2^48
501  *
502  * The technique in use here stems from the format of the prime modulus:
503  * P384 = 2^384 - delta
504  *
505  * Thus we can reduce numbers of the form (X + 2^384 * Y) by substituting
506  * them with (X + delta Y), with delta = 2^128 + 2^96 + (-2^32 + 1). These
507  * coefficients are still quite large, and so we repeatedly apply this
508  * technique on high-order bits in order to guarantee the desired bounds on
509  * the size of our output.
510  *
511  * The three phases of elimination are as follows:
512  * [1]: Y = 2^120 (in[12] | in[11] | in[10] | in[9])
513  * [2]: Y = 2^8 (acc[8] | acc[7])
514  * [3]: Y = 2^48 (acc[6] >> 48)
515  * (Where a | b | c | d = (2^56)^3 a + (2^56)^2 b + (2^56) c + d)
516  */
felem_reduce_ref(felem out,const widefelem in)517 static void felem_reduce_ref(felem out, const widefelem in)
518 {
519     /*
520      * In order to prevent underflow, we add a multiple of p before subtracting.
521      * Use telescopic sums to represent 2^76 * p redundantly with each limb
522      * of the form 2^124 + ...
523      */
524     static const widelimb two124m68 = (((widelimb) 1) << 124)
525                                     - (((widelimb) 1) << 68);
526     static const widelimb two124m116m68 = (((widelimb) 1) << 124)
527                                         - (((widelimb) 1) << 116)
528                                         - (((widelimb) 1) << 68);
529     static const widelimb two124p108m76 = (((widelimb) 1) << 124)
530                                         + (((widelimb) 1) << 108)
531                                         - (((widelimb) 1) << 76);
532     static const widelimb two124m92m68 = (((widelimb) 1) << 124)
533                                        - (((widelimb) 1) << 92)
534                                        - (((widelimb) 1) << 68);
535     widelimb temp, acc[9];
536     unsigned int i;
537 
538     memcpy(acc, in, sizeof(widelimb) * 9);
539 
540     acc[0] += two124p108m76;
541     acc[1] += two124m116m68;
542     acc[2] += two124m92m68;
543     acc[3] += two124m68;
544     acc[4] += two124m68;
545     acc[5] += two124m68;
546     acc[6] += two124m68;
547 
548     /* [1]: Eliminate in[9], ..., in[12] */
549     acc[8] += in[12] >> 32;
550     acc[7] += (in[12] & 0xffffffff) << 24;
551     acc[7] += in[12] >> 8;
552     acc[6] += (in[12] & 0xff) << 48;
553     acc[6] -= in[12] >> 16;
554     acc[5] -= (in[12] & 0xffff) << 40;
555     acc[6] += in[12] >> 48;
556     acc[5] += (in[12] & 0xffffffffffff) << 8;
557 
558     acc[7] += in[11] >> 32;
559     acc[6] += (in[11] & 0xffffffff) << 24;
560     acc[6] += in[11] >> 8;
561     acc[5] += (in[11] & 0xff) << 48;
562     acc[5] -= in[11] >> 16;
563     acc[4] -= (in[11] & 0xffff) << 40;
564     acc[5] += in[11] >> 48;
565     acc[4] += (in[11] & 0xffffffffffff) << 8;
566 
567     acc[6] += in[10] >> 32;
568     acc[5] += (in[10] & 0xffffffff) << 24;
569     acc[5] += in[10] >> 8;
570     acc[4] += (in[10] & 0xff) << 48;
571     acc[4] -= in[10] >> 16;
572     acc[3] -= (in[10] & 0xffff) << 40;
573     acc[4] += in[10] >> 48;
574     acc[3] += (in[10] & 0xffffffffffff) << 8;
575 
576     acc[5] += in[9] >> 32;
577     acc[4] += (in[9] & 0xffffffff) << 24;
578     acc[4] += in[9] >> 8;
579     acc[3] += (in[9] & 0xff) << 48;
580     acc[3] -= in[9] >> 16;
581     acc[2] -= (in[9] & 0xffff) << 40;
582     acc[3] += in[9] >> 48;
583     acc[2] += (in[9] & 0xffffffffffff) << 8;
584 
585     /*
586      * [2]: Eliminate acc[7], acc[8], that is the 7 and eighth limbs, as
587      * well as the contributions made from eliminating higher limbs.
588      * acc[7] < in[7] + 2^120 + 2^56 < in[7] + 2^121
589      * acc[8] < in[8] + 2^96
590      */
591     acc[4] += acc[8] >> 32;
592     acc[3] += (acc[8] & 0xffffffff) << 24;
593     acc[3] += acc[8] >> 8;
594     acc[2] += (acc[8] & 0xff) << 48;
595     acc[2] -= acc[8] >> 16;
596     acc[1] -= (acc[8] & 0xffff) << 40;
597     acc[2] += acc[8] >> 48;
598     acc[1] += (acc[8] & 0xffffffffffff) << 8;
599 
600     acc[3] += acc[7] >> 32;
601     acc[2] += (acc[7] & 0xffffffff) << 24;
602     acc[2] += acc[7] >> 8;
603     acc[1] += (acc[7] & 0xff) << 48;
604     acc[1] -= acc[7] >> 16;
605     acc[0] -= (acc[7] & 0xffff) << 40;
606     acc[1] += acc[7] >> 48;
607     acc[0] += (acc[7] & 0xffffffffffff) << 8;
608 
609     /*-
610      * acc[k] < in[k] + 2^124 + 2^121
611      *        < in[k] + 2^125
612      *        < 2^128, for k <= 6
613      */
614 
615     /*
616      * Carry 4 -> 5 -> 6
617      * This has the effect of ensuring that these more significant limbs
618      * will be small in value after eliminating high bits from acc[6].
619      */
620     acc[5] += acc[4] >> 56;
621     acc[4] &= 0x00ffffffffffffff;
622 
623     acc[6] += acc[5] >> 56;
624     acc[5] &= 0x00ffffffffffffff;
625 
626     /*-
627      * acc[6] < in[6] + 2^124 + 2^121 + 2^72 + 2^16
628      *        < in[6] + 2^125
629      *        < 2^128
630      */
631 
632     /* [3]: Eliminate high bits of acc[6] */
633     temp = acc[6] >> 48;
634     acc[6] &= 0x0000ffffffffffff;
635 
636     /* temp < 2^80 */
637 
638     acc[3] += temp >> 40;
639     acc[2] += (temp & 0xffffffffff) << 16;
640     acc[2] += temp >> 16;
641     acc[1] += (temp & 0xffff) << 40;
642     acc[1] -= temp >> 24;
643     acc[0] -= (temp & 0xffffff) << 32;
644     acc[0] += temp;
645 
646     /*-
647      * acc[k] < acc_old[k] + 2^64 + 2^56
648      *        < in[k] + 2^124 + 2^121 + 2^72 + 2^64 + 2^56 + 2^16 , k < 4
649      */
650 
651     /* Carry 0 -> 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
652     acc[1] += acc[0] >> 56;   /* acc[1] < acc_old[1] + 2^72 */
653     acc[0] &= 0x00ffffffffffffff;
654 
655     acc[2] += acc[1] >> 56;   /* acc[2] < acc_old[2] + 2^72 + 2^16 */
656     acc[1] &= 0x00ffffffffffffff;
657 
658     acc[3] += acc[2] >> 56;   /* acc[3] < acc_old[3] + 2^72 + 2^16 */
659     acc[2] &= 0x00ffffffffffffff;
660 
661     /*-
662      * acc[k] < acc_old[k] + 2^72 + 2^16
663      *        < in[k] + 2^124 + 2^121 + 2^73 + 2^64 + 2^56 + 2^17
664      *        < in[k] + 2^125
665      *        < 2^128 , k < 4
666      */
667 
668     acc[4] += acc[3] >> 56;   /*-
669                                * acc[4] < acc_old[4] + 2^72 + 2^16
670                                *        < 2^72 + 2^56 + 2^16
671                                */
672     acc[3] &= 0x00ffffffffffffff;
673 
674     acc[5] += acc[4] >> 56;   /*-
675                                * acc[5] < acc_old[5] + 2^16 + 1
676                                *        < 2^56 + 2^16 + 1
677                                */
678     acc[4] &= 0x00ffffffffffffff;
679 
680     acc[6] += acc[5] >> 56;   /* acc[6] < 2^48 + 1 <= 2^48 */
681     acc[5] &= 0x00ffffffffffffff;
682 
683     for (i = 0; i < NLIMBS; i++)
684         out[i] = acc[i];
685 }
686 
felem_square_reduce_ref(felem out,const felem in)687 static ossl_inline void felem_square_reduce_ref(felem out, const felem in)
688 {
689     widefelem tmp;
690 
691     felem_square_ref(tmp, in);
692     felem_reduce_ref(out, tmp);
693 }
694 
felem_mul_reduce_ref(felem out,const felem in1,const felem in2)695 static ossl_inline void felem_mul_reduce_ref(felem out, const felem in1, const felem in2)
696 {
697     widefelem tmp;
698 
699     felem_mul_ref(tmp, in1, in2);
700     felem_reduce_ref(out, tmp);
701 }
702 
703 #if defined(ECP_NISTP384_ASM)
704 static void felem_square_wrapper(widefelem out, const felem in);
705 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2);
706 
707 static void (*felem_square_p)(widefelem out, const felem in) =
708     felem_square_wrapper;
709 static void (*felem_mul_p)(widefelem out, const felem in1, const felem in2) =
710     felem_mul_wrapper;
711 
712 static void (*felem_reduce_p)(felem out, const widefelem in) = felem_reduce_ref;
713 
714 static void (*felem_square_reduce_p)(felem out, const felem in) =
715     felem_square_reduce_ref;
716 static void (*felem_mul_reduce_p)(felem out, const felem in1, const felem in2) =
717     felem_mul_reduce_ref;
718 
719 void p384_felem_square(widefelem out, const felem in);
720 void p384_felem_mul(widefelem out, const felem in1, const felem in2);
721 void p384_felem_reduce(felem out, const widefelem in);
722 
723 void p384_felem_square_reduce(felem out, const felem in);
724 void p384_felem_mul_reduce(felem out, const felem in1, const felem in2);
725 
726 # if defined(_ARCH_PPC64)
727 #  include "crypto/ppc_arch.h"
728 # endif
729 
felem_select(void)730 static void felem_select(void)
731 {
732 # if defined(_ARCH_PPC64)
733     if ((OPENSSL_ppccap_P & PPC_MADD300) && (OPENSSL_ppccap_P & PPC_ALTIVEC)) {
734         felem_square_p = p384_felem_square;
735         felem_mul_p = p384_felem_mul;
736         felem_reduce_p = p384_felem_reduce;
737         felem_square_reduce_p = p384_felem_square_reduce;
738         felem_mul_reduce_p = p384_felem_mul_reduce;
739 
740         return;
741     }
742 # endif
743 
744     /* Default */
745     felem_square_p = felem_square_ref;
746     felem_mul_p = felem_mul_ref;
747     felem_reduce_p = felem_reduce_ref;
748     felem_square_reduce_p = felem_square_reduce_ref;
749     felem_mul_reduce_p = felem_mul_reduce_ref;
750 }
751 
felem_square_wrapper(widefelem out,const felem in)752 static void felem_square_wrapper(widefelem out, const felem in)
753 {
754     felem_select();
755     felem_square_p(out, in);
756 }
757 
felem_mul_wrapper(widefelem out,const felem in1,const felem in2)758 static void felem_mul_wrapper(widefelem out, const felem in1, const felem in2)
759 {
760     felem_select();
761     felem_mul_p(out, in1, in2);
762 }
763 
764 # define felem_square felem_square_p
765 # define felem_mul felem_mul_p
766 # define felem_reduce felem_reduce_p
767 
768 # define felem_square_reduce felem_square_reduce_p
769 # define felem_mul_reduce felem_mul_reduce_p
770 #else
771 # define felem_square felem_square_ref
772 # define felem_mul felem_mul_ref
773 # define felem_reduce felem_reduce_ref
774 
775 # define felem_square_reduce felem_square_reduce_ref
776 # define felem_mul_reduce felem_mul_reduce_ref
777 #endif
778 
779 /*-
780  * felem_inv calculates |out| = |in|^{-1}
781  *
782  * Based on Fermat's Little Theorem:
783  *   a^p = a (mod p)
784  *   a^{p-1} = 1 (mod p)
785  *   a^{p-2} = a^{-1} (mod p)
786  */
felem_inv(felem out,const felem in)787 static void felem_inv(felem out, const felem in)
788 {
789     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6;
790     unsigned int i = 0;
791 
792     felem_square_reduce(ftmp, in);      /* 2^1 */
793     felem_mul_reduce(ftmp, ftmp, in);   /* 2^1 + 2^0 */
794     felem_assign(ftmp2, ftmp);
795 
796     felem_square_reduce(ftmp, ftmp);    /* 2^2 + 2^1 */
797     felem_mul_reduce(ftmp, ftmp, in);   /* 2^2 + 2^1 * 2^0 */
798     felem_assign(ftmp3, ftmp);
799 
800     for (i = 0; i < 3; i++)
801         felem_square_reduce(ftmp, ftmp); /* 2^5 + 2^4 + 2^3 */
802     felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^5 + 2^4 + 2^3 + 2^2 + 2^1 + 2^0 */
803     felem_assign(ftmp4, ftmp);
804 
805     for (i = 0; i < 6; i++)
806         felem_square_reduce(ftmp, ftmp); /* 2^11 + ... + 2^6 */
807     felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^11 + ... + 2^0 */
808 
809     for (i = 0; i < 3; i++)
810         felem_square_reduce(ftmp, ftmp); /* 2^14 + ... + 2^3 */
811     felem_mul_reduce(ftmp, ftmp3, ftmp); /* 2^14 + ... + 2^0 */
812     felem_assign(ftmp5, ftmp);
813 
814     for (i = 0; i < 15; i++)
815         felem_square_reduce(ftmp, ftmp); /* 2^29 + ... + 2^15 */
816     felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^29 + ... + 2^0 */
817     felem_assign(ftmp6, ftmp);
818 
819     for (i = 0; i < 30; i++)
820         felem_square_reduce(ftmp, ftmp); /* 2^59 + ... + 2^30 */
821     felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^59 + ... + 2^0 */
822     felem_assign(ftmp4, ftmp);
823 
824     for (i = 0; i < 60; i++)
825         felem_square_reduce(ftmp, ftmp); /* 2^119 + ... + 2^60 */
826     felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^119 + ... + 2^0 */
827     felem_assign(ftmp4, ftmp);
828 
829     for (i = 0; i < 120; i++)
830       felem_square_reduce(ftmp, ftmp);   /* 2^239 + ... + 2^120 */
831     felem_mul_reduce(ftmp, ftmp4, ftmp); /* 2^239 + ... + 2^0 */
832 
833     for (i = 0; i < 15; i++)
834         felem_square_reduce(ftmp, ftmp); /* 2^254 + ... + 2^15 */
835     felem_mul_reduce(ftmp, ftmp5, ftmp); /* 2^254 + ... + 2^0 */
836 
837     for (i = 0; i < 31; i++)
838         felem_square_reduce(ftmp, ftmp); /* 2^285 + ... + 2^31 */
839     felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^285 + ... + 2^31 + 2^29 + ... + 2^0 */
840 
841     for (i = 0; i < 2; i++)
842         felem_square_reduce(ftmp, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^2 */
843     felem_mul_reduce(ftmp, ftmp2, ftmp); /* 2^287 + ... + 2^33 + 2^31 + ... + 2^0 */
844 
845     for (i = 0; i < 94; i++)
846         felem_square_reduce(ftmp, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 */
847     felem_mul_reduce(ftmp, ftmp6, ftmp); /* 2^381 + ... + 2^127 + 2^125 + ... + 2^94 + 2^29 + ... + 2^0 */
848 
849     for (i = 0; i < 2; i++)
850         felem_square_reduce(ftmp, ftmp); /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 */
851     felem_mul_reduce(ftmp, in, ftmp);    /* 2^383 + ... + 2^129 + 2^127 + ... + 2^96 + 2^31 + ... + 2^2 + 2^0 */
852 
853     memcpy(out, ftmp, sizeof(felem));
854 }
855 
856 /*
857  * Zero-check: returns a limb with all bits set if |in| == 0 (mod p)
858  * and 0 otherwise. We know that field elements are reduced to
859  * 0 < in < 2p, so we only need to check two cases:
860  * 0 and 2^384 - 2^128 - 2^96 + 2^32 - 1
861  *   in[k] < 2^56, k < 6
862  *   in[6] <= 2^48
863  */
felem_is_zero(const felem in)864 static limb felem_is_zero(const felem in)
865 {
866     limb zero, p384;
867 
868     zero = in[0] | in[1] | in[2] | in[3] | in[4] | in[5] | in[6];
869     zero = ((int64_t) (zero) - 1) >> 63;
870     p384 = (in[0] ^ 0x000000ffffffff) | (in[1] ^ 0xffff0000000000)
871          | (in[2] ^ 0xfffffffffeffff) | (in[3] ^ 0xffffffffffffff)
872          | (in[4] ^ 0xffffffffffffff) | (in[5] ^ 0xffffffffffffff)
873          | (in[6] ^ 0xffffffffffff);
874     p384 = ((int64_t) (p384) - 1) >> 63;
875 
876     return (zero | p384);
877 }
878 
felem_is_zero_int(const void * in)879 static int felem_is_zero_int(const void *in)
880 {
881     return (int)(felem_is_zero(in) & ((limb) 1));
882 }
883 
884 /*-
885  * felem_contract converts |in| to its unique, minimal representation.
886  * Assume we've removed all redundant bits.
887  * On entry:
888  *   in[k] < 2^56, k < 6
889  *   in[6] <= 2^48
890  */
felem_contract(felem out,const felem in)891 static void felem_contract(felem out, const felem in)
892 {
893     static const int64_t two56 = ((limb) 1) << 56;
894 
895     /*
896      * We know for a fact that 0 <= |in| < 2*p, for p = 2^384 - 2^128 - 2^96 + 2^32 - 1
897      * Perform two successive, idempotent subtractions to reduce if |in| >= p.
898      */
899 
900     int64_t tmp[NLIMBS], cond[5], a;
901     unsigned int i;
902 
903     memcpy(tmp, in, sizeof(felem));
904 
905     /* Case 1: a = 1 iff |in| >= 2^384 */
906     a = (in[6] >> 48);
907     tmp[0] += a;
908     tmp[0] -= a << 32;
909     tmp[1] += a << 40;
910     tmp[2] += a << 16;
911     tmp[6] &= 0x0000ffffffffffff;
912 
913     /*
914      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
915      * non-zero, so we only need one step
916      */
917 
918     a = tmp[0] >> 63;
919     tmp[0] += a & two56;
920     tmp[1] -= a & 1;
921 
922     /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
923     tmp[2] += tmp[1] >> 56;
924     tmp[1] &= 0x00ffffffffffffff;
925 
926     tmp[3] += tmp[2] >> 56;
927     tmp[2] &= 0x00ffffffffffffff;
928 
929     tmp[4] += tmp[3] >> 56;
930     tmp[3] &= 0x00ffffffffffffff;
931 
932     tmp[5] += tmp[4] >> 56;
933     tmp[4] &= 0x00ffffffffffffff;
934 
935     tmp[6] += tmp[5] >> 56; /* tmp[6] < 2^48 */
936     tmp[5] &= 0x00ffffffffffffff;
937 
938     /*
939      * Case 2: a = all ones if p <= |in| < 2^384, 0 otherwise
940      */
941 
942     /* 0 iff (2^129..2^383) are all one */
943     cond[0] = ((tmp[6] | 0xff000000000000) & tmp[5] & tmp[4] & tmp[3] & (tmp[2] | 0x0000000001ffff)) + 1;
944     /* 0 iff 2^128 bit is one */
945     cond[1] = (tmp[2] | ~0x00000000010000) + 1;
946     /* 0 iff (2^96..2^127) bits are all one */
947     cond[2] = ((tmp[2] | 0xffffffffff0000) & (tmp[1] | 0x0000ffffffffff)) + 1;
948     /* 0 iff (2^32..2^95) bits are all zero */
949     cond[3] = (tmp[1] & ~0xffff0000000000) | (tmp[0] & ~((int64_t) 0x000000ffffffff));
950     /* 0 iff (2^0..2^31) bits are all one */
951     cond[4] = (tmp[0] | 0xffffff00000000) + 1;
952 
953     /*
954      * In effect, invert our conditions, so that 0 values become all 1's,
955      * any non-zero value in the low-order 56 bits becomes all 0's
956      */
957     for (i = 0; i < 5; i++)
958        cond[i] = ((cond[i] & 0x00ffffffffffffff) - 1) >> 63;
959 
960     /*
961      * The condition for determining whether in is greater than our
962      * prime is given by the following condition.
963      */
964 
965     /* First subtract 2^384 - 2^129 cheaply */
966     a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
967     tmp[6] &= ~a;
968     tmp[5] &= ~a;
969     tmp[4] &= ~a;
970     tmp[3] &= ~a;
971     tmp[2] &= ~a | 0x0000000001ffff;
972 
973     /*
974      * Subtract 2^128 - 2^96 by
975      * means of disjoint cases.
976      */
977 
978     /* subtract 2^128 if that bit is present, and add 2^96 */
979     a = cond[0] & cond[1];
980     tmp[2] &= ~a | 0xfffffffffeffff;
981     tmp[1] += a & ((int64_t) 1 << 40);
982 
983     /* otherwise, clear bits 2^127 .. 2^96  */
984     a = cond[0] & ~cond[1] & (cond[2] & (~cond[3] | cond[4]));
985     tmp[2] &= ~a | 0xffffffffff0000;
986     tmp[1] &= ~a | 0x0000ffffffffff;
987 
988     /* finally, subtract the last 2^32 - 1 */
989     a = cond[0] & (cond[1] | (cond[2] & (~cond[3] | cond[4])));
990     tmp[0] += a & (-((int64_t) 1 << 32) + 1);
991 
992     /*
993      * eliminate negative coefficients: if tmp[0] is negative, tmp[1] must be
994      * non-zero, so we only need one step
995      */
996     a = tmp[0] >> 63;
997     tmp[0] += a & two56;
998     tmp[1] -= a & 1;
999 
1000     /* Carry 1 -> 2 -> 3 -> 4 -> 5 -> 6 */
1001     tmp[2] += tmp[1] >> 56;
1002     tmp[1] &= 0x00ffffffffffffff;
1003 
1004     tmp[3] += tmp[2] >> 56;
1005     tmp[2] &= 0x00ffffffffffffff;
1006 
1007     tmp[4] += tmp[3] >> 56;
1008     tmp[3] &= 0x00ffffffffffffff;
1009 
1010     tmp[5] += tmp[4] >> 56;
1011     tmp[4] &= 0x00ffffffffffffff;
1012 
1013     tmp[6] += tmp[5] >> 56;
1014     tmp[5] &= 0x00ffffffffffffff;
1015 
1016     memcpy(out, tmp, sizeof(felem));
1017 }
1018 
1019 /*-
1020  * Group operations
1021  * ----------------
1022  *
1023  * Building on top of the field operations we have the operations on the
1024  * elliptic curve group itself. Points on the curve are represented in Jacobian
1025  * coordinates
1026  */
1027 
1028 /*-
1029  * point_double calculates 2*(x_in, y_in, z_in)
1030  *
1031  * The method is taken from:
1032  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#doubling-dbl-2001-b
1033  *
1034  * Outputs can equal corresponding inputs, i.e., x_out == x_in is allowed.
1035  * while x_out == y_in is not (maybe this works, but it's not tested).
1036  */
1037 static void
point_double(felem x_out,felem y_out,felem z_out,const felem x_in,const felem y_in,const felem z_in)1038 point_double(felem x_out, felem y_out, felem z_out,
1039              const felem x_in, const felem y_in, const felem z_in)
1040 {
1041     widefelem tmp, tmp2;
1042     felem delta, gamma, beta, alpha, ftmp, ftmp2;
1043 
1044     felem_assign(ftmp, x_in);
1045     felem_assign(ftmp2, x_in);
1046 
1047     /* delta = z^2 */
1048     felem_square_reduce(delta, z_in);     /* delta[i] < 2^56 */
1049 
1050     /* gamma = y^2 */
1051     felem_square_reduce(gamma, y_in);     /* gamma[i] < 2^56 */
1052 
1053     /* beta = x*gamma */
1054     felem_mul_reduce(beta, x_in, gamma);  /* beta[i] < 2^56 */
1055 
1056     /* alpha = 3*(x-delta)*(x+delta) */
1057     felem_diff64(ftmp, delta);            /* ftmp[i] < 2^60 + 2^58 + 2^44 */
1058     felem_sum64(ftmp2, delta);            /* ftmp2[i] < 2^59 */
1059     felem_scalar64(ftmp2, 3);             /* ftmp2[i] < 2^61 */
1060     felem_mul_reduce(alpha, ftmp, ftmp2); /* alpha[i] < 2^56 */
1061 
1062     /* x' = alpha^2 - 8*beta */
1063     felem_square(tmp, alpha);             /* tmp[i] < 2^115 */
1064     felem_assign(ftmp, beta);             /* ftmp[i] < 2^56 */
1065     felem_scalar64(ftmp, 8);              /* ftmp[i] < 2^59 */
1066     felem_diff_128_64(tmp, ftmp);         /* tmp[i] < 2^115 + 2^64 + 2^48 */
1067     felem_reduce(x_out, tmp);             /* x_out[i] < 2^56 */
1068 
1069     /* z' = (y + z)^2 - gamma - delta */
1070     felem_sum64(delta, gamma);     /* delta[i] < 2^57 */
1071     felem_assign(ftmp, y_in);      /* ftmp[i] < 2^56 */
1072     felem_sum64(ftmp, z_in);       /* ftmp[i] < 2^56 */
1073     felem_square(tmp, ftmp);       /* tmp[i] < 2^115 */
1074     felem_diff_128_64(tmp, delta); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1075     felem_reduce(z_out, tmp);      /* z_out[i] < 2^56 */
1076 
1077     /* y' = alpha*(4*beta - x') - 8*gamma^2 */
1078     felem_scalar64(beta, 4);       /* beta[i] < 2^58 */
1079     felem_diff64(beta, x_out);     /* beta[i] < 2^60 + 2^58 + 2^44 */
1080     felem_mul(tmp, alpha, beta);   /* tmp[i] < 2^119 */
1081     felem_square(tmp2, gamma);     /* tmp2[i] < 2^115 */
1082     felem_scalar128(tmp2, 8);      /* tmp2[i] < 2^118 */
1083     felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^119 + 2^111 */
1084     felem_reduce(y_out, tmp);      /* tmp[i] < 2^56 */
1085 }
1086 
1087 /* copy_conditional copies in to out iff mask is all ones. */
copy_conditional(felem out,const felem in,limb mask)1088 static void copy_conditional(felem out, const felem in, limb mask)
1089 {
1090     unsigned int i;
1091 
1092     for (i = 0; i < NLIMBS; i++)
1093         out[i] ^= mask & (in[i] ^ out[i]);
1094 }
1095 
1096 /*-
1097  * point_add calculates (x1, y1, z1) + (x2, y2, z2)
1098  *
1099  * The method is taken from
1100  *   http://hyperelliptic.org/EFD/g1p/auto-shortw-jacobian-3.html#addition-add-2007-bl,
1101  * adapted for mixed addition (z2 = 1, or z2 = 0 for the point at infinity).
1102  *
1103  * This function includes a branch for checking whether the two input points
1104  * are equal (while not equal to the point at infinity). See comment below
1105  * on constant-time.
1106  */
point_add(felem x3,felem y3,felem z3,const felem x1,const felem y1,const felem z1,const int mixed,const felem x2,const felem y2,const felem z2)1107 static void point_add(felem x3, felem y3, felem z3,
1108                       const felem x1, const felem y1, const felem z1,
1109                       const int mixed, const felem x2, const felem y2,
1110                       const felem z2)
1111 {
1112     felem ftmp, ftmp2, ftmp3, ftmp4, ftmp5, ftmp6, x_out, y_out, z_out;
1113     widefelem tmp, tmp2;
1114     limb x_equal, y_equal, z1_is_zero, z2_is_zero;
1115     limb points_equal;
1116 
1117     z1_is_zero = felem_is_zero(z1);
1118     z2_is_zero = felem_is_zero(z2);
1119 
1120     /* ftmp = z1z1 = z1**2 */
1121     felem_square_reduce(ftmp, z1);      /* ftmp[i] < 2^56 */
1122 
1123     if (!mixed) {
1124         /* ftmp2 = z2z2 = z2**2 */
1125         felem_square_reduce(ftmp2, z2); /* ftmp2[i] < 2^56 */
1126 
1127         /* u1 = ftmp3 = x1*z2z2 */
1128         felem_mul_reduce(ftmp3, x1, ftmp2); /* ftmp3[i] < 2^56 */
1129 
1130         /* ftmp5 = z1 + z2 */
1131         felem_assign(ftmp5, z1);       /* ftmp5[i] < 2^56 */
1132         felem_sum64(ftmp5, z2);        /* ftmp5[i] < 2^57 */
1133 
1134         /* ftmp5 = (z1 + z2)**2 - z1z1 - z2z2 = 2*z1z2 */
1135         felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1136         felem_diff_128_64(tmp, ftmp);  /* tmp[i] < 2^117 + 2^64 + 2^48 */
1137         felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1138         felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1139 
1140         /* ftmp2 = z2 * z2z2 */
1141         felem_mul_reduce(ftmp2, ftmp2, z2); /* ftmp2[i] < 2^56 */
1142 
1143         /* s1 = ftmp6 = y1 * z2**3 */
1144         felem_mul_reduce(ftmp6, y1, ftmp2); /* ftmp6[i] < 2^56 */
1145     } else {
1146         /*
1147          * We'll assume z2 = 1 (special case z2 = 0 is handled later)
1148          */
1149 
1150         /* u1 = ftmp3 = x1*z2z2 */
1151         felem_assign(ftmp3, x1);     /* ftmp3[i] < 2^56 */
1152 
1153         /* ftmp5 = 2*z1z2 */
1154         felem_scalar(ftmp5, z1, 2);  /* ftmp5[i] < 2^57 */
1155 
1156         /* s1 = ftmp6 = y1 * z2**3 */
1157         felem_assign(ftmp6, y1);     /* ftmp6[i] < 2^56 */
1158     }
1159     /* ftmp3[i] < 2^56, ftmp5[i] < 2^57, ftmp6[i] < 2^56 */
1160 
1161     /* u2 = x2*z1z1 */
1162     felem_mul(tmp, x2, ftmp);        /* tmp[i] < 2^115 */
1163 
1164     /* h = ftmp4 = u2 - u1 */
1165     felem_diff_128_64(tmp, ftmp3);   /* tmp[i] < 2^115 + 2^64 + 2^48 */
1166     felem_reduce(ftmp4, tmp);        /* ftmp[4] < 2^56 */
1167 
1168     x_equal = felem_is_zero(ftmp4);
1169 
1170     /* z_out = ftmp5 * h */
1171     felem_mul_reduce(z_out, ftmp5, ftmp4);  /* z_out[i] < 2^56 */
1172 
1173     /* ftmp = z1 * z1z1 */
1174     felem_mul_reduce(ftmp, ftmp, z1);  /* ftmp[i] < 2^56 */
1175 
1176     /* s2 = tmp = y2 * z1**3 */
1177     felem_mul(tmp, y2, ftmp);      /* tmp[i] < 2^115 */
1178 
1179     /* r = ftmp5 = (s2 - s1)*2 */
1180     felem_diff_128_64(tmp, ftmp6); /* tmp[i] < 2^115 + 2^64 + 2^48 */
1181     felem_reduce(ftmp5, tmp);      /* ftmp5[i] < 2^56 */
1182     y_equal = felem_is_zero(ftmp5);
1183     felem_scalar64(ftmp5, 2);      /* ftmp5[i] < 2^57 */
1184 
1185     /*
1186      * The formulae are incorrect if the points are equal, in affine coordinates
1187      * (X_1, Y_1) == (X_2, Y_2), so we check for this and do doubling if this
1188      * happens.
1189      *
1190      * We use bitwise operations to avoid potential side-channels introduced by
1191      * the short-circuiting behaviour of boolean operators.
1192      *
1193      * The special case of either point being the point at infinity (z1 and/or
1194      * z2 are zero), is handled separately later on in this function, so we
1195      * avoid jumping to point_double here in those special cases.
1196      *
1197      * Notice the comment below on the implications of this branching for timing
1198      * leaks and why it is considered practically irrelevant.
1199      */
1200     points_equal = (x_equal & y_equal & (~z1_is_zero) & (~z2_is_zero));
1201 
1202     if (points_equal) {
1203         /*
1204          * This is obviously not constant-time but it will almost-never happen
1205          * for ECDH / ECDSA.
1206          */
1207         point_double(x3, y3, z3, x1, y1, z1);
1208         return;
1209     }
1210 
1211     /* I = ftmp = (2h)**2 */
1212     felem_assign(ftmp, ftmp4);        /* ftmp[i] < 2^56 */
1213     felem_scalar64(ftmp, 2);          /* ftmp[i] < 2^57 */
1214     felem_square_reduce(ftmp, ftmp);  /* ftmp[i] < 2^56 */
1215 
1216     /* J = ftmp2 = h * I */
1217     felem_mul_reduce(ftmp2, ftmp4, ftmp); /* ftmp2[i] < 2^56 */
1218 
1219     /* V = ftmp4 = U1 * I */
1220     felem_mul_reduce(ftmp4, ftmp3, ftmp); /* ftmp4[i] < 2^56 */
1221 
1222     /* x_out = r**2 - J - 2V */
1223     felem_square(tmp, ftmp5);      /* tmp[i] < 2^117 */
1224     felem_diff_128_64(tmp, ftmp2); /* tmp[i] < 2^117 + 2^64 + 2^48 */
1225     felem_assign(ftmp3, ftmp4);    /* ftmp3[i] < 2^56 */
1226     felem_scalar64(ftmp4, 2);      /* ftmp4[i] < 2^57 */
1227     felem_diff_128_64(tmp, ftmp4); /* tmp[i] < 2^117 + 2^65 + 2^49 */
1228     felem_reduce(x_out, tmp);      /* x_out[i] < 2^56 */
1229 
1230     /* y_out = r(V-x_out) - 2 * s1 * J */
1231     felem_diff64(ftmp3, x_out);    /* ftmp3[i] < 2^60 + 2^56 + 2^44 */
1232     felem_mul(tmp, ftmp5, ftmp3);  /* tmp[i] < 2^116 */
1233     felem_mul(tmp2, ftmp6, ftmp2); /* tmp2[i] < 2^115 */
1234     felem_scalar128(tmp2, 2);      /* tmp2[i] < 2^116 */
1235     felem_diff128(tmp, tmp2);      /* tmp[i] < 2^127 + 2^116 + 2^111 */
1236     felem_reduce(y_out, tmp);      /* y_out[i] < 2^56 */
1237 
1238     copy_conditional(x_out, x2, z1_is_zero);
1239     copy_conditional(x_out, x1, z2_is_zero);
1240     copy_conditional(y_out, y2, z1_is_zero);
1241     copy_conditional(y_out, y1, z2_is_zero);
1242     copy_conditional(z_out, z2, z1_is_zero);
1243     copy_conditional(z_out, z1, z2_is_zero);
1244     felem_assign(x3, x_out);
1245     felem_assign(y3, y_out);
1246     felem_assign(z3, z_out);
1247 }
1248 
1249 /*-
1250  * Base point pre computation
1251  * --------------------------
1252  *
1253  * Two different sorts of precomputed tables are used in the following code.
1254  * Each contain various points on the curve, where each point is three field
1255  * elements (x, y, z).
1256  *
1257  * For the base point table, z is usually 1 (0 for the point at infinity).
1258  * This table has 16 elements:
1259  * index | bits    | point
1260  * ------+---------+------------------------------
1261  *     0 | 0 0 0 0 | 0G
1262  *     1 | 0 0 0 1 | 1G
1263  *     2 | 0 0 1 0 | 2^95G
1264  *     3 | 0 0 1 1 | (2^95 + 1)G
1265  *     4 | 0 1 0 0 | 2^190G
1266  *     5 | 0 1 0 1 | (2^190 + 1)G
1267  *     6 | 0 1 1 0 | (2^190 + 2^95)G
1268  *     7 | 0 1 1 1 | (2^190 + 2^95 + 1)G
1269  *     8 | 1 0 0 0 | 2^285G
1270  *     9 | 1 0 0 1 | (2^285 + 1)G
1271  *    10 | 1 0 1 0 | (2^285 + 2^95)G
1272  *    11 | 1 0 1 1 | (2^285 + 2^95 + 1)G
1273  *    12 | 1 1 0 0 | (2^285 + 2^190)G
1274  *    13 | 1 1 0 1 | (2^285 + 2^190 + 1)G
1275  *    14 | 1 1 1 0 | (2^285 + 2^190 + 2^95)G
1276  *    15 | 1 1 1 1 | (2^285 + 2^190 + 2^95 + 1)G
1277  *
1278  * The reason for this is so that we can clock bits into four different
1279  * locations when doing simple scalar multiplies against the base point.
1280  *
1281  * Tables for other points have table[i] = iG for i in 0 .. 16.
1282  */
1283 
1284 /* gmul is the table of precomputed base points */
1285 static const felem gmul[16][3] = {
1286 {{0, 0, 0, 0, 0, 0, 0},
1287  {0, 0, 0, 0, 0, 0, 0},
1288  {0, 0, 0, 0, 0, 0, 0}},
1289 {{0x00545e3872760ab7, 0x00f25dbf55296c3a, 0x00e082542a385502, 0x008ba79b9859f741,
1290   0x0020ad746e1d3b62, 0x0005378eb1c71ef3, 0x0000aa87ca22be8b},
1291  {0x00431d7c90ea0e5f, 0x00b1ce1d7e819d7a, 0x0013b5f0b8c00a60, 0x00289a147ce9da31,
1292   0x0092dc29f8f41dbd, 0x002c6f5d9e98bf92, 0x00003617de4a9626},
1293  {1, 0, 0, 0, 0, 0, 0}},
1294 {{0x00024711cc902a90, 0x00acb2e579ab4fe1, 0x00af818a4b4d57b1, 0x00a17c7bec49c3de,
1295   0x004280482d726a8b, 0x00128dd0f0a90f3b, 0x00004387c1c3fa3c},
1296  {0x002ce76543cf5c3a, 0x00de6cee5ef58f0a, 0x00403e42fa561ca6, 0x00bc54d6f9cb9731,
1297   0x007155f925fb4ff1, 0x004a9ce731b7b9bc, 0x00002609076bd7b2},
1298  {1, 0, 0, 0, 0, 0, 0}},
1299 {{0x00e74c9182f0251d, 0x0039bf54bb111974, 0x00b9d2f2eec511d2, 0x0036b1594eb3a6a4,
1300   0x00ac3bb82d9d564b, 0x00f9313f4615a100, 0x00006716a9a91b10},
1301  {0x0046698116e2f15c, 0x00f34347067d3d33, 0x008de4ccfdebd002, 0x00e838c6b8e8c97b,
1302   0x006faf0798def346, 0x007349794a57563c, 0x00002629e7e6ad84},
1303  {1, 0, 0, 0, 0, 0, 0}},
1304 {{0x0075300e34fd163b, 0x0092e9db4e8d0ad3, 0x00254be9f625f760, 0x00512c518c72ae68,
1305   0x009bfcf162bede5a, 0x00bf9341566ce311, 0x0000cd6175bd41cf},
1306  {0x007dfe52af4ac70f, 0x0002159d2d5c4880, 0x00b504d16f0af8d0, 0x0014585e11f5e64c,
1307   0x0089c6388e030967, 0x00ffb270cbfa5f71, 0x00009a15d92c3947},
1308  {1, 0, 0, 0, 0, 0, 0}},
1309 {{0x0033fc1278dc4fe5, 0x00d53088c2caa043, 0x0085558827e2db66, 0x00c192bef387b736,
1310   0x00df6405a2225f2c, 0x0075205aa90fd91a, 0x0000137e3f12349d},
1311  {0x00ce5b115efcb07e, 0x00abc3308410deeb, 0x005dc6fc1de39904, 0x00907c1c496f36b4,
1312   0x0008e6ad3926cbe1, 0x00110747b787928c, 0x0000021b9162eb7e},
1313  {1, 0, 0, 0, 0, 0, 0}},
1314 {{0x008180042cfa26e1, 0x007b826a96254967, 0x0082473694d6b194, 0x007bd6880a45b589,
1315   0x00c0a5097072d1a3, 0x0019186555e18b4e, 0x000020278190e5ca},
1316  {0x00b4bef17de61ac0, 0x009535e3c38ed348, 0x002d4aa8e468ceab, 0x00ef40b431036ad3,
1317   0x00defd52f4542857, 0x0086edbf98234266, 0x00002025b3a7814d},
1318  {1, 0, 0, 0, 0, 0, 0}},
1319 {{0x00b238aa97b886be, 0x00ef3192d6dd3a32, 0x0079f9e01fd62df8, 0x00742e890daba6c5,
1320   0x008e5289144408ce, 0x0073bbcc8e0171a5, 0x0000c4fd329d3b52},
1321  {0x00c6f64a15ee23e7, 0x00dcfb7b171cad8b, 0x00039f6cbd805867, 0x00de024e428d4562,
1322   0x00be6a594d7c64c5, 0x0078467b70dbcd64, 0x0000251f2ed7079b},
1323  {1, 0, 0, 0, 0, 0, 0}},
1324 {{0x000e5cc25fc4b872, 0x005ebf10d31ef4e1, 0x0061e0ebd11e8256, 0x0076e026096f5a27,
1325   0x0013e6fc44662e9a, 0x0042b00289d3597e, 0x000024f089170d88},
1326  {0x001604d7e0effbe6, 0x0048d77cba64ec2c, 0x008166b16da19e36, 0x006b0d1a0f28c088,
1327   0x000259fcd47754fd, 0x00cc643e4d725f9a, 0x00007b10f3c79c14},
1328  {1, 0, 0, 0, 0, 0, 0}},
1329 {{0x00430155e3b908af, 0x00b801e4fec25226, 0x00b0d4bcfe806d26, 0x009fc4014eb13d37,
1330   0x0066c94e44ec07e8, 0x00d16adc03874ba2, 0x000030c917a0d2a7},
1331  {0x00edac9e21eb891c, 0x00ef0fb768102eff, 0x00c088cef272a5f3, 0x00cbf782134e2964,
1332   0x0001044a7ba9a0e3, 0x00e363f5b194cf3c, 0x00009ce85249e372},
1333  {1, 0, 0, 0, 0, 0, 0}},
1334 {{0x001dd492dda5a7eb, 0x008fd577be539fd1, 0x002ff4b25a5fc3f1, 0x0074a8a1b64df72f,
1335   0x002ba3d8c204a76c, 0x009d5cff95c8235a, 0x0000e014b9406e0f},
1336  {0x008c2e4dbfc98aba, 0x00f30bb89f1a1436, 0x00b46f7aea3e259c, 0x009224454ac02f54,
1337   0x00906401f5645fa2, 0x003a1d1940eabc77, 0x00007c9351d680e6},
1338  {1, 0, 0, 0, 0, 0, 0}},
1339 {{0x005a35d872ef967c, 0x0049f1b7884e1987, 0x0059d46d7e31f552, 0x00ceb4869d2d0fb6,
1340   0x00e8e89eee56802a, 0x0049d806a774aaf2, 0x0000147e2af0ae24},
1341  {0x005fd1bd852c6e5e, 0x00b674b7b3de6885, 0x003b9ea5eb9b6c08, 0x005c9f03babf3ef7,
1342   0x00605337fecab3c7, 0x009a3f85b11bbcc8, 0x0000455470f330ec},
1343  {1, 0, 0, 0, 0, 0, 0}},
1344 {{0x002197ff4d55498d, 0x00383e8916c2d8af, 0x00eb203f34d1c6d2, 0x0080367cbd11b542,
1345   0x00769b3be864e4f5, 0x0081a8458521c7bb, 0x0000c531b34d3539},
1346  {0x00e2a3d775fa2e13, 0x00534fc379573844, 0x00ff237d2a8db54a, 0x00d301b2335a8882,
1347   0x000f75ea96103a80, 0x0018fecb3cdd96fa, 0x0000304bf61e94eb},
1348  {1, 0, 0, 0, 0, 0, 0}},
1349 {{0x00b2afc332a73dbd, 0x0029a0d5bb007bc5, 0x002d628eb210f577, 0x009f59a36dd05f50,
1350   0x006d339de4eca613, 0x00c75a71addc86bc, 0x000060384c5ea93c},
1351  {0x00aa9641c32a30b4, 0x00cc73ae8cce565d, 0x00ec911a4df07f61, 0x00aa4b762ea4b264,
1352   0x0096d395bb393629, 0x004efacfb7632fe0, 0x00006f252f46fa3f},
1353  {1, 0, 0, 0, 0, 0, 0}},
1354 {{0x00567eec597c7af6, 0x0059ba6795204413, 0x00816d4e6f01196f, 0x004ae6b3eb57951d,
1355   0x00420f5abdda2108, 0x003401d1f57ca9d9, 0x0000cf5837b0b67a},
1356  {0x00eaa64b8aeeabf9, 0x00246ddf16bcb4de, 0x000e7e3c3aecd751, 0x0008449f04fed72e,
1357   0x00307b67ccf09183, 0x0017108c3556b7b1, 0x0000229b2483b3bf},
1358  {1, 0, 0, 0, 0, 0, 0}},
1359 {{0x00e7c491a7bb78a1, 0x00eafddd1d3049ab, 0x00352c05e2bc7c98, 0x003d6880c165fa5c,
1360   0x00b6ac61cc11c97d, 0x00beeb54fcf90ce5, 0x0000dc1f0b455edc},
1361  {0x002db2e7aee34d60, 0x0073b5f415a2d8c0, 0x00dd84e4193e9a0c, 0x00d02d873467c572,
1362   0x0018baaeda60aee5, 0x0013fb11d697c61e, 0x000083aafcc3a973},
1363  {1, 0, 0, 0, 0, 0, 0}}
1364 };
1365 
1366 /*
1367  * select_point selects the |idx|th point from a precomputation table and
1368  * copies it to out.
1369  *
1370  * pre_comp below is of the size provided in |size|.
1371  */
select_point(const limb idx,unsigned int size,const felem pre_comp[][3],felem out[3])1372 static void select_point(const limb idx, unsigned int size,
1373                          const felem pre_comp[][3], felem out[3])
1374 {
1375     unsigned int i, j;
1376     limb *outlimbs = &out[0][0];
1377 
1378     memset(out, 0, sizeof(*out) * 3);
1379 
1380     for (i = 0; i < size; i++) {
1381         const limb *inlimbs = &pre_comp[i][0][0];
1382         limb mask = i ^ idx;
1383 
1384         mask |= mask >> 4;
1385         mask |= mask >> 2;
1386         mask |= mask >> 1;
1387         mask &= 1;
1388         mask--;
1389         for (j = 0; j < NLIMBS * 3; j++)
1390             outlimbs[j] |= inlimbs[j] & mask;
1391     }
1392 }
1393 
1394 /* get_bit returns the |i|th bit in |in| */
get_bit(const felem_bytearray in,int i)1395 static char get_bit(const felem_bytearray in, int i)
1396 {
1397     if (i < 0 || i >= 384)
1398         return 0;
1399     return (in[i >> 3] >> (i & 7)) & 1;
1400 }
1401 
1402 /*
1403  * Interleaved point multiplication using precomputed point multiples: The
1404  * small point multiples 0*P, 1*P, ..., 16*P are in pre_comp[], the scalars
1405  * in scalars[]. If g_scalar is non-NULL, we also add this multiple of the
1406  * generator, using certain (large) precomputed multiples in g_pre_comp.
1407  * Output point (X, Y, Z) is stored in x_out, y_out, z_out
1408  */
batch_mul(felem x_out,felem y_out,felem z_out,const felem_bytearray scalars[],const unsigned int num_points,const u8 * g_scalar,const int mixed,const felem pre_comp[][17][3],const felem g_pre_comp[16][3])1409 static void batch_mul(felem x_out, felem y_out, felem z_out,
1410                       const felem_bytearray scalars[],
1411                       const unsigned int num_points, const u8 *g_scalar,
1412                       const int mixed, const felem pre_comp[][17][3],
1413                       const felem g_pre_comp[16][3])
1414 {
1415     int i, skip;
1416     unsigned int num, gen_mul = (g_scalar != NULL);
1417     felem nq[3], tmp[4];
1418     limb bits;
1419     u8 sign, digit;
1420 
1421     /* set nq to the point at infinity */
1422     memset(nq, 0, sizeof(nq));
1423 
1424     /*
1425      * Loop over all scalars msb-to-lsb, interleaving additions of multiples
1426      * of the generator (last quarter of rounds) and additions of other
1427      * points multiples (every 5th round).
1428      */
1429     skip = 1;                   /* save two point operations in the first
1430                                  * round */
1431     for (i = (num_points ? 380 : 98); i >= 0; --i) {
1432         /* double */
1433         if (!skip)
1434             point_double(nq[0], nq[1], nq[2], nq[0], nq[1], nq[2]);
1435 
1436         /* add multiples of the generator */
1437         if (gen_mul && (i <= 98)) {
1438             bits = get_bit(g_scalar, i + 285) << 3;
1439             if (i < 95) {
1440                 bits |= get_bit(g_scalar, i + 190) << 2;
1441                 bits |= get_bit(g_scalar, i + 95) << 1;
1442                 bits |= get_bit(g_scalar, i);
1443             }
1444             /* select the point to add, in constant time */
1445             select_point(bits, 16, g_pre_comp, tmp);
1446             if (!skip) {
1447                 /* The 1 argument below is for "mixed" */
1448                 point_add(nq[0],  nq[1],  nq[2],
1449                           nq[0],  nq[1],  nq[2], 1,
1450                           tmp[0], tmp[1], tmp[2]);
1451             } else {
1452                 memcpy(nq, tmp, 3 * sizeof(felem));
1453                 skip = 0;
1454             }
1455         }
1456 
1457         /* do other additions every 5 doublings */
1458         if (num_points && (i % 5 == 0)) {
1459             /* loop over all scalars */
1460             for (num = 0; num < num_points; ++num) {
1461                 bits = get_bit(scalars[num], i + 4) << 5;
1462                 bits |= get_bit(scalars[num], i + 3) << 4;
1463                 bits |= get_bit(scalars[num], i + 2) << 3;
1464                 bits |= get_bit(scalars[num], i + 1) << 2;
1465                 bits |= get_bit(scalars[num], i) << 1;
1466                 bits |= get_bit(scalars[num], i - 1);
1467                 ossl_ec_GFp_nistp_recode_scalar_bits(&sign, &digit, bits);
1468 
1469                 /*
1470                  * select the point to add or subtract, in constant time
1471                  */
1472                 select_point(digit, 17, pre_comp[num], tmp);
1473                 felem_neg(tmp[3], tmp[1]); /* (X, -Y, Z) is the negative
1474                                             * point */
1475                 copy_conditional(tmp[1], tmp[3], (-(limb) sign));
1476 
1477                 if (!skip) {
1478                     point_add(nq[0],  nq[1],  nq[2],
1479                               nq[0],  nq[1],  nq[2], mixed,
1480                               tmp[0], tmp[1], tmp[2]);
1481                 } else {
1482                     memcpy(nq, tmp, 3 * sizeof(felem));
1483                     skip = 0;
1484                 }
1485             }
1486         }
1487     }
1488     felem_assign(x_out, nq[0]);
1489     felem_assign(y_out, nq[1]);
1490     felem_assign(z_out, nq[2]);
1491 }
1492 
1493 /* Precomputation for the group generator. */
1494 struct nistp384_pre_comp_st {
1495     felem g_pre_comp[16][3];
1496     CRYPTO_REF_COUNT references;
1497 };
1498 
ossl_ec_GFp_nistp384_method(void)1499 const EC_METHOD *ossl_ec_GFp_nistp384_method(void)
1500 {
1501     static const EC_METHOD ret = {
1502         EC_FLAGS_DEFAULT_OCT,
1503         NID_X9_62_prime_field,
1504         ossl_ec_GFp_nistp384_group_init,
1505         ossl_ec_GFp_simple_group_finish,
1506         ossl_ec_GFp_simple_group_clear_finish,
1507         ossl_ec_GFp_nist_group_copy,
1508         ossl_ec_GFp_nistp384_group_set_curve,
1509         ossl_ec_GFp_simple_group_get_curve,
1510         ossl_ec_GFp_simple_group_get_degree,
1511         ossl_ec_group_simple_order_bits,
1512         ossl_ec_GFp_simple_group_check_discriminant,
1513         ossl_ec_GFp_simple_point_init,
1514         ossl_ec_GFp_simple_point_finish,
1515         ossl_ec_GFp_simple_point_clear_finish,
1516         ossl_ec_GFp_simple_point_copy,
1517         ossl_ec_GFp_simple_point_set_to_infinity,
1518         ossl_ec_GFp_simple_point_set_affine_coordinates,
1519         ossl_ec_GFp_nistp384_point_get_affine_coordinates,
1520         0, /* point_set_compressed_coordinates */
1521         0, /* point2oct */
1522         0, /* oct2point */
1523         ossl_ec_GFp_simple_add,
1524         ossl_ec_GFp_simple_dbl,
1525         ossl_ec_GFp_simple_invert,
1526         ossl_ec_GFp_simple_is_at_infinity,
1527         ossl_ec_GFp_simple_is_on_curve,
1528         ossl_ec_GFp_simple_cmp,
1529         ossl_ec_GFp_simple_make_affine,
1530         ossl_ec_GFp_simple_points_make_affine,
1531         ossl_ec_GFp_nistp384_points_mul,
1532         ossl_ec_GFp_nistp384_precompute_mult,
1533         ossl_ec_GFp_nistp384_have_precompute_mult,
1534         ossl_ec_GFp_nist_field_mul,
1535         ossl_ec_GFp_nist_field_sqr,
1536         0, /* field_div */
1537         ossl_ec_GFp_simple_field_inv,
1538         0, /* field_encode */
1539         0, /* field_decode */
1540         0, /* field_set_to_one */
1541         ossl_ec_key_simple_priv2oct,
1542         ossl_ec_key_simple_oct2priv,
1543         0, /* set private */
1544         ossl_ec_key_simple_generate_key,
1545         ossl_ec_key_simple_check_key,
1546         ossl_ec_key_simple_generate_public_key,
1547         0, /* keycopy */
1548         0, /* keyfinish */
1549         ossl_ecdh_simple_compute_key,
1550         ossl_ecdsa_simple_sign_setup,
1551         ossl_ecdsa_simple_sign_sig,
1552         ossl_ecdsa_simple_verify_sig,
1553         0, /* field_inverse_mod_ord */
1554         0, /* blind_coordinates */
1555         0, /* ladder_pre */
1556         0, /* ladder_step */
1557         0  /* ladder_post */
1558     };
1559 
1560     return &ret;
1561 }
1562 
1563 /******************************************************************************/
1564 /*
1565  * FUNCTIONS TO MANAGE PRECOMPUTATION
1566  */
1567 
nistp384_pre_comp_new(void)1568 static NISTP384_PRE_COMP *nistp384_pre_comp_new(void)
1569 {
1570     NISTP384_PRE_COMP *ret = OPENSSL_zalloc(sizeof(*ret));
1571 
1572     if (ret == NULL)
1573         return ret;
1574 
1575     if (!CRYPTO_NEW_REF(&ret->references, 1)) {
1576         OPENSSL_free(ret);
1577         return NULL;
1578     }
1579     return ret;
1580 }
1581 
ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP * p)1582 NISTP384_PRE_COMP *ossl_ec_nistp384_pre_comp_dup(NISTP384_PRE_COMP *p)
1583 {
1584     int i;
1585 
1586     if (p != NULL)
1587         CRYPTO_UP_REF(&p->references, &i);
1588     return p;
1589 }
1590 
ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP * p)1591 void ossl_ec_nistp384_pre_comp_free(NISTP384_PRE_COMP *p)
1592 {
1593     int i;
1594 
1595     if (p == NULL)
1596         return;
1597 
1598     CRYPTO_DOWN_REF(&p->references, &i);
1599     REF_PRINT_COUNT("ossl_ec_nistp384", i, p);
1600     if (i > 0)
1601         return;
1602     REF_ASSERT_ISNT(i < 0);
1603 
1604     CRYPTO_FREE_REF(&p->references);
1605     OPENSSL_free(p);
1606 }
1607 
1608 /******************************************************************************/
1609 /*
1610  * OPENSSL EC_METHOD FUNCTIONS
1611  */
1612 
ossl_ec_GFp_nistp384_group_init(EC_GROUP * group)1613 int ossl_ec_GFp_nistp384_group_init(EC_GROUP *group)
1614 {
1615     int ret;
1616 
1617     ret = ossl_ec_GFp_simple_group_init(group);
1618     group->a_is_minus3 = 1;
1619     return ret;
1620 }
1621 
ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP * group,const BIGNUM * p,const BIGNUM * a,const BIGNUM * b,BN_CTX * ctx)1622 int ossl_ec_GFp_nistp384_group_set_curve(EC_GROUP *group, const BIGNUM *p,
1623                                          const BIGNUM *a, const BIGNUM *b,
1624                                          BN_CTX *ctx)
1625 {
1626     int ret = 0;
1627     BIGNUM *curve_p, *curve_a, *curve_b;
1628 #ifndef FIPS_MODULE
1629     BN_CTX *new_ctx = NULL;
1630 
1631     if (ctx == NULL)
1632         ctx = new_ctx = BN_CTX_new();
1633 #endif
1634     if (ctx == NULL)
1635         return 0;
1636 
1637     BN_CTX_start(ctx);
1638     curve_p = BN_CTX_get(ctx);
1639     curve_a = BN_CTX_get(ctx);
1640     curve_b = BN_CTX_get(ctx);
1641     if (curve_b == NULL)
1642         goto err;
1643     BN_bin2bn(nistp384_curve_params[0], sizeof(felem_bytearray), curve_p);
1644     BN_bin2bn(nistp384_curve_params[1], sizeof(felem_bytearray), curve_a);
1645     BN_bin2bn(nistp384_curve_params[2], sizeof(felem_bytearray), curve_b);
1646     if ((BN_cmp(curve_p, p)) || (BN_cmp(curve_a, a)) || (BN_cmp(curve_b, b))) {
1647         ERR_raise(ERR_LIB_EC, EC_R_WRONG_CURVE_PARAMETERS);
1648         goto err;
1649     }
1650     group->field_mod_func = BN_nist_mod_384;
1651     ret = ossl_ec_GFp_simple_group_set_curve(group, p, a, b, ctx);
1652  err:
1653     BN_CTX_end(ctx);
1654 #ifndef FIPS_MODULE
1655     BN_CTX_free(new_ctx);
1656 #endif
1657     return ret;
1658 }
1659 
1660 /*
1661  * Takes the Jacobian coordinates (X, Y, Z) of a point and returns (X', Y') =
1662  * (X/Z^2, Y/Z^3)
1663  */
ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP * group,const EC_POINT * point,BIGNUM * x,BIGNUM * y,BN_CTX * ctx)1664 int ossl_ec_GFp_nistp384_point_get_affine_coordinates(const EC_GROUP *group,
1665                                                       const EC_POINT *point,
1666                                                       BIGNUM *x, BIGNUM *y,
1667                                                       BN_CTX *ctx)
1668 {
1669     felem z1, z2, x_in, y_in, x_out, y_out;
1670     widefelem tmp;
1671 
1672     if (EC_POINT_is_at_infinity(group, point)) {
1673         ERR_raise(ERR_LIB_EC, EC_R_POINT_AT_INFINITY);
1674         return 0;
1675     }
1676     if ((!BN_to_felem(x_in, point->X)) || (!BN_to_felem(y_in, point->Y)) ||
1677         (!BN_to_felem(z1, point->Z)))
1678         return 0;
1679     felem_inv(z2, z1);
1680     felem_square(tmp, z2);
1681     felem_reduce(z1, tmp);
1682     felem_mul(tmp, x_in, z1);
1683     felem_reduce(x_in, tmp);
1684     felem_contract(x_out, x_in);
1685     if (x != NULL) {
1686         if (!felem_to_BN(x, x_out)) {
1687             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1688             return 0;
1689         }
1690     }
1691     felem_mul(tmp, z1, z2);
1692     felem_reduce(z1, tmp);
1693     felem_mul(tmp, y_in, z1);
1694     felem_reduce(y_in, tmp);
1695     felem_contract(y_out, y_in);
1696     if (y != NULL) {
1697         if (!felem_to_BN(y, y_out)) {
1698             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1699             return 0;
1700         }
1701     }
1702     return 1;
1703 }
1704 
1705 /* points below is of size |num|, and tmp_felems is of size |num+1/ */
make_points_affine(size_t num,felem points[][3],felem tmp_felems[])1706 static void make_points_affine(size_t num, felem points[][3],
1707                                felem tmp_felems[])
1708 {
1709     /*
1710      * Runs in constant time, unless an input is the point at infinity (which
1711      * normally shouldn't happen).
1712      */
1713     ossl_ec_GFp_nistp_points_make_affine_internal(num,
1714                                                   points,
1715                                                   sizeof(felem),
1716                                                   tmp_felems,
1717                                                   (void (*)(void *))felem_one,
1718                                                   felem_is_zero_int,
1719                                                   (void (*)(void *, const void *))
1720                                                   felem_assign,
1721                                                   (void (*)(void *, const void *))
1722                                                   felem_square_reduce,
1723                                                   (void (*)(void *, const void *, const void*))
1724                                                   felem_mul_reduce,
1725                                                   (void (*)(void *, const void *))
1726                                                   felem_inv,
1727                                                   (void (*)(void *, const void *))
1728                                                   felem_contract);
1729 }
1730 
1731 /*
1732  * Computes scalar*generator + \sum scalars[i]*points[i], ignoring NULL
1733  * values Result is stored in r (r can equal one of the inputs).
1734  */
ossl_ec_GFp_nistp384_points_mul(const EC_GROUP * group,EC_POINT * r,const BIGNUM * scalar,size_t num,const EC_POINT * points[],const BIGNUM * scalars[],BN_CTX * ctx)1735 int ossl_ec_GFp_nistp384_points_mul(const EC_GROUP *group, EC_POINT *r,
1736                                     const BIGNUM *scalar, size_t num,
1737                                     const EC_POINT *points[],
1738                                     const BIGNUM *scalars[], BN_CTX *ctx)
1739 {
1740     int ret = 0;
1741     int j;
1742     int mixed = 0;
1743     BIGNUM *x, *y, *z, *tmp_scalar;
1744     felem_bytearray g_secret;
1745     felem_bytearray *secrets = NULL;
1746     felem (*pre_comp)[17][3] = NULL;
1747     felem *tmp_felems = NULL;
1748     unsigned int i;
1749     int num_bytes;
1750     int have_pre_comp = 0;
1751     size_t num_points = num;
1752     felem x_in, y_in, z_in, x_out, y_out, z_out;
1753     NISTP384_PRE_COMP *pre = NULL;
1754     felem(*g_pre_comp)[3] = NULL;
1755     EC_POINT *generator = NULL;
1756     const EC_POINT *p = NULL;
1757     const BIGNUM *p_scalar = NULL;
1758 
1759     BN_CTX_start(ctx);
1760     x = BN_CTX_get(ctx);
1761     y = BN_CTX_get(ctx);
1762     z = BN_CTX_get(ctx);
1763     tmp_scalar = BN_CTX_get(ctx);
1764     if (tmp_scalar == NULL)
1765         goto err;
1766 
1767     if (scalar != NULL) {
1768         pre = group->pre_comp.nistp384;
1769         if (pre)
1770             /* we have precomputation, try to use it */
1771             g_pre_comp = &pre->g_pre_comp[0];
1772         else
1773             /* try to use the standard precomputation */
1774             g_pre_comp = (felem(*)[3]) gmul;
1775         generator = EC_POINT_new(group);
1776         if (generator == NULL)
1777             goto err;
1778         /* get the generator from precomputation */
1779         if (!felem_to_BN(x, g_pre_comp[1][0]) ||
1780             !felem_to_BN(y, g_pre_comp[1][1]) ||
1781             !felem_to_BN(z, g_pre_comp[1][2])) {
1782             ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1783             goto err;
1784         }
1785         if (!ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group,
1786                                                                 generator,
1787                                                                 x, y, z, ctx))
1788             goto err;
1789         if (0 == EC_POINT_cmp(group, generator, group->generator, ctx))
1790             /* precomputation matches generator */
1791             have_pre_comp = 1;
1792         else
1793             /*
1794              * we don't have valid precomputation: treat the generator as a
1795              * random point
1796              */
1797             num_points++;
1798     }
1799 
1800     if (num_points > 0) {
1801         if (num_points >= 2) {
1802             /*
1803              * unless we precompute multiples for just one point, converting
1804              * those into affine form is time well spent
1805              */
1806             mixed = 1;
1807         }
1808         secrets = OPENSSL_zalloc(sizeof(*secrets) * num_points);
1809         pre_comp = OPENSSL_zalloc(sizeof(*pre_comp) * num_points);
1810         if (mixed)
1811             tmp_felems =
1812                 OPENSSL_malloc(sizeof(*tmp_felems) * (num_points * 17 + 1));
1813         if ((secrets == NULL) || (pre_comp == NULL)
1814             || (mixed && (tmp_felems == NULL)))
1815             goto err;
1816 
1817         /*
1818          * we treat NULL scalars as 0, and NULL points as points at infinity,
1819          * i.e., they contribute nothing to the linear combination
1820          */
1821         for (i = 0; i < num_points; ++i) {
1822             if (i == num) {
1823                 /*
1824                  * we didn't have a valid precomputation, so we pick the
1825                  * generator
1826                  */
1827                 p = EC_GROUP_get0_generator(group);
1828                 p_scalar = scalar;
1829             } else {
1830                 /* the i^th point */
1831                 p = points[i];
1832                 p_scalar = scalars[i];
1833             }
1834             if (p_scalar != NULL && p != NULL) {
1835                 /* reduce scalar to 0 <= scalar < 2^384 */
1836                 if ((BN_num_bits(p_scalar) > 384)
1837                     || (BN_is_negative(p_scalar))) {
1838                     /*
1839                      * this is an unusual input, and we don't guarantee
1840                      * constant-timeness
1841                      */
1842                     if (!BN_nnmod(tmp_scalar, p_scalar, group->order, ctx)) {
1843                         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1844                         goto err;
1845                     }
1846                     num_bytes = BN_bn2lebinpad(tmp_scalar,
1847                                                secrets[i], sizeof(secrets[i]));
1848                 } else {
1849                     num_bytes = BN_bn2lebinpad(p_scalar,
1850                                                secrets[i], sizeof(secrets[i]));
1851                 }
1852                 if (num_bytes < 0) {
1853                     ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1854                     goto err;
1855                 }
1856                 /* precompute multiples */
1857                 if ((!BN_to_felem(x_out, p->X)) ||
1858                     (!BN_to_felem(y_out, p->Y)) ||
1859                     (!BN_to_felem(z_out, p->Z)))
1860                     goto err;
1861                 memcpy(pre_comp[i][1][0], x_out, sizeof(felem));
1862                 memcpy(pre_comp[i][1][1], y_out, sizeof(felem));
1863                 memcpy(pre_comp[i][1][2], z_out, sizeof(felem));
1864                 for (j = 2; j <= 16; ++j) {
1865                     if (j & 1) {
1866                         point_add(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1867                                   pre_comp[i][1][0],     pre_comp[i][1][1],     pre_comp[i][1][2], 0,
1868                                   pre_comp[i][j - 1][0], pre_comp[i][j - 1][1], pre_comp[i][j - 1][2]);
1869                     } else {
1870                         point_double(pre_comp[i][j][0],     pre_comp[i][j][1],     pre_comp[i][j][2],
1871                                      pre_comp[i][j / 2][0], pre_comp[i][j / 2][1], pre_comp[i][j / 2][2]);
1872                     }
1873                 }
1874             }
1875         }
1876         if (mixed)
1877             make_points_affine(num_points * 17, pre_comp[0], tmp_felems);
1878     }
1879 
1880     /* the scalar for the generator */
1881     if (scalar != NULL && have_pre_comp) {
1882         memset(g_secret, 0, sizeof(g_secret));
1883         /* reduce scalar to 0 <= scalar < 2^384 */
1884         if ((BN_num_bits(scalar) > 384) || (BN_is_negative(scalar))) {
1885             /*
1886              * this is an unusual input, and we don't guarantee
1887              * constant-timeness
1888              */
1889             if (!BN_nnmod(tmp_scalar, scalar, group->order, ctx)) {
1890                 ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1891                 goto err;
1892             }
1893             num_bytes = BN_bn2lebinpad(tmp_scalar, g_secret, sizeof(g_secret));
1894         } else {
1895             num_bytes = BN_bn2lebinpad(scalar, g_secret, sizeof(g_secret));
1896         }
1897         /* do the multiplication with generator precomputation */
1898         batch_mul(x_out, y_out, z_out,
1899                   (const felem_bytearray(*))secrets, num_points,
1900                   g_secret,
1901                   mixed, (const felem(*)[17][3])pre_comp,
1902                   (const felem(*)[3])g_pre_comp);
1903     } else {
1904         /* do the multiplication without generator precomputation */
1905         batch_mul(x_out, y_out, z_out,
1906                   (const felem_bytearray(*))secrets, num_points,
1907                   NULL, mixed, (const felem(*)[17][3])pre_comp, NULL);
1908     }
1909     /* reduce the output to its unique minimal representation */
1910     felem_contract(x_in, x_out);
1911     felem_contract(y_in, y_out);
1912     felem_contract(z_in, z_out);
1913     if ((!felem_to_BN(x, x_in)) || (!felem_to_BN(y, y_in)) ||
1914         (!felem_to_BN(z, z_in))) {
1915         ERR_raise(ERR_LIB_EC, ERR_R_BN_LIB);
1916         goto err;
1917     }
1918     ret = ossl_ec_GFp_simple_set_Jprojective_coordinates_GFp(group, r, x, y, z,
1919                                                              ctx);
1920 
1921  err:
1922     BN_CTX_end(ctx);
1923     EC_POINT_free(generator);
1924     OPENSSL_free(secrets);
1925     OPENSSL_free(pre_comp);
1926     OPENSSL_free(tmp_felems);
1927     return ret;
1928 }
1929 
ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP * group,BN_CTX * ctx)1930 int ossl_ec_GFp_nistp384_precompute_mult(EC_GROUP *group, BN_CTX *ctx)
1931 {
1932     int ret = 0;
1933     NISTP384_PRE_COMP *pre = NULL;
1934     int i, j;
1935     BIGNUM *x, *y;
1936     EC_POINT *generator = NULL;
1937     felem tmp_felems[16];
1938 #ifndef FIPS_MODULE
1939     BN_CTX *new_ctx = NULL;
1940 #endif
1941 
1942     /* throw away old precomputation */
1943     EC_pre_comp_free(group);
1944 
1945 #ifndef FIPS_MODULE
1946     if (ctx == NULL)
1947         ctx = new_ctx = BN_CTX_new();
1948 #endif
1949     if (ctx == NULL)
1950         return 0;
1951 
1952     BN_CTX_start(ctx);
1953     x = BN_CTX_get(ctx);
1954     y = BN_CTX_get(ctx);
1955     if (y == NULL)
1956         goto err;
1957     /* get the generator */
1958     if (group->generator == NULL)
1959         goto err;
1960     generator = EC_POINT_new(group);
1961     if (generator == NULL)
1962         goto err;
1963     BN_bin2bn(nistp384_curve_params[3], sizeof(felem_bytearray), x);
1964     BN_bin2bn(nistp384_curve_params[4], sizeof(felem_bytearray), y);
1965     if (!EC_POINT_set_affine_coordinates(group, generator, x, y, ctx))
1966         goto err;
1967     if ((pre = nistp384_pre_comp_new()) == NULL)
1968         goto err;
1969     /*
1970      * if the generator is the standard one, use built-in precomputation
1971      */
1972     if (0 == EC_POINT_cmp(group, generator, group->generator, ctx)) {
1973         memcpy(pre->g_pre_comp, gmul, sizeof(pre->g_pre_comp));
1974         goto done;
1975     }
1976     if ((!BN_to_felem(pre->g_pre_comp[1][0], group->generator->X)) ||
1977         (!BN_to_felem(pre->g_pre_comp[1][1], group->generator->Y)) ||
1978         (!BN_to_felem(pre->g_pre_comp[1][2], group->generator->Z)))
1979         goto err;
1980     /* compute 2^95*G, 2^190*G, 2^285*G */
1981     for (i = 1; i <= 4; i <<= 1) {
1982         point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1983                      pre->g_pre_comp[i][0],  pre->g_pre_comp[i][1],    pre->g_pre_comp[i][2]);
1984         for (j = 0; j < 94; ++j) {
1985             point_double(pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2],
1986                          pre->g_pre_comp[2 * i][0], pre->g_pre_comp[2 * i][1], pre->g_pre_comp[2 * i][2]);
1987         }
1988     }
1989     /* g_pre_comp[0] is the point at infinity */
1990     memset(pre->g_pre_comp[0], 0, sizeof(pre->g_pre_comp[0]));
1991     /* the remaining multiples */
1992     /* 2^95*G + 2^190*G */
1993     point_add(pre->g_pre_comp[6][0],  pre->g_pre_comp[6][1],  pre->g_pre_comp[6][2],
1994               pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2], 0,
1995               pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
1996     /* 2^95*G + 2^285*G */
1997     point_add(pre->g_pre_comp[10][0], pre->g_pre_comp[10][1], pre->g_pre_comp[10][2],
1998               pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
1999               pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
2000     /* 2^190*G + 2^285*G */
2001     point_add(pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2],
2002               pre->g_pre_comp[8][0],  pre->g_pre_comp[8][1],  pre->g_pre_comp[8][2], 0,
2003               pre->g_pre_comp[4][0],  pre->g_pre_comp[4][1],  pre->g_pre_comp[4][2]);
2004     /* 2^95*G + 2^190*G + 2^285*G */
2005     point_add(pre->g_pre_comp[14][0], pre->g_pre_comp[14][1], pre->g_pre_comp[14][2],
2006               pre->g_pre_comp[12][0], pre->g_pre_comp[12][1], pre->g_pre_comp[12][2], 0,
2007               pre->g_pre_comp[2][0],  pre->g_pre_comp[2][1],  pre->g_pre_comp[2][2]);
2008     for (i = 1; i < 8; ++i) {
2009         /* odd multiples: add G */
2010         point_add(pre->g_pre_comp[2 * i + 1][0], pre->g_pre_comp[2 * i + 1][1], pre->g_pre_comp[2 * i + 1][2],
2011                   pre->g_pre_comp[2 * i][0],     pre->g_pre_comp[2 * i][1],     pre->g_pre_comp[2 * i][2], 0,
2012                   pre->g_pre_comp[1][0],         pre->g_pre_comp[1][1],         pre->g_pre_comp[1][2]);
2013     }
2014     make_points_affine(15, &(pre->g_pre_comp[1]), tmp_felems);
2015 
2016  done:
2017     SETPRECOMP(group, nistp384, pre);
2018     ret = 1;
2019     pre = NULL;
2020  err:
2021     BN_CTX_end(ctx);
2022     EC_POINT_free(generator);
2023 #ifndef FIPS_MODULE
2024     BN_CTX_free(new_ctx);
2025 #endif
2026     ossl_ec_nistp384_pre_comp_free(pre);
2027     return ret;
2028 }
2029 
ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP * group)2030 int ossl_ec_GFp_nistp384_have_precompute_mult(const EC_GROUP *group)
2031 {
2032     return HAVEPRECOMP(group, nistp384);
2033 }
2034