1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ 29 * that right-shifting a signed negative integer copies the sign bit 30 * (arithmetic right-shift). This is "implementation-defined behaviour", 31 * i.e. it is not undefined, but it may differ between compilers. Each 32 * compiler is supposed to document its behaviour in that respect. GCC 33 * explicitly defines that an arithmetic right shift is used. We expect 34 * all other compilers to do the same, because underlying CPU offer an 35 * arithmetic right shift opcode that could not be used otherwise. 36 */ 37 #if BR_NO_ARITH_SHIFT 38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \ 39 | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) 40 #else 41 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) 42 #endif 43 44 /* 45 * Convert an integer from unsigned big-endian encoding to a sequence of 46 * 13-bit words in little-endian order. The final "partial" word is 47 * returned. 48 */ 49 static uint32_t 50 be8_to_le13(uint32_t *dst, const unsigned char *src, size_t len) 51 { 52 uint32_t acc; 53 int acc_len; 54 55 acc = 0; 56 acc_len = 0; 57 while (len -- > 0) { 58 acc |= (uint32_t)src[len] << acc_len; 59 acc_len += 8; 60 if (acc_len >= 13) { 61 *dst ++ = acc & 0x1FFF; 62 acc >>= 13; 63 acc_len -= 13; 64 } 65 } 66 return acc; 67 } 68 69 /* 70 * Convert an integer (13-bit words, little-endian) to unsigned 71 * big-endian encoding. The total encoding length is provided; all 72 * the destination bytes will be filled. 73 */ 74 static void 75 le13_to_be8(unsigned char *dst, size_t len, const uint32_t *src) 76 { 77 uint32_t acc; 78 int acc_len; 79 80 acc = 0; 81 acc_len = 0; 82 while (len -- > 0) { 83 if (acc_len < 8) { 84 acc |= (*src ++) << acc_len; 85 acc_len += 13; 86 } 87 dst[len] = (unsigned char)acc; 88 acc >>= 8; 89 acc_len -= 8; 90 } 91 } 92 93 /* 94 * Normalise an array of words to a strict 13 bits per word. Returned 95 * value is the resulting carry. The source (w) and destination (d) 96 * arrays may be identical, but shall not overlap partially. 97 */ 98 static inline uint32_t 99 norm13(uint32_t *d, const uint32_t *w, size_t len) 100 { 101 size_t u; 102 uint32_t cc; 103 104 cc = 0; 105 for (u = 0; u < len; u ++) { 106 int32_t z; 107 108 z = w[u] + cc; 109 d[u] = z & 0x1FFF; 110 cc = ARSH(z, 13); 111 } 112 return cc; 113 } 114 115 /* 116 * mul20() multiplies two 260-bit integers together. Each word must fit 117 * on 13 bits; source operands use 20 words, destination operand 118 * receives 40 words. All overlaps allowed. 119 * 120 * square20() computes the square of a 260-bit integer. Each word must 121 * fit on 13 bits; source operand uses 20 words, destination operand 122 * receives 40 words. All overlaps allowed. 123 */ 124 125 #if BR_SLOW_MUL15 126 127 static void 128 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) 129 { 130 /* 131 * Two-level Karatsuba: turns a 20x20 multiplication into 132 * nine 5x5 multiplications. We use 13-bit words but do not 133 * propagate carries immediately, so words may expand: 134 * 135 * - First Karatsuba decomposition turns the 20x20 mul on 136 * 13-bit words into three 10x10 muls, two on 13-bit words 137 * and one on 14-bit words. 138 * 139 * - Second Karatsuba decomposition further splits these into: 140 * 141 * * four 5x5 muls on 13-bit words 142 * * four 5x5 muls on 14-bit words 143 * * one 5x5 mul on 15-bit words 144 * 145 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit 146 * or 15-bit words, respectively. 147 */ 148 uint32_t u[45], v[45], w[90]; 149 uint32_t cc; 150 int i; 151 152 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ 153 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \ 154 + (s2w)[5 * (s2_off) + 0]; \ 155 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \ 156 + (s2w)[5 * (s2_off) + 1]; \ 157 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \ 158 + (s2w)[5 * (s2_off) + 2]; \ 159 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \ 160 + (s2w)[5 * (s2_off) + 3]; \ 161 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \ 162 + (s2w)[5 * (s2_off) + 4]; \ 163 } while (0) 164 165 #define ZADDT(dw, d_off, sw, s_off) do { \ 166 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \ 167 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \ 168 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \ 169 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \ 170 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \ 171 } while (0) 172 173 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ 174 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \ 175 + (s2w)[5 * (s2_off) + 0]; \ 176 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \ 177 + (s2w)[5 * (s2_off) + 1]; \ 178 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \ 179 + (s2w)[5 * (s2_off) + 2]; \ 180 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \ 181 + (s2w)[5 * (s2_off) + 3]; \ 182 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \ 183 + (s2w)[5 * (s2_off) + 4]; \ 184 } while (0) 185 186 #define CPR1(w, cprcc) do { \ 187 uint32_t cprz = (w) + cprcc; \ 188 (w) = cprz & 0x1FFF; \ 189 cprcc = cprz >> 13; \ 190 } while (0) 191 192 #define CPR(dw, d_off) do { \ 193 uint32_t cprcc; \ 194 cprcc = 0; \ 195 CPR1((dw)[(d_off) + 0], cprcc); \ 196 CPR1((dw)[(d_off) + 1], cprcc); \ 197 CPR1((dw)[(d_off) + 2], cprcc); \ 198 CPR1((dw)[(d_off) + 3], cprcc); \ 199 CPR1((dw)[(d_off) + 4], cprcc); \ 200 CPR1((dw)[(d_off) + 5], cprcc); \ 201 CPR1((dw)[(d_off) + 6], cprcc); \ 202 CPR1((dw)[(d_off) + 7], cprcc); \ 203 CPR1((dw)[(d_off) + 8], cprcc); \ 204 (dw)[(d_off) + 9] = cprcc; \ 205 } while (0) 206 207 memcpy(u, a, 20 * sizeof *a); 208 ZADD(u, 4, a, 0, a, 1); 209 ZADD(u, 5, a, 2, a, 3); 210 ZADD(u, 6, a, 0, a, 2); 211 ZADD(u, 7, a, 1, a, 3); 212 ZADD(u, 8, u, 6, u, 7); 213 214 memcpy(v, b, 20 * sizeof *b); 215 ZADD(v, 4, b, 0, b, 1); 216 ZADD(v, 5, b, 2, b, 3); 217 ZADD(v, 6, b, 0, b, 2); 218 ZADD(v, 7, b, 1, b, 3); 219 ZADD(v, 8, v, 6, v, 7); 220 221 /* 222 * Do the eight first 8x8 muls. Source words are at most 16382 223 * each, so we can add product results together "as is" in 32-bit 224 * words. 225 */ 226 for (i = 0; i < 40; i += 5) { 227 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]); 228 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1]) 229 + MUL15(u[i + 1], v[i + 0]); 230 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2]) 231 + MUL15(u[i + 1], v[i + 1]) 232 + MUL15(u[i + 2], v[i + 0]); 233 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3]) 234 + MUL15(u[i + 1], v[i + 2]) 235 + MUL15(u[i + 2], v[i + 1]) 236 + MUL15(u[i + 3], v[i + 0]); 237 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4]) 238 + MUL15(u[i + 1], v[i + 3]) 239 + MUL15(u[i + 2], v[i + 2]) 240 + MUL15(u[i + 3], v[i + 1]) 241 + MUL15(u[i + 4], v[i + 0]); 242 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4]) 243 + MUL15(u[i + 2], v[i + 3]) 244 + MUL15(u[i + 3], v[i + 2]) 245 + MUL15(u[i + 4], v[i + 1]); 246 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4]) 247 + MUL15(u[i + 3], v[i + 3]) 248 + MUL15(u[i + 4], v[i + 2]); 249 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4]) 250 + MUL15(u[i + 4], v[i + 3]); 251 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]); 252 w[(i << 1) + 9] = 0; 253 } 254 255 /* 256 * For the 9th multiplication, source words are up to 32764, 257 * so we must do some carry propagation. If we add up to 258 * 4 products and the carry is no more than 524224, then the 259 * result fits in 32 bits, and the next carry will be no more 260 * than 524224 (because 4*(32764^2)+524224 < 8192*524225). 261 * 262 * We thus just skip one of the products in the middle word, 263 * then do a carry propagation (this reduces words to 13 bits 264 * each, except possibly the last, which may use up to 17 bits 265 * or so), then add the missing product. 266 */ 267 w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]); 268 w[80 + 1] = MUL15(u[40 + 0], v[40 + 1]) 269 + MUL15(u[40 + 1], v[40 + 0]); 270 w[80 + 2] = MUL15(u[40 + 0], v[40 + 2]) 271 + MUL15(u[40 + 1], v[40 + 1]) 272 + MUL15(u[40 + 2], v[40 + 0]); 273 w[80 + 3] = MUL15(u[40 + 0], v[40 + 3]) 274 + MUL15(u[40 + 1], v[40 + 2]) 275 + MUL15(u[40 + 2], v[40 + 1]) 276 + MUL15(u[40 + 3], v[40 + 0]); 277 w[80 + 4] = MUL15(u[40 + 0], v[40 + 4]) 278 + MUL15(u[40 + 1], v[40 + 3]) 279 + MUL15(u[40 + 2], v[40 + 2]) 280 + MUL15(u[40 + 3], v[40 + 1]); 281 /* + MUL15(u[40 + 4], v[40 + 0]) */ 282 w[80 + 5] = MUL15(u[40 + 1], v[40 + 4]) 283 + MUL15(u[40 + 2], v[40 + 3]) 284 + MUL15(u[40 + 3], v[40 + 2]) 285 + MUL15(u[40 + 4], v[40 + 1]); 286 w[80 + 6] = MUL15(u[40 + 2], v[40 + 4]) 287 + MUL15(u[40 + 3], v[40 + 3]) 288 + MUL15(u[40 + 4], v[40 + 2]); 289 w[80 + 7] = MUL15(u[40 + 3], v[40 + 4]) 290 + MUL15(u[40 + 4], v[40 + 3]); 291 w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]); 292 293 CPR(w, 80); 294 295 w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]); 296 297 /* 298 * The products on 14-bit words in slots 6 and 7 yield values 299 * up to 5*(16382^2) each, and we need to subtract two such 300 * values from the higher word. We need the subtraction to fit 301 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit. 302 * However, 10*(16382^2) does not fit. So we must perform a 303 * bit of reduction here. 304 */ 305 CPR(w, 60); 306 CPR(w, 70); 307 308 /* 309 * Recompose results. 310 */ 311 312 /* 0..1*0..1 into 0..3 */ 313 ZSUB2F(w, 8, w, 0, w, 2); 314 ZSUB2F(w, 9, w, 1, w, 3); 315 ZADDT(w, 1, w, 8); 316 ZADDT(w, 2, w, 9); 317 318 /* 2..3*2..3 into 4..7 */ 319 ZSUB2F(w, 10, w, 4, w, 6); 320 ZSUB2F(w, 11, w, 5, w, 7); 321 ZADDT(w, 5, w, 10); 322 ZADDT(w, 6, w, 11); 323 324 /* (0..1+2..3)*(0..1+2..3) into 12..15 */ 325 ZSUB2F(w, 16, w, 12, w, 14); 326 ZSUB2F(w, 17, w, 13, w, 15); 327 ZADDT(w, 13, w, 16); 328 ZADDT(w, 14, w, 17); 329 330 /* first-level recomposition */ 331 ZSUB2F(w, 12, w, 0, w, 4); 332 ZSUB2F(w, 13, w, 1, w, 5); 333 ZSUB2F(w, 14, w, 2, w, 6); 334 ZSUB2F(w, 15, w, 3, w, 7); 335 ZADDT(w, 2, w, 12); 336 ZADDT(w, 3, w, 13); 337 ZADDT(w, 4, w, 14); 338 ZADDT(w, 5, w, 15); 339 340 /* 341 * Perform carry propagation to bring all words down to 13 bits. 342 */ 343 cc = norm13(d, w, 40); 344 d[39] += (cc << 13); 345 346 #undef ZADD 347 #undef ZADDT 348 #undef ZSUB2F 349 #undef CPR1 350 #undef CPR 351 } 352 353 static inline void 354 square20(uint32_t *d, const uint32_t *a) 355 { 356 mul20(d, a, a); 357 } 358 359 #else 360 361 static void 362 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) 363 { 364 uint32_t t[39]; 365 366 t[ 0] = MUL15(a[ 0], b[ 0]); 367 t[ 1] = MUL15(a[ 0], b[ 1]) 368 + MUL15(a[ 1], b[ 0]); 369 t[ 2] = MUL15(a[ 0], b[ 2]) 370 + MUL15(a[ 1], b[ 1]) 371 + MUL15(a[ 2], b[ 0]); 372 t[ 3] = MUL15(a[ 0], b[ 3]) 373 + MUL15(a[ 1], b[ 2]) 374 + MUL15(a[ 2], b[ 1]) 375 + MUL15(a[ 3], b[ 0]); 376 t[ 4] = MUL15(a[ 0], b[ 4]) 377 + MUL15(a[ 1], b[ 3]) 378 + MUL15(a[ 2], b[ 2]) 379 + MUL15(a[ 3], b[ 1]) 380 + MUL15(a[ 4], b[ 0]); 381 t[ 5] = MUL15(a[ 0], b[ 5]) 382 + MUL15(a[ 1], b[ 4]) 383 + MUL15(a[ 2], b[ 3]) 384 + MUL15(a[ 3], b[ 2]) 385 + MUL15(a[ 4], b[ 1]) 386 + MUL15(a[ 5], b[ 0]); 387 t[ 6] = MUL15(a[ 0], b[ 6]) 388 + MUL15(a[ 1], b[ 5]) 389 + MUL15(a[ 2], b[ 4]) 390 + MUL15(a[ 3], b[ 3]) 391 + MUL15(a[ 4], b[ 2]) 392 + MUL15(a[ 5], b[ 1]) 393 + MUL15(a[ 6], b[ 0]); 394 t[ 7] = MUL15(a[ 0], b[ 7]) 395 + MUL15(a[ 1], b[ 6]) 396 + MUL15(a[ 2], b[ 5]) 397 + MUL15(a[ 3], b[ 4]) 398 + MUL15(a[ 4], b[ 3]) 399 + MUL15(a[ 5], b[ 2]) 400 + MUL15(a[ 6], b[ 1]) 401 + MUL15(a[ 7], b[ 0]); 402 t[ 8] = MUL15(a[ 0], b[ 8]) 403 + MUL15(a[ 1], b[ 7]) 404 + MUL15(a[ 2], b[ 6]) 405 + MUL15(a[ 3], b[ 5]) 406 + MUL15(a[ 4], b[ 4]) 407 + MUL15(a[ 5], b[ 3]) 408 + MUL15(a[ 6], b[ 2]) 409 + MUL15(a[ 7], b[ 1]) 410 + MUL15(a[ 8], b[ 0]); 411 t[ 9] = MUL15(a[ 0], b[ 9]) 412 + MUL15(a[ 1], b[ 8]) 413 + MUL15(a[ 2], b[ 7]) 414 + MUL15(a[ 3], b[ 6]) 415 + MUL15(a[ 4], b[ 5]) 416 + MUL15(a[ 5], b[ 4]) 417 + MUL15(a[ 6], b[ 3]) 418 + MUL15(a[ 7], b[ 2]) 419 + MUL15(a[ 8], b[ 1]) 420 + MUL15(a[ 9], b[ 0]); 421 t[10] = MUL15(a[ 0], b[10]) 422 + MUL15(a[ 1], b[ 9]) 423 + MUL15(a[ 2], b[ 8]) 424 + MUL15(a[ 3], b[ 7]) 425 + MUL15(a[ 4], b[ 6]) 426 + MUL15(a[ 5], b[ 5]) 427 + MUL15(a[ 6], b[ 4]) 428 + MUL15(a[ 7], b[ 3]) 429 + MUL15(a[ 8], b[ 2]) 430 + MUL15(a[ 9], b[ 1]) 431 + MUL15(a[10], b[ 0]); 432 t[11] = MUL15(a[ 0], b[11]) 433 + MUL15(a[ 1], b[10]) 434 + MUL15(a[ 2], b[ 9]) 435 + MUL15(a[ 3], b[ 8]) 436 + MUL15(a[ 4], b[ 7]) 437 + MUL15(a[ 5], b[ 6]) 438 + MUL15(a[ 6], b[ 5]) 439 + MUL15(a[ 7], b[ 4]) 440 + MUL15(a[ 8], b[ 3]) 441 + MUL15(a[ 9], b[ 2]) 442 + MUL15(a[10], b[ 1]) 443 + MUL15(a[11], b[ 0]); 444 t[12] = MUL15(a[ 0], b[12]) 445 + MUL15(a[ 1], b[11]) 446 + MUL15(a[ 2], b[10]) 447 + MUL15(a[ 3], b[ 9]) 448 + MUL15(a[ 4], b[ 8]) 449 + MUL15(a[ 5], b[ 7]) 450 + MUL15(a[ 6], b[ 6]) 451 + MUL15(a[ 7], b[ 5]) 452 + MUL15(a[ 8], b[ 4]) 453 + MUL15(a[ 9], b[ 3]) 454 + MUL15(a[10], b[ 2]) 455 + MUL15(a[11], b[ 1]) 456 + MUL15(a[12], b[ 0]); 457 t[13] = MUL15(a[ 0], b[13]) 458 + MUL15(a[ 1], b[12]) 459 + MUL15(a[ 2], b[11]) 460 + MUL15(a[ 3], b[10]) 461 + MUL15(a[ 4], b[ 9]) 462 + MUL15(a[ 5], b[ 8]) 463 + MUL15(a[ 6], b[ 7]) 464 + MUL15(a[ 7], b[ 6]) 465 + MUL15(a[ 8], b[ 5]) 466 + MUL15(a[ 9], b[ 4]) 467 + MUL15(a[10], b[ 3]) 468 + MUL15(a[11], b[ 2]) 469 + MUL15(a[12], b[ 1]) 470 + MUL15(a[13], b[ 0]); 471 t[14] = MUL15(a[ 0], b[14]) 472 + MUL15(a[ 1], b[13]) 473 + MUL15(a[ 2], b[12]) 474 + MUL15(a[ 3], b[11]) 475 + MUL15(a[ 4], b[10]) 476 + MUL15(a[ 5], b[ 9]) 477 + MUL15(a[ 6], b[ 8]) 478 + MUL15(a[ 7], b[ 7]) 479 + MUL15(a[ 8], b[ 6]) 480 + MUL15(a[ 9], b[ 5]) 481 + MUL15(a[10], b[ 4]) 482 + MUL15(a[11], b[ 3]) 483 + MUL15(a[12], b[ 2]) 484 + MUL15(a[13], b[ 1]) 485 + MUL15(a[14], b[ 0]); 486 t[15] = MUL15(a[ 0], b[15]) 487 + MUL15(a[ 1], b[14]) 488 + MUL15(a[ 2], b[13]) 489 + MUL15(a[ 3], b[12]) 490 + MUL15(a[ 4], b[11]) 491 + MUL15(a[ 5], b[10]) 492 + MUL15(a[ 6], b[ 9]) 493 + MUL15(a[ 7], b[ 8]) 494 + MUL15(a[ 8], b[ 7]) 495 + MUL15(a[ 9], b[ 6]) 496 + MUL15(a[10], b[ 5]) 497 + MUL15(a[11], b[ 4]) 498 + MUL15(a[12], b[ 3]) 499 + MUL15(a[13], b[ 2]) 500 + MUL15(a[14], b[ 1]) 501 + MUL15(a[15], b[ 0]); 502 t[16] = MUL15(a[ 0], b[16]) 503 + MUL15(a[ 1], b[15]) 504 + MUL15(a[ 2], b[14]) 505 + MUL15(a[ 3], b[13]) 506 + MUL15(a[ 4], b[12]) 507 + MUL15(a[ 5], b[11]) 508 + MUL15(a[ 6], b[10]) 509 + MUL15(a[ 7], b[ 9]) 510 + MUL15(a[ 8], b[ 8]) 511 + MUL15(a[ 9], b[ 7]) 512 + MUL15(a[10], b[ 6]) 513 + MUL15(a[11], b[ 5]) 514 + MUL15(a[12], b[ 4]) 515 + MUL15(a[13], b[ 3]) 516 + MUL15(a[14], b[ 2]) 517 + MUL15(a[15], b[ 1]) 518 + MUL15(a[16], b[ 0]); 519 t[17] = MUL15(a[ 0], b[17]) 520 + MUL15(a[ 1], b[16]) 521 + MUL15(a[ 2], b[15]) 522 + MUL15(a[ 3], b[14]) 523 + MUL15(a[ 4], b[13]) 524 + MUL15(a[ 5], b[12]) 525 + MUL15(a[ 6], b[11]) 526 + MUL15(a[ 7], b[10]) 527 + MUL15(a[ 8], b[ 9]) 528 + MUL15(a[ 9], b[ 8]) 529 + MUL15(a[10], b[ 7]) 530 + MUL15(a[11], b[ 6]) 531 + MUL15(a[12], b[ 5]) 532 + MUL15(a[13], b[ 4]) 533 + MUL15(a[14], b[ 3]) 534 + MUL15(a[15], b[ 2]) 535 + MUL15(a[16], b[ 1]) 536 + MUL15(a[17], b[ 0]); 537 t[18] = MUL15(a[ 0], b[18]) 538 + MUL15(a[ 1], b[17]) 539 + MUL15(a[ 2], b[16]) 540 + MUL15(a[ 3], b[15]) 541 + MUL15(a[ 4], b[14]) 542 + MUL15(a[ 5], b[13]) 543 + MUL15(a[ 6], b[12]) 544 + MUL15(a[ 7], b[11]) 545 + MUL15(a[ 8], b[10]) 546 + MUL15(a[ 9], b[ 9]) 547 + MUL15(a[10], b[ 8]) 548 + MUL15(a[11], b[ 7]) 549 + MUL15(a[12], b[ 6]) 550 + MUL15(a[13], b[ 5]) 551 + MUL15(a[14], b[ 4]) 552 + MUL15(a[15], b[ 3]) 553 + MUL15(a[16], b[ 2]) 554 + MUL15(a[17], b[ 1]) 555 + MUL15(a[18], b[ 0]); 556 t[19] = MUL15(a[ 0], b[19]) 557 + MUL15(a[ 1], b[18]) 558 + MUL15(a[ 2], b[17]) 559 + MUL15(a[ 3], b[16]) 560 + MUL15(a[ 4], b[15]) 561 + MUL15(a[ 5], b[14]) 562 + MUL15(a[ 6], b[13]) 563 + MUL15(a[ 7], b[12]) 564 + MUL15(a[ 8], b[11]) 565 + MUL15(a[ 9], b[10]) 566 + MUL15(a[10], b[ 9]) 567 + MUL15(a[11], b[ 8]) 568 + MUL15(a[12], b[ 7]) 569 + MUL15(a[13], b[ 6]) 570 + MUL15(a[14], b[ 5]) 571 + MUL15(a[15], b[ 4]) 572 + MUL15(a[16], b[ 3]) 573 + MUL15(a[17], b[ 2]) 574 + MUL15(a[18], b[ 1]) 575 + MUL15(a[19], b[ 0]); 576 t[20] = MUL15(a[ 1], b[19]) 577 + MUL15(a[ 2], b[18]) 578 + MUL15(a[ 3], b[17]) 579 + MUL15(a[ 4], b[16]) 580 + MUL15(a[ 5], b[15]) 581 + MUL15(a[ 6], b[14]) 582 + MUL15(a[ 7], b[13]) 583 + MUL15(a[ 8], b[12]) 584 + MUL15(a[ 9], b[11]) 585 + MUL15(a[10], b[10]) 586 + MUL15(a[11], b[ 9]) 587 + MUL15(a[12], b[ 8]) 588 + MUL15(a[13], b[ 7]) 589 + MUL15(a[14], b[ 6]) 590 + MUL15(a[15], b[ 5]) 591 + MUL15(a[16], b[ 4]) 592 + MUL15(a[17], b[ 3]) 593 + MUL15(a[18], b[ 2]) 594 + MUL15(a[19], b[ 1]); 595 t[21] = MUL15(a[ 2], b[19]) 596 + MUL15(a[ 3], b[18]) 597 + MUL15(a[ 4], b[17]) 598 + MUL15(a[ 5], b[16]) 599 + MUL15(a[ 6], b[15]) 600 + MUL15(a[ 7], b[14]) 601 + MUL15(a[ 8], b[13]) 602 + MUL15(a[ 9], b[12]) 603 + MUL15(a[10], b[11]) 604 + MUL15(a[11], b[10]) 605 + MUL15(a[12], b[ 9]) 606 + MUL15(a[13], b[ 8]) 607 + MUL15(a[14], b[ 7]) 608 + MUL15(a[15], b[ 6]) 609 + MUL15(a[16], b[ 5]) 610 + MUL15(a[17], b[ 4]) 611 + MUL15(a[18], b[ 3]) 612 + MUL15(a[19], b[ 2]); 613 t[22] = MUL15(a[ 3], b[19]) 614 + MUL15(a[ 4], b[18]) 615 + MUL15(a[ 5], b[17]) 616 + MUL15(a[ 6], b[16]) 617 + MUL15(a[ 7], b[15]) 618 + MUL15(a[ 8], b[14]) 619 + MUL15(a[ 9], b[13]) 620 + MUL15(a[10], b[12]) 621 + MUL15(a[11], b[11]) 622 + MUL15(a[12], b[10]) 623 + MUL15(a[13], b[ 9]) 624 + MUL15(a[14], b[ 8]) 625 + MUL15(a[15], b[ 7]) 626 + MUL15(a[16], b[ 6]) 627 + MUL15(a[17], b[ 5]) 628 + MUL15(a[18], b[ 4]) 629 + MUL15(a[19], b[ 3]); 630 t[23] = MUL15(a[ 4], b[19]) 631 + MUL15(a[ 5], b[18]) 632 + MUL15(a[ 6], b[17]) 633 + MUL15(a[ 7], b[16]) 634 + MUL15(a[ 8], b[15]) 635 + MUL15(a[ 9], b[14]) 636 + MUL15(a[10], b[13]) 637 + MUL15(a[11], b[12]) 638 + MUL15(a[12], b[11]) 639 + MUL15(a[13], b[10]) 640 + MUL15(a[14], b[ 9]) 641 + MUL15(a[15], b[ 8]) 642 + MUL15(a[16], b[ 7]) 643 + MUL15(a[17], b[ 6]) 644 + MUL15(a[18], b[ 5]) 645 + MUL15(a[19], b[ 4]); 646 t[24] = MUL15(a[ 5], b[19]) 647 + MUL15(a[ 6], b[18]) 648 + MUL15(a[ 7], b[17]) 649 + MUL15(a[ 8], b[16]) 650 + MUL15(a[ 9], b[15]) 651 + MUL15(a[10], b[14]) 652 + MUL15(a[11], b[13]) 653 + MUL15(a[12], b[12]) 654 + MUL15(a[13], b[11]) 655 + MUL15(a[14], b[10]) 656 + MUL15(a[15], b[ 9]) 657 + MUL15(a[16], b[ 8]) 658 + MUL15(a[17], b[ 7]) 659 + MUL15(a[18], b[ 6]) 660 + MUL15(a[19], b[ 5]); 661 t[25] = MUL15(a[ 6], b[19]) 662 + MUL15(a[ 7], b[18]) 663 + MUL15(a[ 8], b[17]) 664 + MUL15(a[ 9], b[16]) 665 + MUL15(a[10], b[15]) 666 + MUL15(a[11], b[14]) 667 + MUL15(a[12], b[13]) 668 + MUL15(a[13], b[12]) 669 + MUL15(a[14], b[11]) 670 + MUL15(a[15], b[10]) 671 + MUL15(a[16], b[ 9]) 672 + MUL15(a[17], b[ 8]) 673 + MUL15(a[18], b[ 7]) 674 + MUL15(a[19], b[ 6]); 675 t[26] = MUL15(a[ 7], b[19]) 676 + MUL15(a[ 8], b[18]) 677 + MUL15(a[ 9], b[17]) 678 + MUL15(a[10], b[16]) 679 + MUL15(a[11], b[15]) 680 + MUL15(a[12], b[14]) 681 + MUL15(a[13], b[13]) 682 + MUL15(a[14], b[12]) 683 + MUL15(a[15], b[11]) 684 + MUL15(a[16], b[10]) 685 + MUL15(a[17], b[ 9]) 686 + MUL15(a[18], b[ 8]) 687 + MUL15(a[19], b[ 7]); 688 t[27] = MUL15(a[ 8], b[19]) 689 + MUL15(a[ 9], b[18]) 690 + MUL15(a[10], b[17]) 691 + MUL15(a[11], b[16]) 692 + MUL15(a[12], b[15]) 693 + MUL15(a[13], b[14]) 694 + MUL15(a[14], b[13]) 695 + MUL15(a[15], b[12]) 696 + MUL15(a[16], b[11]) 697 + MUL15(a[17], b[10]) 698 + MUL15(a[18], b[ 9]) 699 + MUL15(a[19], b[ 8]); 700 t[28] = MUL15(a[ 9], b[19]) 701 + MUL15(a[10], b[18]) 702 + MUL15(a[11], b[17]) 703 + MUL15(a[12], b[16]) 704 + MUL15(a[13], b[15]) 705 + MUL15(a[14], b[14]) 706 + MUL15(a[15], b[13]) 707 + MUL15(a[16], b[12]) 708 + MUL15(a[17], b[11]) 709 + MUL15(a[18], b[10]) 710 + MUL15(a[19], b[ 9]); 711 t[29] = MUL15(a[10], b[19]) 712 + MUL15(a[11], b[18]) 713 + MUL15(a[12], b[17]) 714 + MUL15(a[13], b[16]) 715 + MUL15(a[14], b[15]) 716 + MUL15(a[15], b[14]) 717 + MUL15(a[16], b[13]) 718 + MUL15(a[17], b[12]) 719 + MUL15(a[18], b[11]) 720 + MUL15(a[19], b[10]); 721 t[30] = MUL15(a[11], b[19]) 722 + MUL15(a[12], b[18]) 723 + MUL15(a[13], b[17]) 724 + MUL15(a[14], b[16]) 725 + MUL15(a[15], b[15]) 726 + MUL15(a[16], b[14]) 727 + MUL15(a[17], b[13]) 728 + MUL15(a[18], b[12]) 729 + MUL15(a[19], b[11]); 730 t[31] = MUL15(a[12], b[19]) 731 + MUL15(a[13], b[18]) 732 + MUL15(a[14], b[17]) 733 + MUL15(a[15], b[16]) 734 + MUL15(a[16], b[15]) 735 + MUL15(a[17], b[14]) 736 + MUL15(a[18], b[13]) 737 + MUL15(a[19], b[12]); 738 t[32] = MUL15(a[13], b[19]) 739 + MUL15(a[14], b[18]) 740 + MUL15(a[15], b[17]) 741 + MUL15(a[16], b[16]) 742 + MUL15(a[17], b[15]) 743 + MUL15(a[18], b[14]) 744 + MUL15(a[19], b[13]); 745 t[33] = MUL15(a[14], b[19]) 746 + MUL15(a[15], b[18]) 747 + MUL15(a[16], b[17]) 748 + MUL15(a[17], b[16]) 749 + MUL15(a[18], b[15]) 750 + MUL15(a[19], b[14]); 751 t[34] = MUL15(a[15], b[19]) 752 + MUL15(a[16], b[18]) 753 + MUL15(a[17], b[17]) 754 + MUL15(a[18], b[16]) 755 + MUL15(a[19], b[15]); 756 t[35] = MUL15(a[16], b[19]) 757 + MUL15(a[17], b[18]) 758 + MUL15(a[18], b[17]) 759 + MUL15(a[19], b[16]); 760 t[36] = MUL15(a[17], b[19]) 761 + MUL15(a[18], b[18]) 762 + MUL15(a[19], b[17]); 763 t[37] = MUL15(a[18], b[19]) 764 + MUL15(a[19], b[18]); 765 t[38] = MUL15(a[19], b[19]); 766 d[39] = norm13(d, t, 39); 767 } 768 769 static void 770 square20(uint32_t *d, const uint32_t *a) 771 { 772 uint32_t t[39]; 773 774 t[ 0] = MUL15(a[ 0], a[ 0]); 775 t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1); 776 t[ 2] = MUL15(a[ 1], a[ 1]) 777 + ((MUL15(a[ 0], a[ 2])) << 1); 778 t[ 3] = ((MUL15(a[ 0], a[ 3]) 779 + MUL15(a[ 1], a[ 2])) << 1); 780 t[ 4] = MUL15(a[ 2], a[ 2]) 781 + ((MUL15(a[ 0], a[ 4]) 782 + MUL15(a[ 1], a[ 3])) << 1); 783 t[ 5] = ((MUL15(a[ 0], a[ 5]) 784 + MUL15(a[ 1], a[ 4]) 785 + MUL15(a[ 2], a[ 3])) << 1); 786 t[ 6] = MUL15(a[ 3], a[ 3]) 787 + ((MUL15(a[ 0], a[ 6]) 788 + MUL15(a[ 1], a[ 5]) 789 + MUL15(a[ 2], a[ 4])) << 1); 790 t[ 7] = ((MUL15(a[ 0], a[ 7]) 791 + MUL15(a[ 1], a[ 6]) 792 + MUL15(a[ 2], a[ 5]) 793 + MUL15(a[ 3], a[ 4])) << 1); 794 t[ 8] = MUL15(a[ 4], a[ 4]) 795 + ((MUL15(a[ 0], a[ 8]) 796 + MUL15(a[ 1], a[ 7]) 797 + MUL15(a[ 2], a[ 6]) 798 + MUL15(a[ 3], a[ 5])) << 1); 799 t[ 9] = ((MUL15(a[ 0], a[ 9]) 800 + MUL15(a[ 1], a[ 8]) 801 + MUL15(a[ 2], a[ 7]) 802 + MUL15(a[ 3], a[ 6]) 803 + MUL15(a[ 4], a[ 5])) << 1); 804 t[10] = MUL15(a[ 5], a[ 5]) 805 + ((MUL15(a[ 0], a[10]) 806 + MUL15(a[ 1], a[ 9]) 807 + MUL15(a[ 2], a[ 8]) 808 + MUL15(a[ 3], a[ 7]) 809 + MUL15(a[ 4], a[ 6])) << 1); 810 t[11] = ((MUL15(a[ 0], a[11]) 811 + MUL15(a[ 1], a[10]) 812 + MUL15(a[ 2], a[ 9]) 813 + MUL15(a[ 3], a[ 8]) 814 + MUL15(a[ 4], a[ 7]) 815 + MUL15(a[ 5], a[ 6])) << 1); 816 t[12] = MUL15(a[ 6], a[ 6]) 817 + ((MUL15(a[ 0], a[12]) 818 + MUL15(a[ 1], a[11]) 819 + MUL15(a[ 2], a[10]) 820 + MUL15(a[ 3], a[ 9]) 821 + MUL15(a[ 4], a[ 8]) 822 + MUL15(a[ 5], a[ 7])) << 1); 823 t[13] = ((MUL15(a[ 0], a[13]) 824 + MUL15(a[ 1], a[12]) 825 + MUL15(a[ 2], a[11]) 826 + MUL15(a[ 3], a[10]) 827 + MUL15(a[ 4], a[ 9]) 828 + MUL15(a[ 5], a[ 8]) 829 + MUL15(a[ 6], a[ 7])) << 1); 830 t[14] = MUL15(a[ 7], a[ 7]) 831 + ((MUL15(a[ 0], a[14]) 832 + MUL15(a[ 1], a[13]) 833 + MUL15(a[ 2], a[12]) 834 + MUL15(a[ 3], a[11]) 835 + MUL15(a[ 4], a[10]) 836 + MUL15(a[ 5], a[ 9]) 837 + MUL15(a[ 6], a[ 8])) << 1); 838 t[15] = ((MUL15(a[ 0], a[15]) 839 + MUL15(a[ 1], a[14]) 840 + MUL15(a[ 2], a[13]) 841 + MUL15(a[ 3], a[12]) 842 + MUL15(a[ 4], a[11]) 843 + MUL15(a[ 5], a[10]) 844 + MUL15(a[ 6], a[ 9]) 845 + MUL15(a[ 7], a[ 8])) << 1); 846 t[16] = MUL15(a[ 8], a[ 8]) 847 + ((MUL15(a[ 0], a[16]) 848 + MUL15(a[ 1], a[15]) 849 + MUL15(a[ 2], a[14]) 850 + MUL15(a[ 3], a[13]) 851 + MUL15(a[ 4], a[12]) 852 + MUL15(a[ 5], a[11]) 853 + MUL15(a[ 6], a[10]) 854 + MUL15(a[ 7], a[ 9])) << 1); 855 t[17] = ((MUL15(a[ 0], a[17]) 856 + MUL15(a[ 1], a[16]) 857 + MUL15(a[ 2], a[15]) 858 + MUL15(a[ 3], a[14]) 859 + MUL15(a[ 4], a[13]) 860 + MUL15(a[ 5], a[12]) 861 + MUL15(a[ 6], a[11]) 862 + MUL15(a[ 7], a[10]) 863 + MUL15(a[ 8], a[ 9])) << 1); 864 t[18] = MUL15(a[ 9], a[ 9]) 865 + ((MUL15(a[ 0], a[18]) 866 + MUL15(a[ 1], a[17]) 867 + MUL15(a[ 2], a[16]) 868 + MUL15(a[ 3], a[15]) 869 + MUL15(a[ 4], a[14]) 870 + MUL15(a[ 5], a[13]) 871 + MUL15(a[ 6], a[12]) 872 + MUL15(a[ 7], a[11]) 873 + MUL15(a[ 8], a[10])) << 1); 874 t[19] = ((MUL15(a[ 0], a[19]) 875 + MUL15(a[ 1], a[18]) 876 + MUL15(a[ 2], a[17]) 877 + MUL15(a[ 3], a[16]) 878 + MUL15(a[ 4], a[15]) 879 + MUL15(a[ 5], a[14]) 880 + MUL15(a[ 6], a[13]) 881 + MUL15(a[ 7], a[12]) 882 + MUL15(a[ 8], a[11]) 883 + MUL15(a[ 9], a[10])) << 1); 884 t[20] = MUL15(a[10], a[10]) 885 + ((MUL15(a[ 1], a[19]) 886 + MUL15(a[ 2], a[18]) 887 + MUL15(a[ 3], a[17]) 888 + MUL15(a[ 4], a[16]) 889 + MUL15(a[ 5], a[15]) 890 + MUL15(a[ 6], a[14]) 891 + MUL15(a[ 7], a[13]) 892 + MUL15(a[ 8], a[12]) 893 + MUL15(a[ 9], a[11])) << 1); 894 t[21] = ((MUL15(a[ 2], a[19]) 895 + MUL15(a[ 3], a[18]) 896 + MUL15(a[ 4], a[17]) 897 + MUL15(a[ 5], a[16]) 898 + MUL15(a[ 6], a[15]) 899 + MUL15(a[ 7], a[14]) 900 + MUL15(a[ 8], a[13]) 901 + MUL15(a[ 9], a[12]) 902 + MUL15(a[10], a[11])) << 1); 903 t[22] = MUL15(a[11], a[11]) 904 + ((MUL15(a[ 3], a[19]) 905 + MUL15(a[ 4], a[18]) 906 + MUL15(a[ 5], a[17]) 907 + MUL15(a[ 6], a[16]) 908 + MUL15(a[ 7], a[15]) 909 + MUL15(a[ 8], a[14]) 910 + MUL15(a[ 9], a[13]) 911 + MUL15(a[10], a[12])) << 1); 912 t[23] = ((MUL15(a[ 4], a[19]) 913 + MUL15(a[ 5], a[18]) 914 + MUL15(a[ 6], a[17]) 915 + MUL15(a[ 7], a[16]) 916 + MUL15(a[ 8], a[15]) 917 + MUL15(a[ 9], a[14]) 918 + MUL15(a[10], a[13]) 919 + MUL15(a[11], a[12])) << 1); 920 t[24] = MUL15(a[12], a[12]) 921 + ((MUL15(a[ 5], a[19]) 922 + MUL15(a[ 6], a[18]) 923 + MUL15(a[ 7], a[17]) 924 + MUL15(a[ 8], a[16]) 925 + MUL15(a[ 9], a[15]) 926 + MUL15(a[10], a[14]) 927 + MUL15(a[11], a[13])) << 1); 928 t[25] = ((MUL15(a[ 6], a[19]) 929 + MUL15(a[ 7], a[18]) 930 + MUL15(a[ 8], a[17]) 931 + MUL15(a[ 9], a[16]) 932 + MUL15(a[10], a[15]) 933 + MUL15(a[11], a[14]) 934 + MUL15(a[12], a[13])) << 1); 935 t[26] = MUL15(a[13], a[13]) 936 + ((MUL15(a[ 7], a[19]) 937 + MUL15(a[ 8], a[18]) 938 + MUL15(a[ 9], a[17]) 939 + MUL15(a[10], a[16]) 940 + MUL15(a[11], a[15]) 941 + MUL15(a[12], a[14])) << 1); 942 t[27] = ((MUL15(a[ 8], a[19]) 943 + MUL15(a[ 9], a[18]) 944 + MUL15(a[10], a[17]) 945 + MUL15(a[11], a[16]) 946 + MUL15(a[12], a[15]) 947 + MUL15(a[13], a[14])) << 1); 948 t[28] = MUL15(a[14], a[14]) 949 + ((MUL15(a[ 9], a[19]) 950 + MUL15(a[10], a[18]) 951 + MUL15(a[11], a[17]) 952 + MUL15(a[12], a[16]) 953 + MUL15(a[13], a[15])) << 1); 954 t[29] = ((MUL15(a[10], a[19]) 955 + MUL15(a[11], a[18]) 956 + MUL15(a[12], a[17]) 957 + MUL15(a[13], a[16]) 958 + MUL15(a[14], a[15])) << 1); 959 t[30] = MUL15(a[15], a[15]) 960 + ((MUL15(a[11], a[19]) 961 + MUL15(a[12], a[18]) 962 + MUL15(a[13], a[17]) 963 + MUL15(a[14], a[16])) << 1); 964 t[31] = ((MUL15(a[12], a[19]) 965 + MUL15(a[13], a[18]) 966 + MUL15(a[14], a[17]) 967 + MUL15(a[15], a[16])) << 1); 968 t[32] = MUL15(a[16], a[16]) 969 + ((MUL15(a[13], a[19]) 970 + MUL15(a[14], a[18]) 971 + MUL15(a[15], a[17])) << 1); 972 t[33] = ((MUL15(a[14], a[19]) 973 + MUL15(a[15], a[18]) 974 + MUL15(a[16], a[17])) << 1); 975 t[34] = MUL15(a[17], a[17]) 976 + ((MUL15(a[15], a[19]) 977 + MUL15(a[16], a[18])) << 1); 978 t[35] = ((MUL15(a[16], a[19]) 979 + MUL15(a[17], a[18])) << 1); 980 t[36] = MUL15(a[18], a[18]) 981 + ((MUL15(a[17], a[19])) << 1); 982 t[37] = ((MUL15(a[18], a[19])) << 1); 983 t[38] = MUL15(a[19], a[19]); 984 d[39] = norm13(d, t, 39); 985 } 986 987 #endif 988 989 /* 990 * Modulus for field F256 (field for point coordinates in curve P-256). 991 */ 992 static const uint32_t F256[] = { 993 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x1FFF, 0x001F, 994 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0400, 0x0000, 995 0x0000, 0x1FF8, 0x1FFF, 0x01FF 996 }; 997 998 /* 999 * The 'b' curve equation coefficient for P-256. 1000 */ 1001 static const uint32_t P256_B[] = { 1002 0x004B, 0x1E93, 0x0F89, 0x1C78, 0x03BC, 0x187B, 0x114E, 0x1619, 1003 0x1D06, 0x0328, 0x01AF, 0x0D31, 0x1557, 0x15DE, 0x1ECF, 0x127C, 1004 0x0A3A, 0x0EC5, 0x118D, 0x00B5 1005 }; 1006 1007 /* 1008 * Perform a "short reduction" in field F256 (field for curve P-256). 1009 * The source value should be less than 262 bits; on output, it will 1010 * be at most 257 bits, and less than twice the modulus. 1011 */ 1012 static void 1013 reduce_f256(uint32_t *d) 1014 { 1015 uint32_t x; 1016 1017 x = d[19] >> 9; 1018 d[19] &= 0x01FF; 1019 d[17] += x << 3; 1020 d[14] -= x << 10; 1021 d[7] -= x << 5; 1022 d[0] += x; 1023 norm13(d, d, 20); 1024 } 1025 1026 /* 1027 * Perform a "final reduction" in field F256 (field for curve P-256). 1028 * The source value must be less than twice the modulus. If the value 1029 * is not lower than the modulus, then the modulus is subtracted and 1030 * this function returns 1; otherwise, it leaves it untouched and it 1031 * returns 0. 1032 */ 1033 static uint32_t 1034 reduce_final_f256(uint32_t *d) 1035 { 1036 uint32_t t[20]; 1037 uint32_t cc; 1038 int i; 1039 1040 memcpy(t, d, sizeof t); 1041 cc = 0; 1042 for (i = 0; i < 20; i ++) { 1043 uint32_t w; 1044 1045 w = t[i] - F256[i] - cc; 1046 cc = w >> 31; 1047 t[i] = w & 0x1FFF; 1048 } 1049 cc ^= 1; 1050 CCOPY(cc, d, t, sizeof t); 1051 return cc; 1052 } 1053 1054 /* 1055 * Perform a multiplication of two integers modulo 1056 * 2^256-2^224+2^192+2^96-1 (for NIST curve P-256). Operands are arrays 1057 * of 20 words, each containing 13 bits of data, in little-endian order. 1058 * On input, upper word may be up to 13 bits (hence value up to 2^260-1); 1059 * on output, value fits on 257 bits and is lower than twice the modulus. 1060 */ 1061 static void 1062 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) 1063 { 1064 uint32_t t[40], cc; 1065 int i; 1066 1067 /* 1068 * Compute raw multiplication. All result words fit in 13 bits 1069 * each. 1070 */ 1071 mul20(t, a, b); 1072 1073 /* 1074 * Modular reduction: each high word in added/subtracted where 1075 * necessary. 1076 * 1077 * The modulus is: 1078 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 1079 * Therefore: 1080 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p 1081 * 1082 * For a word x at bit offset n (n >= 256), we have: 1083 * x*2^n = x*2^(n-32) - x*2^(n-64) 1084 * - x*2^(n - 160) + x*2^(n-256) mod p 1085 * 1086 * Thus, we can nullify the high word if we reinject it at some 1087 * proper emplacements. 1088 */ 1089 for (i = 39; i >= 20; i --) { 1090 uint32_t x; 1091 1092 x = t[i]; 1093 t[i - 2] += ARSH(x, 6); 1094 t[i - 3] += (x << 7) & 0x1FFF; 1095 t[i - 4] -= ARSH(x, 12); 1096 t[i - 5] -= (x << 1) & 0x1FFF; 1097 t[i - 12] -= ARSH(x, 4); 1098 t[i - 13] -= (x << 9) & 0x1FFF; 1099 t[i - 19] += ARSH(x, 9); 1100 t[i - 20] += (x << 4) & 0x1FFF; 1101 } 1102 1103 /* 1104 * Propagate carries. This is a signed propagation, and the 1105 * result may be negative. The loop above may enlarge values, 1106 * but not two much: worst case is the chain involving t[i - 3], 1107 * in which a value may be added to itself up to 7 times. Since 1108 * starting values are 13-bit each, all words fit on 20 bits 1109 * (21 to account for the sign bit). 1110 */ 1111 cc = norm13(t, t, 20); 1112 1113 /* 1114 * Perform modular reduction again for the bits beyond 256 (the carry 1115 * and the bits 256..259). Since the largest shift below is by 10 1116 * bits, and the values fit on 21 bits, values fit in 32-bit words, 1117 * thereby allowing injecting full word values. 1118 */ 1119 cc = (cc << 4) | (t[19] >> 9); 1120 t[19] &= 0x01FF; 1121 t[17] += cc << 3; 1122 t[14] -= cc << 10; 1123 t[7] -= cc << 5; 1124 t[0] += cc; 1125 1126 /* 1127 * If the carry is negative, then after carry propagation, we may 1128 * end up with a value which is negative, and we don't want that. 1129 * Thus, in that case, we add the modulus. Note that the subtraction 1130 * result, when the carry is negative, is always smaller than the 1131 * modulus, so the extra addition will not make the value exceed 1132 * twice the modulus. 1133 */ 1134 cc >>= 31; 1135 t[0] -= cc; 1136 t[7] += cc << 5; 1137 t[14] += cc << 10; 1138 t[17] -= cc << 3; 1139 t[19] += cc << 9; 1140 1141 norm13(d, t, 20); 1142 } 1143 1144 /* 1145 * Square an integer modulo 2^256-2^224+2^192+2^96-1 (for NIST curve 1146 * P-256). Operand is an array of 20 words, each containing 13 bits of 1147 * data, in little-endian order. On input, upper word may be up to 13 1148 * bits (hence value up to 2^260-1); on output, value fits on 257 bits 1149 * and is lower than twice the modulus. 1150 */ 1151 static void 1152 square_f256(uint32_t *d, const uint32_t *a) 1153 { 1154 uint32_t t[40], cc; 1155 int i; 1156 1157 /* 1158 * Compute raw square. All result words fit in 13 bits each. 1159 */ 1160 square20(t, a); 1161 1162 /* 1163 * Modular reduction: each high word in added/subtracted where 1164 * necessary. 1165 * 1166 * The modulus is: 1167 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 1168 * Therefore: 1169 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p 1170 * 1171 * For a word x at bit offset n (n >= 256), we have: 1172 * x*2^n = x*2^(n-32) - x*2^(n-64) 1173 * - x*2^(n - 160) + x*2^(n-256) mod p 1174 * 1175 * Thus, we can nullify the high word if we reinject it at some 1176 * proper emplacements. 1177 */ 1178 for (i = 39; i >= 20; i --) { 1179 uint32_t x; 1180 1181 x = t[i]; 1182 t[i - 2] += ARSH(x, 6); 1183 t[i - 3] += (x << 7) & 0x1FFF; 1184 t[i - 4] -= ARSH(x, 12); 1185 t[i - 5] -= (x << 1) & 0x1FFF; 1186 t[i - 12] -= ARSH(x, 4); 1187 t[i - 13] -= (x << 9) & 0x1FFF; 1188 t[i - 19] += ARSH(x, 9); 1189 t[i - 20] += (x << 4) & 0x1FFF; 1190 } 1191 1192 /* 1193 * Propagate carries. This is a signed propagation, and the 1194 * result may be negative. The loop above may enlarge values, 1195 * but not two much: worst case is the chain involving t[i - 3], 1196 * in which a value may be added to itself up to 7 times. Since 1197 * starting values are 13-bit each, all words fit on 20 bits 1198 * (21 to account for the sign bit). 1199 */ 1200 cc = norm13(t, t, 20); 1201 1202 /* 1203 * Perform modular reduction again for the bits beyond 256 (the carry 1204 * and the bits 256..259). Since the largest shift below is by 10 1205 * bits, and the values fit on 21 bits, values fit in 32-bit words, 1206 * thereby allowing injecting full word values. 1207 */ 1208 cc = (cc << 4) | (t[19] >> 9); 1209 t[19] &= 0x01FF; 1210 t[17] += cc << 3; 1211 t[14] -= cc << 10; 1212 t[7] -= cc << 5; 1213 t[0] += cc; 1214 1215 /* 1216 * If the carry is negative, then after carry propagation, we may 1217 * end up with a value which is negative, and we don't want that. 1218 * Thus, in that case, we add the modulus. Note that the subtraction 1219 * result, when the carry is negative, is always smaller than the 1220 * modulus, so the extra addition will not make the value exceed 1221 * twice the modulus. 1222 */ 1223 cc >>= 31; 1224 t[0] -= cc; 1225 t[7] += cc << 5; 1226 t[14] += cc << 10; 1227 t[17] -= cc << 3; 1228 t[19] += cc << 9; 1229 1230 norm13(d, t, 20); 1231 } 1232 1233 /* 1234 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y) 1235 * are such that: 1236 * X = x / z^2 1237 * Y = y / z^3 1238 * For the point at infinity, z = 0. 1239 * Each point thus admits many possible representations. 1240 * 1241 * Coordinates are represented in arrays of 32-bit integers, each holding 1242 * 13 bits of data. Values may also be slightly greater than the modulus, 1243 * but they will always be lower than twice the modulus. 1244 */ 1245 typedef struct { 1246 uint32_t x[20]; 1247 uint32_t y[20]; 1248 uint32_t z[20]; 1249 } p256_jacobian; 1250 1251 /* 1252 * Convert a point to affine coordinates: 1253 * - If the point is the point at infinity, then all three coordinates 1254 * are set to 0. 1255 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y' 1256 * coordinates are the 'X' and 'Y' affine coordinates. 1257 * The coordinates are guaranteed to be lower than the modulus. 1258 */ 1259 static void 1260 p256_to_affine(p256_jacobian *P) 1261 { 1262 uint32_t t1[20], t2[20]; 1263 int i; 1264 1265 /* 1266 * Invert z with a modular exponentiation: the modulus is 1267 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is 1268 * p-2. Exponent bit pattern (from high to low) is: 1269 * - 32 bits of value 1 1270 * - 31 bits of value 0 1271 * - 1 bit of value 1 1272 * - 96 bits of value 0 1273 * - 94 bits of value 1 1274 * - 1 bit of value 0 1275 * - 1 bit of value 1 1276 * Thus, we precompute z^(2^31-1) to speed things up. 1277 * 1278 * If z = 0 (point at infinity) then the modular exponentiation 1279 * will yield 0, which leads to the expected result (all three 1280 * coordinates set to 0). 1281 */ 1282 1283 /* 1284 * A simple square-and-multiply for z^(2^31-1). We could save about 1285 * two dozen multiplications here with an addition chain, but 1286 * this would require a bit more code, and extra stack buffers. 1287 */ 1288 memcpy(t1, P->z, sizeof P->z); 1289 for (i = 0; i < 30; i ++) { 1290 square_f256(t1, t1); 1291 mul_f256(t1, t1, P->z); 1292 } 1293 1294 /* 1295 * Square-and-multiply. Apart from the squarings, we have a few 1296 * multiplications to set bits to 1; we multiply by the original z 1297 * for setting 1 bit, and by t1 for setting 31 bits. 1298 */ 1299 memcpy(t2, P->z, sizeof P->z); 1300 for (i = 1; i < 256; i ++) { 1301 square_f256(t2, t2); 1302 switch (i) { 1303 case 31: 1304 case 190: 1305 case 221: 1306 case 252: 1307 mul_f256(t2, t2, t1); 1308 break; 1309 case 63: 1310 case 253: 1311 case 255: 1312 mul_f256(t2, t2, P->z); 1313 break; 1314 } 1315 } 1316 1317 /* 1318 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3. 1319 */ 1320 mul_f256(t1, t2, t2); 1321 mul_f256(P->x, t1, P->x); 1322 mul_f256(t1, t1, t2); 1323 mul_f256(P->y, t1, P->y); 1324 reduce_final_f256(P->x); 1325 reduce_final_f256(P->y); 1326 1327 /* 1328 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise 1329 * this will set z to 1. 1330 */ 1331 mul_f256(P->z, P->z, t2); 1332 reduce_final_f256(P->z); 1333 } 1334 1335 /* 1336 * Double a point in P-256. This function works for all valid points, 1337 * including the point at infinity. 1338 */ 1339 static void 1340 p256_double(p256_jacobian *Q) 1341 { 1342 /* 1343 * Doubling formulas are: 1344 * 1345 * s = 4*x*y^2 1346 * m = 3*(x + z^2)*(x - z^2) 1347 * x' = m^2 - 2*s 1348 * y' = m*(s - x') - 8*y^4 1349 * z' = 2*y*z 1350 * 1351 * These formulas work for all points, including points of order 2 1352 * and points at infinity: 1353 * - If y = 0 then z' = 0. But there is no such point in P-256 1354 * anyway. 1355 * - If z = 0 then z' = 0. 1356 */ 1357 uint32_t t1[20], t2[20], t3[20], t4[20]; 1358 int i; 1359 1360 /* 1361 * Compute z^2 in t1. 1362 */ 1363 square_f256(t1, Q->z); 1364 1365 /* 1366 * Compute x-z^2 in t2 and x+z^2 in t1. 1367 */ 1368 for (i = 0; i < 20; i ++) { 1369 t2[i] = (F256[i] << 1) + Q->x[i] - t1[i]; 1370 t1[i] += Q->x[i]; 1371 } 1372 norm13(t1, t1, 20); 1373 norm13(t2, t2, 20); 1374 1375 /* 1376 * Compute 3*(x+z^2)*(x-z^2) in t1. 1377 */ 1378 mul_f256(t3, t1, t2); 1379 for (i = 0; i < 20; i ++) { 1380 t1[i] = MUL15(3, t3[i]); 1381 } 1382 norm13(t1, t1, 20); 1383 1384 /* 1385 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). 1386 */ 1387 square_f256(t3, Q->y); 1388 for (i = 0; i < 20; i ++) { 1389 t3[i] <<= 1; 1390 } 1391 norm13(t3, t3, 20); 1392 mul_f256(t2, Q->x, t3); 1393 for (i = 0; i < 20; i ++) { 1394 t2[i] <<= 1; 1395 } 1396 norm13(t2, t2, 20); 1397 reduce_f256(t2); 1398 1399 /* 1400 * Compute x' = m^2 - 2*s. 1401 */ 1402 square_f256(Q->x, t1); 1403 for (i = 0; i < 20; i ++) { 1404 Q->x[i] += (F256[i] << 2) - (t2[i] << 1); 1405 } 1406 norm13(Q->x, Q->x, 20); 1407 reduce_f256(Q->x); 1408 1409 /* 1410 * Compute z' = 2*y*z. 1411 */ 1412 mul_f256(t4, Q->y, Q->z); 1413 for (i = 0; i < 20; i ++) { 1414 Q->z[i] = t4[i] << 1; 1415 } 1416 norm13(Q->z, Q->z, 20); 1417 reduce_f256(Q->z); 1418 1419 /* 1420 * Compute y' = m*(s - x') - 8*y^4. Note that we already have 1421 * 2*y^2 in t3. 1422 */ 1423 for (i = 0; i < 20; i ++) { 1424 t2[i] += (F256[i] << 1) - Q->x[i]; 1425 } 1426 norm13(t2, t2, 20); 1427 mul_f256(Q->y, t1, t2); 1428 square_f256(t4, t3); 1429 for (i = 0; i < 20; i ++) { 1430 Q->y[i] += (F256[i] << 2) - (t4[i] << 1); 1431 } 1432 norm13(Q->y, Q->y, 20); 1433 reduce_f256(Q->y); 1434 } 1435 1436 /* 1437 * Add point P2 to point P1. 1438 * 1439 * This function computes the wrong result in the following cases: 1440 * 1441 * - If P1 == 0 but P2 != 0 1442 * - If P1 != 0 but P2 == 0 1443 * - If P1 == P2 1444 * 1445 * In all three cases, P1 is set to the point at infinity. 1446 * 1447 * Returned value is 0 if one of the following occurs: 1448 * 1449 * - P1 and P2 have the same Y coordinate 1450 * - P1 == 0 and P2 == 0 1451 * - The Y coordinate of one of the points is 0 and the other point is 1452 * the point at infinity. 1453 * 1454 * The third case cannot actually happen with valid points, since a point 1455 * with Y == 0 is a point of order 2, and there is no point of order 2 on 1456 * curve P-256. 1457 * 1458 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller 1459 * can apply the following: 1460 * 1461 * - If the result is not the point at infinity, then it is correct. 1462 * - Otherwise, if the returned value is 1, then this is a case of 1463 * P1+P2 == 0, so the result is indeed the point at infinity. 1464 * - Otherwise, P1 == P2, so a "double" operation should have been 1465 * performed. 1466 */ 1467 static uint32_t 1468 p256_add(p256_jacobian *P1, const p256_jacobian *P2) 1469 { 1470 /* 1471 * Addtions formulas are: 1472 * 1473 * u1 = x1 * z2^2 1474 * u2 = x2 * z1^2 1475 * s1 = y1 * z2^3 1476 * s2 = y2 * z1^3 1477 * h = u2 - u1 1478 * r = s2 - s1 1479 * x3 = r^2 - h^3 - 2 * u1 * h^2 1480 * y3 = r * (u1 * h^2 - x3) - s1 * h^3 1481 * z3 = h * z1 * z2 1482 */ 1483 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20]; 1484 uint32_t ret; 1485 int i; 1486 1487 /* 1488 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). 1489 */ 1490 square_f256(t3, P2->z); 1491 mul_f256(t1, P1->x, t3); 1492 mul_f256(t4, P2->z, t3); 1493 mul_f256(t3, P1->y, t4); 1494 1495 /* 1496 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). 1497 */ 1498 square_f256(t4, P1->z); 1499 mul_f256(t2, P2->x, t4); 1500 mul_f256(t5, P1->z, t4); 1501 mul_f256(t4, P2->y, t5); 1502 1503 /* 1504 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). 1505 * We need to test whether r is zero, so we will do some extra 1506 * reduce. 1507 */ 1508 for (i = 0; i < 20; i ++) { 1509 t2[i] += (F256[i] << 1) - t1[i]; 1510 t4[i] += (F256[i] << 1) - t3[i]; 1511 } 1512 norm13(t2, t2, 20); 1513 norm13(t4, t4, 20); 1514 reduce_f256(t4); 1515 reduce_final_f256(t4); 1516 ret = 0; 1517 for (i = 0; i < 20; i ++) { 1518 ret |= t4[i]; 1519 } 1520 ret = (ret | -ret) >> 31; 1521 1522 /* 1523 * Compute u1*h^2 (in t6) and h^3 (in t5); 1524 */ 1525 square_f256(t7, t2); 1526 mul_f256(t6, t1, t7); 1527 mul_f256(t5, t7, t2); 1528 1529 /* 1530 * Compute x3 = r^2 - h^3 - 2*u1*h^2. 1531 */ 1532 square_f256(P1->x, t4); 1533 for (i = 0; i < 20; i ++) { 1534 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1); 1535 } 1536 norm13(P1->x, P1->x, 20); 1537 reduce_f256(P1->x); 1538 1539 /* 1540 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. 1541 */ 1542 for (i = 0; i < 20; i ++) { 1543 t6[i] += (F256[i] << 1) - P1->x[i]; 1544 } 1545 norm13(t6, t6, 20); 1546 mul_f256(P1->y, t4, t6); 1547 mul_f256(t1, t5, t3); 1548 for (i = 0; i < 20; i ++) { 1549 P1->y[i] += (F256[i] << 1) - t1[i]; 1550 } 1551 norm13(P1->y, P1->y, 20); 1552 reduce_f256(P1->y); 1553 1554 /* 1555 * Compute z3 = h*z1*z2. 1556 */ 1557 mul_f256(t1, P1->z, P2->z); 1558 mul_f256(P1->z, t1, t2); 1559 1560 return ret; 1561 } 1562 1563 /* 1564 * Add point P2 to point P1. This is a specialised function for the 1565 * case when P2 is a non-zero point in affine coordinate. 1566 * 1567 * This function computes the wrong result in the following cases: 1568 * 1569 * - If P1 == 0 1570 * - If P1 == P2 1571 * 1572 * In both cases, P1 is set to the point at infinity. 1573 * 1574 * Returned value is 0 if one of the following occurs: 1575 * 1576 * - P1 and P2 have the same Y coordinate 1577 * - The Y coordinate of P2 is 0 and P1 is the point at infinity. 1578 * 1579 * The second case cannot actually happen with valid points, since a point 1580 * with Y == 0 is a point of order 2, and there is no point of order 2 on 1581 * curve P-256. 1582 * 1583 * Therefore, assuming that P1 != 0 on input, then the caller 1584 * can apply the following: 1585 * 1586 * - If the result is not the point at infinity, then it is correct. 1587 * - Otherwise, if the returned value is 1, then this is a case of 1588 * P1+P2 == 0, so the result is indeed the point at infinity. 1589 * - Otherwise, P1 == P2, so a "double" operation should have been 1590 * performed. 1591 */ 1592 static uint32_t 1593 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2) 1594 { 1595 /* 1596 * Addtions formulas are: 1597 * 1598 * u1 = x1 1599 * u2 = x2 * z1^2 1600 * s1 = y1 1601 * s2 = y2 * z1^3 1602 * h = u2 - u1 1603 * r = s2 - s1 1604 * x3 = r^2 - h^3 - 2 * u1 * h^2 1605 * y3 = r * (u1 * h^2 - x3) - s1 * h^3 1606 * z3 = h * z1 1607 */ 1608 uint32_t t1[20], t2[20], t3[20], t4[20], t5[20], t6[20], t7[20]; 1609 uint32_t ret; 1610 int i; 1611 1612 /* 1613 * Compute u1 = x1 (in t1) and s1 = y1 (in t3). 1614 */ 1615 memcpy(t1, P1->x, sizeof t1); 1616 memcpy(t3, P1->y, sizeof t3); 1617 1618 /* 1619 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). 1620 */ 1621 square_f256(t4, P1->z); 1622 mul_f256(t2, P2->x, t4); 1623 mul_f256(t5, P1->z, t4); 1624 mul_f256(t4, P2->y, t5); 1625 1626 /* 1627 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). 1628 * We need to test whether r is zero, so we will do some extra 1629 * reduce. 1630 */ 1631 for (i = 0; i < 20; i ++) { 1632 t2[i] += (F256[i] << 1) - t1[i]; 1633 t4[i] += (F256[i] << 1) - t3[i]; 1634 } 1635 norm13(t2, t2, 20); 1636 norm13(t4, t4, 20); 1637 reduce_f256(t4); 1638 reduce_final_f256(t4); 1639 ret = 0; 1640 for (i = 0; i < 20; i ++) { 1641 ret |= t4[i]; 1642 } 1643 ret = (ret | -ret) >> 31; 1644 1645 /* 1646 * Compute u1*h^2 (in t6) and h^3 (in t5); 1647 */ 1648 square_f256(t7, t2); 1649 mul_f256(t6, t1, t7); 1650 mul_f256(t5, t7, t2); 1651 1652 /* 1653 * Compute x3 = r^2 - h^3 - 2*u1*h^2. 1654 */ 1655 square_f256(P1->x, t4); 1656 for (i = 0; i < 20; i ++) { 1657 P1->x[i] += (F256[i] << 3) - t5[i] - (t6[i] << 1); 1658 } 1659 norm13(P1->x, P1->x, 20); 1660 reduce_f256(P1->x); 1661 1662 /* 1663 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. 1664 */ 1665 for (i = 0; i < 20; i ++) { 1666 t6[i] += (F256[i] << 1) - P1->x[i]; 1667 } 1668 norm13(t6, t6, 20); 1669 mul_f256(P1->y, t4, t6); 1670 mul_f256(t1, t5, t3); 1671 for (i = 0; i < 20; i ++) { 1672 P1->y[i] += (F256[i] << 1) - t1[i]; 1673 } 1674 norm13(P1->y, P1->y, 20); 1675 reduce_f256(P1->y); 1676 1677 /* 1678 * Compute z3 = h*z1*z2. 1679 */ 1680 mul_f256(P1->z, P1->z, t2); 1681 1682 return ret; 1683 } 1684 1685 /* 1686 * Decode a P-256 point. This function does not support the point at 1687 * infinity. Returned value is 0 if the point is invalid, 1 otherwise. 1688 */ 1689 static uint32_t 1690 p256_decode(p256_jacobian *P, const void *src, size_t len) 1691 { 1692 const unsigned char *buf; 1693 uint32_t tx[20], ty[20], t1[20], t2[20]; 1694 uint32_t bad; 1695 int i; 1696 1697 if (len != 65) { 1698 return 0; 1699 } 1700 buf = src; 1701 1702 /* 1703 * First byte must be 0x04 (uncompressed format). We could support 1704 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the 1705 * least significant bit of the Y coordinate), but it is explicitly 1706 * forbidden by RFC 5480 (section 2.2). 1707 */ 1708 bad = NEQ(buf[0], 0x04); 1709 1710 /* 1711 * Decode the coordinates, and check that they are both lower 1712 * than the modulus. 1713 */ 1714 tx[19] = be8_to_le13(tx, buf + 1, 32); 1715 ty[19] = be8_to_le13(ty, buf + 33, 32); 1716 bad |= reduce_final_f256(tx); 1717 bad |= reduce_final_f256(ty); 1718 1719 /* 1720 * Check curve equation. 1721 */ 1722 square_f256(t1, tx); 1723 mul_f256(t1, tx, t1); 1724 square_f256(t2, ty); 1725 for (i = 0; i < 20; i ++) { 1726 t1[i] += (F256[i] << 3) - MUL15(3, tx[i]) + P256_B[i] - t2[i]; 1727 } 1728 norm13(t1, t1, 20); 1729 reduce_f256(t1); 1730 reduce_final_f256(t1); 1731 for (i = 0; i < 20; i ++) { 1732 bad |= t1[i]; 1733 } 1734 1735 /* 1736 * Copy coordinates to the point structure. 1737 */ 1738 memcpy(P->x, tx, sizeof tx); 1739 memcpy(P->y, ty, sizeof ty); 1740 memset(P->z, 0, sizeof P->z); 1741 P->z[0] = 1; 1742 return EQ(bad, 0); 1743 } 1744 1745 /* 1746 * Encode a point into a buffer. This function assumes that the point is 1747 * valid, in affine coordinates, and not the point at infinity. 1748 */ 1749 static void 1750 p256_encode(void *dst, const p256_jacobian *P) 1751 { 1752 unsigned char *buf; 1753 1754 buf = dst; 1755 buf[0] = 0x04; 1756 le13_to_be8(buf + 1, 32, P->x); 1757 le13_to_be8(buf + 33, 32, P->y); 1758 } 1759 1760 /* 1761 * Multiply a curve point by an integer. The integer is assumed to be 1762 * lower than the curve order, and the base point must not be the point 1763 * at infinity. 1764 */ 1765 static void 1766 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen) 1767 { 1768 /* 1769 * qz is a flag that is initially 1, and remains equal to 1 1770 * as long as the point is the point at infinity. 1771 * 1772 * We use a 2-bit window to handle multiplier bits by pairs. 1773 * The precomputed window really is the points P2 and P3. 1774 */ 1775 uint32_t qz; 1776 p256_jacobian P2, P3, Q, T, U; 1777 1778 /* 1779 * Compute window values. 1780 */ 1781 P2 = *P; 1782 p256_double(&P2); 1783 P3 = *P; 1784 p256_add(&P3, &P2); 1785 1786 /* 1787 * We start with Q = 0. We process multiplier bits 2 by 2. 1788 */ 1789 memset(&Q, 0, sizeof Q); 1790 qz = 1; 1791 while (xlen -- > 0) { 1792 int k; 1793 1794 for (k = 6; k >= 0; k -= 2) { 1795 uint32_t bits; 1796 uint32_t bnz; 1797 1798 p256_double(&Q); 1799 p256_double(&Q); 1800 T = *P; 1801 U = Q; 1802 bits = (*x >> k) & (uint32_t)3; 1803 bnz = NEQ(bits, 0); 1804 CCOPY(EQ(bits, 2), &T, &P2, sizeof T); 1805 CCOPY(EQ(bits, 3), &T, &P3, sizeof T); 1806 p256_add(&U, &T); 1807 CCOPY(bnz & qz, &Q, &T, sizeof Q); 1808 CCOPY(bnz & ~qz, &Q, &U, sizeof Q); 1809 qz &= ~bnz; 1810 } 1811 x ++; 1812 } 1813 *P = Q; 1814 } 1815 1816 /* 1817 * Precomputed window: k*G points, where G is the curve generator, and k 1818 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of 1819 * the point are encoded as 20 words of 13 bits each (little-endian 1820 * order); 13-bit words are then grouped 2-by-2 into 32-bit words 1821 * (little-endian order within each word). 1822 */ 1823 static const uint32_t Gwin[15][20] = { 1824 1825 { 0x04C60296, 0x02721176, 0x19D00F4A, 0x102517AC, 1826 0x13B8037D, 0x0748103C, 0x1E730E56, 0x08481FE2, 1827 0x0F97012C, 0x00D605F4, 0x1DFA11F5, 0x0C801A0D, 1828 0x0F670CBB, 0x0AED0CC5, 0x115E0E33, 0x181F0785, 1829 0x13F514A7, 0x0FF30E3B, 0x17171E1A, 0x009F18D0 }, 1830 1831 { 0x1B341978, 0x16911F11, 0x0D9A1A60, 0x1C4E1FC8, 1832 0x1E040969, 0x096A06B0, 0x091C0030, 0x09EF1A29, 1833 0x18C40D03, 0x00F91C9E, 0x13C313D1, 0x096F0748, 1834 0x011419E0, 0x1CC713A6, 0x1DD31DAD, 0x1EE80C36, 1835 0x1ECD0C69, 0x1A0800A4, 0x08861B8E, 0x000E1DD5 }, 1836 1837 { 0x173F1D6C, 0x02CC06F1, 0x14C21FB4, 0x043D1EB6, 1838 0x0F3606B7, 0x1A971C59, 0x1BF71951, 0x01481323, 1839 0x068D0633, 0x00BD12F9, 0x13EA1032, 0x136209E8, 1840 0x1C1E19A7, 0x06C7013E, 0x06C10AB0, 0x14C908BB, 1841 0x05830CE1, 0x1FEF18DD, 0x00620998, 0x010E0D19 }, 1842 1843 { 0x18180852, 0x0604111A, 0x0B771509, 0x1B6F0156, 1844 0x00181FE2, 0x1DCC0AF4, 0x16EF0659, 0x11F70E80, 1845 0x11A912D0, 0x01C414D2, 0x027618C6, 0x05840FC6, 1846 0x100215C4, 0x187E0C3B, 0x12771C96, 0x150C0B5D, 1847 0x0FF705FD, 0x07981C67, 0x1AD20C63, 0x01C11C55 }, 1848 1849 { 0x1E8113ED, 0x0A940370, 0x12920215, 0x1FA31D6F, 1850 0x1F7C0C82, 0x10CD03F7, 0x02640560, 0x081A0B5E, 1851 0x1BD21151, 0x00A21642, 0x0D0B0DA4, 0x0176113F, 1852 0x04440D1D, 0x001A1360, 0x1068012F, 0x1F141E49, 1853 0x10DF136B, 0x0E4F162B, 0x0D44104A, 0x01C1105F }, 1854 1855 { 0x011411A9, 0x01551A4F, 0x0ADA0C6B, 0x01BD0EC8, 1856 0x18120C74, 0x112F1778, 0x099202CB, 0x0C05124B, 1857 0x195316A4, 0x01600685, 0x1E3B1FE2, 0x189014E3, 1858 0x0B5E1FD7, 0x0E0311F8, 0x08E000F7, 0x174E00DE, 1859 0x160702DF, 0x1B5A15BF, 0x03A11237, 0x01D01704 }, 1860 1861 { 0x0C3D12A3, 0x0C501C0C, 0x17AD1300, 0x1715003F, 1862 0x03F719F8, 0x18031ED8, 0x1D980667, 0x0F681896, 1863 0x1B7D00BF, 0x011C14CE, 0x0FA000B4, 0x1C3501B0, 1864 0x0D901C55, 0x06790C10, 0x029E0736, 0x0DEB0400, 1865 0x034F183A, 0x030619B4, 0x0DEF0033, 0x00E71AC7 }, 1866 1867 { 0x1B7D1393, 0x1B3B1076, 0x0BED1B4D, 0x13011F3A, 1868 0x0E0E1238, 0x156A132B, 0x013A02D3, 0x160A0D01, 1869 0x1CED1EE9, 0x00C5165D, 0x184C157E, 0x08141A83, 1870 0x153C0DA5, 0x1ED70F9D, 0x05170D51, 0x02CF13B8, 1871 0x18AE1771, 0x1B04113F, 0x05EC11E9, 0x015A16B3 }, 1872 1873 { 0x04A41EE0, 0x1D1412E4, 0x1C591D79, 0x118511B7, 1874 0x14F00ACB, 0x1AE31E1C, 0x049C0D51, 0x016E061E, 1875 0x1DB71EDF, 0x01D41A35, 0x0E8208FA, 0x14441293, 1876 0x011F1E85, 0x1D54137A, 0x026B114F, 0x151D0832, 1877 0x00A50964, 0x1F9C1E1C, 0x064B12C9, 0x005409D1 }, 1878 1879 { 0x062B123F, 0x0C0D0501, 0x183704C3, 0x08E31120, 1880 0x0A2E0A6C, 0x14440FED, 0x090A0D1E, 0x13271964, 1881 0x0B590A3A, 0x019D1D9B, 0x05780773, 0x09770A91, 1882 0x0F770CA3, 0x053F19D4, 0x02C80DED, 0x1A761304, 1883 0x091E0DD9, 0x15D201B8, 0x151109AA, 0x010F0198 }, 1884 1885 { 0x05E101D1, 0x072314DD, 0x045F1433, 0x1A041541, 1886 0x10B3142E, 0x01840736, 0x1C1B19DB, 0x098B0418, 1887 0x1DBC083B, 0x007D1444, 0x01511740, 0x11DD1F3A, 1888 0x04ED0E2F, 0x1B4B1A62, 0x10480D04, 0x09E911A2, 1889 0x04211AFA, 0x19140893, 0x04D60CC4, 0x01210648 }, 1890 1891 { 0x112703C4, 0x018B1BA1, 0x164C1D50, 0x05160BE0, 1892 0x0BCC1830, 0x01CB1554, 0x13291732, 0x1B2B1918, 1893 0x0DED0817, 0x00E80775, 0x0A2401D3, 0x0BFE08B3, 1894 0x0E531199, 0x058616E9, 0x04770B91, 0x110F0C55, 1895 0x19C11554, 0x0BFB1159, 0x03541C38, 0x000E1C2D }, 1896 1897 { 0x10390C01, 0x02BB0751, 0x0AC5098E, 0x096C17AB, 1898 0x03C90E28, 0x10BD18BF, 0x002E1F2D, 0x092B0986, 1899 0x1BD700AC, 0x002E1F20, 0x1E3D1FD8, 0x077718BB, 1900 0x06F919C4, 0x187407ED, 0x11370E14, 0x081E139C, 1901 0x00481ADB, 0x14AB0289, 0x066A0EBE, 0x00C70ED6 }, 1902 1903 { 0x0694120B, 0x124E1CC9, 0x0E2F0570, 0x17CF081A, 1904 0x078906AC, 0x066D17CF, 0x1B3207F4, 0x0C5705E9, 1905 0x10001C38, 0x00A919DE, 0x06851375, 0x0F900BD8, 1906 0x080401BA, 0x0EEE0D42, 0x1B8B11EA, 0x0B4519F0, 1907 0x090F18C0, 0x062E1508, 0x0DD909F4, 0x01EB067C }, 1908 1909 { 0x0CDC1D5F, 0x0D1818F9, 0x07781636, 0x125B18E8, 1910 0x0D7003AF, 0x13110099, 0x1D9B1899, 0x175C1EB7, 1911 0x0E34171A, 0x01E01153, 0x081A0F36, 0x0B391783, 1912 0x1D1F147E, 0x19CE16D7, 0x11511B21, 0x1F2C10F9, 1913 0x12CA0E51, 0x05A31D39, 0x171A192E, 0x016B0E4F } 1914 }; 1915 1916 /* 1917 * Lookup one of the Gwin[] values, by index. This is constant-time. 1918 */ 1919 static void 1920 lookup_Gwin(p256_jacobian *T, uint32_t idx) 1921 { 1922 uint32_t xy[20]; 1923 uint32_t k; 1924 size_t u; 1925 1926 memset(xy, 0, sizeof xy); 1927 for (k = 0; k < 15; k ++) { 1928 uint32_t m; 1929 1930 m = -EQ(idx, k + 1); 1931 for (u = 0; u < 20; u ++) { 1932 xy[u] |= m & Gwin[k][u]; 1933 } 1934 } 1935 for (u = 0; u < 10; u ++) { 1936 T->x[(u << 1) + 0] = xy[u] & 0xFFFF; 1937 T->x[(u << 1) + 1] = xy[u] >> 16; 1938 T->y[(u << 1) + 0] = xy[u + 10] & 0xFFFF; 1939 T->y[(u << 1) + 1] = xy[u + 10] >> 16; 1940 } 1941 memset(T->z, 0, sizeof T->z); 1942 T->z[0] = 1; 1943 } 1944 1945 /* 1946 * Multiply the generator by an integer. The integer is assumed non-zero 1947 * and lower than the curve order. 1948 */ 1949 static void 1950 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen) 1951 { 1952 /* 1953 * qz is a flag that is initially 1, and remains equal to 1 1954 * as long as the point is the point at infinity. 1955 * 1956 * We use a 4-bit window to handle multiplier bits by groups 1957 * of 4. The precomputed window is constant static data, with 1958 * points in affine coordinates; we use a constant-time lookup. 1959 */ 1960 p256_jacobian Q; 1961 uint32_t qz; 1962 1963 memset(&Q, 0, sizeof Q); 1964 qz = 1; 1965 while (xlen -- > 0) { 1966 int k; 1967 unsigned bx; 1968 1969 bx = *x ++; 1970 for (k = 0; k < 2; k ++) { 1971 uint32_t bits; 1972 uint32_t bnz; 1973 p256_jacobian T, U; 1974 1975 p256_double(&Q); 1976 p256_double(&Q); 1977 p256_double(&Q); 1978 p256_double(&Q); 1979 bits = (bx >> 4) & 0x0F; 1980 bnz = NEQ(bits, 0); 1981 lookup_Gwin(&T, bits); 1982 U = Q; 1983 p256_add_mixed(&U, &T); 1984 CCOPY(bnz & qz, &Q, &T, sizeof Q); 1985 CCOPY(bnz & ~qz, &Q, &U, sizeof Q); 1986 qz &= ~bnz; 1987 bx <<= 4; 1988 } 1989 } 1990 *P = Q; 1991 } 1992 1993 static const unsigned char P256_G[] = { 1994 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, 1995 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, 1996 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, 1997 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, 1998 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, 1999 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, 2000 0x68, 0x37, 0xBF, 0x51, 0xF5 2001 }; 2002 2003 static const unsigned char P256_N[] = { 2004 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 2005 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, 2006 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 2007 0x25, 0x51 2008 }; 2009 2010 static const unsigned char * 2011 api_generator(int curve, size_t *len) 2012 { 2013 (void)curve; 2014 *len = sizeof P256_G; 2015 return P256_G; 2016 } 2017 2018 static const unsigned char * 2019 api_order(int curve, size_t *len) 2020 { 2021 (void)curve; 2022 *len = sizeof P256_N; 2023 return P256_N; 2024 } 2025 2026 static size_t 2027 api_xoff(int curve, size_t *len) 2028 { 2029 (void)curve; 2030 *len = 32; 2031 return 1; 2032 } 2033 2034 static uint32_t 2035 api_mul(unsigned char *G, size_t Glen, 2036 const unsigned char *x, size_t xlen, int curve) 2037 { 2038 uint32_t r; 2039 p256_jacobian P; 2040 2041 (void)curve; 2042 if (Glen != 65) { 2043 return 0; 2044 } 2045 r = p256_decode(&P, G, Glen); 2046 p256_mul(&P, x, xlen); 2047 p256_to_affine(&P); 2048 p256_encode(G, &P); 2049 return r; 2050 } 2051 2052 static size_t 2053 api_mulgen(unsigned char *R, 2054 const unsigned char *x, size_t xlen, int curve) 2055 { 2056 p256_jacobian P; 2057 2058 (void)curve; 2059 p256_mulgen(&P, x, xlen); 2060 p256_to_affine(&P); 2061 p256_encode(R, &P); 2062 return 65; 2063 } 2064 2065 static uint32_t 2066 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 2067 const unsigned char *x, size_t xlen, 2068 const unsigned char *y, size_t ylen, int curve) 2069 { 2070 p256_jacobian P, Q; 2071 uint32_t r, t, z; 2072 int i; 2073 2074 (void)curve; 2075 if (len != 65) { 2076 return 0; 2077 } 2078 r = p256_decode(&P, A, len); 2079 p256_mul(&P, x, xlen); 2080 if (B == NULL) { 2081 p256_mulgen(&Q, y, ylen); 2082 } else { 2083 r &= p256_decode(&Q, B, len); 2084 p256_mul(&Q, y, ylen); 2085 } 2086 2087 /* 2088 * The final addition may fail in case both points are equal. 2089 */ 2090 t = p256_add(&P, &Q); 2091 reduce_final_f256(P.z); 2092 z = 0; 2093 for (i = 0; i < 20; i ++) { 2094 z |= P.z[i]; 2095 } 2096 z = EQ(z, 0); 2097 p256_double(&Q); 2098 2099 /* 2100 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we 2101 * have the following: 2102 * 2103 * z = 0, t = 0 return P (normal addition) 2104 * z = 0, t = 1 return P (normal addition) 2105 * z = 1, t = 0 return Q (a 'double' case) 2106 * z = 1, t = 1 report an error (P+Q = 0) 2107 */ 2108 CCOPY(z & ~t, &P, &Q, sizeof Q); 2109 p256_to_affine(&P); 2110 p256_encode(A, &P); 2111 r &= ~(z & t); 2112 return r; 2113 } 2114 2115 /* see bearssl_ec.h */ 2116 const br_ec_impl br_ec_p256_m15 = { 2117 (uint32_t)0x00800000, 2118 &api_generator, 2119 &api_order, 2120 &api_xoff, 2121 &api_mul, 2122 &api_mulgen, 2123 &api_muladd 2124 }; 2125