1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ 29 * that right-shifting a signed negative integer copies the sign bit 30 * (arithmetic right-shift). This is "implementation-defined behaviour", 31 * i.e. it is not undefined, but it may differ between compilers. Each 32 * compiler is supposed to document its behaviour in that respect. GCC 33 * explicitly defines that an arithmetic right shift is used. We expect 34 * all other compilers to do the same, because underlying CPU offer an 35 * arithmetic right shift opcode that could not be used otherwise. 36 */ 37 #if BR_NO_ARITH_SHIFT 38 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \ 39 | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) 40 #define ARSHW(x, n) (((uint64_t)(x) >> (n)) \ 41 | ((-((uint64_t)(x) >> 63)) << (64 - (n)))) 42 #else 43 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) 44 #define ARSHW(x, n) ((*(int64_t *)&(x)) >> (n)) 45 #endif 46 47 /* 48 * Convert an integer from unsigned big-endian encoding to a sequence of 49 * 30-bit words in little-endian order. The final "partial" word is 50 * returned. 51 */ 52 static uint32_t 53 be8_to_le30(uint32_t *dst, const unsigned char *src, size_t len) 54 { 55 uint32_t acc; 56 int acc_len; 57 58 acc = 0; 59 acc_len = 0; 60 while (len -- > 0) { 61 uint32_t b; 62 63 b = src[len]; 64 if (acc_len < 22) { 65 acc |= b << acc_len; 66 acc_len += 8; 67 } else { 68 *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF; 69 acc = b >> (30 - acc_len); 70 acc_len -= 22; 71 } 72 } 73 return acc; 74 } 75 76 /* 77 * Convert an integer (30-bit words, little-endian) to unsigned 78 * big-endian encoding. The total encoding length is provided; all 79 * the destination bytes will be filled. 80 */ 81 static void 82 le30_to_be8(unsigned char *dst, size_t len, const uint32_t *src) 83 { 84 uint32_t acc; 85 int acc_len; 86 87 acc = 0; 88 acc_len = 0; 89 while (len -- > 0) { 90 if (acc_len < 8) { 91 uint32_t w; 92 93 w = *src ++; 94 dst[len] = (unsigned char)(acc | (w << acc_len)); 95 acc = w >> (8 - acc_len); 96 acc_len += 22; 97 } else { 98 dst[len] = (unsigned char)acc; 99 acc >>= 8; 100 acc_len -= 8; 101 } 102 } 103 } 104 105 /* 106 * Multiply two integers. Source integers are represented as arrays of 107 * nine 30-bit words, for values up to 2^270-1. Result is encoded over 108 * 18 words of 30 bits each. 109 */ 110 static void 111 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b) 112 { 113 /* 114 * Maximum intermediate result is no more than 115 * 10376293531797946367, which fits in 64 bits. Reason: 116 * 117 * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406 118 * 10376293531797946367 < 9663676407 * 2^30 119 * 120 * Thus, adding together 9 products of 30-bit integers, with 121 * a carry of at most 9663676406, yields an integer that fits 122 * on 64 bits and generates a carry of at most 9663676406. 123 */ 124 uint64_t t[17]; 125 uint64_t cc; 126 int i; 127 128 t[ 0] = MUL31(a[0], b[0]); 129 t[ 1] = MUL31(a[0], b[1]) 130 + MUL31(a[1], b[0]); 131 t[ 2] = MUL31(a[0], b[2]) 132 + MUL31(a[1], b[1]) 133 + MUL31(a[2], b[0]); 134 t[ 3] = MUL31(a[0], b[3]) 135 + MUL31(a[1], b[2]) 136 + MUL31(a[2], b[1]) 137 + MUL31(a[3], b[0]); 138 t[ 4] = MUL31(a[0], b[4]) 139 + MUL31(a[1], b[3]) 140 + MUL31(a[2], b[2]) 141 + MUL31(a[3], b[1]) 142 + MUL31(a[4], b[0]); 143 t[ 5] = MUL31(a[0], b[5]) 144 + MUL31(a[1], b[4]) 145 + MUL31(a[2], b[3]) 146 + MUL31(a[3], b[2]) 147 + MUL31(a[4], b[1]) 148 + MUL31(a[5], b[0]); 149 t[ 6] = MUL31(a[0], b[6]) 150 + MUL31(a[1], b[5]) 151 + MUL31(a[2], b[4]) 152 + MUL31(a[3], b[3]) 153 + MUL31(a[4], b[2]) 154 + MUL31(a[5], b[1]) 155 + MUL31(a[6], b[0]); 156 t[ 7] = MUL31(a[0], b[7]) 157 + MUL31(a[1], b[6]) 158 + MUL31(a[2], b[5]) 159 + MUL31(a[3], b[4]) 160 + MUL31(a[4], b[3]) 161 + MUL31(a[5], b[2]) 162 + MUL31(a[6], b[1]) 163 + MUL31(a[7], b[0]); 164 t[ 8] = MUL31(a[0], b[8]) 165 + MUL31(a[1], b[7]) 166 + MUL31(a[2], b[6]) 167 + MUL31(a[3], b[5]) 168 + MUL31(a[4], b[4]) 169 + MUL31(a[5], b[3]) 170 + MUL31(a[6], b[2]) 171 + MUL31(a[7], b[1]) 172 + MUL31(a[8], b[0]); 173 t[ 9] = MUL31(a[1], b[8]) 174 + MUL31(a[2], b[7]) 175 + MUL31(a[3], b[6]) 176 + MUL31(a[4], b[5]) 177 + MUL31(a[5], b[4]) 178 + MUL31(a[6], b[3]) 179 + MUL31(a[7], b[2]) 180 + MUL31(a[8], b[1]); 181 t[10] = MUL31(a[2], b[8]) 182 + MUL31(a[3], b[7]) 183 + MUL31(a[4], b[6]) 184 + MUL31(a[5], b[5]) 185 + MUL31(a[6], b[4]) 186 + MUL31(a[7], b[3]) 187 + MUL31(a[8], b[2]); 188 t[11] = MUL31(a[3], b[8]) 189 + MUL31(a[4], b[7]) 190 + MUL31(a[5], b[6]) 191 + MUL31(a[6], b[5]) 192 + MUL31(a[7], b[4]) 193 + MUL31(a[8], b[3]); 194 t[12] = MUL31(a[4], b[8]) 195 + MUL31(a[5], b[7]) 196 + MUL31(a[6], b[6]) 197 + MUL31(a[7], b[5]) 198 + MUL31(a[8], b[4]); 199 t[13] = MUL31(a[5], b[8]) 200 + MUL31(a[6], b[7]) 201 + MUL31(a[7], b[6]) 202 + MUL31(a[8], b[5]); 203 t[14] = MUL31(a[6], b[8]) 204 + MUL31(a[7], b[7]) 205 + MUL31(a[8], b[6]); 206 t[15] = MUL31(a[7], b[8]) 207 + MUL31(a[8], b[7]); 208 t[16] = MUL31(a[8], b[8]); 209 210 /* 211 * Propagate carries. 212 */ 213 cc = 0; 214 for (i = 0; i < 17; i ++) { 215 uint64_t w; 216 217 w = t[i] + cc; 218 d[i] = (uint32_t)w & 0x3FFFFFFF; 219 cc = w >> 30; 220 } 221 d[17] = (uint32_t)cc; 222 } 223 224 /* 225 * Square a 270-bit integer, represented as an array of nine 30-bit words. 226 * Result uses 18 words of 30 bits each. 227 */ 228 static void 229 square9(uint32_t *d, const uint32_t *a) 230 { 231 uint64_t t[17]; 232 uint64_t cc; 233 int i; 234 235 t[ 0] = MUL31(a[0], a[0]); 236 t[ 1] = ((MUL31(a[0], a[1])) << 1); 237 t[ 2] = MUL31(a[1], a[1]) 238 + ((MUL31(a[0], a[2])) << 1); 239 t[ 3] = ((MUL31(a[0], a[3]) 240 + MUL31(a[1], a[2])) << 1); 241 t[ 4] = MUL31(a[2], a[2]) 242 + ((MUL31(a[0], a[4]) 243 + MUL31(a[1], a[3])) << 1); 244 t[ 5] = ((MUL31(a[0], a[5]) 245 + MUL31(a[1], a[4]) 246 + MUL31(a[2], a[3])) << 1); 247 t[ 6] = MUL31(a[3], a[3]) 248 + ((MUL31(a[0], a[6]) 249 + MUL31(a[1], a[5]) 250 + MUL31(a[2], a[4])) << 1); 251 t[ 7] = ((MUL31(a[0], a[7]) 252 + MUL31(a[1], a[6]) 253 + MUL31(a[2], a[5]) 254 + MUL31(a[3], a[4])) << 1); 255 t[ 8] = MUL31(a[4], a[4]) 256 + ((MUL31(a[0], a[8]) 257 + MUL31(a[1], a[7]) 258 + MUL31(a[2], a[6]) 259 + MUL31(a[3], a[5])) << 1); 260 t[ 9] = ((MUL31(a[1], a[8]) 261 + MUL31(a[2], a[7]) 262 + MUL31(a[3], a[6]) 263 + MUL31(a[4], a[5])) << 1); 264 t[10] = MUL31(a[5], a[5]) 265 + ((MUL31(a[2], a[8]) 266 + MUL31(a[3], a[7]) 267 + MUL31(a[4], a[6])) << 1); 268 t[11] = ((MUL31(a[3], a[8]) 269 + MUL31(a[4], a[7]) 270 + MUL31(a[5], a[6])) << 1); 271 t[12] = MUL31(a[6], a[6]) 272 + ((MUL31(a[4], a[8]) 273 + MUL31(a[5], a[7])) << 1); 274 t[13] = ((MUL31(a[5], a[8]) 275 + MUL31(a[6], a[7])) << 1); 276 t[14] = MUL31(a[7], a[7]) 277 + ((MUL31(a[6], a[8])) << 1); 278 t[15] = ((MUL31(a[7], a[8])) << 1); 279 t[16] = MUL31(a[8], a[8]); 280 281 /* 282 * Propagate carries. 283 */ 284 cc = 0; 285 for (i = 0; i < 17; i ++) { 286 uint64_t w; 287 288 w = t[i] + cc; 289 d[i] = (uint32_t)w & 0x3FFFFFFF; 290 cc = w >> 30; 291 } 292 d[17] = (uint32_t)cc; 293 } 294 295 /* 296 * Base field modulus for P-256. 297 */ 298 static const uint32_t F256[] = { 299 300 0x3FFFFFFF, 0x3FFFFFFF, 0x3FFFFFFF, 0x0000003F, 0x00000000, 301 0x00000000, 0x00001000, 0x3FFFC000, 0x0000FFFF 302 }; 303 304 /* 305 * The 'b' curve equation coefficient for P-256. 306 */ 307 static const uint32_t P256_B[] = { 308 309 0x27D2604B, 0x2F38F0F8, 0x053B0F63, 0x0741AC33, 0x1886BC65, 310 0x2EF555DA, 0x293E7B3E, 0x0D762A8E, 0x00005AC6 311 }; 312 313 /* 314 * Addition in the field. Source operands shall fit on 257 bits; output 315 * will be lower than twice the modulus. 316 */ 317 static void 318 add_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) 319 { 320 uint32_t w, cc; 321 int i; 322 323 cc = 0; 324 for (i = 0; i < 9; i ++) { 325 w = a[i] + b[i] + cc; 326 d[i] = w & 0x3FFFFFFF; 327 cc = w >> 30; 328 } 329 w >>= 16; 330 d[8] &= 0xFFFF; 331 d[3] -= w << 6; 332 d[6] -= w << 12; 333 d[7] += w << 14; 334 cc = w; 335 for (i = 0; i < 9; i ++) { 336 w = d[i] + cc; 337 d[i] = w & 0x3FFFFFFF; 338 cc = ARSH(w, 30); 339 } 340 } 341 342 /* 343 * Subtraction in the field. Source operands shall be smaller than twice 344 * the modulus; the result will fulfil the same property. 345 */ 346 static void 347 sub_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) 348 { 349 uint32_t w, cc; 350 int i; 351 352 /* 353 * We really compute a - b + 2*p to make sure that the result is 354 * positive. 355 */ 356 w = a[0] - b[0] - 0x00002; 357 d[0] = w & 0x3FFFFFFF; 358 w = a[1] - b[1] + ARSH(w, 30); 359 d[1] = w & 0x3FFFFFFF; 360 w = a[2] - b[2] + ARSH(w, 30); 361 d[2] = w & 0x3FFFFFFF; 362 w = a[3] - b[3] + ARSH(w, 30) + 0x00080; 363 d[3] = w & 0x3FFFFFFF; 364 w = a[4] - b[4] + ARSH(w, 30); 365 d[4] = w & 0x3FFFFFFF; 366 w = a[5] - b[5] + ARSH(w, 30); 367 d[5] = w & 0x3FFFFFFF; 368 w = a[6] - b[6] + ARSH(w, 30) + 0x02000; 369 d[6] = w & 0x3FFFFFFF; 370 w = a[7] - b[7] + ARSH(w, 30) - 0x08000; 371 d[7] = w & 0x3FFFFFFF; 372 w = a[8] - b[8] + ARSH(w, 30) + 0x20000; 373 d[8] = w & 0xFFFF; 374 w >>= 16; 375 d[8] &= 0xFFFF; 376 d[3] -= w << 6; 377 d[6] -= w << 12; 378 d[7] += w << 14; 379 cc = w; 380 for (i = 0; i < 9; i ++) { 381 w = d[i] + cc; 382 d[i] = w & 0x3FFFFFFF; 383 cc = ARSH(w, 30); 384 } 385 } 386 387 /* 388 * Compute a multiplication in F256. Source operands shall be less than 389 * twice the modulus. 390 */ 391 static void 392 mul_f256(uint32_t *d, const uint32_t *a, const uint32_t *b) 393 { 394 uint32_t t[18]; 395 uint64_t s[18]; 396 uint64_t cc, x; 397 uint32_t z, c; 398 int i; 399 400 mul9(t, a, b); 401 402 /* 403 * Modular reduction: each high word in added/subtracted where 404 * necessary. 405 * 406 * The modulus is: 407 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 408 * Therefore: 409 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p 410 * 411 * For a word x at bit offset n (n >= 256), we have: 412 * x*2^n = x*2^(n-32) - x*2^(n-64) 413 * - x*2^(n - 160) + x*2^(n-256) mod p 414 * 415 * Thus, we can nullify the high word if we reinject it at some 416 * proper emplacements. 417 * 418 * We use 64-bit intermediate words to allow for carries to 419 * accumulate easily, before performing the final propagation. 420 */ 421 for (i = 0; i < 18; i ++) { 422 s[i] = t[i]; 423 } 424 425 for (i = 17; i >= 9; i --) { 426 uint64_t y; 427 428 y = s[i]; 429 s[i - 1] += ARSHW(y, 2); 430 s[i - 2] += (y << 28) & 0x3FFFFFFF; 431 s[i - 2] -= ARSHW(y, 4); 432 s[i - 3] -= (y << 26) & 0x3FFFFFFF; 433 s[i - 5] -= ARSHW(y, 10); 434 s[i - 6] -= (y << 20) & 0x3FFFFFFF; 435 s[i - 8] += ARSHW(y, 16); 436 s[i - 9] += (y << 14) & 0x3FFFFFFF; 437 } 438 439 /* 440 * Carry propagation must be signed. Moreover, we may have overdone 441 * it a bit, and obtain a negative result. 442 * 443 * The loop above ran 9 times; each time, each word was augmented 444 * by at most one extra word (in absolute value). Thus, the top 445 * word must in fine fit in 39 bits, so the carry below will fit 446 * on 9 bits. 447 */ 448 cc = 0; 449 for (i = 0; i < 9; i ++) { 450 x = s[i] + cc; 451 d[i] = (uint32_t)x & 0x3FFFFFFF; 452 cc = ARSHW(x, 30); 453 } 454 455 /* 456 * All nine words fit on 30 bits, but there may be an extra 457 * carry for a few bits (at most 9), and that carry may be 458 * negative. Moreover, we want the result to fit on 257 bits. 459 * The two lines below ensure that the word in d[] has length 460 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The 461 * significant length of cc is less than 24 bits, so we will be 462 * able to switch to 32-bit operations. 463 */ 464 cc = ARSHW(x, 16); 465 d[8] &= 0xFFFF; 466 467 /* 468 * One extra round of reduction, for cc*2^256, which means 469 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative) 470 * value. If cc is negative, then it may happen (rarely, but 471 * not neglectibly so) that the result would be negative. In 472 * order to avoid that, if cc is negative, then we add the 473 * modulus once. Note that if cc is negative, then propagating 474 * that carry must yield a value lower than the modulus, so 475 * adding the modulus once will keep the final result under 476 * twice the modulus. 477 */ 478 z = (uint32_t)cc; 479 d[3] -= z << 6; 480 d[6] -= (z << 12) & 0x3FFFFFFF; 481 d[7] -= ARSH(z, 18); 482 d[7] += (z << 14) & 0x3FFFFFFF; 483 d[8] += ARSH(z, 16); 484 c = z >> 31; 485 d[0] -= c; 486 d[3] += c << 6; 487 d[6] += c << 12; 488 d[7] -= c << 14; 489 d[8] += c << 16; 490 for (i = 0; i < 9; i ++) { 491 uint32_t w; 492 493 w = d[i] + z; 494 d[i] = w & 0x3FFFFFFF; 495 z = ARSH(w, 30); 496 } 497 } 498 499 /* 500 * Compute a square in F256. Source operand shall be less than 501 * twice the modulus. 502 */ 503 static void 504 square_f256(uint32_t *d, const uint32_t *a) 505 { 506 uint32_t t[18]; 507 uint64_t s[18]; 508 uint64_t cc, x; 509 uint32_t z, c; 510 int i; 511 512 square9(t, a); 513 514 /* 515 * Modular reduction: each high word in added/subtracted where 516 * necessary. 517 * 518 * The modulus is: 519 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1 520 * Therefore: 521 * 2^256 = 2^224 - 2^192 - 2^96 + 1 mod p 522 * 523 * For a word x at bit offset n (n >= 256), we have: 524 * x*2^n = x*2^(n-32) - x*2^(n-64) 525 * - x*2^(n - 160) + x*2^(n-256) mod p 526 * 527 * Thus, we can nullify the high word if we reinject it at some 528 * proper emplacements. 529 * 530 * We use 64-bit intermediate words to allow for carries to 531 * accumulate easily, before performing the final propagation. 532 */ 533 for (i = 0; i < 18; i ++) { 534 s[i] = t[i]; 535 } 536 537 for (i = 17; i >= 9; i --) { 538 uint64_t y; 539 540 y = s[i]; 541 s[i - 1] += ARSHW(y, 2); 542 s[i - 2] += (y << 28) & 0x3FFFFFFF; 543 s[i - 2] -= ARSHW(y, 4); 544 s[i - 3] -= (y << 26) & 0x3FFFFFFF; 545 s[i - 5] -= ARSHW(y, 10); 546 s[i - 6] -= (y << 20) & 0x3FFFFFFF; 547 s[i - 8] += ARSHW(y, 16); 548 s[i - 9] += (y << 14) & 0x3FFFFFFF; 549 } 550 551 /* 552 * Carry propagation must be signed. Moreover, we may have overdone 553 * it a bit, and obtain a negative result. 554 * 555 * The loop above ran 9 times; each time, each word was augmented 556 * by at most one extra word (in absolute value). Thus, the top 557 * word must in fine fit in 39 bits, so the carry below will fit 558 * on 9 bits. 559 */ 560 cc = 0; 561 for (i = 0; i < 9; i ++) { 562 x = s[i] + cc; 563 d[i] = (uint32_t)x & 0x3FFFFFFF; 564 cc = ARSHW(x, 30); 565 } 566 567 /* 568 * All nine words fit on 30 bits, but there may be an extra 569 * carry for a few bits (at most 9), and that carry may be 570 * negative. Moreover, we want the result to fit on 257 bits. 571 * The two lines below ensure that the word in d[] has length 572 * 256 bits, and the (signed) carry (beyond 2^256) is in cc. The 573 * significant length of cc is less than 24 bits, so we will be 574 * able to switch to 32-bit operations. 575 */ 576 cc = ARSHW(x, 16); 577 d[8] &= 0xFFFF; 578 579 /* 580 * One extra round of reduction, for cc*2^256, which means 581 * adding cc*(2^224-2^192-2^96+1) to a 256-bit (nonnegative) 582 * value. If cc is negative, then it may happen (rarely, but 583 * not neglectibly so) that the result would be negative. In 584 * order to avoid that, if cc is negative, then we add the 585 * modulus once. Note that if cc is negative, then propagating 586 * that carry must yield a value lower than the modulus, so 587 * adding the modulus once will keep the final result under 588 * twice the modulus. 589 */ 590 z = (uint32_t)cc; 591 d[3] -= z << 6; 592 d[6] -= (z << 12) & 0x3FFFFFFF; 593 d[7] -= ARSH(z, 18); 594 d[7] += (z << 14) & 0x3FFFFFFF; 595 d[8] += ARSH(z, 16); 596 c = z >> 31; 597 d[0] -= c; 598 d[3] += c << 6; 599 d[6] += c << 12; 600 d[7] -= c << 14; 601 d[8] += c << 16; 602 for (i = 0; i < 9; i ++) { 603 uint32_t w; 604 605 w = d[i] + z; 606 d[i] = w & 0x3FFFFFFF; 607 z = ARSH(w, 30); 608 } 609 } 610 611 /* 612 * Perform a "final reduction" in field F256 (field for curve P-256). 613 * The source value must be less than twice the modulus. If the value 614 * is not lower than the modulus, then the modulus is subtracted and 615 * this function returns 1; otherwise, it leaves it untouched and it 616 * returns 0. 617 */ 618 static uint32_t 619 reduce_final_f256(uint32_t *d) 620 { 621 uint32_t t[9]; 622 uint32_t cc; 623 int i; 624 625 cc = 0; 626 for (i = 0; i < 9; i ++) { 627 uint32_t w; 628 629 w = d[i] - F256[i] - cc; 630 cc = w >> 31; 631 t[i] = w & 0x3FFFFFFF; 632 } 633 cc ^= 1; 634 CCOPY(cc, d, t, sizeof t); 635 return cc; 636 } 637 638 /* 639 * Jacobian coordinates for a point in P-256: affine coordinates (X,Y) 640 * are such that: 641 * X = x / z^2 642 * Y = y / z^3 643 * For the point at infinity, z = 0. 644 * Each point thus admits many possible representations. 645 * 646 * Coordinates are represented in arrays of 32-bit integers, each holding 647 * 30 bits of data. Values may also be slightly greater than the modulus, 648 * but they will always be lower than twice the modulus. 649 */ 650 typedef struct { 651 uint32_t x[9]; 652 uint32_t y[9]; 653 uint32_t z[9]; 654 } p256_jacobian; 655 656 /* 657 * Convert a point to affine coordinates: 658 * - If the point is the point at infinity, then all three coordinates 659 * are set to 0. 660 * - Otherwise, the 'z' coordinate is set to 1, and the 'x' and 'y' 661 * coordinates are the 'X' and 'Y' affine coordinates. 662 * The coordinates are guaranteed to be lower than the modulus. 663 */ 664 static void 665 p256_to_affine(p256_jacobian *P) 666 { 667 uint32_t t1[9], t2[9]; 668 int i; 669 670 /* 671 * Invert z with a modular exponentiation: the modulus is 672 * p = 2^256 - 2^224 + 2^192 + 2^96 - 1, and the exponent is 673 * p-2. Exponent bit pattern (from high to low) is: 674 * - 32 bits of value 1 675 * - 31 bits of value 0 676 * - 1 bit of value 1 677 * - 96 bits of value 0 678 * - 94 bits of value 1 679 * - 1 bit of value 0 680 * - 1 bit of value 1 681 * Thus, we precompute z^(2^31-1) to speed things up. 682 * 683 * If z = 0 (point at infinity) then the modular exponentiation 684 * will yield 0, which leads to the expected result (all three 685 * coordinates set to 0). 686 */ 687 688 /* 689 * A simple square-and-multiply for z^(2^31-1). We could save about 690 * two dozen multiplications here with an addition chain, but 691 * this would require a bit more code, and extra stack buffers. 692 */ 693 memcpy(t1, P->z, sizeof P->z); 694 for (i = 0; i < 30; i ++) { 695 square_f256(t1, t1); 696 mul_f256(t1, t1, P->z); 697 } 698 699 /* 700 * Square-and-multiply. Apart from the squarings, we have a few 701 * multiplications to set bits to 1; we multiply by the original z 702 * for setting 1 bit, and by t1 for setting 31 bits. 703 */ 704 memcpy(t2, P->z, sizeof P->z); 705 for (i = 1; i < 256; i ++) { 706 square_f256(t2, t2); 707 switch (i) { 708 case 31: 709 case 190: 710 case 221: 711 case 252: 712 mul_f256(t2, t2, t1); 713 break; 714 case 63: 715 case 253: 716 case 255: 717 mul_f256(t2, t2, P->z); 718 break; 719 } 720 } 721 722 /* 723 * Now that we have 1/z, multiply x by 1/z^2 and y by 1/z^3. 724 */ 725 mul_f256(t1, t2, t2); 726 mul_f256(P->x, t1, P->x); 727 mul_f256(t1, t1, t2); 728 mul_f256(P->y, t1, P->y); 729 reduce_final_f256(P->x); 730 reduce_final_f256(P->y); 731 732 /* 733 * Multiply z by 1/z. If z = 0, then this will yield 0, otherwise 734 * this will set z to 1. 735 */ 736 mul_f256(P->z, P->z, t2); 737 reduce_final_f256(P->z); 738 } 739 740 /* 741 * Double a point in P-256. This function works for all valid points, 742 * including the point at infinity. 743 */ 744 static void 745 p256_double(p256_jacobian *Q) 746 { 747 /* 748 * Doubling formulas are: 749 * 750 * s = 4*x*y^2 751 * m = 3*(x + z^2)*(x - z^2) 752 * x' = m^2 - 2*s 753 * y' = m*(s - x') - 8*y^4 754 * z' = 2*y*z 755 * 756 * These formulas work for all points, including points of order 2 757 * and points at infinity: 758 * - If y = 0 then z' = 0. But there is no such point in P-256 759 * anyway. 760 * - If z = 0 then z' = 0. 761 */ 762 uint32_t t1[9], t2[9], t3[9], t4[9]; 763 764 /* 765 * Compute z^2 in t1. 766 */ 767 square_f256(t1, Q->z); 768 769 /* 770 * Compute x-z^2 in t2 and x+z^2 in t1. 771 */ 772 add_f256(t2, Q->x, t1); 773 sub_f256(t1, Q->x, t1); 774 775 /* 776 * Compute 3*(x+z^2)*(x-z^2) in t1. 777 */ 778 mul_f256(t3, t1, t2); 779 add_f256(t1, t3, t3); 780 add_f256(t1, t3, t1); 781 782 /* 783 * Compute 4*x*y^2 (in t2) and 2*y^2 (in t3). 784 */ 785 square_f256(t3, Q->y); 786 add_f256(t3, t3, t3); 787 mul_f256(t2, Q->x, t3); 788 add_f256(t2, t2, t2); 789 790 /* 791 * Compute x' = m^2 - 2*s. 792 */ 793 square_f256(Q->x, t1); 794 sub_f256(Q->x, Q->x, t2); 795 sub_f256(Q->x, Q->x, t2); 796 797 /* 798 * Compute z' = 2*y*z. 799 */ 800 mul_f256(t4, Q->y, Q->z); 801 add_f256(Q->z, t4, t4); 802 803 /* 804 * Compute y' = m*(s - x') - 8*y^4. Note that we already have 805 * 2*y^2 in t3. 806 */ 807 sub_f256(t2, t2, Q->x); 808 mul_f256(Q->y, t1, t2); 809 square_f256(t4, t3); 810 add_f256(t4, t4, t4); 811 sub_f256(Q->y, Q->y, t4); 812 } 813 814 /* 815 * Add point P2 to point P1. 816 * 817 * This function computes the wrong result in the following cases: 818 * 819 * - If P1 == 0 but P2 != 0 820 * - If P1 != 0 but P2 == 0 821 * - If P1 == P2 822 * 823 * In all three cases, P1 is set to the point at infinity. 824 * 825 * Returned value is 0 if one of the following occurs: 826 * 827 * - P1 and P2 have the same Y coordinate 828 * - P1 == 0 and P2 == 0 829 * - The Y coordinate of one of the points is 0 and the other point is 830 * the point at infinity. 831 * 832 * The third case cannot actually happen with valid points, since a point 833 * with Y == 0 is a point of order 2, and there is no point of order 2 on 834 * curve P-256. 835 * 836 * Therefore, assuming that P1 != 0 and P2 != 0 on input, then the caller 837 * can apply the following: 838 * 839 * - If the result is not the point at infinity, then it is correct. 840 * - Otherwise, if the returned value is 1, then this is a case of 841 * P1+P2 == 0, so the result is indeed the point at infinity. 842 * - Otherwise, P1 == P2, so a "double" operation should have been 843 * performed. 844 */ 845 static uint32_t 846 p256_add(p256_jacobian *P1, const p256_jacobian *P2) 847 { 848 /* 849 * Addtions formulas are: 850 * 851 * u1 = x1 * z2^2 852 * u2 = x2 * z1^2 853 * s1 = y1 * z2^3 854 * s2 = y2 * z1^3 855 * h = u2 - u1 856 * r = s2 - s1 857 * x3 = r^2 - h^3 - 2 * u1 * h^2 858 * y3 = r * (u1 * h^2 - x3) - s1 * h^3 859 * z3 = h * z1 * z2 860 */ 861 uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9]; 862 uint32_t ret; 863 int i; 864 865 /* 866 * Compute u1 = x1*z2^2 (in t1) and s1 = y1*z2^3 (in t3). 867 */ 868 square_f256(t3, P2->z); 869 mul_f256(t1, P1->x, t3); 870 mul_f256(t4, P2->z, t3); 871 mul_f256(t3, P1->y, t4); 872 873 /* 874 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). 875 */ 876 square_f256(t4, P1->z); 877 mul_f256(t2, P2->x, t4); 878 mul_f256(t5, P1->z, t4); 879 mul_f256(t4, P2->y, t5); 880 881 /* 882 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). 883 * We need to test whether r is zero, so we will do some extra 884 * reduce. 885 */ 886 sub_f256(t2, t2, t1); 887 sub_f256(t4, t4, t3); 888 reduce_final_f256(t4); 889 ret = 0; 890 for (i = 0; i < 9; i ++) { 891 ret |= t4[i]; 892 } 893 ret = (ret | -ret) >> 31; 894 895 /* 896 * Compute u1*h^2 (in t6) and h^3 (in t5); 897 */ 898 square_f256(t7, t2); 899 mul_f256(t6, t1, t7); 900 mul_f256(t5, t7, t2); 901 902 /* 903 * Compute x3 = r^2 - h^3 - 2*u1*h^2. 904 */ 905 square_f256(P1->x, t4); 906 sub_f256(P1->x, P1->x, t5); 907 sub_f256(P1->x, P1->x, t6); 908 sub_f256(P1->x, P1->x, t6); 909 910 /* 911 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. 912 */ 913 sub_f256(t6, t6, P1->x); 914 mul_f256(P1->y, t4, t6); 915 mul_f256(t1, t5, t3); 916 sub_f256(P1->y, P1->y, t1); 917 918 /* 919 * Compute z3 = h*z1*z2. 920 */ 921 mul_f256(t1, P1->z, P2->z); 922 mul_f256(P1->z, t1, t2); 923 924 return ret; 925 } 926 927 /* 928 * Add point P2 to point P1. This is a specialised function for the 929 * case when P2 is a non-zero point in affine coordinate. 930 * 931 * This function computes the wrong result in the following cases: 932 * 933 * - If P1 == 0 934 * - If P1 == P2 935 * 936 * In both cases, P1 is set to the point at infinity. 937 * 938 * Returned value is 0 if one of the following occurs: 939 * 940 * - P1 and P2 have the same Y coordinate 941 * - The Y coordinate of P2 is 0 and P1 is the point at infinity. 942 * 943 * The second case cannot actually happen with valid points, since a point 944 * with Y == 0 is a point of order 2, and there is no point of order 2 on 945 * curve P-256. 946 * 947 * Therefore, assuming that P1 != 0 on input, then the caller 948 * can apply the following: 949 * 950 * - If the result is not the point at infinity, then it is correct. 951 * - Otherwise, if the returned value is 1, then this is a case of 952 * P1+P2 == 0, so the result is indeed the point at infinity. 953 * - Otherwise, P1 == P2, so a "double" operation should have been 954 * performed. 955 */ 956 static uint32_t 957 p256_add_mixed(p256_jacobian *P1, const p256_jacobian *P2) 958 { 959 /* 960 * Addtions formulas are: 961 * 962 * u1 = x1 963 * u2 = x2 * z1^2 964 * s1 = y1 965 * s2 = y2 * z1^3 966 * h = u2 - u1 967 * r = s2 - s1 968 * x3 = r^2 - h^3 - 2 * u1 * h^2 969 * y3 = r * (u1 * h^2 - x3) - s1 * h^3 970 * z3 = h * z1 971 */ 972 uint32_t t1[9], t2[9], t3[9], t4[9], t5[9], t6[9], t7[9]; 973 uint32_t ret; 974 int i; 975 976 /* 977 * Compute u1 = x1 (in t1) and s1 = y1 (in t3). 978 */ 979 memcpy(t1, P1->x, sizeof t1); 980 memcpy(t3, P1->y, sizeof t3); 981 982 /* 983 * Compute u2 = x2*z1^2 (in t2) and s2 = y2*z1^3 (in t4). 984 */ 985 square_f256(t4, P1->z); 986 mul_f256(t2, P2->x, t4); 987 mul_f256(t5, P1->z, t4); 988 mul_f256(t4, P2->y, t5); 989 990 /* 991 * Compute h = h2 - u1 (in t2) and r = s2 - s1 (in t4). 992 * We need to test whether r is zero, so we will do some extra 993 * reduce. 994 */ 995 sub_f256(t2, t2, t1); 996 sub_f256(t4, t4, t3); 997 reduce_final_f256(t4); 998 ret = 0; 999 for (i = 0; i < 9; i ++) { 1000 ret |= t4[i]; 1001 } 1002 ret = (ret | -ret) >> 31; 1003 1004 /* 1005 * Compute u1*h^2 (in t6) and h^3 (in t5); 1006 */ 1007 square_f256(t7, t2); 1008 mul_f256(t6, t1, t7); 1009 mul_f256(t5, t7, t2); 1010 1011 /* 1012 * Compute x3 = r^2 - h^3 - 2*u1*h^2. 1013 */ 1014 square_f256(P1->x, t4); 1015 sub_f256(P1->x, P1->x, t5); 1016 sub_f256(P1->x, P1->x, t6); 1017 sub_f256(P1->x, P1->x, t6); 1018 1019 /* 1020 * Compute y3 = r*(u1*h^2 - x3) - s1*h^3. 1021 */ 1022 sub_f256(t6, t6, P1->x); 1023 mul_f256(P1->y, t4, t6); 1024 mul_f256(t1, t5, t3); 1025 sub_f256(P1->y, P1->y, t1); 1026 1027 /* 1028 * Compute z3 = h*z1*z2. 1029 */ 1030 mul_f256(P1->z, P1->z, t2); 1031 1032 return ret; 1033 } 1034 1035 /* 1036 * Decode a P-256 point. This function does not support the point at 1037 * infinity. Returned value is 0 if the point is invalid, 1 otherwise. 1038 */ 1039 static uint32_t 1040 p256_decode(p256_jacobian *P, const void *src, size_t len) 1041 { 1042 const unsigned char *buf; 1043 uint32_t tx[9], ty[9], t1[9], t2[9]; 1044 uint32_t bad; 1045 int i; 1046 1047 if (len != 65) { 1048 return 0; 1049 } 1050 buf = src; 1051 1052 /* 1053 * First byte must be 0x04 (uncompressed format). We could support 1054 * "hybrid format" (first byte is 0x06 or 0x07, and encodes the 1055 * least significant bit of the Y coordinate), but it is explicitly 1056 * forbidden by RFC 5480 (section 2.2). 1057 */ 1058 bad = NEQ(buf[0], 0x04); 1059 1060 /* 1061 * Decode the coordinates, and check that they are both lower 1062 * than the modulus. 1063 */ 1064 tx[8] = be8_to_le30(tx, buf + 1, 32); 1065 ty[8] = be8_to_le30(ty, buf + 33, 32); 1066 bad |= reduce_final_f256(tx); 1067 bad |= reduce_final_f256(ty); 1068 1069 /* 1070 * Check curve equation. 1071 */ 1072 square_f256(t1, tx); 1073 mul_f256(t1, tx, t1); 1074 square_f256(t2, ty); 1075 sub_f256(t1, t1, tx); 1076 sub_f256(t1, t1, tx); 1077 sub_f256(t1, t1, tx); 1078 add_f256(t1, t1, P256_B); 1079 sub_f256(t1, t1, t2); 1080 reduce_final_f256(t1); 1081 for (i = 0; i < 9; i ++) { 1082 bad |= t1[i]; 1083 } 1084 1085 /* 1086 * Copy coordinates to the point structure. 1087 */ 1088 memcpy(P->x, tx, sizeof tx); 1089 memcpy(P->y, ty, sizeof ty); 1090 memset(P->z, 0, sizeof P->z); 1091 P->z[0] = 1; 1092 return EQ(bad, 0); 1093 } 1094 1095 /* 1096 * Encode a point into a buffer. This function assumes that the point is 1097 * valid, in affine coordinates, and not the point at infinity. 1098 */ 1099 static void 1100 p256_encode(void *dst, const p256_jacobian *P) 1101 { 1102 unsigned char *buf; 1103 1104 buf = dst; 1105 buf[0] = 0x04; 1106 le30_to_be8(buf + 1, 32, P->x); 1107 le30_to_be8(buf + 33, 32, P->y); 1108 } 1109 1110 /* 1111 * Multiply a curve point by an integer. The integer is assumed to be 1112 * lower than the curve order, and the base point must not be the point 1113 * at infinity. 1114 */ 1115 static void 1116 p256_mul(p256_jacobian *P, const unsigned char *x, size_t xlen) 1117 { 1118 /* 1119 * qz is a flag that is initially 1, and remains equal to 1 1120 * as long as the point is the point at infinity. 1121 * 1122 * We use a 2-bit window to handle multiplier bits by pairs. 1123 * The precomputed window really is the points P2 and P3. 1124 */ 1125 uint32_t qz; 1126 p256_jacobian P2, P3, Q, T, U; 1127 1128 /* 1129 * Compute window values. 1130 */ 1131 P2 = *P; 1132 p256_double(&P2); 1133 P3 = *P; 1134 p256_add(&P3, &P2); 1135 1136 /* 1137 * We start with Q = 0. We process multiplier bits 2 by 2. 1138 */ 1139 memset(&Q, 0, sizeof Q); 1140 qz = 1; 1141 while (xlen -- > 0) { 1142 int k; 1143 1144 for (k = 6; k >= 0; k -= 2) { 1145 uint32_t bits; 1146 uint32_t bnz; 1147 1148 p256_double(&Q); 1149 p256_double(&Q); 1150 T = *P; 1151 U = Q; 1152 bits = (*x >> k) & (uint32_t)3; 1153 bnz = NEQ(bits, 0); 1154 CCOPY(EQ(bits, 2), &T, &P2, sizeof T); 1155 CCOPY(EQ(bits, 3), &T, &P3, sizeof T); 1156 p256_add(&U, &T); 1157 CCOPY(bnz & qz, &Q, &T, sizeof Q); 1158 CCOPY(bnz & ~qz, &Q, &U, sizeof Q); 1159 qz &= ~bnz; 1160 } 1161 x ++; 1162 } 1163 *P = Q; 1164 } 1165 1166 /* 1167 * Precomputed window: k*G points, where G is the curve generator, and k 1168 * is an integer from 1 to 15 (inclusive). The X and Y coordinates of 1169 * the point are encoded as 9 words of 30 bits each (little-endian 1170 * order). 1171 */ 1172 static const uint32_t Gwin[15][18] = { 1173 1174 { 0x1898C296, 0x1284E517, 0x1EB33A0F, 0x00DF604B, 1175 0x2440F277, 0x339B958E, 0x04247F8B, 0x347CB84B, 1176 0x00006B17, 0x37BF51F5, 0x2ED901A0, 0x3315ECEC, 1177 0x338CD5DA, 0x0F9E162B, 0x1FAD29F0, 0x27F9B8EE, 1178 0x10B8BF86, 0x00004FE3 }, 1179 1180 { 0x07669978, 0x182D23F1, 0x3F21B35A, 0x225A789D, 1181 0x351AC3C0, 0x08E00C12, 0x34F7E8A5, 0x1EC62340, 1182 0x00007CF2, 0x227873D1, 0x3812DE74, 0x0E982299, 1183 0x1F6B798F, 0x3430DBBA, 0x366B1A7D, 0x2D040293, 1184 0x154436E3, 0x00000777 }, 1185 1186 { 0x06E7FD6C, 0x2D05986F, 0x3ADA985F, 0x31ADC87B, 1187 0x0BF165E6, 0x1FBE5475, 0x30A44C8F, 0x3934698C, 1188 0x00005ECB, 0x227D5032, 0x29E6C49E, 0x04FB83D9, 1189 0x0AAC0D8E, 0x24A2ECD8, 0x2C1B3869, 0x0FF7E374, 1190 0x19031266, 0x00008734 }, 1191 1192 { 0x2B030852, 0x024C0911, 0x05596EF5, 0x07F8B6DE, 1193 0x262BD003, 0x3779967B, 0x08FBBA02, 0x128D4CB4, 1194 0x0000E253, 0x184ED8C6, 0x310B08FC, 0x30EE0055, 1195 0x3F25B0FC, 0x062D764E, 0x3FB97F6A, 0x33CC719D, 1196 0x15D69318, 0x0000E0F1 }, 1197 1198 { 0x03D033ED, 0x05552837, 0x35BE5242, 0x2320BF47, 1199 0x268FDFEF, 0x13215821, 0x140D2D78, 0x02DE9454, 1200 0x00005159, 0x3DA16DA4, 0x0742ED13, 0x0D80888D, 1201 0x004BC035, 0x0A79260D, 0x06FCDAFE, 0x2727D8AE, 1202 0x1F6A2412, 0x0000E0C1 }, 1203 1204 { 0x3C2291A9, 0x1AC2ABA4, 0x3B215B4C, 0x131D037A, 1205 0x17DDE302, 0x0C90B2E2, 0x0602C92D, 0x05CA9DA9, 1206 0x0000B01A, 0x0FC77FE2, 0x35F1214E, 0x07E16BDF, 1207 0x003DDC07, 0x2703791C, 0x3038B7EE, 0x3DAD56FE, 1208 0x041D0C8D, 0x0000E85C }, 1209 1210 { 0x3187B2A3, 0x0018A1C0, 0x00FEF5B3, 0x3E7E2E2A, 1211 0x01FB607E, 0x2CC199F0, 0x37B4625B, 0x0EDBE82F, 1212 0x00008E53, 0x01F400B4, 0x15786A1B, 0x3041B21C, 1213 0x31CD8CF2, 0x35900053, 0x1A7E0E9B, 0x318366D0, 1214 0x076F780C, 0x000073EB }, 1215 1216 { 0x1B6FB393, 0x13767707, 0x3CE97DBB, 0x348E2603, 1217 0x354CADC1, 0x09D0B4EA, 0x1B053404, 0x1DE76FBA, 1218 0x000062D9, 0x0F09957E, 0x295029A8, 0x3E76A78D, 1219 0x3B547DAE, 0x27CEE0A2, 0x0575DC45, 0x1D8244FF, 1220 0x332F647A, 0x0000AD5A }, 1221 1222 { 0x10949EE0, 0x1E7A292E, 0x06DF8B3D, 0x02B2E30B, 1223 0x31F8729E, 0x24E35475, 0x30B71878, 0x35EDBFB7, 1224 0x0000EA68, 0x0DD048FA, 0x21688929, 0x0DE823FE, 1225 0x1C53FAA9, 0x0EA0C84D, 0x052A592A, 0x1FCE7870, 1226 0x11325CB2, 0x00002A27 }, 1227 1228 { 0x04C5723F, 0x30D81A50, 0x048306E4, 0x329B11C7, 1229 0x223FB545, 0x085347A8, 0x2993E591, 0x1B5ACA8E, 1230 0x0000CEF6, 0x04AF0773, 0x28D2EEA9, 0x2751EEEC, 1231 0x037B4A7F, 0x3B4C1059, 0x08F37674, 0x2AE906E1, 1232 0x18A88A6A, 0x00008786 }, 1233 1234 { 0x34BC21D1, 0x0CCE474D, 0x15048BF4, 0x1D0BB409, 1235 0x021CDA16, 0x20DE76C3, 0x34C59063, 0x04EDE20E, 1236 0x00003ED1, 0x282A3740, 0x0BE3BBF3, 0x29889DAE, 1237 0x03413697, 0x34C68A09, 0x210EBE93, 0x0C8A224C, 1238 0x0826B331, 0x00009099 }, 1239 1240 { 0x0624E3C4, 0x140317BA, 0x2F82C99D, 0x260C0A2C, 1241 0x25D55179, 0x194DCC83, 0x3D95E462, 0x356F6A05, 1242 0x0000741D, 0x0D4481D3, 0x2657FC8B, 0x1BA5CA71, 1243 0x3AE44B0D, 0x07B1548E, 0x0E0D5522, 0x05FDC567, 1244 0x2D1AA70E, 0x00000770 }, 1245 1246 { 0x06072C01, 0x23857675, 0x1EAD58A9, 0x0B8A12D9, 1247 0x1EE2FC79, 0x0177CB61, 0x0495A618, 0x20DEB82B, 1248 0x0000177C, 0x2FC7BFD8, 0x310EEF8B, 0x1FB4DF39, 1249 0x3B8530E8, 0x0F4E7226, 0x0246B6D0, 0x2A558A24, 1250 0x163353AF, 0x000063BB }, 1251 1252 { 0x24D2920B, 0x1C249DCC, 0x2069C5E5, 0x09AB2F9E, 1253 0x36DF3CF1, 0x1991FD0C, 0x062B97A7, 0x1E80070E, 1254 0x000054E7, 0x20D0B375, 0x2E9F20BD, 0x35090081, 1255 0x1C7A9DDC, 0x22E7C371, 0x087E3016, 0x03175421, 1256 0x3C6ECA7D, 0x0000F599 }, 1257 1258 { 0x259B9D5F, 0x0D9A318F, 0x23A0EF16, 0x00EBE4B7, 1259 0x088265AE, 0x2CDE2666, 0x2BAE7ADF, 0x1371A5C6, 1260 0x0000F045, 0x0D034F36, 0x1F967378, 0x1B5FA3F4, 1261 0x0EC8739D, 0x1643E62A, 0x1653947E, 0x22D1F4E6, 1262 0x0FB8D64B, 0x0000B5B9 } 1263 }; 1264 1265 /* 1266 * Lookup one of the Gwin[] values, by index. This is constant-time. 1267 */ 1268 static void 1269 lookup_Gwin(p256_jacobian *T, uint32_t idx) 1270 { 1271 uint32_t xy[18]; 1272 uint32_t k; 1273 size_t u; 1274 1275 memset(xy, 0, sizeof xy); 1276 for (k = 0; k < 15; k ++) { 1277 uint32_t m; 1278 1279 m = -EQ(idx, k + 1); 1280 for (u = 0; u < 18; u ++) { 1281 xy[u] |= m & Gwin[k][u]; 1282 } 1283 } 1284 memcpy(T->x, &xy[0], sizeof T->x); 1285 memcpy(T->y, &xy[9], sizeof T->y); 1286 memset(T->z, 0, sizeof T->z); 1287 T->z[0] = 1; 1288 } 1289 1290 /* 1291 * Multiply the generator by an integer. The integer is assumed non-zero 1292 * and lower than the curve order. 1293 */ 1294 static void 1295 p256_mulgen(p256_jacobian *P, const unsigned char *x, size_t xlen) 1296 { 1297 /* 1298 * qz is a flag that is initially 1, and remains equal to 1 1299 * as long as the point is the point at infinity. 1300 * 1301 * We use a 4-bit window to handle multiplier bits by groups 1302 * of 4. The precomputed window is constant static data, with 1303 * points in affine coordinates; we use a constant-time lookup. 1304 */ 1305 p256_jacobian Q; 1306 uint32_t qz; 1307 1308 memset(&Q, 0, sizeof Q); 1309 qz = 1; 1310 while (xlen -- > 0) { 1311 int k; 1312 unsigned bx; 1313 1314 bx = *x ++; 1315 for (k = 0; k < 2; k ++) { 1316 uint32_t bits; 1317 uint32_t bnz; 1318 p256_jacobian T, U; 1319 1320 p256_double(&Q); 1321 p256_double(&Q); 1322 p256_double(&Q); 1323 p256_double(&Q); 1324 bits = (bx >> 4) & 0x0F; 1325 bnz = NEQ(bits, 0); 1326 lookup_Gwin(&T, bits); 1327 U = Q; 1328 p256_add_mixed(&U, &T); 1329 CCOPY(bnz & qz, &Q, &T, sizeof Q); 1330 CCOPY(bnz & ~qz, &Q, &U, sizeof Q); 1331 qz &= ~bnz; 1332 bx <<= 4; 1333 } 1334 } 1335 *P = Q; 1336 } 1337 1338 static const unsigned char P256_G[] = { 1339 0x04, 0x6B, 0x17, 0xD1, 0xF2, 0xE1, 0x2C, 0x42, 0x47, 0xF8, 1340 0xBC, 0xE6, 0xE5, 0x63, 0xA4, 0x40, 0xF2, 0x77, 0x03, 0x7D, 1341 0x81, 0x2D, 0xEB, 0x33, 0xA0, 0xF4, 0xA1, 0x39, 0x45, 0xD8, 1342 0x98, 0xC2, 0x96, 0x4F, 0xE3, 0x42, 0xE2, 0xFE, 0x1A, 0x7F, 1343 0x9B, 0x8E, 0xE7, 0xEB, 0x4A, 0x7C, 0x0F, 0x9E, 0x16, 0x2B, 1344 0xCE, 0x33, 0x57, 0x6B, 0x31, 0x5E, 0xCE, 0xCB, 0xB6, 0x40, 1345 0x68, 0x37, 0xBF, 0x51, 0xF5 1346 }; 1347 1348 static const unsigned char P256_N[] = { 1349 0xFF, 0xFF, 0xFF, 0xFF, 0x00, 0x00, 0x00, 0x00, 0xFF, 0xFF, 1350 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xBC, 0xE6, 0xFA, 0xAD, 1351 0xA7, 0x17, 0x9E, 0x84, 0xF3, 0xB9, 0xCA, 0xC2, 0xFC, 0x63, 1352 0x25, 0x51 1353 }; 1354 1355 static const unsigned char * 1356 api_generator(int curve, size_t *len) 1357 { 1358 (void)curve; 1359 *len = sizeof P256_G; 1360 return P256_G; 1361 } 1362 1363 static const unsigned char * 1364 api_order(int curve, size_t *len) 1365 { 1366 (void)curve; 1367 *len = sizeof P256_N; 1368 return P256_N; 1369 } 1370 1371 static size_t 1372 api_xoff(int curve, size_t *len) 1373 { 1374 (void)curve; 1375 *len = 32; 1376 return 1; 1377 } 1378 1379 static uint32_t 1380 api_mul(unsigned char *G, size_t Glen, 1381 const unsigned char *x, size_t xlen, int curve) 1382 { 1383 uint32_t r; 1384 p256_jacobian P; 1385 1386 (void)curve; 1387 r = p256_decode(&P, G, Glen); 1388 p256_mul(&P, x, xlen); 1389 if (Glen >= 65) { 1390 p256_to_affine(&P); 1391 p256_encode(G, &P); 1392 } 1393 return r; 1394 } 1395 1396 static size_t 1397 api_mulgen(unsigned char *R, 1398 const unsigned char *x, size_t xlen, int curve) 1399 { 1400 p256_jacobian P; 1401 1402 (void)curve; 1403 p256_mulgen(&P, x, xlen); 1404 p256_to_affine(&P); 1405 p256_encode(R, &P); 1406 return 65; 1407 1408 /* 1409 const unsigned char *G; 1410 size_t Glen; 1411 1412 G = api_generator(curve, &Glen); 1413 memcpy(R, G, Glen); 1414 api_mul(R, Glen, x, xlen, curve); 1415 return Glen; 1416 */ 1417 } 1418 1419 static uint32_t 1420 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 1421 const unsigned char *x, size_t xlen, 1422 const unsigned char *y, size_t ylen, int curve) 1423 { 1424 p256_jacobian P, Q; 1425 uint32_t r, t, z; 1426 int i; 1427 1428 (void)curve; 1429 r = p256_decode(&P, A, len); 1430 p256_mul(&P, x, xlen); 1431 if (B == NULL) { 1432 p256_mulgen(&Q, y, ylen); 1433 } else { 1434 r &= p256_decode(&Q, B, len); 1435 p256_mul(&Q, y, ylen); 1436 } 1437 1438 /* 1439 * The final addition may fail in case both points are equal. 1440 */ 1441 t = p256_add(&P, &Q); 1442 reduce_final_f256(P.z); 1443 z = 0; 1444 for (i = 0; i < 9; i ++) { 1445 z |= P.z[i]; 1446 } 1447 z = EQ(z, 0); 1448 p256_double(&Q); 1449 1450 /* 1451 * If z is 1 then either P+Q = 0 (t = 1) or P = Q (t = 0). So we 1452 * have the following: 1453 * 1454 * z = 0, t = 0 return P (normal addition) 1455 * z = 0, t = 1 return P (normal addition) 1456 * z = 1, t = 0 return Q (a 'double' case) 1457 * z = 1, t = 1 report an error (P+Q = 0) 1458 */ 1459 CCOPY(z & ~t, &P, &Q, sizeof Q); 1460 p256_to_affine(&P); 1461 p256_encode(A, &P); 1462 r &= ~(z & t); 1463 return r; 1464 } 1465 1466 /* see bearssl_ec.h */ 1467 const br_ec_impl br_ec_p256_m31 = { 1468 (uint32_t)0x00800000, 1469 &api_generator, 1470 &api_order, 1471 &api_xoff, 1472 &api_mul, 1473 &api_mulgen, 1474 &api_muladd 1475 }; 1476