1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* obsolete 28 #include <stdio.h> 29 #include <stdlib.h> 30 static void 31 print_int(const char *name, const uint32_t *x) 32 { 33 size_t u; 34 unsigned char tmp[40]; 35 36 printf("%s = ", name); 37 for (u = 0; u < 9; u ++) { 38 if (x[u] > 0x3FFFFFFF) { 39 printf("INVALID:"); 40 for (u = 0; u < 9; u ++) { 41 printf(" %08X", x[u]); 42 } 43 printf("\n"); 44 return; 45 } 46 } 47 memset(tmp, 0, sizeof tmp); 48 for (u = 0; u < 9; u ++) { 49 uint64_t w; 50 int j, k; 51 52 w = x[u]; 53 j = 30 * (int)u; 54 k = j & 7; 55 if (k != 0) { 56 w <<= k; 57 j -= k; 58 } 59 k = j >> 3; 60 for (j = 0; j < 8; j ++) { 61 tmp[39 - k - j] |= (unsigned char)w; 62 w >>= 8; 63 } 64 } 65 for (u = 8; u < 40; u ++) { 66 printf("%02X", tmp[u]); 67 } 68 printf("\n"); 69 } 70 */ 71 72 /* 73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ 74 * that right-shifting a signed negative integer copies the sign bit 75 * (arithmetic right-shift). This is "implementation-defined behaviour", 76 * i.e. it is not undefined, but it may differ between compilers. Each 77 * compiler is supposed to document its behaviour in that respect. GCC 78 * explicitly defines that an arithmetic right shift is used. We expect 79 * all other compilers to do the same, because underlying CPU offer an 80 * arithmetic right shift opcode that could not be used otherwise. 81 */ 82 #if BR_NO_ARITH_SHIFT 83 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \ 84 | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) 85 #else 86 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) 87 #endif 88 89 /* 90 * Convert an integer from unsigned little-endian encoding to a sequence of 91 * 30-bit words in little-endian order. The final "partial" word is 92 * returned. 93 */ 94 static uint32_t 95 le8_to_le30(uint32_t *dst, const unsigned char *src, size_t len) 96 { 97 uint32_t acc; 98 int acc_len; 99 100 acc = 0; 101 acc_len = 0; 102 while (len -- > 0) { 103 uint32_t b; 104 105 b = *src ++; 106 if (acc_len < 22) { 107 acc |= b << acc_len; 108 acc_len += 8; 109 } else { 110 *dst ++ = (acc | (b << acc_len)) & 0x3FFFFFFF; 111 acc = b >> (30 - acc_len); 112 acc_len -= 22; 113 } 114 } 115 return acc; 116 } 117 118 /* 119 * Convert an integer (30-bit words, little-endian) to unsigned 120 * little-endian encoding. The total encoding length is provided; all 121 * the destination bytes will be filled. 122 */ 123 static void 124 le30_to_le8(unsigned char *dst, size_t len, const uint32_t *src) 125 { 126 uint32_t acc; 127 int acc_len; 128 129 acc = 0; 130 acc_len = 0; 131 while (len -- > 0) { 132 if (acc_len < 8) { 133 uint32_t w; 134 135 w = *src ++; 136 *dst ++ = (unsigned char)(acc | (w << acc_len)); 137 acc = w >> (8 - acc_len); 138 acc_len += 22; 139 } else { 140 *dst ++ = (unsigned char)acc; 141 acc >>= 8; 142 acc_len -= 8; 143 } 144 } 145 } 146 147 /* 148 * Multiply two integers. Source integers are represented as arrays of 149 * nine 30-bit words, for values up to 2^270-1. Result is encoded over 150 * 18 words of 30 bits each. 151 */ 152 static void 153 mul9(uint32_t *d, const uint32_t *a, const uint32_t *b) 154 { 155 /* 156 * Maximum intermediate result is no more than 157 * 10376293531797946367, which fits in 64 bits. Reason: 158 * 159 * 10376293531797946367 = 9 * (2^30-1)^2 + 9663676406 160 * 10376293531797946367 < 9663676407 * 2^30 161 * 162 * Thus, adding together 9 products of 30-bit integers, with 163 * a carry of at most 9663676406, yields an integer that fits 164 * on 64 bits and generates a carry of at most 9663676406. 165 */ 166 uint64_t t[17]; 167 uint64_t cc; 168 int i; 169 170 t[ 0] = MUL31(a[0], b[0]); 171 t[ 1] = MUL31(a[0], b[1]) 172 + MUL31(a[1], b[0]); 173 t[ 2] = MUL31(a[0], b[2]) 174 + MUL31(a[1], b[1]) 175 + MUL31(a[2], b[0]); 176 t[ 3] = MUL31(a[0], b[3]) 177 + MUL31(a[1], b[2]) 178 + MUL31(a[2], b[1]) 179 + MUL31(a[3], b[0]); 180 t[ 4] = MUL31(a[0], b[4]) 181 + MUL31(a[1], b[3]) 182 + MUL31(a[2], b[2]) 183 + MUL31(a[3], b[1]) 184 + MUL31(a[4], b[0]); 185 t[ 5] = MUL31(a[0], b[5]) 186 + MUL31(a[1], b[4]) 187 + MUL31(a[2], b[3]) 188 + MUL31(a[3], b[2]) 189 + MUL31(a[4], b[1]) 190 + MUL31(a[5], b[0]); 191 t[ 6] = MUL31(a[0], b[6]) 192 + MUL31(a[1], b[5]) 193 + MUL31(a[2], b[4]) 194 + MUL31(a[3], b[3]) 195 + MUL31(a[4], b[2]) 196 + MUL31(a[5], b[1]) 197 + MUL31(a[6], b[0]); 198 t[ 7] = MUL31(a[0], b[7]) 199 + MUL31(a[1], b[6]) 200 + MUL31(a[2], b[5]) 201 + MUL31(a[3], b[4]) 202 + MUL31(a[4], b[3]) 203 + MUL31(a[5], b[2]) 204 + MUL31(a[6], b[1]) 205 + MUL31(a[7], b[0]); 206 t[ 8] = MUL31(a[0], b[8]) 207 + MUL31(a[1], b[7]) 208 + MUL31(a[2], b[6]) 209 + MUL31(a[3], b[5]) 210 + MUL31(a[4], b[4]) 211 + MUL31(a[5], b[3]) 212 + MUL31(a[6], b[2]) 213 + MUL31(a[7], b[1]) 214 + MUL31(a[8], b[0]); 215 t[ 9] = MUL31(a[1], b[8]) 216 + MUL31(a[2], b[7]) 217 + MUL31(a[3], b[6]) 218 + MUL31(a[4], b[5]) 219 + MUL31(a[5], b[4]) 220 + MUL31(a[6], b[3]) 221 + MUL31(a[7], b[2]) 222 + MUL31(a[8], b[1]); 223 t[10] = MUL31(a[2], b[8]) 224 + MUL31(a[3], b[7]) 225 + MUL31(a[4], b[6]) 226 + MUL31(a[5], b[5]) 227 + MUL31(a[6], b[4]) 228 + MUL31(a[7], b[3]) 229 + MUL31(a[8], b[2]); 230 t[11] = MUL31(a[3], b[8]) 231 + MUL31(a[4], b[7]) 232 + MUL31(a[5], b[6]) 233 + MUL31(a[6], b[5]) 234 + MUL31(a[7], b[4]) 235 + MUL31(a[8], b[3]); 236 t[12] = MUL31(a[4], b[8]) 237 + MUL31(a[5], b[7]) 238 + MUL31(a[6], b[6]) 239 + MUL31(a[7], b[5]) 240 + MUL31(a[8], b[4]); 241 t[13] = MUL31(a[5], b[8]) 242 + MUL31(a[6], b[7]) 243 + MUL31(a[7], b[6]) 244 + MUL31(a[8], b[5]); 245 t[14] = MUL31(a[6], b[8]) 246 + MUL31(a[7], b[7]) 247 + MUL31(a[8], b[6]); 248 t[15] = MUL31(a[7], b[8]) 249 + MUL31(a[8], b[7]); 250 t[16] = MUL31(a[8], b[8]); 251 252 /* 253 * Propagate carries. 254 */ 255 cc = 0; 256 for (i = 0; i < 17; i ++) { 257 uint64_t w; 258 259 w = t[i] + cc; 260 d[i] = (uint32_t)w & 0x3FFFFFFF; 261 cc = w >> 30; 262 } 263 d[17] = (uint32_t)cc; 264 } 265 266 /* 267 * Square a 270-bit integer, represented as an array of nine 30-bit words. 268 * Result uses 18 words of 30 bits each. 269 */ 270 static void 271 square9(uint32_t *d, const uint32_t *a) 272 { 273 uint64_t t[17]; 274 uint64_t cc; 275 int i; 276 277 t[ 0] = MUL31(a[0], a[0]); 278 t[ 1] = ((MUL31(a[0], a[1])) << 1); 279 t[ 2] = MUL31(a[1], a[1]) 280 + ((MUL31(a[0], a[2])) << 1); 281 t[ 3] = ((MUL31(a[0], a[3]) 282 + MUL31(a[1], a[2])) << 1); 283 t[ 4] = MUL31(a[2], a[2]) 284 + ((MUL31(a[0], a[4]) 285 + MUL31(a[1], a[3])) << 1); 286 t[ 5] = ((MUL31(a[0], a[5]) 287 + MUL31(a[1], a[4]) 288 + MUL31(a[2], a[3])) << 1); 289 t[ 6] = MUL31(a[3], a[3]) 290 + ((MUL31(a[0], a[6]) 291 + MUL31(a[1], a[5]) 292 + MUL31(a[2], a[4])) << 1); 293 t[ 7] = ((MUL31(a[0], a[7]) 294 + MUL31(a[1], a[6]) 295 + MUL31(a[2], a[5]) 296 + MUL31(a[3], a[4])) << 1); 297 t[ 8] = MUL31(a[4], a[4]) 298 + ((MUL31(a[0], a[8]) 299 + MUL31(a[1], a[7]) 300 + MUL31(a[2], a[6]) 301 + MUL31(a[3], a[5])) << 1); 302 t[ 9] = ((MUL31(a[1], a[8]) 303 + MUL31(a[2], a[7]) 304 + MUL31(a[3], a[6]) 305 + MUL31(a[4], a[5])) << 1); 306 t[10] = MUL31(a[5], a[5]) 307 + ((MUL31(a[2], a[8]) 308 + MUL31(a[3], a[7]) 309 + MUL31(a[4], a[6])) << 1); 310 t[11] = ((MUL31(a[3], a[8]) 311 + MUL31(a[4], a[7]) 312 + MUL31(a[5], a[6])) << 1); 313 t[12] = MUL31(a[6], a[6]) 314 + ((MUL31(a[4], a[8]) 315 + MUL31(a[5], a[7])) << 1); 316 t[13] = ((MUL31(a[5], a[8]) 317 + MUL31(a[6], a[7])) << 1); 318 t[14] = MUL31(a[7], a[7]) 319 + ((MUL31(a[6], a[8])) << 1); 320 t[15] = ((MUL31(a[7], a[8])) << 1); 321 t[16] = MUL31(a[8], a[8]); 322 323 /* 324 * Propagate carries. 325 */ 326 cc = 0; 327 for (i = 0; i < 17; i ++) { 328 uint64_t w; 329 330 w = t[i] + cc; 331 d[i] = (uint32_t)w & 0x3FFFFFFF; 332 cc = w >> 30; 333 } 334 d[17] = (uint32_t)cc; 335 } 336 337 /* 338 * Perform a "final reduction" in field F255 (field for Curve25519) 339 * The source value must be less than twice the modulus. If the value 340 * is not lower than the modulus, then the modulus is subtracted and 341 * this function returns 1; otherwise, it leaves it untouched and it 342 * returns 0. 343 */ 344 static uint32_t 345 reduce_final_f255(uint32_t *d) 346 { 347 uint32_t t[9]; 348 uint32_t cc; 349 int i; 350 351 memcpy(t, d, sizeof t); 352 cc = 19; 353 for (i = 0; i < 9; i ++) { 354 uint32_t w; 355 356 w = t[i] + cc; 357 cc = w >> 30; 358 t[i] = w & 0x3FFFFFFF; 359 } 360 cc = t[8] >> 15; 361 t[8] &= 0x7FFF; 362 CCOPY(cc, d, t, sizeof t); 363 return cc; 364 } 365 366 /* 367 * Perform a multiplication of two integers modulo 2^255-19. 368 * Operands are arrays of 9 words, each containing 30 bits of data, in 369 * little-endian order. Input value may be up to 2^256-1; on output, value 370 * fits on 256 bits and is lower than twice the modulus. 371 */ 372 static void 373 f255_mul(uint32_t *d, const uint32_t *a, const uint32_t *b) 374 { 375 uint32_t t[18], cc; 376 int i; 377 378 /* 379 * Compute raw multiplication. All result words fit in 30 bits 380 * each; upper word (t[17]) must fit on 2 bits, since the product 381 * of two 256-bit integers must fit on 512 bits. 382 */ 383 mul9(t, a, b); 384 385 /* 386 * Modular reduction: each high word is added where necessary. 387 * Since the modulus is 2^255-19 and word 9 corresponds to 388 * offset 9*30 = 270, word 9+k must be added to word k with 389 * a factor of 19*2^15 = 622592. The extra bits in word 8 are also 390 * added that way. 391 * 392 * Keeping the carry on 32 bits helps with 32-bit architectures, 393 * and does not noticeably impact performance on 64-bit systems. 394 */ 395 cc = MUL15(t[8] >> 15, 19); /* at most 19*(2^15-1) = 622573 */ 396 t[8] &= 0x7FFF; 397 for (i = 0; i < 9; i ++) { 398 uint64_t w; 399 400 w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); 401 t[i] = (uint32_t)w & 0x3FFFFFFF; 402 cc = (uint32_t)(w >> 30); /* at most 622592 */ 403 } 404 405 /* 406 * Original product was up to (2^256-1)^2, i.e. a 512-bit integer. 407 * This was split into two parts (upper of 257 bits, lower of 255 408 * bits), and the upper was added to the lower with a factor 19, 409 * which means that the intermediate value is less than 77*2^255 410 * (19*2^257 + 2^255). Therefore, the extra bits "t[8] >> 15" are 411 * less than 77, and the initial carry cc is at most 76*19 = 1444. 412 */ 413 cc = MUL15(t[8] >> 15, 19); 414 t[8] &= 0x7FFF; 415 for (i = 0; i < 9; i ++) { 416 uint32_t z; 417 418 z = t[i] + cc; 419 d[i] = z & 0x3FFFFFFF; 420 cc = z >> 30; 421 } 422 423 /* 424 * Final result is at most 2^255 + 1443. In particular, the last 425 * carry is necessarily 0, since t[8] was truncated to 15 bits. 426 */ 427 } 428 429 /* 430 * Perform a squaring of an integer modulo 2^255-19. 431 * Operands are arrays of 9 words, each containing 30 bits of data, in 432 * little-endian order. Input value may be up to 2^256-1; on output, value 433 * fits on 256 bits and is lower than twice the modulus. 434 */ 435 static void 436 f255_square(uint32_t *d, const uint32_t *a) 437 { 438 uint32_t t[18], cc; 439 int i; 440 441 /* 442 * Compute raw squaring. All result words fit in 30 bits 443 * each; upper word (t[17]) must fit on 2 bits, since the square 444 * of a 256-bit integers must fit on 512 bits. 445 */ 446 square9(t, a); 447 448 /* 449 * Modular reduction: each high word is added where necessary. 450 * See f255_mul() for details on the reduction and carry limits. 451 */ 452 cc = MUL15(t[8] >> 15, 19); 453 t[8] &= 0x7FFF; 454 for (i = 0; i < 9; i ++) { 455 uint64_t w; 456 457 w = (uint64_t)t[i] + (uint64_t)cc + MUL31(t[i + 9], 622592); 458 t[i] = (uint32_t)w & 0x3FFFFFFF; 459 cc = (uint32_t)(w >> 30); 460 } 461 cc = MUL15(t[8] >> 15, 19); 462 t[8] &= 0x7FFF; 463 for (i = 0; i < 9; i ++) { 464 uint32_t z; 465 466 z = t[i] + cc; 467 d[i] = z & 0x3FFFFFFF; 468 cc = z >> 30; 469 } 470 } 471 472 /* 473 * Add two values in F255. Partial reduction is performed (down to less 474 * than twice the modulus). 475 */ 476 static void 477 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) 478 { 479 /* 480 * Since operand words fit on 30 bits, we can use 32-bit 481 * variables throughout. 482 */ 483 int i; 484 uint32_t cc, w; 485 486 cc = 0; 487 for (i = 0; i < 9; i ++) { 488 w = a[i] + b[i] + cc; 489 d[i] = w & 0x3FFFFFFF; 490 cc = w >> 30; 491 } 492 cc = MUL15(w >> 15, 19); 493 d[8] &= 0x7FFF; 494 for (i = 0; i < 9; i ++) { 495 w = d[i] + cc; 496 d[i] = w & 0x3FFFFFFF; 497 cc = w >> 30; 498 } 499 } 500 501 /* 502 * Subtract one value from another in F255. Partial reduction is 503 * performed (down to less than twice the modulus). 504 */ 505 static void 506 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) 507 { 508 /* 509 * We actually compute a - b + 2*p, so that the final value is 510 * necessarily positive. 511 */ 512 int i; 513 uint32_t cc, w; 514 515 cc = (uint32_t)-38; 516 for (i = 0; i < 9; i ++) { 517 w = a[i] - b[i] + cc; 518 d[i] = w & 0x3FFFFFFF; 519 cc = ARSH(w, 30); 520 } 521 cc = MUL15((w + 0x10000) >> 15, 19); 522 d[8] &= 0x7FFF; 523 for (i = 0; i < 9; i ++) { 524 w = d[i] + cc; 525 d[i] = w & 0x3FFFFFFF; 526 cc = w >> 30; 527 } 528 } 529 530 /* 531 * Multiply an integer by the 'A24' constant (121665). Partial reduction 532 * is performed (down to less than twice the modulus). 533 */ 534 static void 535 f255_mul_a24(uint32_t *d, const uint32_t *a) 536 { 537 int i; 538 uint64_t w; 539 uint32_t cc; 540 541 /* 542 * a[] is over 256 bits, thus a[8] has length at most 16 bits. 543 * We single out the processing of the last word: intermediate 544 * value w is up to 121665*2^16, yielding a carry for the next 545 * loop of at most 19*(121665*2^16/2^15) = 4623289. 546 */ 547 cc = 0; 548 for (i = 0; i < 8; i ++) { 549 w = MUL31(a[i], 121665) + (uint64_t)cc; 550 d[i] = (uint32_t)w & 0x3FFFFFFF; 551 cc = (uint32_t)(w >> 30); 552 } 553 w = MUL31(a[8], 121665) + (uint64_t)cc; 554 d[8] = (uint32_t)w & 0x7FFF; 555 cc = MUL15((uint32_t)(w >> 15), 19); 556 557 for (i = 0; i < 9; i ++) { 558 uint32_t z; 559 560 z = d[i] + cc; 561 d[i] = z & 0x3FFFFFFF; 562 cc = z >> 30; 563 } 564 } 565 566 static const unsigned char GEN[] = { 567 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 568 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 569 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 570 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 571 }; 572 573 static const unsigned char ORDER[] = { 574 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 575 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 576 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 577 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 578 }; 579 580 static const unsigned char * 581 api_generator(int curve, size_t *len) 582 { 583 (void)curve; 584 *len = 32; 585 return GEN; 586 } 587 588 static const unsigned char * 589 api_order(int curve, size_t *len) 590 { 591 (void)curve; 592 *len = 32; 593 return ORDER; 594 } 595 596 static size_t 597 api_xoff(int curve, size_t *len) 598 { 599 (void)curve; 600 *len = 32; 601 return 0; 602 } 603 604 static void 605 cswap(uint32_t *a, uint32_t *b, uint32_t ctl) 606 { 607 int i; 608 609 ctl = -ctl; 610 for (i = 0; i < 9; i ++) { 611 uint32_t aw, bw, tw; 612 613 aw = a[i]; 614 bw = b[i]; 615 tw = ctl & (aw ^ bw); 616 a[i] = aw ^ tw; 617 b[i] = bw ^ tw; 618 } 619 } 620 621 static uint32_t 622 api_mul(unsigned char *G, size_t Glen, 623 const unsigned char *kb, size_t kblen, int curve) 624 { 625 uint32_t x1[9], x2[9], x3[9], z2[9], z3[9]; 626 uint32_t a[9], aa[9], b[9], bb[9]; 627 uint32_t c[9], d[9], e[9], da[9], cb[9]; 628 unsigned char k[32]; 629 uint32_t swap; 630 int i; 631 632 (void)curve; 633 634 /* 635 * Points are encoded over exactly 32 bytes. Multipliers must fit 636 * in 32 bytes as well. 637 * RFC 7748 mandates that the high bit of the last point byte must 638 * be ignored/cleared. 639 */ 640 if (Glen != 32 || kblen > 32) { 641 return 0; 642 } 643 G[31] &= 0x7F; 644 645 /* 646 * Initialise variables x1, x2, z2, x3 and z3. We set all of them 647 * into Montgomery representation. 648 */ 649 x1[8] = le8_to_le30(x1, G, 32); 650 memcpy(x3, x1, sizeof x1); 651 memset(z2, 0, sizeof z2); 652 memset(x2, 0, sizeof x2); 653 x2[0] = 1; 654 memset(z3, 0, sizeof z3); 655 z3[0] = 1; 656 657 memset(k, 0, (sizeof k) - kblen); 658 memcpy(k + (sizeof k) - kblen, kb, kblen); 659 k[31] &= 0xF8; 660 k[0] &= 0x7F; 661 k[0] |= 0x40; 662 663 /* obsolete 664 print_int("x1", x1); 665 */ 666 667 swap = 0; 668 for (i = 254; i >= 0; i --) { 669 uint32_t kt; 670 671 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; 672 swap ^= kt; 673 cswap(x2, x3, swap); 674 cswap(z2, z3, swap); 675 swap = kt; 676 677 /* obsolete 678 print_int("x2", x2); 679 print_int("z2", z2); 680 print_int("x3", x3); 681 print_int("z3", z3); 682 */ 683 684 f255_add(a, x2, z2); 685 f255_square(aa, a); 686 f255_sub(b, x2, z2); 687 f255_square(bb, b); 688 f255_sub(e, aa, bb); 689 f255_add(c, x3, z3); 690 f255_sub(d, x3, z3); 691 f255_mul(da, d, a); 692 f255_mul(cb, c, b); 693 694 /* obsolete 695 print_int("a ", a); 696 print_int("aa", aa); 697 print_int("b ", b); 698 print_int("bb", bb); 699 print_int("e ", e); 700 print_int("c ", c); 701 print_int("d ", d); 702 print_int("da", da); 703 print_int("cb", cb); 704 */ 705 706 f255_add(x3, da, cb); 707 f255_square(x3, x3); 708 f255_sub(z3, da, cb); 709 f255_square(z3, z3); 710 f255_mul(z3, z3, x1); 711 f255_mul(x2, aa, bb); 712 f255_mul_a24(z2, e); 713 f255_add(z2, z2, aa); 714 f255_mul(z2, e, z2); 715 716 /* obsolete 717 print_int("x2", x2); 718 print_int("z2", z2); 719 print_int("x3", x3); 720 print_int("z3", z3); 721 */ 722 } 723 cswap(x2, x3, swap); 724 cswap(z2, z3, swap); 725 726 /* 727 * Inverse z2 with a modular exponentiation. This is a simple 728 * square-and-multiply algorithm; we mutualise most non-squarings 729 * since the exponent contains almost only ones. 730 */ 731 memcpy(a, z2, sizeof z2); 732 for (i = 0; i < 15; i ++) { 733 f255_square(a, a); 734 f255_mul(a, a, z2); 735 } 736 memcpy(b, a, sizeof a); 737 for (i = 0; i < 14; i ++) { 738 int j; 739 740 for (j = 0; j < 16; j ++) { 741 f255_square(b, b); 742 } 743 f255_mul(b, b, a); 744 } 745 for (i = 14; i >= 0; i --) { 746 f255_square(b, b); 747 if ((0xFFEB >> i) & 1) { 748 f255_mul(b, z2, b); 749 } 750 } 751 f255_mul(x2, x2, b); 752 reduce_final_f255(x2); 753 le30_to_le8(G, 32, x2); 754 return 1; 755 } 756 757 static size_t 758 api_mulgen(unsigned char *R, 759 const unsigned char *x, size_t xlen, int curve) 760 { 761 const unsigned char *G; 762 size_t Glen; 763 764 G = api_generator(curve, &Glen); 765 memcpy(R, G, Glen); 766 api_mul(R, Glen, x, xlen, curve); 767 return Glen; 768 } 769 770 static uint32_t 771 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 772 const unsigned char *x, size_t xlen, 773 const unsigned char *y, size_t ylen, int curve) 774 { 775 /* 776 * We don't implement this method, since it is used for ECDSA 777 * only, and there is no ECDSA over Curve25519 (which instead 778 * uses EdDSA). 779 */ 780 (void)A; 781 (void)B; 782 (void)len; 783 (void)x; 784 (void)xlen; 785 (void)y; 786 (void)ylen; 787 (void)curve; 788 return 0; 789 } 790 791 /* see bearssl_ec.h */ 792 const br_ec_impl br_ec_c25519_m31 = { 793 (uint32_t)0x20000000, 794 &api_generator, 795 &api_order, 796 &api_xoff, 797 &api_mul, 798 &api_mulgen, 799 &api_muladd 800 }; 801