1 /* 2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 #if BR_INT128 || BR_UMUL128 28 29 #if BR_UMUL128 30 #include <intrin.h> 31 #endif 32 33 static const unsigned char GEN[] = { 34 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 36 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 37 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38 }; 39 40 static const unsigned char ORDER[] = { 41 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 43 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 45 }; 46 47 static const unsigned char * 48 api_generator(int curve, size_t *len) 49 { 50 (void)curve; 51 *len = 32; 52 return GEN; 53 } 54 55 static const unsigned char * 56 api_order(int curve, size_t *len) 57 { 58 (void)curve; 59 *len = 32; 60 return ORDER; 61 } 62 63 static size_t 64 api_xoff(int curve, size_t *len) 65 { 66 (void)curve; 67 *len = 32; 68 return 0; 69 } 70 71 /* 72 * A field element is encoded as four 64-bit integers, in basis 2^63. 73 * Operations return partially reduced values, which may range up to 74 * 2^255+37. 75 */ 76 77 #define MASK63 (((uint64_t)1 << 63) - (uint64_t)1) 78 79 /* 80 * Swap two field elements, conditionally on a flag. 81 */ 82 static inline void 83 f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl) 84 { 85 uint64_t m, w; 86 87 m = -(uint64_t)ctl; 88 w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w; 89 w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w; 90 w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w; 91 w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w; 92 } 93 94 /* 95 * Addition in the field. 96 */ 97 static inline void 98 f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b) 99 { 100 #if BR_INT128 101 102 uint64_t t0, t1, t2, t3, cc; 103 unsigned __int128 z; 104 105 z = (unsigned __int128)a[0] + (unsigned __int128)b[0]; 106 t0 = (uint64_t)z; 107 z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64); 108 t1 = (uint64_t)z; 109 z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64); 110 t2 = (uint64_t)z; 111 z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64); 112 t3 = (uint64_t)z & MASK63; 113 cc = (uint64_t)(z >> 63); 114 115 /* 116 * Since operands are at most 2^255+37, the sum is at most 117 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. 118 * 119 * We use: 2^255 = 19 mod p. 120 * Since we add 0, 19 or 38 to a value that fits on 255 bits, 121 * the result is at most 2^255+37. 122 */ 123 z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc); 124 d[0] = (uint64_t)z; 125 z = (unsigned __int128)t1 + (z >> 64); 126 d[1] = (uint64_t)z; 127 z = (unsigned __int128)t2 + (z >> 64); 128 d[2] = (uint64_t)z; 129 d[3] = t3 + (uint64_t)(z >> 64); 130 131 #elif BR_UMUL128 132 133 uint64_t t0, t1, t2, t3, cc; 134 unsigned char k; 135 136 k = _addcarry_u64(0, a[0], b[0], &t0); 137 k = _addcarry_u64(k, a[1], b[1], &t1); 138 k = _addcarry_u64(k, a[2], b[2], &t2); 139 k = _addcarry_u64(k, a[3], b[3], &t3); 140 cc = (k << 1) + (t3 >> 63); 141 t3 &= MASK63; 142 143 /* 144 * Since operands are at most 2^255+37, the sum is at most 145 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. 146 * 147 * We use: 2^255 = 19 mod p. 148 * Since we add 0, 19 or 38 to a value that fits on 255 bits, 149 * the result is at most 2^255+37. 150 */ 151 k = _addcarry_u64(0, t0, 19 * cc, &d[0]); 152 k = _addcarry_u64(k, t1, 0, &d[1]); 153 k = _addcarry_u64(k, t2, 0, &d[2]); 154 (void)_addcarry_u64(k, t3, 0, &d[3]); 155 156 #endif 157 } 158 159 /* 160 * Subtraction. 161 * On input, limbs must fit on 60 bits each. On output, result is 162 * partially reduced, with max value 2^255+19456; moreover, all 163 * limbs will fit on 51 bits, except the low limb, which may have 164 * value up to 2^51+19455. 165 */ 166 static inline void 167 f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) 168 { 169 #if BR_INT128 170 171 /* 172 * We compute t = 2^256 - 38 + a - b, which is necessarily 173 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 174 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending 175 * on the two upper bits of t (bits 255 and 256). 176 */ 177 178 uint64_t t0, t1, t2, t3, t4, cc; 179 unsigned __int128 z; 180 181 z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38; 182 t0 = (uint64_t)z; 183 cc = -(uint64_t)(z >> 64); 184 z = (unsigned __int128)a[1] - (unsigned __int128)b[1] 185 - (unsigned __int128)cc; 186 t1 = (uint64_t)z; 187 cc = -(uint64_t)(z >> 64); 188 z = (unsigned __int128)a[2] - (unsigned __int128)b[2] 189 - (unsigned __int128)cc; 190 t2 = (uint64_t)z; 191 cc = -(uint64_t)(z >> 64); 192 z = (unsigned __int128)a[3] - (unsigned __int128)b[3] 193 - (unsigned __int128)cc; 194 t3 = (uint64_t)z; 195 t4 = 1 + (uint64_t)(z >> 64); 196 197 /* 198 * We have a 257-bit result. The two top bits can be 00, 01 or 10, 199 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). 200 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. 201 * This guarantees that the result is at most 2^255+37. 202 */ 203 cc = (38 & -t4) + (19 & -(t3 >> 63)); 204 t3 &= MASK63; 205 z = (unsigned __int128)t0 + (unsigned __int128)cc; 206 d[0] = (uint64_t)z; 207 z = (unsigned __int128)t1 + (z >> 64); 208 d[1] = (uint64_t)z; 209 z = (unsigned __int128)t2 + (z >> 64); 210 d[2] = (uint64_t)z; 211 d[3] = t3 + (uint64_t)(z >> 64); 212 213 #elif BR_UMUL128 214 215 /* 216 * We compute t = 2^256 - 38 + a - b, which is necessarily 217 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 218 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending 219 * on the two upper bits of t (bits 255 and 256). 220 */ 221 222 uint64_t t0, t1, t2, t3, t4; 223 unsigned char k; 224 225 k = _subborrow_u64(0, a[0], b[0], &t0); 226 k = _subborrow_u64(k, a[1], b[1], &t1); 227 k = _subborrow_u64(k, a[2], b[2], &t2); 228 k = _subborrow_u64(k, a[3], b[3], &t3); 229 (void)_subborrow_u64(k, 1, 0, &t4); 230 231 k = _subborrow_u64(0, t0, 38, &t0); 232 k = _subborrow_u64(k, t1, 0, &t1); 233 k = _subborrow_u64(k, t2, 0, &t2); 234 k = _subborrow_u64(k, t3, 0, &t3); 235 (void)_subborrow_u64(k, t4, 0, &t4); 236 237 /* 238 * We have a 257-bit result. The two top bits can be 00, 01 or 10, 239 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). 240 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. 241 * This guarantees that the result is at most 2^255+37. 242 */ 243 t4 = (38 & -t4) + (19 & -(t3 >> 63)); 244 t3 &= MASK63; 245 k = _addcarry_u64(0, t0, t4, &d[0]); 246 k = _addcarry_u64(k, t1, 0, &d[1]); 247 k = _addcarry_u64(k, t2, 0, &d[2]); 248 (void)_addcarry_u64(k, t3, 0, &d[3]); 249 250 #endif 251 } 252 253 /* 254 * Multiplication. 255 */ 256 static inline void 257 f255_mul(uint64_t *d, uint64_t *a, uint64_t *b) 258 { 259 #if BR_INT128 260 261 unsigned __int128 z; 262 uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; 263 264 /* 265 * Compute the product a*b over plain integers. 266 */ 267 z = (unsigned __int128)a[0] * (unsigned __int128)b[0]; 268 t0 = (uint64_t)z; 269 z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64); 270 t1 = (uint64_t)z; 271 z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64); 272 t2 = (uint64_t)z; 273 z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64); 274 t3 = (uint64_t)z; 275 t4 = (uint64_t)(z >> 64); 276 277 z = (unsigned __int128)a[1] * (unsigned __int128)b[0] 278 + (unsigned __int128)t1; 279 t1 = (uint64_t)z; 280 z = (unsigned __int128)a[1] * (unsigned __int128)b[1] 281 + (unsigned __int128)t2 + (z >> 64); 282 t2 = (uint64_t)z; 283 z = (unsigned __int128)a[1] * (unsigned __int128)b[2] 284 + (unsigned __int128)t3 + (z >> 64); 285 t3 = (uint64_t)z; 286 z = (unsigned __int128)a[1] * (unsigned __int128)b[3] 287 + (unsigned __int128)t4 + (z >> 64); 288 t4 = (uint64_t)z; 289 t5 = (uint64_t)(z >> 64); 290 291 z = (unsigned __int128)a[2] * (unsigned __int128)b[0] 292 + (unsigned __int128)t2; 293 t2 = (uint64_t)z; 294 z = (unsigned __int128)a[2] * (unsigned __int128)b[1] 295 + (unsigned __int128)t3 + (z >> 64); 296 t3 = (uint64_t)z; 297 z = (unsigned __int128)a[2] * (unsigned __int128)b[2] 298 + (unsigned __int128)t4 + (z >> 64); 299 t4 = (uint64_t)z; 300 z = (unsigned __int128)a[2] * (unsigned __int128)b[3] 301 + (unsigned __int128)t5 + (z >> 64); 302 t5 = (uint64_t)z; 303 t6 = (uint64_t)(z >> 64); 304 305 z = (unsigned __int128)a[3] * (unsigned __int128)b[0] 306 + (unsigned __int128)t3; 307 t3 = (uint64_t)z; 308 z = (unsigned __int128)a[3] * (unsigned __int128)b[1] 309 + (unsigned __int128)t4 + (z >> 64); 310 t4 = (uint64_t)z; 311 z = (unsigned __int128)a[3] * (unsigned __int128)b[2] 312 + (unsigned __int128)t5 + (z >> 64); 313 t5 = (uint64_t)z; 314 z = (unsigned __int128)a[3] * (unsigned __int128)b[3] 315 + (unsigned __int128)t6 + (z >> 64); 316 t6 = (uint64_t)z; 317 t7 = (uint64_t)(z >> 64); 318 319 /* 320 * Modulo p, we have: 321 * 322 * 2^255 = 19 323 * 2^510 = 19*19 = 361 324 * 325 * We split the intermediate t into three parts, in basis 326 * 2^255. The low one will be in t0..t3; the middle one in t4..t7. 327 * The upper one can only be a single bit (th), since the 328 * multiplication operands are at most 2^255+37 each. 329 */ 330 th = t7 >> 62; 331 t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; 332 t6 = (t6 << 1) | (t5 >> 63); 333 t5 = (t5 << 1) | (t4 >> 63); 334 t4 = (t4 << 1) | (t3 >> 63); 335 t3 &= MASK63; 336 337 /* 338 * Multiply the middle part (t4..t7) by 19. We truncate it to 339 * 255 bits; the extra bits will go along with th. 340 */ 341 z = (unsigned __int128)t4 * 19; 342 t4 = (uint64_t)z; 343 z = (unsigned __int128)t5 * 19 + (z >> 64); 344 t5 = (uint64_t)z; 345 z = (unsigned __int128)t6 * 19 + (z >> 64); 346 t6 = (uint64_t)z; 347 z = (unsigned __int128)t7 * 19 + (z >> 64); 348 t7 = (uint64_t)z & MASK63; 349 350 th = (361 & -th) + (19 * (uint64_t)(z >> 63)); 351 352 /* 353 * Add elements together. 354 * At this point: 355 * t0..t3 fits on 255 bits. 356 * t4..t7 fits on 255 bits. 357 * th <= 361 + 342 = 703. 358 */ 359 z = (unsigned __int128)t0 + (unsigned __int128)t4 360 + (unsigned __int128)th; 361 t0 = (uint64_t)z; 362 z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64); 363 t1 = (uint64_t)z; 364 z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64); 365 t2 = (uint64_t)z; 366 z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64); 367 t3 = (uint64_t)z & MASK63; 368 th = (uint64_t)(z >> 63); 369 370 /* 371 * Since the sum is at most 2^256 + 703, the two upper bits, in th, 372 * can only have value 0, 1 or 2. We just add th*19, which 373 * guarantees a result of at most 2^255+37. 374 */ 375 z = (unsigned __int128)t0 + (19 * th); 376 d[0] = (uint64_t)z; 377 z = (unsigned __int128)t1 + (z >> 64); 378 d[1] = (uint64_t)z; 379 z = (unsigned __int128)t2 + (z >> 64); 380 d[2] = (uint64_t)z; 381 d[3] = t3 + (uint64_t)(z >> 64); 382 383 #elif BR_UMUL128 384 385 uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; 386 uint64_t h0, h1, h2, h3; 387 unsigned char k; 388 389 /* 390 * Compute the product a*b over plain integers. 391 */ 392 t0 = _umul128(a[0], b[0], &h0); 393 t1 = _umul128(a[0], b[1], &h1); 394 k = _addcarry_u64(0, t1, h0, &t1); 395 t2 = _umul128(a[0], b[2], &h2); 396 k = _addcarry_u64(k, t2, h1, &t2); 397 t3 = _umul128(a[0], b[3], &h3); 398 k = _addcarry_u64(k, t3, h2, &t3); 399 (void)_addcarry_u64(k, h3, 0, &t4); 400 401 k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1); 402 k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2); 403 k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3); 404 k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4); 405 t5 = k; 406 k = _addcarry_u64(0, t2, h0, &t2); 407 k = _addcarry_u64(k, t3, h1, &t3); 408 k = _addcarry_u64(k, t4, h2, &t4); 409 (void)_addcarry_u64(k, t5, h3, &t5); 410 411 k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2); 412 k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3); 413 k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4); 414 k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5); 415 t6 = k; 416 k = _addcarry_u64(0, t3, h0, &t3); 417 k = _addcarry_u64(k, t4, h1, &t4); 418 k = _addcarry_u64(k, t5, h2, &t5); 419 (void)_addcarry_u64(k, t6, h3, &t6); 420 421 k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3); 422 k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4); 423 k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5); 424 k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6); 425 t7 = k; 426 k = _addcarry_u64(0, t4, h0, &t4); 427 k = _addcarry_u64(k, t5, h1, &t5); 428 k = _addcarry_u64(k, t6, h2, &t6); 429 (void)_addcarry_u64(k, t7, h3, &t7); 430 431 /* 432 * Modulo p, we have: 433 * 434 * 2^255 = 19 435 * 2^510 = 19*19 = 361 436 * 437 * We split the intermediate t into three parts, in basis 438 * 2^255. The low one will be in t0..t3; the middle one in t4..t7. 439 * The upper one can only be a single bit (th), since the 440 * multiplication operands are at most 2^255+37 each. 441 */ 442 th = t7 >> 62; 443 t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; 444 t6 = (t6 << 1) | (t5 >> 63); 445 t5 = (t5 << 1) | (t4 >> 63); 446 t4 = (t4 << 1) | (t3 >> 63); 447 t3 &= MASK63; 448 449 /* 450 * Multiply the middle part (t4..t7) by 19. We truncate it to 451 * 255 bits; the extra bits will go along with th. 452 */ 453 t4 = _umul128(t4, 19, &h0); 454 t5 = _umul128(t5, 19, &h1); 455 t6 = _umul128(t6, 19, &h2); 456 t7 = _umul128(t7, 19, &h3); 457 k = _addcarry_u64(0, t5, h0, &t5); 458 k = _addcarry_u64(k, t6, h1, &t6); 459 k = _addcarry_u64(k, t7, h2, &t7); 460 (void)_addcarry_u64(k, h3, 0, &h3); 461 th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63))); 462 t7 &= MASK63; 463 464 /* 465 * Add elements together. 466 * At this point: 467 * t0..t3 fits on 255 bits. 468 * t4..t7 fits on 255 bits. 469 * th <= 361 + 342 = 703. 470 */ 471 k = _addcarry_u64(0, t0, t4, &t0); 472 k = _addcarry_u64(k, t1, t5, &t1); 473 k = _addcarry_u64(k, t2, t6, &t2); 474 k = _addcarry_u64(k, t3, t7, &t3); 475 t4 = k; 476 k = _addcarry_u64(0, t0, th, &t0); 477 k = _addcarry_u64(k, t1, 0, &t1); 478 k = _addcarry_u64(k, t2, 0, &t2); 479 k = _addcarry_u64(k, t3, 0, &t3); 480 (void)_addcarry_u64(k, t4, 0, &t4); 481 482 th = (t4 << 1) + (t3 >> 63); 483 t3 &= MASK63; 484 485 /* 486 * Since the sum is at most 2^256 + 703, the two upper bits, in th, 487 * can only have value 0, 1 or 2. We just add th*19, which 488 * guarantees a result of at most 2^255+37. 489 */ 490 k = _addcarry_u64(0, t0, 19 * th, &d[0]); 491 k = _addcarry_u64(k, t1, 0, &d[1]); 492 k = _addcarry_u64(k, t2, 0, &d[2]); 493 (void)_addcarry_u64(k, t3, 0, &d[3]); 494 495 #endif 496 } 497 498 /* 499 * Multiplication by A24 = 121665. 500 */ 501 static inline void 502 f255_mul_a24(uint64_t *d, const uint64_t *a) 503 { 504 #if BR_INT128 505 506 uint64_t t0, t1, t2, t3; 507 unsigned __int128 z; 508 509 z = (unsigned __int128)a[0] * 121665; 510 t0 = (uint64_t)z; 511 z = (unsigned __int128)a[1] * 121665 + (z >> 64); 512 t1 = (uint64_t)z; 513 z = (unsigned __int128)a[2] * 121665 + (z >> 64); 514 t2 = (uint64_t)z; 515 z = (unsigned __int128)a[3] * 121665 + (z >> 64); 516 t3 = (uint64_t)z & MASK63; 517 518 z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63)); 519 t0 = (uint64_t)z; 520 z = (unsigned __int128)t1 + (z >> 64); 521 t1 = (uint64_t)z; 522 z = (unsigned __int128)t2 + (z >> 64); 523 t2 = (uint64_t)z; 524 t3 = t3 + (uint64_t)(z >> 64); 525 526 z = (unsigned __int128)t0 + (19 & -(t3 >> 63)); 527 d[0] = (uint64_t)z; 528 z = (unsigned __int128)t1 + (z >> 64); 529 d[1] = (uint64_t)z; 530 z = (unsigned __int128)t2 + (z >> 64); 531 d[2] = (uint64_t)z; 532 d[3] = (t3 & MASK63) + (uint64_t)(z >> 64); 533 534 #elif BR_UMUL128 535 536 uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3; 537 unsigned char k; 538 539 t0 = _umul128(a[0], 121665, &h0); 540 t1 = _umul128(a[1], 121665, &h1); 541 k = _addcarry_u64(0, t1, h0, &t1); 542 t2 = _umul128(a[2], 121665, &h2); 543 k = _addcarry_u64(k, t2, h1, &t2); 544 t3 = _umul128(a[3], 121665, &h3); 545 k = _addcarry_u64(k, t3, h2, &t3); 546 (void)_addcarry_u64(k, h3, 0, &t4); 547 548 t4 = (t4 << 1) + (t3 >> 63); 549 t3 &= MASK63; 550 k = _addcarry_u64(0, t0, 19 * t4, &t0); 551 k = _addcarry_u64(k, t1, 0, &t1); 552 k = _addcarry_u64(k, t2, 0, &t2); 553 (void)_addcarry_u64(k, t3, 0, &t3); 554 555 t4 = 19 & -(t3 >> 63); 556 t3 &= MASK63; 557 k = _addcarry_u64(0, t0, t4, &d[0]); 558 k = _addcarry_u64(k, t1, 0, &d[1]); 559 k = _addcarry_u64(k, t2, 0, &d[2]); 560 (void)_addcarry_u64(k, t3, 0, &d[3]); 561 562 #endif 563 } 564 565 /* 566 * Finalize reduction. 567 */ 568 static inline void 569 f255_final_reduce(uint64_t *a) 570 { 571 #if BR_INT128 572 573 uint64_t t0, t1, t2, t3, m; 574 unsigned __int128 z; 575 576 /* 577 * We add 19. If the result (in t) is below 2^255, then a[] 578 * is already less than 2^255-19, thus already reduced. 579 * Otherwise, we subtract 2^255 from t[], in which case we 580 * have t = a - (2^255-19), and that's our result. 581 */ 582 z = (unsigned __int128)a[0] + 19; 583 t0 = (uint64_t)z; 584 z = (unsigned __int128)a[1] + (z >> 64); 585 t1 = (uint64_t)z; 586 z = (unsigned __int128)a[2] + (z >> 64); 587 t2 = (uint64_t)z; 588 t3 = a[3] + (uint64_t)(z >> 64); 589 590 m = -(t3 >> 63); 591 t3 &= MASK63; 592 a[0] ^= m & (a[0] ^ t0); 593 a[1] ^= m & (a[1] ^ t1); 594 a[2] ^= m & (a[2] ^ t2); 595 a[3] ^= m & (a[3] ^ t3); 596 597 #elif BR_UMUL128 598 599 uint64_t t0, t1, t2, t3, m; 600 unsigned char k; 601 602 /* 603 * We add 19. If the result (in t) is below 2^255, then a[] 604 * is already less than 2^255-19, thus already reduced. 605 * Otherwise, we subtract 2^255 from t[], in which case we 606 * have t = a - (2^255-19), and that's our result. 607 */ 608 k = _addcarry_u64(0, a[0], 19, &t0); 609 k = _addcarry_u64(k, a[1], 0, &t1); 610 k = _addcarry_u64(k, a[2], 0, &t2); 611 (void)_addcarry_u64(k, a[3], 0, &t3); 612 613 m = -(t3 >> 63); 614 t3 &= MASK63; 615 a[0] ^= m & (a[0] ^ t0); 616 a[1] ^= m & (a[1] ^ t1); 617 a[2] ^= m & (a[2] ^ t2); 618 a[3] ^= m & (a[3] ^ t3); 619 620 #endif 621 } 622 623 static uint32_t 624 api_mul(unsigned char *G, size_t Glen, 625 const unsigned char *kb, size_t kblen, int curve) 626 { 627 unsigned char k[32]; 628 uint64_t x1[4], x2[4], z2[4], x3[4], z3[4]; 629 uint32_t swap; 630 int i; 631 632 (void)curve; 633 634 /* 635 * Points are encoded over exactly 32 bytes. Multipliers must fit 636 * in 32 bytes as well. 637 */ 638 if (Glen != 32 || kblen > 32) { 639 return 0; 640 } 641 642 /* 643 * RFC 7748 mandates that the high bit of the last point byte must 644 * be ignored/cleared. 645 */ 646 x1[0] = br_dec64le(&G[ 0]); 647 x1[1] = br_dec64le(&G[ 8]); 648 x1[2] = br_dec64le(&G[16]); 649 x1[3] = br_dec64le(&G[24]) & MASK63; 650 651 /* 652 * We can use memset() to clear values, because exact-width types 653 * like uint64_t are guaranteed to have no padding bits or 654 * trap representations. 655 */ 656 memset(x2, 0, sizeof x2); 657 x2[0] = 1; 658 memset(z2, 0, sizeof z2); 659 memcpy(x3, x1, sizeof x1); 660 memcpy(z3, x2, sizeof x2); 661 662 /* 663 * The multiplier is provided in big-endian notation, and 664 * possibly shorter than 32 bytes. 665 */ 666 memset(k, 0, (sizeof k) - kblen); 667 memcpy(k + (sizeof k) - kblen, kb, kblen); 668 k[31] &= 0xF8; 669 k[0] &= 0x7F; 670 k[0] |= 0x40; 671 672 swap = 0; 673 674 for (i = 254; i >= 0; i --) { 675 uint64_t a[4], aa[4], b[4], bb[4], e[4]; 676 uint64_t c[4], d[4], da[4], cb[4]; 677 uint32_t kt; 678 679 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; 680 swap ^= kt; 681 f255_cswap(x2, x3, swap); 682 f255_cswap(z2, z3, swap); 683 swap = kt; 684 685 /* A = x_2 + z_2 */ 686 f255_add(a, x2, z2); 687 688 /* AA = A^2 */ 689 f255_mul(aa, a, a); 690 691 /* B = x_2 - z_2 */ 692 f255_sub(b, x2, z2); 693 694 /* BB = B^2 */ 695 f255_mul(bb, b, b); 696 697 /* E = AA - BB */ 698 f255_sub(e, aa, bb); 699 700 /* C = x_3 + z_3 */ 701 f255_add(c, x3, z3); 702 703 /* D = x_3 - z_3 */ 704 f255_sub(d, x3, z3); 705 706 /* DA = D * A */ 707 f255_mul(da, d, a); 708 709 /* CB = C * B */ 710 f255_mul(cb, c, b); 711 712 /* x_3 = (DA + CB)^2 */ 713 f255_add(x3, da, cb); 714 f255_mul(x3, x3, x3); 715 716 /* z_3 = x_1 * (DA - CB)^2 */ 717 f255_sub(z3, da, cb); 718 f255_mul(z3, z3, z3); 719 f255_mul(z3, x1, z3); 720 721 /* x_2 = AA * BB */ 722 f255_mul(x2, aa, bb); 723 724 /* z_2 = E * (AA + a24 * E) */ 725 f255_mul_a24(z2, e); 726 f255_add(z2, aa, z2); 727 f255_mul(z2, e, z2); 728 } 729 730 f255_cswap(x2, x3, swap); 731 f255_cswap(z2, z3, swap); 732 733 /* 734 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize 735 * most non-squarings. We use x1 and x3, now useless, as temporaries. 736 */ 737 memcpy(x1, z2, sizeof z2); 738 for (i = 0; i < 15; i ++) { 739 f255_mul(x1, x1, x1); 740 f255_mul(x1, x1, z2); 741 } 742 memcpy(x3, x1, sizeof x1); 743 for (i = 0; i < 14; i ++) { 744 int j; 745 746 for (j = 0; j < 16; j ++) { 747 f255_mul(x3, x3, x3); 748 } 749 f255_mul(x3, x3, x1); 750 } 751 for (i = 14; i >= 0; i --) { 752 f255_mul(x3, x3, x3); 753 if ((0xFFEB >> i) & 1) { 754 f255_mul(x3, z2, x3); 755 } 756 } 757 758 /* 759 * Compute x2/z2. We have 1/z2 in x3. 760 */ 761 f255_mul(x2, x2, x3); 762 f255_final_reduce(x2); 763 764 /* 765 * Encode the final x2 value in little-endian. 766 */ 767 br_enc64le(G, x2[0]); 768 br_enc64le(G + 8, x2[1]); 769 br_enc64le(G + 16, x2[2]); 770 br_enc64le(G + 24, x2[3]); 771 return 1; 772 } 773 774 static size_t 775 api_mulgen(unsigned char *R, 776 const unsigned char *x, size_t xlen, int curve) 777 { 778 const unsigned char *G; 779 size_t Glen; 780 781 G = api_generator(curve, &Glen); 782 memcpy(R, G, Glen); 783 api_mul(R, Glen, x, xlen, curve); 784 return Glen; 785 } 786 787 static uint32_t 788 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 789 const unsigned char *x, size_t xlen, 790 const unsigned char *y, size_t ylen, int curve) 791 { 792 /* 793 * We don't implement this method, since it is used for ECDSA 794 * only, and there is no ECDSA over Curve25519 (which instead 795 * uses EdDSA). 796 */ 797 (void)A; 798 (void)B; 799 (void)len; 800 (void)x; 801 (void)xlen; 802 (void)y; 803 (void)ylen; 804 (void)curve; 805 return 0; 806 } 807 808 /* see bearssl_ec.h */ 809 const br_ec_impl br_ec_c25519_m64 = { 810 (uint32_t)0x20000000, 811 &api_generator, 812 &api_order, 813 &api_xoff, 814 &api_mul, 815 &api_mulgen, 816 &api_muladd 817 }; 818 819 /* see bearssl_ec.h */ 820 const br_ec_impl * 821 br_ec_c25519_m64_get(void) 822 { 823 return &br_ec_c25519_m64; 824 } 825 826 #else 827 828 /* see bearssl_ec.h */ 829 const br_ec_impl * 830 br_ec_c25519_m64_get(void) 831 { 832 return 0; 833 } 834 835 #endif 836