1 /* 2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 #if BR_INT128 || BR_UMUL128 28 29 #if BR_UMUL128 30 #include <intrin.h> 31 #endif 32 33 static const unsigned char GEN[] = { 34 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 35 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 36 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 37 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 38 }; 39 40 static const unsigned char ORDER[] = { 41 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 42 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 43 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 44 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 45 }; 46 47 static const unsigned char * 48 api_generator(int curve, size_t *len) 49 { 50 (void)curve; 51 *len = 32; 52 return GEN; 53 } 54 55 static const unsigned char * 56 api_order(int curve, size_t *len) 57 { 58 (void)curve; 59 *len = 32; 60 return ORDER; 61 } 62 63 static size_t 64 api_xoff(int curve, size_t *len) 65 { 66 (void)curve; 67 *len = 32; 68 return 0; 69 } 70 71 /* 72 * A field element is encoded as four 64-bit integers, in basis 2^63. 73 * Operations return partially reduced values, which may range up to 74 * 2^255+37. 75 */ 76 77 #define MASK63 (((uint64_t)1 << 63) - (uint64_t)1) 78 79 /* 80 * Swap two field elements, conditionally on a flag. 81 */ 82 static inline void 83 f255_cswap(uint64_t *a, uint64_t *b, uint32_t ctl) 84 { 85 uint64_t m, w; 86 87 m = -(uint64_t)ctl; 88 w = m & (a[0] ^ b[0]); a[0] ^= w; b[0] ^= w; 89 w = m & (a[1] ^ b[1]); a[1] ^= w; b[1] ^= w; 90 w = m & (a[2] ^ b[2]); a[2] ^= w; b[2] ^= w; 91 w = m & (a[3] ^ b[3]); a[3] ^= w; b[3] ^= w; 92 } 93 94 /* 95 * Addition in the field. 96 */ 97 static inline void 98 f255_add(uint64_t *d, const uint64_t *a, const uint64_t *b) 99 { 100 #if BR_INT128 101 102 uint64_t t0, t1, t2, t3, cc; 103 unsigned __int128 z; 104 105 z = (unsigned __int128)a[0] + (unsigned __int128)b[0]; 106 t0 = (uint64_t)z; 107 z = (unsigned __int128)a[1] + (unsigned __int128)b[1] + (z >> 64); 108 t1 = (uint64_t)z; 109 z = (unsigned __int128)a[2] + (unsigned __int128)b[2] + (z >> 64); 110 t2 = (uint64_t)z; 111 z = (unsigned __int128)a[3] + (unsigned __int128)b[3] + (z >> 64); 112 t3 = (uint64_t)z & MASK63; 113 cc = (uint64_t)(z >> 63); 114 115 /* 116 * Since operands are at most 2^255+37, the sum is at most 117 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. 118 * 119 * We use: 2^255 = 19 mod p. 120 * Since we add 0, 19 or 38 to a value that fits on 255 bits, 121 * the result is at most 2^255+37. 122 */ 123 z = (unsigned __int128)t0 + (unsigned __int128)(19 * cc); 124 d[0] = (uint64_t)z; 125 z = (unsigned __int128)t1 + (z >> 64); 126 d[1] = (uint64_t)z; 127 z = (unsigned __int128)t2 + (z >> 64); 128 d[2] = (uint64_t)z; 129 d[3] = t3 + (uint64_t)(z >> 64); 130 131 #elif BR_UMUL128 132 133 uint64_t t0, t1, t2, t3, cc; 134 unsigned char k; 135 136 k = _addcarry_u64(0, a[0], b[0], &t0); 137 k = _addcarry_u64(k, a[1], b[1], &t1); 138 k = _addcarry_u64(k, a[2], b[2], &t2); 139 k = _addcarry_u64(k, a[3], b[3], &t3); 140 cc = (k << 1) + (t3 >> 63); 141 t3 &= MASK63; 142 143 /* 144 * Since operands are at most 2^255+37, the sum is at most 145 * 2^256+74; thus, the carry cc is equal to 0, 1 or 2. 146 * 147 * We use: 2^255 = 19 mod p. 148 * Since we add 0, 19 or 38 to a value that fits on 255 bits, 149 * the result is at most 2^255+37. 150 */ 151 k = _addcarry_u64(0, t0, 19 * cc, &d[0]); 152 k = _addcarry_u64(k, t1, 0, &d[1]); 153 k = _addcarry_u64(k, t2, 0, &d[2]); 154 (void)_addcarry_u64(k, t3, 0, &d[3]); 155 156 #endif 157 } 158 159 /* 160 * Subtraction. 161 */ 162 static inline void 163 f255_sub(uint64_t *d, const uint64_t *a, const uint64_t *b) 164 { 165 #if BR_INT128 166 167 /* 168 * We compute t = 2^256 - 38 + a - b, which is necessarily 169 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 170 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending 171 * on the two upper bits of t (bits 255 and 256). 172 */ 173 174 uint64_t t0, t1, t2, t3, t4, cc; 175 unsigned __int128 z; 176 177 z = (unsigned __int128)a[0] - (unsigned __int128)b[0] - 38; 178 t0 = (uint64_t)z; 179 cc = -(uint64_t)(z >> 64); 180 z = (unsigned __int128)a[1] - (unsigned __int128)b[1] 181 - (unsigned __int128)cc; 182 t1 = (uint64_t)z; 183 cc = -(uint64_t)(z >> 64); 184 z = (unsigned __int128)a[2] - (unsigned __int128)b[2] 185 - (unsigned __int128)cc; 186 t2 = (uint64_t)z; 187 cc = -(uint64_t)(z >> 64); 188 z = (unsigned __int128)a[3] - (unsigned __int128)b[3] 189 - (unsigned __int128)cc; 190 t3 = (uint64_t)z; 191 t4 = 1 + (uint64_t)(z >> 64); 192 193 /* 194 * We have a 257-bit result. The two top bits can be 00, 01 or 10, 195 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). 196 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. 197 * This guarantees that the result is at most 2^255+37. 198 */ 199 cc = (38 & -t4) + (19 & -(t3 >> 63)); 200 t3 &= MASK63; 201 z = (unsigned __int128)t0 + (unsigned __int128)cc; 202 d[0] = (uint64_t)z; 203 z = (unsigned __int128)t1 + (z >> 64); 204 d[1] = (uint64_t)z; 205 z = (unsigned __int128)t2 + (z >> 64); 206 d[2] = (uint64_t)z; 207 d[3] = t3 + (uint64_t)(z >> 64); 208 209 #elif BR_UMUL128 210 211 /* 212 * We compute t = 2^256 - 38 + a - b, which is necessarily 213 * positive but lower than 2^256 + 2^255, since a <= 2^255 + 37 214 * and b <= 2^255 + 37. We then subtract 0, p or 2*p, depending 215 * on the two upper bits of t (bits 255 and 256). 216 */ 217 218 uint64_t t0, t1, t2, t3, t4; 219 unsigned char k; 220 221 k = _subborrow_u64(0, a[0], b[0], &t0); 222 k = _subborrow_u64(k, a[1], b[1], &t1); 223 k = _subborrow_u64(k, a[2], b[2], &t2); 224 k = _subborrow_u64(k, a[3], b[3], &t3); 225 (void)_subborrow_u64(k, 1, 0, &t4); 226 227 k = _subborrow_u64(0, t0, 38, &t0); 228 k = _subborrow_u64(k, t1, 0, &t1); 229 k = _subborrow_u64(k, t2, 0, &t2); 230 k = _subborrow_u64(k, t3, 0, &t3); 231 (void)_subborrow_u64(k, t4, 0, &t4); 232 233 /* 234 * We have a 257-bit result. The two top bits can be 00, 01 or 10, 235 * but not 11 (value t <= 2^256 - 38 + 2^255 + 37 = 2^256 + 2^255 - 1). 236 * Therefore, we can truncate to 255 bits, and add 0, 19 or 38. 237 * This guarantees that the result is at most 2^255+37. 238 */ 239 t4 = (38 & -t4) + (19 & -(t3 >> 63)); 240 t3 &= MASK63; 241 k = _addcarry_u64(0, t0, t4, &d[0]); 242 k = _addcarry_u64(k, t1, 0, &d[1]); 243 k = _addcarry_u64(k, t2, 0, &d[2]); 244 (void)_addcarry_u64(k, t3, 0, &d[3]); 245 246 #endif 247 } 248 249 /* 250 * Multiplication. 251 */ 252 static inline void 253 f255_mul(uint64_t *d, uint64_t *a, uint64_t *b) 254 { 255 #if BR_INT128 256 257 unsigned __int128 z; 258 uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; 259 260 /* 261 * Compute the product a*b over plain integers. 262 */ 263 z = (unsigned __int128)a[0] * (unsigned __int128)b[0]; 264 t0 = (uint64_t)z; 265 z = (unsigned __int128)a[0] * (unsigned __int128)b[1] + (z >> 64); 266 t1 = (uint64_t)z; 267 z = (unsigned __int128)a[0] * (unsigned __int128)b[2] + (z >> 64); 268 t2 = (uint64_t)z; 269 z = (unsigned __int128)a[0] * (unsigned __int128)b[3] + (z >> 64); 270 t3 = (uint64_t)z; 271 t4 = (uint64_t)(z >> 64); 272 273 z = (unsigned __int128)a[1] * (unsigned __int128)b[0] 274 + (unsigned __int128)t1; 275 t1 = (uint64_t)z; 276 z = (unsigned __int128)a[1] * (unsigned __int128)b[1] 277 + (unsigned __int128)t2 + (z >> 64); 278 t2 = (uint64_t)z; 279 z = (unsigned __int128)a[1] * (unsigned __int128)b[2] 280 + (unsigned __int128)t3 + (z >> 64); 281 t3 = (uint64_t)z; 282 z = (unsigned __int128)a[1] * (unsigned __int128)b[3] 283 + (unsigned __int128)t4 + (z >> 64); 284 t4 = (uint64_t)z; 285 t5 = (uint64_t)(z >> 64); 286 287 z = (unsigned __int128)a[2] * (unsigned __int128)b[0] 288 + (unsigned __int128)t2; 289 t2 = (uint64_t)z; 290 z = (unsigned __int128)a[2] * (unsigned __int128)b[1] 291 + (unsigned __int128)t3 + (z >> 64); 292 t3 = (uint64_t)z; 293 z = (unsigned __int128)a[2] * (unsigned __int128)b[2] 294 + (unsigned __int128)t4 + (z >> 64); 295 t4 = (uint64_t)z; 296 z = (unsigned __int128)a[2] * (unsigned __int128)b[3] 297 + (unsigned __int128)t5 + (z >> 64); 298 t5 = (uint64_t)z; 299 t6 = (uint64_t)(z >> 64); 300 301 z = (unsigned __int128)a[3] * (unsigned __int128)b[0] 302 + (unsigned __int128)t3; 303 t3 = (uint64_t)z; 304 z = (unsigned __int128)a[3] * (unsigned __int128)b[1] 305 + (unsigned __int128)t4 + (z >> 64); 306 t4 = (uint64_t)z; 307 z = (unsigned __int128)a[3] * (unsigned __int128)b[2] 308 + (unsigned __int128)t5 + (z >> 64); 309 t5 = (uint64_t)z; 310 z = (unsigned __int128)a[3] * (unsigned __int128)b[3] 311 + (unsigned __int128)t6 + (z >> 64); 312 t6 = (uint64_t)z; 313 t7 = (uint64_t)(z >> 64); 314 315 /* 316 * Modulo p, we have: 317 * 318 * 2^255 = 19 319 * 2^510 = 19*19 = 361 320 * 321 * We split the intermediate t into three parts, in basis 322 * 2^255. The low one will be in t0..t3; the middle one in t4..t7. 323 * The upper one can only be a single bit (th), since the 324 * multiplication operands are at most 2^255+37 each. 325 */ 326 th = t7 >> 62; 327 t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; 328 t6 = (t6 << 1) | (t5 >> 63); 329 t5 = (t5 << 1) | (t4 >> 63); 330 t4 = (t4 << 1) | (t3 >> 63); 331 t3 &= MASK63; 332 333 /* 334 * Multiply the middle part (t4..t7) by 19. We truncate it to 335 * 255 bits; the extra bits will go along with th. 336 */ 337 z = (unsigned __int128)t4 * 19; 338 t4 = (uint64_t)z; 339 z = (unsigned __int128)t5 * 19 + (z >> 64); 340 t5 = (uint64_t)z; 341 z = (unsigned __int128)t6 * 19 + (z >> 64); 342 t6 = (uint64_t)z; 343 z = (unsigned __int128)t7 * 19 + (z >> 64); 344 t7 = (uint64_t)z & MASK63; 345 346 th = (361 & -th) + (19 * (uint64_t)(z >> 63)); 347 348 /* 349 * Add elements together. 350 * At this point: 351 * t0..t3 fits on 255 bits. 352 * t4..t7 fits on 255 bits. 353 * th <= 361 + 342 = 703. 354 */ 355 z = (unsigned __int128)t0 + (unsigned __int128)t4 356 + (unsigned __int128)th; 357 t0 = (uint64_t)z; 358 z = (unsigned __int128)t1 + (unsigned __int128)t5 + (z >> 64); 359 t1 = (uint64_t)z; 360 z = (unsigned __int128)t2 + (unsigned __int128)t6 + (z >> 64); 361 t2 = (uint64_t)z; 362 z = (unsigned __int128)t3 + (unsigned __int128)t7 + (z >> 64); 363 t3 = (uint64_t)z & MASK63; 364 th = (uint64_t)(z >> 63); 365 366 /* 367 * Since the sum is at most 2^256 + 703, the two upper bits, in th, 368 * can only have value 0, 1 or 2. We just add th*19, which 369 * guarantees a result of at most 2^255+37. 370 */ 371 z = (unsigned __int128)t0 + (19 * th); 372 d[0] = (uint64_t)z; 373 z = (unsigned __int128)t1 + (z >> 64); 374 d[1] = (uint64_t)z; 375 z = (unsigned __int128)t2 + (z >> 64); 376 d[2] = (uint64_t)z; 377 d[3] = t3 + (uint64_t)(z >> 64); 378 379 #elif BR_UMUL128 380 381 uint64_t t0, t1, t2, t3, t4, t5, t6, t7, th; 382 uint64_t h0, h1, h2, h3; 383 unsigned char k; 384 385 /* 386 * Compute the product a*b over plain integers. 387 */ 388 t0 = _umul128(a[0], b[0], &h0); 389 t1 = _umul128(a[0], b[1], &h1); 390 k = _addcarry_u64(0, t1, h0, &t1); 391 t2 = _umul128(a[0], b[2], &h2); 392 k = _addcarry_u64(k, t2, h1, &t2); 393 t3 = _umul128(a[0], b[3], &h3); 394 k = _addcarry_u64(k, t3, h2, &t3); 395 (void)_addcarry_u64(k, h3, 0, &t4); 396 397 k = _addcarry_u64(0, _umul128(a[1], b[0], &h0), t1, &t1); 398 k = _addcarry_u64(k, _umul128(a[1], b[1], &h1), t2, &t2); 399 k = _addcarry_u64(k, _umul128(a[1], b[2], &h2), t3, &t3); 400 k = _addcarry_u64(k, _umul128(a[1], b[3], &h3), t4, &t4); 401 t5 = k; 402 k = _addcarry_u64(0, t2, h0, &t2); 403 k = _addcarry_u64(k, t3, h1, &t3); 404 k = _addcarry_u64(k, t4, h2, &t4); 405 (void)_addcarry_u64(k, t5, h3, &t5); 406 407 k = _addcarry_u64(0, _umul128(a[2], b[0], &h0), t2, &t2); 408 k = _addcarry_u64(k, _umul128(a[2], b[1], &h1), t3, &t3); 409 k = _addcarry_u64(k, _umul128(a[2], b[2], &h2), t4, &t4); 410 k = _addcarry_u64(k, _umul128(a[2], b[3], &h3), t5, &t5); 411 t6 = k; 412 k = _addcarry_u64(0, t3, h0, &t3); 413 k = _addcarry_u64(k, t4, h1, &t4); 414 k = _addcarry_u64(k, t5, h2, &t5); 415 (void)_addcarry_u64(k, t6, h3, &t6); 416 417 k = _addcarry_u64(0, _umul128(a[3], b[0], &h0), t3, &t3); 418 k = _addcarry_u64(k, _umul128(a[3], b[1], &h1), t4, &t4); 419 k = _addcarry_u64(k, _umul128(a[3], b[2], &h2), t5, &t5); 420 k = _addcarry_u64(k, _umul128(a[3], b[3], &h3), t6, &t6); 421 t7 = k; 422 k = _addcarry_u64(0, t4, h0, &t4); 423 k = _addcarry_u64(k, t5, h1, &t5); 424 k = _addcarry_u64(k, t6, h2, &t6); 425 (void)_addcarry_u64(k, t7, h3, &t7); 426 427 /* 428 * Modulo p, we have: 429 * 430 * 2^255 = 19 431 * 2^510 = 19*19 = 361 432 * 433 * We split the intermediate t into three parts, in basis 434 * 2^255. The low one will be in t0..t3; the middle one in t4..t7. 435 * The upper one can only be a single bit (th), since the 436 * multiplication operands are at most 2^255+37 each. 437 */ 438 th = t7 >> 62; 439 t7 = ((t7 << 1) | (t6 >> 63)) & MASK63; 440 t6 = (t6 << 1) | (t5 >> 63); 441 t5 = (t5 << 1) | (t4 >> 63); 442 t4 = (t4 << 1) | (t3 >> 63); 443 t3 &= MASK63; 444 445 /* 446 * Multiply the middle part (t4..t7) by 19. We truncate it to 447 * 255 bits; the extra bits will go along with th. 448 */ 449 t4 = _umul128(t4, 19, &h0); 450 t5 = _umul128(t5, 19, &h1); 451 t6 = _umul128(t6, 19, &h2); 452 t7 = _umul128(t7, 19, &h3); 453 k = _addcarry_u64(0, t5, h0, &t5); 454 k = _addcarry_u64(k, t6, h1, &t6); 455 k = _addcarry_u64(k, t7, h2, &t7); 456 (void)_addcarry_u64(k, h3, 0, &h3); 457 th = (361 & -th) + (19 * ((h3 << 1) + (t7 >> 63))); 458 t7 &= MASK63; 459 460 /* 461 * Add elements together. 462 * At this point: 463 * t0..t3 fits on 255 bits. 464 * t4..t7 fits on 255 bits. 465 * th <= 361 + 342 = 703. 466 */ 467 k = _addcarry_u64(0, t0, t4, &t0); 468 k = _addcarry_u64(k, t1, t5, &t1); 469 k = _addcarry_u64(k, t2, t6, &t2); 470 k = _addcarry_u64(k, t3, t7, &t3); 471 t4 = k; 472 k = _addcarry_u64(0, t0, th, &t0); 473 k = _addcarry_u64(k, t1, 0, &t1); 474 k = _addcarry_u64(k, t2, 0, &t2); 475 k = _addcarry_u64(k, t3, 0, &t3); 476 (void)_addcarry_u64(k, t4, 0, &t4); 477 478 th = (t4 << 1) + (t3 >> 63); 479 t3 &= MASK63; 480 481 /* 482 * Since the sum is at most 2^256 + 703, the two upper bits, in th, 483 * can only have value 0, 1 or 2. We just add th*19, which 484 * guarantees a result of at most 2^255+37. 485 */ 486 k = _addcarry_u64(0, t0, 19 * th, &d[0]); 487 k = _addcarry_u64(k, t1, 0, &d[1]); 488 k = _addcarry_u64(k, t2, 0, &d[2]); 489 (void)_addcarry_u64(k, t3, 0, &d[3]); 490 491 #endif 492 } 493 494 /* 495 * Multiplication by A24 = 121665. 496 */ 497 static inline void 498 f255_mul_a24(uint64_t *d, const uint64_t *a) 499 { 500 #if BR_INT128 501 502 uint64_t t0, t1, t2, t3; 503 unsigned __int128 z; 504 505 z = (unsigned __int128)a[0] * 121665; 506 t0 = (uint64_t)z; 507 z = (unsigned __int128)a[1] * 121665 + (z >> 64); 508 t1 = (uint64_t)z; 509 z = (unsigned __int128)a[2] * 121665 + (z >> 64); 510 t2 = (uint64_t)z; 511 z = (unsigned __int128)a[3] * 121665 + (z >> 64); 512 t3 = (uint64_t)z & MASK63; 513 514 z = (unsigned __int128)t0 + (19 * (uint64_t)(z >> 63)); 515 t0 = (uint64_t)z; 516 z = (unsigned __int128)t1 + (z >> 64); 517 t1 = (uint64_t)z; 518 z = (unsigned __int128)t2 + (z >> 64); 519 t2 = (uint64_t)z; 520 t3 = t3 + (uint64_t)(z >> 64); 521 522 z = (unsigned __int128)t0 + (19 & -(t3 >> 63)); 523 d[0] = (uint64_t)z; 524 z = (unsigned __int128)t1 + (z >> 64); 525 d[1] = (uint64_t)z; 526 z = (unsigned __int128)t2 + (z >> 64); 527 d[2] = (uint64_t)z; 528 d[3] = (t3 & MASK63) + (uint64_t)(z >> 64); 529 530 #elif BR_UMUL128 531 532 uint64_t t0, t1, t2, t3, t4, h0, h1, h2, h3; 533 unsigned char k; 534 535 t0 = _umul128(a[0], 121665, &h0); 536 t1 = _umul128(a[1], 121665, &h1); 537 k = _addcarry_u64(0, t1, h0, &t1); 538 t2 = _umul128(a[2], 121665, &h2); 539 k = _addcarry_u64(k, t2, h1, &t2); 540 t3 = _umul128(a[3], 121665, &h3); 541 k = _addcarry_u64(k, t3, h2, &t3); 542 (void)_addcarry_u64(k, h3, 0, &t4); 543 544 t4 = (t4 << 1) + (t3 >> 63); 545 t3 &= MASK63; 546 k = _addcarry_u64(0, t0, 19 * t4, &t0); 547 k = _addcarry_u64(k, t1, 0, &t1); 548 k = _addcarry_u64(k, t2, 0, &t2); 549 (void)_addcarry_u64(k, t3, 0, &t3); 550 551 t4 = 19 & -(t3 >> 63); 552 t3 &= MASK63; 553 k = _addcarry_u64(0, t0, t4, &d[0]); 554 k = _addcarry_u64(k, t1, 0, &d[1]); 555 k = _addcarry_u64(k, t2, 0, &d[2]); 556 (void)_addcarry_u64(k, t3, 0, &d[3]); 557 558 #endif 559 } 560 561 /* 562 * Finalize reduction. 563 */ 564 static inline void 565 f255_final_reduce(uint64_t *a) 566 { 567 #if BR_INT128 568 569 uint64_t t0, t1, t2, t3, m; 570 unsigned __int128 z; 571 572 /* 573 * We add 19. If the result (in t) is below 2^255, then a[] 574 * is already less than 2^255-19, thus already reduced. 575 * Otherwise, we subtract 2^255 from t[], in which case we 576 * have t = a - (2^255-19), and that's our result. 577 */ 578 z = (unsigned __int128)a[0] + 19; 579 t0 = (uint64_t)z; 580 z = (unsigned __int128)a[1] + (z >> 64); 581 t1 = (uint64_t)z; 582 z = (unsigned __int128)a[2] + (z >> 64); 583 t2 = (uint64_t)z; 584 t3 = a[3] + (uint64_t)(z >> 64); 585 586 m = -(t3 >> 63); 587 t3 &= MASK63; 588 a[0] ^= m & (a[0] ^ t0); 589 a[1] ^= m & (a[1] ^ t1); 590 a[2] ^= m & (a[2] ^ t2); 591 a[3] ^= m & (a[3] ^ t3); 592 593 #elif BR_UMUL128 594 595 uint64_t t0, t1, t2, t3, m; 596 unsigned char k; 597 598 /* 599 * We add 19. If the result (in t) is below 2^255, then a[] 600 * is already less than 2^255-19, thus already reduced. 601 * Otherwise, we subtract 2^255 from t[], in which case we 602 * have t = a - (2^255-19), and that's our result. 603 */ 604 k = _addcarry_u64(0, a[0], 19, &t0); 605 k = _addcarry_u64(k, a[1], 0, &t1); 606 k = _addcarry_u64(k, a[2], 0, &t2); 607 (void)_addcarry_u64(k, a[3], 0, &t3); 608 609 m = -(t3 >> 63); 610 t3 &= MASK63; 611 a[0] ^= m & (a[0] ^ t0); 612 a[1] ^= m & (a[1] ^ t1); 613 a[2] ^= m & (a[2] ^ t2); 614 a[3] ^= m & (a[3] ^ t3); 615 616 #endif 617 } 618 619 static uint32_t 620 api_mul(unsigned char *G, size_t Glen, 621 const unsigned char *kb, size_t kblen, int curve) 622 { 623 unsigned char k[32]; 624 uint64_t x1[4], x2[4], z2[4], x3[4], z3[4]; 625 uint32_t swap; 626 int i; 627 628 (void)curve; 629 630 /* 631 * Points are encoded over exactly 32 bytes. Multipliers must fit 632 * in 32 bytes as well. 633 */ 634 if (Glen != 32 || kblen > 32) { 635 return 0; 636 } 637 638 /* 639 * RFC 7748 mandates that the high bit of the last point byte must 640 * be ignored/cleared. 641 */ 642 x1[0] = br_dec64le(&G[ 0]); 643 x1[1] = br_dec64le(&G[ 8]); 644 x1[2] = br_dec64le(&G[16]); 645 x1[3] = br_dec64le(&G[24]) & MASK63; 646 647 /* 648 * We can use memset() to clear values, because exact-width types 649 * like uint64_t are guaranteed to have no padding bits or 650 * trap representations. 651 */ 652 memset(x2, 0, sizeof x2); 653 x2[0] = 1; 654 memset(z2, 0, sizeof z2); 655 memcpy(x3, x1, sizeof x1); 656 memcpy(z3, x2, sizeof x2); 657 658 /* 659 * The multiplier is provided in big-endian notation, and 660 * possibly shorter than 32 bytes. 661 */ 662 memset(k, 0, (sizeof k) - kblen); 663 memcpy(k + (sizeof k) - kblen, kb, kblen); 664 k[31] &= 0xF8; 665 k[0] &= 0x7F; 666 k[0] |= 0x40; 667 668 swap = 0; 669 670 for (i = 254; i >= 0; i --) { 671 uint64_t a[4], aa[4], b[4], bb[4], e[4]; 672 uint64_t c[4], d[4], da[4], cb[4]; 673 uint32_t kt; 674 675 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; 676 swap ^= kt; 677 f255_cswap(x2, x3, swap); 678 f255_cswap(z2, z3, swap); 679 swap = kt; 680 681 /* A = x_2 + z_2 */ 682 f255_add(a, x2, z2); 683 684 /* AA = A^2 */ 685 f255_mul(aa, a, a); 686 687 /* B = x_2 - z_2 */ 688 f255_sub(b, x2, z2); 689 690 /* BB = B^2 */ 691 f255_mul(bb, b, b); 692 693 /* E = AA - BB */ 694 f255_sub(e, aa, bb); 695 696 /* C = x_3 + z_3 */ 697 f255_add(c, x3, z3); 698 699 /* D = x_3 - z_3 */ 700 f255_sub(d, x3, z3); 701 702 /* DA = D * A */ 703 f255_mul(da, d, a); 704 705 /* CB = C * B */ 706 f255_mul(cb, c, b); 707 708 /* x_3 = (DA + CB)^2 */ 709 f255_add(x3, da, cb); 710 f255_mul(x3, x3, x3); 711 712 /* z_3 = x_1 * (DA - CB)^2 */ 713 f255_sub(z3, da, cb); 714 f255_mul(z3, z3, z3); 715 f255_mul(z3, x1, z3); 716 717 /* x_2 = AA * BB */ 718 f255_mul(x2, aa, bb); 719 720 /* z_2 = E * (AA + a24 * E) */ 721 f255_mul_a24(z2, e); 722 f255_add(z2, aa, z2); 723 f255_mul(z2, e, z2); 724 } 725 726 f255_cswap(x2, x3, swap); 727 f255_cswap(z2, z3, swap); 728 729 /* 730 * Compute 1/z2 = z2^(p-2). Since p = 2^255-19, we can mutualize 731 * most non-squarings. We use x1 and x3, now useless, as temporaries. 732 */ 733 memcpy(x1, z2, sizeof z2); 734 for (i = 0; i < 15; i ++) { 735 f255_mul(x1, x1, x1); 736 f255_mul(x1, x1, z2); 737 } 738 memcpy(x3, x1, sizeof x1); 739 for (i = 0; i < 14; i ++) { 740 int j; 741 742 for (j = 0; j < 16; j ++) { 743 f255_mul(x3, x3, x3); 744 } 745 f255_mul(x3, x3, x1); 746 } 747 for (i = 14; i >= 0; i --) { 748 f255_mul(x3, x3, x3); 749 if ((0xFFEB >> i) & 1) { 750 f255_mul(x3, z2, x3); 751 } 752 } 753 754 /* 755 * Compute x2/z2. We have 1/z2 in x3. 756 */ 757 f255_mul(x2, x2, x3); 758 f255_final_reduce(x2); 759 760 /* 761 * Encode the final x2 value in little-endian. 762 */ 763 br_enc64le(G, x2[0]); 764 br_enc64le(G + 8, x2[1]); 765 br_enc64le(G + 16, x2[2]); 766 br_enc64le(G + 24, x2[3]); 767 return 1; 768 } 769 770 static size_t 771 api_mulgen(unsigned char *R, 772 const unsigned char *x, size_t xlen, int curve) 773 { 774 const unsigned char *G; 775 size_t Glen; 776 777 G = api_generator(curve, &Glen); 778 memcpy(R, G, Glen); 779 api_mul(R, Glen, x, xlen, curve); 780 return Glen; 781 } 782 783 static uint32_t 784 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 785 const unsigned char *x, size_t xlen, 786 const unsigned char *y, size_t ylen, int curve) 787 { 788 /* 789 * We don't implement this method, since it is used for ECDSA 790 * only, and there is no ECDSA over Curve25519 (which instead 791 * uses EdDSA). 792 */ 793 (void)A; 794 (void)B; 795 (void)len; 796 (void)x; 797 (void)xlen; 798 (void)y; 799 (void)ylen; 800 (void)curve; 801 return 0; 802 } 803 804 /* see bearssl_ec.h */ 805 const br_ec_impl br_ec_c25519_m64 = { 806 (uint32_t)0x20000000, 807 &api_generator, 808 &api_order, 809 &api_xoff, 810 &api_mul, 811 &api_mulgen, 812 &api_muladd 813 }; 814 815 /* see bearssl_ec.h */ 816 const br_ec_impl * 817 br_ec_c25519_m64_get(void) 818 { 819 return &br_ec_c25519_m64; 820 } 821 822 #else 823 824 /* see bearssl_ec.h */ 825 const br_ec_impl * 826 br_ec_c25519_m64_get(void) 827 { 828 return 0; 829 } 830 831 #endif 832