1 // SPDX-License-Identifier: GPL-2.0 OR MIT 2 /* 3 * Copyright (C) 2016-2017 INRIA and Microsoft Corporation. 4 * Copyright (C) 2018-2019 Jason A. Donenfeld <Jason@zx2c4.com>. All Rights Reserved. 5 * 6 * This is a machine-generated formally verified implementation of Curve25519 7 * ECDH from: <https://github.com/mitls/hacl-star>. Though originally machine 8 * generated, it has been tweaked to be suitable for use in the kernel. It is 9 * optimized for 64-bit machines that can efficiently work with 128-bit 10 * integer types. 11 */ 12 13 #include <asm/unaligned.h> 14 #include <crypto/curve25519.h> 15 #include <linux/string.h> 16 17 static __always_inline u64 u64_eq_mask(u64 a, u64 b) 18 { 19 u64 x = a ^ b; 20 u64 minus_x = ~x + (u64)1U; 21 u64 x_or_minus_x = x | minus_x; 22 u64 xnx = x_or_minus_x >> (u32)63U; 23 u64 c = xnx - (u64)1U; 24 return c; 25 } 26 27 static __always_inline u64 u64_gte_mask(u64 a, u64 b) 28 { 29 u64 x = a; 30 u64 y = b; 31 u64 x_xor_y = x ^ y; 32 u64 x_sub_y = x - y; 33 u64 x_sub_y_xor_y = x_sub_y ^ y; 34 u64 q = x_xor_y | x_sub_y_xor_y; 35 u64 x_xor_q = x ^ q; 36 u64 x_xor_q_ = x_xor_q >> (u32)63U; 37 u64 c = x_xor_q_ - (u64)1U; 38 return c; 39 } 40 41 static __always_inline void modulo_carry_top(u64 *b) 42 { 43 u64 b4 = b[4]; 44 u64 b0 = b[0]; 45 u64 b4_ = b4 & 0x7ffffffffffffLLU; 46 u64 b0_ = b0 + 19 * (b4 >> 51); 47 b[4] = b4_; 48 b[0] = b0_; 49 } 50 51 static __always_inline void fproduct_copy_from_wide_(u64 *output, u128 *input) 52 { 53 { 54 u128 xi = input[0]; 55 output[0] = ((u64)(xi)); 56 } 57 { 58 u128 xi = input[1]; 59 output[1] = ((u64)(xi)); 60 } 61 { 62 u128 xi = input[2]; 63 output[2] = ((u64)(xi)); 64 } 65 { 66 u128 xi = input[3]; 67 output[3] = ((u64)(xi)); 68 } 69 { 70 u128 xi = input[4]; 71 output[4] = ((u64)(xi)); 72 } 73 } 74 75 static __always_inline void 76 fproduct_sum_scalar_multiplication_(u128 *output, u64 *input, u64 s) 77 { 78 output[0] += (u128)input[0] * s; 79 output[1] += (u128)input[1] * s; 80 output[2] += (u128)input[2] * s; 81 output[3] += (u128)input[3] * s; 82 output[4] += (u128)input[4] * s; 83 } 84 85 static __always_inline void fproduct_carry_wide_(u128 *tmp) 86 { 87 { 88 u32 ctr = 0; 89 u128 tctr = tmp[ctr]; 90 u128 tctrp1 = tmp[ctr + 1]; 91 u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; 92 u128 c = ((tctr) >> (51)); 93 tmp[ctr] = ((u128)(r0)); 94 tmp[ctr + 1] = ((tctrp1) + (c)); 95 } 96 { 97 u32 ctr = 1; 98 u128 tctr = tmp[ctr]; 99 u128 tctrp1 = tmp[ctr + 1]; 100 u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; 101 u128 c = ((tctr) >> (51)); 102 tmp[ctr] = ((u128)(r0)); 103 tmp[ctr + 1] = ((tctrp1) + (c)); 104 } 105 106 { 107 u32 ctr = 2; 108 u128 tctr = tmp[ctr]; 109 u128 tctrp1 = tmp[ctr + 1]; 110 u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; 111 u128 c = ((tctr) >> (51)); 112 tmp[ctr] = ((u128)(r0)); 113 tmp[ctr + 1] = ((tctrp1) + (c)); 114 } 115 { 116 u32 ctr = 3; 117 u128 tctr = tmp[ctr]; 118 u128 tctrp1 = tmp[ctr + 1]; 119 u64 r0 = ((u64)(tctr)) & 0x7ffffffffffffLLU; 120 u128 c = ((tctr) >> (51)); 121 tmp[ctr] = ((u128)(r0)); 122 tmp[ctr + 1] = ((tctrp1) + (c)); 123 } 124 } 125 126 static __always_inline void fmul_shift_reduce(u64 *output) 127 { 128 u64 tmp = output[4]; 129 u64 b0; 130 { 131 u32 ctr = 5 - 0 - 1; 132 u64 z = output[ctr - 1]; 133 output[ctr] = z; 134 } 135 { 136 u32 ctr = 5 - 1 - 1; 137 u64 z = output[ctr - 1]; 138 output[ctr] = z; 139 } 140 { 141 u32 ctr = 5 - 2 - 1; 142 u64 z = output[ctr - 1]; 143 output[ctr] = z; 144 } 145 { 146 u32 ctr = 5 - 3 - 1; 147 u64 z = output[ctr - 1]; 148 output[ctr] = z; 149 } 150 output[0] = tmp; 151 b0 = output[0]; 152 output[0] = 19 * b0; 153 } 154 155 static __always_inline void fmul_mul_shift_reduce_(u128 *output, u64 *input, 156 u64 *input21) 157 { 158 u32 i; 159 u64 input2i; 160 { 161 u64 input2i = input21[0]; 162 fproduct_sum_scalar_multiplication_(output, input, input2i); 163 fmul_shift_reduce(input); 164 } 165 { 166 u64 input2i = input21[1]; 167 fproduct_sum_scalar_multiplication_(output, input, input2i); 168 fmul_shift_reduce(input); 169 } 170 { 171 u64 input2i = input21[2]; 172 fproduct_sum_scalar_multiplication_(output, input, input2i); 173 fmul_shift_reduce(input); 174 } 175 { 176 u64 input2i = input21[3]; 177 fproduct_sum_scalar_multiplication_(output, input, input2i); 178 fmul_shift_reduce(input); 179 } 180 i = 4; 181 input2i = input21[i]; 182 fproduct_sum_scalar_multiplication_(output, input, input2i); 183 } 184 185 static __always_inline void fmul_fmul(u64 *output, u64 *input, u64 *input21) 186 { 187 u64 tmp[5] = { input[0], input[1], input[2], input[3], input[4] }; 188 { 189 u128 b4; 190 u128 b0; 191 u128 b4_; 192 u128 b0_; 193 u64 i0; 194 u64 i1; 195 u64 i0_; 196 u64 i1_; 197 u128 t[5] = { 0 }; 198 fmul_mul_shift_reduce_(t, tmp, input21); 199 fproduct_carry_wide_(t); 200 b4 = t[4]; 201 b0 = t[0]; 202 b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); 203 b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); 204 t[4] = b4_; 205 t[0] = b0_; 206 fproduct_copy_from_wide_(output, t); 207 i0 = output[0]; 208 i1 = output[1]; 209 i0_ = i0 & 0x7ffffffffffffLLU; 210 i1_ = i1 + (i0 >> 51); 211 output[0] = i0_; 212 output[1] = i1_; 213 } 214 } 215 216 static __always_inline void fsquare_fsquare__(u128 *tmp, u64 *output) 217 { 218 u64 r0 = output[0]; 219 u64 r1 = output[1]; 220 u64 r2 = output[2]; 221 u64 r3 = output[3]; 222 u64 r4 = output[4]; 223 u64 d0 = r0 * 2; 224 u64 d1 = r1 * 2; 225 u64 d2 = r2 * 2 * 19; 226 u64 d419 = r4 * 19; 227 u64 d4 = d419 * 2; 228 u128 s0 = ((((((u128)(r0) * (r0))) + (((u128)(d4) * (r1))))) + 229 (((u128)(d2) * (r3)))); 230 u128 s1 = ((((((u128)(d0) * (r1))) + (((u128)(d4) * (r2))))) + 231 (((u128)(r3 * 19) * (r3)))); 232 u128 s2 = ((((((u128)(d0) * (r2))) + (((u128)(r1) * (r1))))) + 233 (((u128)(d4) * (r3)))); 234 u128 s3 = ((((((u128)(d0) * (r3))) + (((u128)(d1) * (r2))))) + 235 (((u128)(r4) * (d419)))); 236 u128 s4 = ((((((u128)(d0) * (r4))) + (((u128)(d1) * (r3))))) + 237 (((u128)(r2) * (r2)))); 238 tmp[0] = s0; 239 tmp[1] = s1; 240 tmp[2] = s2; 241 tmp[3] = s3; 242 tmp[4] = s4; 243 } 244 245 static __always_inline void fsquare_fsquare_(u128 *tmp, u64 *output) 246 { 247 u128 b4; 248 u128 b0; 249 u128 b4_; 250 u128 b0_; 251 u64 i0; 252 u64 i1; 253 u64 i0_; 254 u64 i1_; 255 fsquare_fsquare__(tmp, output); 256 fproduct_carry_wide_(tmp); 257 b4 = tmp[4]; 258 b0 = tmp[0]; 259 b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); 260 b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); 261 tmp[4] = b4_; 262 tmp[0] = b0_; 263 fproduct_copy_from_wide_(output, tmp); 264 i0 = output[0]; 265 i1 = output[1]; 266 i0_ = i0 & 0x7ffffffffffffLLU; 267 i1_ = i1 + (i0 >> 51); 268 output[0] = i0_; 269 output[1] = i1_; 270 } 271 272 static __always_inline void fsquare_fsquare_times_(u64 *output, u128 *tmp, 273 u32 count1) 274 { 275 u32 i; 276 fsquare_fsquare_(tmp, output); 277 for (i = 1; i < count1; ++i) 278 fsquare_fsquare_(tmp, output); 279 } 280 281 static __always_inline void fsquare_fsquare_times(u64 *output, u64 *input, 282 u32 count1) 283 { 284 u128 t[5]; 285 memcpy(output, input, 5 * sizeof(*input)); 286 fsquare_fsquare_times_(output, t, count1); 287 } 288 289 static __always_inline void fsquare_fsquare_times_inplace(u64 *output, 290 u32 count1) 291 { 292 u128 t[5]; 293 fsquare_fsquare_times_(output, t, count1); 294 } 295 296 static __always_inline void crecip_crecip(u64 *out, u64 *z) 297 { 298 u64 buf[20] = { 0 }; 299 u64 *a0 = buf; 300 u64 *t00 = buf + 5; 301 u64 *b0 = buf + 10; 302 u64 *t01; 303 u64 *b1; 304 u64 *c0; 305 u64 *a; 306 u64 *t0; 307 u64 *b; 308 u64 *c; 309 fsquare_fsquare_times(a0, z, 1); 310 fsquare_fsquare_times(t00, a0, 2); 311 fmul_fmul(b0, t00, z); 312 fmul_fmul(a0, b0, a0); 313 fsquare_fsquare_times(t00, a0, 1); 314 fmul_fmul(b0, t00, b0); 315 fsquare_fsquare_times(t00, b0, 5); 316 t01 = buf + 5; 317 b1 = buf + 10; 318 c0 = buf + 15; 319 fmul_fmul(b1, t01, b1); 320 fsquare_fsquare_times(t01, b1, 10); 321 fmul_fmul(c0, t01, b1); 322 fsquare_fsquare_times(t01, c0, 20); 323 fmul_fmul(t01, t01, c0); 324 fsquare_fsquare_times_inplace(t01, 10); 325 fmul_fmul(b1, t01, b1); 326 fsquare_fsquare_times(t01, b1, 50); 327 a = buf; 328 t0 = buf + 5; 329 b = buf + 10; 330 c = buf + 15; 331 fmul_fmul(c, t0, b); 332 fsquare_fsquare_times(t0, c, 100); 333 fmul_fmul(t0, t0, c); 334 fsquare_fsquare_times_inplace(t0, 50); 335 fmul_fmul(t0, t0, b); 336 fsquare_fsquare_times_inplace(t0, 5); 337 fmul_fmul(out, t0, a); 338 } 339 340 static __always_inline void fsum(u64 *a, u64 *b) 341 { 342 a[0] += b[0]; 343 a[1] += b[1]; 344 a[2] += b[2]; 345 a[3] += b[3]; 346 a[4] += b[4]; 347 } 348 349 static __always_inline void fdifference(u64 *a, u64 *b) 350 { 351 u64 tmp[5] = { 0 }; 352 u64 b0; 353 u64 b1; 354 u64 b2; 355 u64 b3; 356 u64 b4; 357 memcpy(tmp, b, 5 * sizeof(*b)); 358 b0 = tmp[0]; 359 b1 = tmp[1]; 360 b2 = tmp[2]; 361 b3 = tmp[3]; 362 b4 = tmp[4]; 363 tmp[0] = b0 + 0x3fffffffffff68LLU; 364 tmp[1] = b1 + 0x3ffffffffffff8LLU; 365 tmp[2] = b2 + 0x3ffffffffffff8LLU; 366 tmp[3] = b3 + 0x3ffffffffffff8LLU; 367 tmp[4] = b4 + 0x3ffffffffffff8LLU; 368 { 369 u64 xi = a[0]; 370 u64 yi = tmp[0]; 371 a[0] = yi - xi; 372 } 373 { 374 u64 xi = a[1]; 375 u64 yi = tmp[1]; 376 a[1] = yi - xi; 377 } 378 { 379 u64 xi = a[2]; 380 u64 yi = tmp[2]; 381 a[2] = yi - xi; 382 } 383 { 384 u64 xi = a[3]; 385 u64 yi = tmp[3]; 386 a[3] = yi - xi; 387 } 388 { 389 u64 xi = a[4]; 390 u64 yi = tmp[4]; 391 a[4] = yi - xi; 392 } 393 } 394 395 static __always_inline void fscalar(u64 *output, u64 *b, u64 s) 396 { 397 u128 tmp[5]; 398 u128 b4; 399 u128 b0; 400 u128 b4_; 401 u128 b0_; 402 { 403 u64 xi = b[0]; 404 tmp[0] = ((u128)(xi) * (s)); 405 } 406 { 407 u64 xi = b[1]; 408 tmp[1] = ((u128)(xi) * (s)); 409 } 410 { 411 u64 xi = b[2]; 412 tmp[2] = ((u128)(xi) * (s)); 413 } 414 { 415 u64 xi = b[3]; 416 tmp[3] = ((u128)(xi) * (s)); 417 } 418 { 419 u64 xi = b[4]; 420 tmp[4] = ((u128)(xi) * (s)); 421 } 422 fproduct_carry_wide_(tmp); 423 b4 = tmp[4]; 424 b0 = tmp[0]; 425 b4_ = ((b4) & (((u128)(0x7ffffffffffffLLU)))); 426 b0_ = ((b0) + (((u128)(19) * (((u64)(((b4) >> (51)))))))); 427 tmp[4] = b4_; 428 tmp[0] = b0_; 429 fproduct_copy_from_wide_(output, tmp); 430 } 431 432 static __always_inline void fmul(u64 *output, u64 *a, u64 *b) 433 { 434 fmul_fmul(output, a, b); 435 } 436 437 static __always_inline void crecip(u64 *output, u64 *input) 438 { 439 crecip_crecip(output, input); 440 } 441 442 static __always_inline void point_swap_conditional_step(u64 *a, u64 *b, 443 u64 swap1, u32 ctr) 444 { 445 u32 i = ctr - 1; 446 u64 ai = a[i]; 447 u64 bi = b[i]; 448 u64 x = swap1 & (ai ^ bi); 449 u64 ai1 = ai ^ x; 450 u64 bi1 = bi ^ x; 451 a[i] = ai1; 452 b[i] = bi1; 453 } 454 455 static __always_inline void point_swap_conditional5(u64 *a, u64 *b, u64 swap1) 456 { 457 point_swap_conditional_step(a, b, swap1, 5); 458 point_swap_conditional_step(a, b, swap1, 4); 459 point_swap_conditional_step(a, b, swap1, 3); 460 point_swap_conditional_step(a, b, swap1, 2); 461 point_swap_conditional_step(a, b, swap1, 1); 462 } 463 464 static __always_inline void point_swap_conditional(u64 *a, u64 *b, u64 iswap) 465 { 466 u64 swap1 = 0 - iswap; 467 point_swap_conditional5(a, b, swap1); 468 point_swap_conditional5(a + 5, b + 5, swap1); 469 } 470 471 static __always_inline void point_copy(u64 *output, u64 *input) 472 { 473 memcpy(output, input, 5 * sizeof(*input)); 474 memcpy(output + 5, input + 5, 5 * sizeof(*input)); 475 } 476 477 static __always_inline void addanddouble_fmonty(u64 *pp, u64 *ppq, u64 *p, 478 u64 *pq, u64 *qmqp) 479 { 480 u64 *qx = qmqp; 481 u64 *x2 = pp; 482 u64 *z2 = pp + 5; 483 u64 *x3 = ppq; 484 u64 *z3 = ppq + 5; 485 u64 *x = p; 486 u64 *z = p + 5; 487 u64 *xprime = pq; 488 u64 *zprime = pq + 5; 489 u64 buf[40] = { 0 }; 490 u64 *origx = buf; 491 u64 *origxprime0 = buf + 5; 492 u64 *xxprime0; 493 u64 *zzprime0; 494 u64 *origxprime; 495 xxprime0 = buf + 25; 496 zzprime0 = buf + 30; 497 memcpy(origx, x, 5 * sizeof(*x)); 498 fsum(x, z); 499 fdifference(z, origx); 500 memcpy(origxprime0, xprime, 5 * sizeof(*xprime)); 501 fsum(xprime, zprime); 502 fdifference(zprime, origxprime0); 503 fmul(xxprime0, xprime, z); 504 fmul(zzprime0, x, zprime); 505 origxprime = buf + 5; 506 { 507 u64 *xx0; 508 u64 *zz0; 509 u64 *xxprime; 510 u64 *zzprime; 511 u64 *zzzprime; 512 xx0 = buf + 15; 513 zz0 = buf + 20; 514 xxprime = buf + 25; 515 zzprime = buf + 30; 516 zzzprime = buf + 35; 517 memcpy(origxprime, xxprime, 5 * sizeof(*xxprime)); 518 fsum(xxprime, zzprime); 519 fdifference(zzprime, origxprime); 520 fsquare_fsquare_times(x3, xxprime, 1); 521 fsquare_fsquare_times(zzzprime, zzprime, 1); 522 fmul(z3, zzzprime, qx); 523 fsquare_fsquare_times(xx0, x, 1); 524 fsquare_fsquare_times(zz0, z, 1); 525 { 526 u64 *zzz; 527 u64 *xx; 528 u64 *zz; 529 u64 scalar; 530 zzz = buf + 10; 531 xx = buf + 15; 532 zz = buf + 20; 533 fmul(x2, xx, zz); 534 fdifference(zz, xx); 535 scalar = 121665; 536 fscalar(zzz, zz, scalar); 537 fsum(zzz, xx); 538 fmul(z2, zzz, zz); 539 } 540 } 541 } 542 543 static __always_inline void 544 ladder_smallloop_cmult_small_loop_step(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, 545 u64 *q, u8 byt) 546 { 547 u64 bit0 = (u64)(byt >> 7); 548 u64 bit; 549 point_swap_conditional(nq, nqpq, bit0); 550 addanddouble_fmonty(nq2, nqpq2, nq, nqpq, q); 551 bit = (u64)(byt >> 7); 552 point_swap_conditional(nq2, nqpq2, bit); 553 } 554 555 static __always_inline void 556 ladder_smallloop_cmult_small_loop_double_step(u64 *nq, u64 *nqpq, u64 *nq2, 557 u64 *nqpq2, u64 *q, u8 byt) 558 { 559 u8 byt1; 560 ladder_smallloop_cmult_small_loop_step(nq, nqpq, nq2, nqpq2, q, byt); 561 byt1 = byt << 1; 562 ladder_smallloop_cmult_small_loop_step(nq2, nqpq2, nq, nqpq, q, byt1); 563 } 564 565 static __always_inline void 566 ladder_smallloop_cmult_small_loop(u64 *nq, u64 *nqpq, u64 *nq2, u64 *nqpq2, 567 u64 *q, u8 byt, u32 i) 568 { 569 while (i--) { 570 ladder_smallloop_cmult_small_loop_double_step(nq, nqpq, nq2, 571 nqpq2, q, byt); 572 byt <<= 2; 573 } 574 } 575 576 static __always_inline void ladder_bigloop_cmult_big_loop(u8 *n1, u64 *nq, 577 u64 *nqpq, u64 *nq2, 578 u64 *nqpq2, u64 *q, 579 u32 i) 580 { 581 while (i--) { 582 u8 byte = n1[i]; 583 ladder_smallloop_cmult_small_loop(nq, nqpq, nq2, nqpq2, q, 584 byte, 4); 585 } 586 } 587 588 static void ladder_cmult(u64 *result, u8 *n1, u64 *q) 589 { 590 u64 point_buf[40] = { 0 }; 591 u64 *nq = point_buf; 592 u64 *nqpq = point_buf + 10; 593 u64 *nq2 = point_buf + 20; 594 u64 *nqpq2 = point_buf + 30; 595 point_copy(nqpq, q); 596 nq[0] = 1; 597 ladder_bigloop_cmult_big_loop(n1, nq, nqpq, nq2, nqpq2, q, 32); 598 point_copy(result, nq); 599 } 600 601 static __always_inline void format_fexpand(u64 *output, const u8 *input) 602 { 603 const u8 *x00 = input + 6; 604 const u8 *x01 = input + 12; 605 const u8 *x02 = input + 19; 606 const u8 *x0 = input + 24; 607 u64 i0, i1, i2, i3, i4, output0, output1, output2, output3, output4; 608 i0 = get_unaligned_le64(input); 609 i1 = get_unaligned_le64(x00); 610 i2 = get_unaligned_le64(x01); 611 i3 = get_unaligned_le64(x02); 612 i4 = get_unaligned_le64(x0); 613 output0 = i0 & 0x7ffffffffffffLLU; 614 output1 = i1 >> 3 & 0x7ffffffffffffLLU; 615 output2 = i2 >> 6 & 0x7ffffffffffffLLU; 616 output3 = i3 >> 1 & 0x7ffffffffffffLLU; 617 output4 = i4 >> 12 & 0x7ffffffffffffLLU; 618 output[0] = output0; 619 output[1] = output1; 620 output[2] = output2; 621 output[3] = output3; 622 output[4] = output4; 623 } 624 625 static __always_inline void format_fcontract_first_carry_pass(u64 *input) 626 { 627 u64 t0 = input[0]; 628 u64 t1 = input[1]; 629 u64 t2 = input[2]; 630 u64 t3 = input[3]; 631 u64 t4 = input[4]; 632 u64 t1_ = t1 + (t0 >> 51); 633 u64 t0_ = t0 & 0x7ffffffffffffLLU; 634 u64 t2_ = t2 + (t1_ >> 51); 635 u64 t1__ = t1_ & 0x7ffffffffffffLLU; 636 u64 t3_ = t3 + (t2_ >> 51); 637 u64 t2__ = t2_ & 0x7ffffffffffffLLU; 638 u64 t4_ = t4 + (t3_ >> 51); 639 u64 t3__ = t3_ & 0x7ffffffffffffLLU; 640 input[0] = t0_; 641 input[1] = t1__; 642 input[2] = t2__; 643 input[3] = t3__; 644 input[4] = t4_; 645 } 646 647 static __always_inline void format_fcontract_first_carry_full(u64 *input) 648 { 649 format_fcontract_first_carry_pass(input); 650 modulo_carry_top(input); 651 } 652 653 static __always_inline void format_fcontract_second_carry_pass(u64 *input) 654 { 655 u64 t0 = input[0]; 656 u64 t1 = input[1]; 657 u64 t2 = input[2]; 658 u64 t3 = input[3]; 659 u64 t4 = input[4]; 660 u64 t1_ = t1 + (t0 >> 51); 661 u64 t0_ = t0 & 0x7ffffffffffffLLU; 662 u64 t2_ = t2 + (t1_ >> 51); 663 u64 t1__ = t1_ & 0x7ffffffffffffLLU; 664 u64 t3_ = t3 + (t2_ >> 51); 665 u64 t2__ = t2_ & 0x7ffffffffffffLLU; 666 u64 t4_ = t4 + (t3_ >> 51); 667 u64 t3__ = t3_ & 0x7ffffffffffffLLU; 668 input[0] = t0_; 669 input[1] = t1__; 670 input[2] = t2__; 671 input[3] = t3__; 672 input[4] = t4_; 673 } 674 675 static __always_inline void format_fcontract_second_carry_full(u64 *input) 676 { 677 u64 i0; 678 u64 i1; 679 u64 i0_; 680 u64 i1_; 681 format_fcontract_second_carry_pass(input); 682 modulo_carry_top(input); 683 i0 = input[0]; 684 i1 = input[1]; 685 i0_ = i0 & 0x7ffffffffffffLLU; 686 i1_ = i1 + (i0 >> 51); 687 input[0] = i0_; 688 input[1] = i1_; 689 } 690 691 static __always_inline void format_fcontract_trim(u64 *input) 692 { 693 u64 a0 = input[0]; 694 u64 a1 = input[1]; 695 u64 a2 = input[2]; 696 u64 a3 = input[3]; 697 u64 a4 = input[4]; 698 u64 mask0 = u64_gte_mask(a0, 0x7ffffffffffedLLU); 699 u64 mask1 = u64_eq_mask(a1, 0x7ffffffffffffLLU); 700 u64 mask2 = u64_eq_mask(a2, 0x7ffffffffffffLLU); 701 u64 mask3 = u64_eq_mask(a3, 0x7ffffffffffffLLU); 702 u64 mask4 = u64_eq_mask(a4, 0x7ffffffffffffLLU); 703 u64 mask = (((mask0 & mask1) & mask2) & mask3) & mask4; 704 u64 a0_ = a0 - (0x7ffffffffffedLLU & mask); 705 u64 a1_ = a1 - (0x7ffffffffffffLLU & mask); 706 u64 a2_ = a2 - (0x7ffffffffffffLLU & mask); 707 u64 a3_ = a3 - (0x7ffffffffffffLLU & mask); 708 u64 a4_ = a4 - (0x7ffffffffffffLLU & mask); 709 input[0] = a0_; 710 input[1] = a1_; 711 input[2] = a2_; 712 input[3] = a3_; 713 input[4] = a4_; 714 } 715 716 static __always_inline void format_fcontract_store(u8 *output, u64 *input) 717 { 718 u64 t0 = input[0]; 719 u64 t1 = input[1]; 720 u64 t2 = input[2]; 721 u64 t3 = input[3]; 722 u64 t4 = input[4]; 723 u64 o0 = t1 << 51 | t0; 724 u64 o1 = t2 << 38 | t1 >> 13; 725 u64 o2 = t3 << 25 | t2 >> 26; 726 u64 o3 = t4 << 12 | t3 >> 39; 727 u8 *b0 = output; 728 u8 *b1 = output + 8; 729 u8 *b2 = output + 16; 730 u8 *b3 = output + 24; 731 put_unaligned_le64(o0, b0); 732 put_unaligned_le64(o1, b1); 733 put_unaligned_le64(o2, b2); 734 put_unaligned_le64(o3, b3); 735 } 736 737 static __always_inline void format_fcontract(u8 *output, u64 *input) 738 { 739 format_fcontract_first_carry_full(input); 740 format_fcontract_second_carry_full(input); 741 format_fcontract_trim(input); 742 format_fcontract_store(output, input); 743 } 744 745 static __always_inline void format_scalar_of_point(u8 *scalar, u64 *point) 746 { 747 u64 *x = point; 748 u64 *z = point + 5; 749 u64 buf[10] __aligned(32) = { 0 }; 750 u64 *zmone = buf; 751 u64 *sc = buf + 5; 752 crecip(zmone, z); 753 fmul(sc, x, zmone); 754 format_fcontract(scalar, sc); 755 } 756 757 void curve25519_generic(u8 mypublic[CURVE25519_KEY_SIZE], 758 const u8 secret[CURVE25519_KEY_SIZE], 759 const u8 basepoint[CURVE25519_KEY_SIZE]) 760 { 761 u64 buf0[10] __aligned(32) = { 0 }; 762 u64 *x0 = buf0; 763 u64 *z = buf0 + 5; 764 u64 *q; 765 format_fexpand(x0, basepoint); 766 z[0] = 1; 767 q = buf0; 768 { 769 u8 e[32] __aligned(32) = { 0 }; 770 u8 *scalar; 771 memcpy(e, secret, 32); 772 curve25519_clamp_secret(e); 773 scalar = e; 774 { 775 u64 buf[15] = { 0 }; 776 u64 *nq = buf; 777 u64 *x = nq; 778 x[0] = 1; 779 ladder_cmult(nq, scalar, q); 780 format_scalar_of_point(mypublic, nq); 781 memzero_explicit(buf, sizeof(buf)); 782 } 783 memzero_explicit(e, sizeof(e)); 784 } 785 memzero_explicit(buf0, sizeof(buf0)); 786 } 787