1 /* 2 * Copyright (c) 2017 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* obsolete 28 #include <stdio.h> 29 #include <stdlib.h> 30 static void 31 print_int(const char *name, const uint32_t *x) 32 { 33 size_t u; 34 unsigned char tmp[36]; 35 36 printf("%s = ", name); 37 for (u = 0; u < 20; u ++) { 38 if (x[u] > 0x1FFF) { 39 printf("INVALID:"); 40 for (u = 0; u < 20; u ++) { 41 printf(" %04X", x[u]); 42 } 43 printf("\n"); 44 return; 45 } 46 } 47 memset(tmp, 0, sizeof tmp); 48 for (u = 0; u < 20; u ++) { 49 uint32_t w; 50 int j, k; 51 52 w = x[u]; 53 j = 13 * (int)u; 54 k = j & 7; 55 if (k != 0) { 56 w <<= k; 57 j -= k; 58 } 59 k = j >> 3; 60 tmp[35 - k] |= (unsigned char)w; 61 tmp[34 - k] |= (unsigned char)(w >> 8); 62 tmp[33 - k] |= (unsigned char)(w >> 16); 63 tmp[32 - k] |= (unsigned char)(w >> 24); 64 } 65 for (u = 4; u < 36; u ++) { 66 printf("%02X", tmp[u]); 67 } 68 printf("\n"); 69 } 70 */ 71 72 /* 73 * If BR_NO_ARITH_SHIFT is undefined, or defined to 0, then we _assume_ 74 * that right-shifting a signed negative integer copies the sign bit 75 * (arithmetic right-shift). This is "implementation-defined behaviour", 76 * i.e. it is not undefined, but it may differ between compilers. Each 77 * compiler is supposed to document its behaviour in that respect. GCC 78 * explicitly defines that an arithmetic right shift is used. We expect 79 * all other compilers to do the same, because underlying CPU offer an 80 * arithmetic right shift opcode that could not be used otherwise. 81 */ 82 #if BR_NO_ARITH_SHIFT 83 #define ARSH(x, n) (((uint32_t)(x) >> (n)) \ 84 | ((-((uint32_t)(x) >> 31)) << (32 - (n)))) 85 #else 86 #define ARSH(x, n) ((*(int32_t *)&(x)) >> (n)) 87 #endif 88 89 /* 90 * Convert an integer from unsigned little-endian encoding to a sequence of 91 * 13-bit words in little-endian order. The final "partial" word is 92 * returned. 93 */ 94 static uint32_t 95 le8_to_le13(uint32_t *dst, const unsigned char *src, size_t len) 96 { 97 uint32_t acc; 98 int acc_len; 99 100 acc = 0; 101 acc_len = 0; 102 while (len -- > 0) { 103 acc |= (uint32_t)(*src ++) << acc_len; 104 acc_len += 8; 105 if (acc_len >= 13) { 106 *dst ++ = acc & 0x1FFF; 107 acc >>= 13; 108 acc_len -= 13; 109 } 110 } 111 return acc; 112 } 113 114 /* 115 * Convert an integer (13-bit words, little-endian) to unsigned 116 * little-endian encoding. The total encoding length is provided; all 117 * the destination bytes will be filled. 118 */ 119 static void 120 le13_to_le8(unsigned char *dst, size_t len, const uint32_t *src) 121 { 122 uint32_t acc; 123 int acc_len; 124 125 acc = 0; 126 acc_len = 0; 127 while (len -- > 0) { 128 if (acc_len < 8) { 129 acc |= (*src ++) << acc_len; 130 acc_len += 13; 131 } 132 *dst ++ = (unsigned char)acc; 133 acc >>= 8; 134 acc_len -= 8; 135 } 136 } 137 138 /* 139 * Normalise an array of words to a strict 13 bits per word. Returned 140 * value is the resulting carry. The source (w) and destination (d) 141 * arrays may be identical, but shall not overlap partially. 142 */ 143 static inline uint32_t 144 norm13(uint32_t *d, const uint32_t *w, size_t len) 145 { 146 size_t u; 147 uint32_t cc; 148 149 cc = 0; 150 for (u = 0; u < len; u ++) { 151 int32_t z; 152 153 z = w[u] + cc; 154 d[u] = z & 0x1FFF; 155 cc = ARSH(z, 13); 156 } 157 return cc; 158 } 159 160 /* 161 * mul20() multiplies two 260-bit integers together. Each word must fit 162 * on 13 bits; source operands use 20 words, destination operand 163 * receives 40 words. All overlaps allowed. 164 * 165 * square20() computes the square of a 260-bit integer. Each word must 166 * fit on 13 bits; source operand uses 20 words, destination operand 167 * receives 40 words. All overlaps allowed. 168 */ 169 170 #if BR_SLOW_MUL15 171 172 static void 173 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) 174 { 175 /* 176 * Two-level Karatsuba: turns a 20x20 multiplication into 177 * nine 5x5 multiplications. We use 13-bit words but do not 178 * propagate carries immediately, so words may expand: 179 * 180 * - First Karatsuba decomposition turns the 20x20 mul on 181 * 13-bit words into three 10x10 muls, two on 13-bit words 182 * and one on 14-bit words. 183 * 184 * - Second Karatsuba decomposition further splits these into: 185 * 186 * * four 5x5 muls on 13-bit words 187 * * four 5x5 muls on 14-bit words 188 * * one 5x5 mul on 15-bit words 189 * 190 * Highest word value is 8191, 16382 or 32764, for 13-bit, 14-bit 191 * or 15-bit words, respectively. 192 */ 193 uint32_t u[45], v[45], w[90]; 194 uint32_t cc; 195 int i; 196 197 #define ZADD(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ 198 (dw)[5 * (d_off) + 0] = (s1w)[5 * (s1_off) + 0] \ 199 + (s2w)[5 * (s2_off) + 0]; \ 200 (dw)[5 * (d_off) + 1] = (s1w)[5 * (s1_off) + 1] \ 201 + (s2w)[5 * (s2_off) + 1]; \ 202 (dw)[5 * (d_off) + 2] = (s1w)[5 * (s1_off) + 2] \ 203 + (s2w)[5 * (s2_off) + 2]; \ 204 (dw)[5 * (d_off) + 3] = (s1w)[5 * (s1_off) + 3] \ 205 + (s2w)[5 * (s2_off) + 3]; \ 206 (dw)[5 * (d_off) + 4] = (s1w)[5 * (s1_off) + 4] \ 207 + (s2w)[5 * (s2_off) + 4]; \ 208 } while (0) 209 210 #define ZADDT(dw, d_off, sw, s_off) do { \ 211 (dw)[5 * (d_off) + 0] += (sw)[5 * (s_off) + 0]; \ 212 (dw)[5 * (d_off) + 1] += (sw)[5 * (s_off) + 1]; \ 213 (dw)[5 * (d_off) + 2] += (sw)[5 * (s_off) + 2]; \ 214 (dw)[5 * (d_off) + 3] += (sw)[5 * (s_off) + 3]; \ 215 (dw)[5 * (d_off) + 4] += (sw)[5 * (s_off) + 4]; \ 216 } while (0) 217 218 #define ZSUB2F(dw, d_off, s1w, s1_off, s2w, s2_off) do { \ 219 (dw)[5 * (d_off) + 0] -= (s1w)[5 * (s1_off) + 0] \ 220 + (s2w)[5 * (s2_off) + 0]; \ 221 (dw)[5 * (d_off) + 1] -= (s1w)[5 * (s1_off) + 1] \ 222 + (s2w)[5 * (s2_off) + 1]; \ 223 (dw)[5 * (d_off) + 2] -= (s1w)[5 * (s1_off) + 2] \ 224 + (s2w)[5 * (s2_off) + 2]; \ 225 (dw)[5 * (d_off) + 3] -= (s1w)[5 * (s1_off) + 3] \ 226 + (s2w)[5 * (s2_off) + 3]; \ 227 (dw)[5 * (d_off) + 4] -= (s1w)[5 * (s1_off) + 4] \ 228 + (s2w)[5 * (s2_off) + 4]; \ 229 } while (0) 230 231 #define CPR1(w, cprcc) do { \ 232 uint32_t cprz = (w) + cprcc; \ 233 (w) = cprz & 0x1FFF; \ 234 cprcc = cprz >> 13; \ 235 } while (0) 236 237 #define CPR(dw, d_off) do { \ 238 uint32_t cprcc; \ 239 cprcc = 0; \ 240 CPR1((dw)[(d_off) + 0], cprcc); \ 241 CPR1((dw)[(d_off) + 1], cprcc); \ 242 CPR1((dw)[(d_off) + 2], cprcc); \ 243 CPR1((dw)[(d_off) + 3], cprcc); \ 244 CPR1((dw)[(d_off) + 4], cprcc); \ 245 CPR1((dw)[(d_off) + 5], cprcc); \ 246 CPR1((dw)[(d_off) + 6], cprcc); \ 247 CPR1((dw)[(d_off) + 7], cprcc); \ 248 CPR1((dw)[(d_off) + 8], cprcc); \ 249 (dw)[(d_off) + 9] = cprcc; \ 250 } while (0) 251 252 memcpy(u, a, 20 * sizeof *a); 253 ZADD(u, 4, a, 0, a, 1); 254 ZADD(u, 5, a, 2, a, 3); 255 ZADD(u, 6, a, 0, a, 2); 256 ZADD(u, 7, a, 1, a, 3); 257 ZADD(u, 8, u, 6, u, 7); 258 259 memcpy(v, b, 20 * sizeof *b); 260 ZADD(v, 4, b, 0, b, 1); 261 ZADD(v, 5, b, 2, b, 3); 262 ZADD(v, 6, b, 0, b, 2); 263 ZADD(v, 7, b, 1, b, 3); 264 ZADD(v, 8, v, 6, v, 7); 265 266 /* 267 * Do the eight first 8x8 muls. Source words are at most 16382 268 * each, so we can add product results together "as is" in 32-bit 269 * words. 270 */ 271 for (i = 0; i < 40; i += 5) { 272 w[(i << 1) + 0] = MUL15(u[i + 0], v[i + 0]); 273 w[(i << 1) + 1] = MUL15(u[i + 0], v[i + 1]) 274 + MUL15(u[i + 1], v[i + 0]); 275 w[(i << 1) + 2] = MUL15(u[i + 0], v[i + 2]) 276 + MUL15(u[i + 1], v[i + 1]) 277 + MUL15(u[i + 2], v[i + 0]); 278 w[(i << 1) + 3] = MUL15(u[i + 0], v[i + 3]) 279 + MUL15(u[i + 1], v[i + 2]) 280 + MUL15(u[i + 2], v[i + 1]) 281 + MUL15(u[i + 3], v[i + 0]); 282 w[(i << 1) + 4] = MUL15(u[i + 0], v[i + 4]) 283 + MUL15(u[i + 1], v[i + 3]) 284 + MUL15(u[i + 2], v[i + 2]) 285 + MUL15(u[i + 3], v[i + 1]) 286 + MUL15(u[i + 4], v[i + 0]); 287 w[(i << 1) + 5] = MUL15(u[i + 1], v[i + 4]) 288 + MUL15(u[i + 2], v[i + 3]) 289 + MUL15(u[i + 3], v[i + 2]) 290 + MUL15(u[i + 4], v[i + 1]); 291 w[(i << 1) + 6] = MUL15(u[i + 2], v[i + 4]) 292 + MUL15(u[i + 3], v[i + 3]) 293 + MUL15(u[i + 4], v[i + 2]); 294 w[(i << 1) + 7] = MUL15(u[i + 3], v[i + 4]) 295 + MUL15(u[i + 4], v[i + 3]); 296 w[(i << 1) + 8] = MUL15(u[i + 4], v[i + 4]); 297 w[(i << 1) + 9] = 0; 298 } 299 300 /* 301 * For the 9th multiplication, source words are up to 32764, 302 * so we must do some carry propagation. If we add up to 303 * 4 products and the carry is no more than 524224, then the 304 * result fits in 32 bits, and the next carry will be no more 305 * than 524224 (because 4*(32764^2)+524224 < 8192*524225). 306 * 307 * We thus just skip one of the products in the middle word, 308 * then do a carry propagation (this reduces words to 13 bits 309 * each, except possibly the last, which may use up to 17 bits 310 * or so), then add the missing product. 311 */ 312 w[80 + 0] = MUL15(u[40 + 0], v[40 + 0]); 313 w[80 + 1] = MUL15(u[40 + 0], v[40 + 1]) 314 + MUL15(u[40 + 1], v[40 + 0]); 315 w[80 + 2] = MUL15(u[40 + 0], v[40 + 2]) 316 + MUL15(u[40 + 1], v[40 + 1]) 317 + MUL15(u[40 + 2], v[40 + 0]); 318 w[80 + 3] = MUL15(u[40 + 0], v[40 + 3]) 319 + MUL15(u[40 + 1], v[40 + 2]) 320 + MUL15(u[40 + 2], v[40 + 1]) 321 + MUL15(u[40 + 3], v[40 + 0]); 322 w[80 + 4] = MUL15(u[40 + 0], v[40 + 4]) 323 + MUL15(u[40 + 1], v[40 + 3]) 324 + MUL15(u[40 + 2], v[40 + 2]) 325 + MUL15(u[40 + 3], v[40 + 1]); 326 /* + MUL15(u[40 + 4], v[40 + 0]) */ 327 w[80 + 5] = MUL15(u[40 + 1], v[40 + 4]) 328 + MUL15(u[40 + 2], v[40 + 3]) 329 + MUL15(u[40 + 3], v[40 + 2]) 330 + MUL15(u[40 + 4], v[40 + 1]); 331 w[80 + 6] = MUL15(u[40 + 2], v[40 + 4]) 332 + MUL15(u[40 + 3], v[40 + 3]) 333 + MUL15(u[40 + 4], v[40 + 2]); 334 w[80 + 7] = MUL15(u[40 + 3], v[40 + 4]) 335 + MUL15(u[40 + 4], v[40 + 3]); 336 w[80 + 8] = MUL15(u[40 + 4], v[40 + 4]); 337 338 CPR(w, 80); 339 340 w[80 + 4] += MUL15(u[40 + 4], v[40 + 0]); 341 342 /* 343 * The products on 14-bit words in slots 6 and 7 yield values 344 * up to 5*(16382^2) each, and we need to subtract two such 345 * values from the higher word. We need the subtraction to fit 346 * in a _signed_ 32-bit integer, i.e. 31 bits + a sign bit. 347 * However, 10*(16382^2) does not fit. So we must perform a 348 * bit of reduction here. 349 */ 350 CPR(w, 60); 351 CPR(w, 70); 352 353 /* 354 * Recompose results. 355 */ 356 357 /* 0..1*0..1 into 0..3 */ 358 ZSUB2F(w, 8, w, 0, w, 2); 359 ZSUB2F(w, 9, w, 1, w, 3); 360 ZADDT(w, 1, w, 8); 361 ZADDT(w, 2, w, 9); 362 363 /* 2..3*2..3 into 4..7 */ 364 ZSUB2F(w, 10, w, 4, w, 6); 365 ZSUB2F(w, 11, w, 5, w, 7); 366 ZADDT(w, 5, w, 10); 367 ZADDT(w, 6, w, 11); 368 369 /* (0..1+2..3)*(0..1+2..3) into 12..15 */ 370 ZSUB2F(w, 16, w, 12, w, 14); 371 ZSUB2F(w, 17, w, 13, w, 15); 372 ZADDT(w, 13, w, 16); 373 ZADDT(w, 14, w, 17); 374 375 /* first-level recomposition */ 376 ZSUB2F(w, 12, w, 0, w, 4); 377 ZSUB2F(w, 13, w, 1, w, 5); 378 ZSUB2F(w, 14, w, 2, w, 6); 379 ZSUB2F(w, 15, w, 3, w, 7); 380 ZADDT(w, 2, w, 12); 381 ZADDT(w, 3, w, 13); 382 ZADDT(w, 4, w, 14); 383 ZADDT(w, 5, w, 15); 384 385 /* 386 * Perform carry propagation to bring all words down to 13 bits. 387 */ 388 cc = norm13(d, w, 40); 389 d[39] += (cc << 13); 390 391 #undef ZADD 392 #undef ZADDT 393 #undef ZSUB2F 394 #undef CPR1 395 #undef CPR 396 } 397 398 static inline void 399 square20(uint32_t *d, const uint32_t *a) 400 { 401 mul20(d, a, a); 402 } 403 404 #else 405 406 static void 407 mul20(uint32_t *d, const uint32_t *a, const uint32_t *b) 408 { 409 uint32_t t[39]; 410 411 t[ 0] = MUL15(a[ 0], b[ 0]); 412 t[ 1] = MUL15(a[ 0], b[ 1]) 413 + MUL15(a[ 1], b[ 0]); 414 t[ 2] = MUL15(a[ 0], b[ 2]) 415 + MUL15(a[ 1], b[ 1]) 416 + MUL15(a[ 2], b[ 0]); 417 t[ 3] = MUL15(a[ 0], b[ 3]) 418 + MUL15(a[ 1], b[ 2]) 419 + MUL15(a[ 2], b[ 1]) 420 + MUL15(a[ 3], b[ 0]); 421 t[ 4] = MUL15(a[ 0], b[ 4]) 422 + MUL15(a[ 1], b[ 3]) 423 + MUL15(a[ 2], b[ 2]) 424 + MUL15(a[ 3], b[ 1]) 425 + MUL15(a[ 4], b[ 0]); 426 t[ 5] = MUL15(a[ 0], b[ 5]) 427 + MUL15(a[ 1], b[ 4]) 428 + MUL15(a[ 2], b[ 3]) 429 + MUL15(a[ 3], b[ 2]) 430 + MUL15(a[ 4], b[ 1]) 431 + MUL15(a[ 5], b[ 0]); 432 t[ 6] = MUL15(a[ 0], b[ 6]) 433 + MUL15(a[ 1], b[ 5]) 434 + MUL15(a[ 2], b[ 4]) 435 + MUL15(a[ 3], b[ 3]) 436 + MUL15(a[ 4], b[ 2]) 437 + MUL15(a[ 5], b[ 1]) 438 + MUL15(a[ 6], b[ 0]); 439 t[ 7] = MUL15(a[ 0], b[ 7]) 440 + MUL15(a[ 1], b[ 6]) 441 + MUL15(a[ 2], b[ 5]) 442 + MUL15(a[ 3], b[ 4]) 443 + MUL15(a[ 4], b[ 3]) 444 + MUL15(a[ 5], b[ 2]) 445 + MUL15(a[ 6], b[ 1]) 446 + MUL15(a[ 7], b[ 0]); 447 t[ 8] = MUL15(a[ 0], b[ 8]) 448 + MUL15(a[ 1], b[ 7]) 449 + MUL15(a[ 2], b[ 6]) 450 + MUL15(a[ 3], b[ 5]) 451 + MUL15(a[ 4], b[ 4]) 452 + MUL15(a[ 5], b[ 3]) 453 + MUL15(a[ 6], b[ 2]) 454 + MUL15(a[ 7], b[ 1]) 455 + MUL15(a[ 8], b[ 0]); 456 t[ 9] = MUL15(a[ 0], b[ 9]) 457 + MUL15(a[ 1], b[ 8]) 458 + MUL15(a[ 2], b[ 7]) 459 + MUL15(a[ 3], b[ 6]) 460 + MUL15(a[ 4], b[ 5]) 461 + MUL15(a[ 5], b[ 4]) 462 + MUL15(a[ 6], b[ 3]) 463 + MUL15(a[ 7], b[ 2]) 464 + MUL15(a[ 8], b[ 1]) 465 + MUL15(a[ 9], b[ 0]); 466 t[10] = MUL15(a[ 0], b[10]) 467 + MUL15(a[ 1], b[ 9]) 468 + MUL15(a[ 2], b[ 8]) 469 + MUL15(a[ 3], b[ 7]) 470 + MUL15(a[ 4], b[ 6]) 471 + MUL15(a[ 5], b[ 5]) 472 + MUL15(a[ 6], b[ 4]) 473 + MUL15(a[ 7], b[ 3]) 474 + MUL15(a[ 8], b[ 2]) 475 + MUL15(a[ 9], b[ 1]) 476 + MUL15(a[10], b[ 0]); 477 t[11] = MUL15(a[ 0], b[11]) 478 + MUL15(a[ 1], b[10]) 479 + MUL15(a[ 2], b[ 9]) 480 + MUL15(a[ 3], b[ 8]) 481 + MUL15(a[ 4], b[ 7]) 482 + MUL15(a[ 5], b[ 6]) 483 + MUL15(a[ 6], b[ 5]) 484 + MUL15(a[ 7], b[ 4]) 485 + MUL15(a[ 8], b[ 3]) 486 + MUL15(a[ 9], b[ 2]) 487 + MUL15(a[10], b[ 1]) 488 + MUL15(a[11], b[ 0]); 489 t[12] = MUL15(a[ 0], b[12]) 490 + MUL15(a[ 1], b[11]) 491 + MUL15(a[ 2], b[10]) 492 + MUL15(a[ 3], b[ 9]) 493 + MUL15(a[ 4], b[ 8]) 494 + MUL15(a[ 5], b[ 7]) 495 + MUL15(a[ 6], b[ 6]) 496 + MUL15(a[ 7], b[ 5]) 497 + MUL15(a[ 8], b[ 4]) 498 + MUL15(a[ 9], b[ 3]) 499 + MUL15(a[10], b[ 2]) 500 + MUL15(a[11], b[ 1]) 501 + MUL15(a[12], b[ 0]); 502 t[13] = MUL15(a[ 0], b[13]) 503 + MUL15(a[ 1], b[12]) 504 + MUL15(a[ 2], b[11]) 505 + MUL15(a[ 3], b[10]) 506 + MUL15(a[ 4], b[ 9]) 507 + MUL15(a[ 5], b[ 8]) 508 + MUL15(a[ 6], b[ 7]) 509 + MUL15(a[ 7], b[ 6]) 510 + MUL15(a[ 8], b[ 5]) 511 + MUL15(a[ 9], b[ 4]) 512 + MUL15(a[10], b[ 3]) 513 + MUL15(a[11], b[ 2]) 514 + MUL15(a[12], b[ 1]) 515 + MUL15(a[13], b[ 0]); 516 t[14] = MUL15(a[ 0], b[14]) 517 + MUL15(a[ 1], b[13]) 518 + MUL15(a[ 2], b[12]) 519 + MUL15(a[ 3], b[11]) 520 + MUL15(a[ 4], b[10]) 521 + MUL15(a[ 5], b[ 9]) 522 + MUL15(a[ 6], b[ 8]) 523 + MUL15(a[ 7], b[ 7]) 524 + MUL15(a[ 8], b[ 6]) 525 + MUL15(a[ 9], b[ 5]) 526 + MUL15(a[10], b[ 4]) 527 + MUL15(a[11], b[ 3]) 528 + MUL15(a[12], b[ 2]) 529 + MUL15(a[13], b[ 1]) 530 + MUL15(a[14], b[ 0]); 531 t[15] = MUL15(a[ 0], b[15]) 532 + MUL15(a[ 1], b[14]) 533 + MUL15(a[ 2], b[13]) 534 + MUL15(a[ 3], b[12]) 535 + MUL15(a[ 4], b[11]) 536 + MUL15(a[ 5], b[10]) 537 + MUL15(a[ 6], b[ 9]) 538 + MUL15(a[ 7], b[ 8]) 539 + MUL15(a[ 8], b[ 7]) 540 + MUL15(a[ 9], b[ 6]) 541 + MUL15(a[10], b[ 5]) 542 + MUL15(a[11], b[ 4]) 543 + MUL15(a[12], b[ 3]) 544 + MUL15(a[13], b[ 2]) 545 + MUL15(a[14], b[ 1]) 546 + MUL15(a[15], b[ 0]); 547 t[16] = MUL15(a[ 0], b[16]) 548 + MUL15(a[ 1], b[15]) 549 + MUL15(a[ 2], b[14]) 550 + MUL15(a[ 3], b[13]) 551 + MUL15(a[ 4], b[12]) 552 + MUL15(a[ 5], b[11]) 553 + MUL15(a[ 6], b[10]) 554 + MUL15(a[ 7], b[ 9]) 555 + MUL15(a[ 8], b[ 8]) 556 + MUL15(a[ 9], b[ 7]) 557 + MUL15(a[10], b[ 6]) 558 + MUL15(a[11], b[ 5]) 559 + MUL15(a[12], b[ 4]) 560 + MUL15(a[13], b[ 3]) 561 + MUL15(a[14], b[ 2]) 562 + MUL15(a[15], b[ 1]) 563 + MUL15(a[16], b[ 0]); 564 t[17] = MUL15(a[ 0], b[17]) 565 + MUL15(a[ 1], b[16]) 566 + MUL15(a[ 2], b[15]) 567 + MUL15(a[ 3], b[14]) 568 + MUL15(a[ 4], b[13]) 569 + MUL15(a[ 5], b[12]) 570 + MUL15(a[ 6], b[11]) 571 + MUL15(a[ 7], b[10]) 572 + MUL15(a[ 8], b[ 9]) 573 + MUL15(a[ 9], b[ 8]) 574 + MUL15(a[10], b[ 7]) 575 + MUL15(a[11], b[ 6]) 576 + MUL15(a[12], b[ 5]) 577 + MUL15(a[13], b[ 4]) 578 + MUL15(a[14], b[ 3]) 579 + MUL15(a[15], b[ 2]) 580 + MUL15(a[16], b[ 1]) 581 + MUL15(a[17], b[ 0]); 582 t[18] = MUL15(a[ 0], b[18]) 583 + MUL15(a[ 1], b[17]) 584 + MUL15(a[ 2], b[16]) 585 + MUL15(a[ 3], b[15]) 586 + MUL15(a[ 4], b[14]) 587 + MUL15(a[ 5], b[13]) 588 + MUL15(a[ 6], b[12]) 589 + MUL15(a[ 7], b[11]) 590 + MUL15(a[ 8], b[10]) 591 + MUL15(a[ 9], b[ 9]) 592 + MUL15(a[10], b[ 8]) 593 + MUL15(a[11], b[ 7]) 594 + MUL15(a[12], b[ 6]) 595 + MUL15(a[13], b[ 5]) 596 + MUL15(a[14], b[ 4]) 597 + MUL15(a[15], b[ 3]) 598 + MUL15(a[16], b[ 2]) 599 + MUL15(a[17], b[ 1]) 600 + MUL15(a[18], b[ 0]); 601 t[19] = MUL15(a[ 0], b[19]) 602 + MUL15(a[ 1], b[18]) 603 + MUL15(a[ 2], b[17]) 604 + MUL15(a[ 3], b[16]) 605 + MUL15(a[ 4], b[15]) 606 + MUL15(a[ 5], b[14]) 607 + MUL15(a[ 6], b[13]) 608 + MUL15(a[ 7], b[12]) 609 + MUL15(a[ 8], b[11]) 610 + MUL15(a[ 9], b[10]) 611 + MUL15(a[10], b[ 9]) 612 + MUL15(a[11], b[ 8]) 613 + MUL15(a[12], b[ 7]) 614 + MUL15(a[13], b[ 6]) 615 + MUL15(a[14], b[ 5]) 616 + MUL15(a[15], b[ 4]) 617 + MUL15(a[16], b[ 3]) 618 + MUL15(a[17], b[ 2]) 619 + MUL15(a[18], b[ 1]) 620 + MUL15(a[19], b[ 0]); 621 t[20] = MUL15(a[ 1], b[19]) 622 + MUL15(a[ 2], b[18]) 623 + MUL15(a[ 3], b[17]) 624 + MUL15(a[ 4], b[16]) 625 + MUL15(a[ 5], b[15]) 626 + MUL15(a[ 6], b[14]) 627 + MUL15(a[ 7], b[13]) 628 + MUL15(a[ 8], b[12]) 629 + MUL15(a[ 9], b[11]) 630 + MUL15(a[10], b[10]) 631 + MUL15(a[11], b[ 9]) 632 + MUL15(a[12], b[ 8]) 633 + MUL15(a[13], b[ 7]) 634 + MUL15(a[14], b[ 6]) 635 + MUL15(a[15], b[ 5]) 636 + MUL15(a[16], b[ 4]) 637 + MUL15(a[17], b[ 3]) 638 + MUL15(a[18], b[ 2]) 639 + MUL15(a[19], b[ 1]); 640 t[21] = MUL15(a[ 2], b[19]) 641 + MUL15(a[ 3], b[18]) 642 + MUL15(a[ 4], b[17]) 643 + MUL15(a[ 5], b[16]) 644 + MUL15(a[ 6], b[15]) 645 + MUL15(a[ 7], b[14]) 646 + MUL15(a[ 8], b[13]) 647 + MUL15(a[ 9], b[12]) 648 + MUL15(a[10], b[11]) 649 + MUL15(a[11], b[10]) 650 + MUL15(a[12], b[ 9]) 651 + MUL15(a[13], b[ 8]) 652 + MUL15(a[14], b[ 7]) 653 + MUL15(a[15], b[ 6]) 654 + MUL15(a[16], b[ 5]) 655 + MUL15(a[17], b[ 4]) 656 + MUL15(a[18], b[ 3]) 657 + MUL15(a[19], b[ 2]); 658 t[22] = MUL15(a[ 3], b[19]) 659 + MUL15(a[ 4], b[18]) 660 + MUL15(a[ 5], b[17]) 661 + MUL15(a[ 6], b[16]) 662 + MUL15(a[ 7], b[15]) 663 + MUL15(a[ 8], b[14]) 664 + MUL15(a[ 9], b[13]) 665 + MUL15(a[10], b[12]) 666 + MUL15(a[11], b[11]) 667 + MUL15(a[12], b[10]) 668 + MUL15(a[13], b[ 9]) 669 + MUL15(a[14], b[ 8]) 670 + MUL15(a[15], b[ 7]) 671 + MUL15(a[16], b[ 6]) 672 + MUL15(a[17], b[ 5]) 673 + MUL15(a[18], b[ 4]) 674 + MUL15(a[19], b[ 3]); 675 t[23] = MUL15(a[ 4], b[19]) 676 + MUL15(a[ 5], b[18]) 677 + MUL15(a[ 6], b[17]) 678 + MUL15(a[ 7], b[16]) 679 + MUL15(a[ 8], b[15]) 680 + MUL15(a[ 9], b[14]) 681 + MUL15(a[10], b[13]) 682 + MUL15(a[11], b[12]) 683 + MUL15(a[12], b[11]) 684 + MUL15(a[13], b[10]) 685 + MUL15(a[14], b[ 9]) 686 + MUL15(a[15], b[ 8]) 687 + MUL15(a[16], b[ 7]) 688 + MUL15(a[17], b[ 6]) 689 + MUL15(a[18], b[ 5]) 690 + MUL15(a[19], b[ 4]); 691 t[24] = MUL15(a[ 5], b[19]) 692 + MUL15(a[ 6], b[18]) 693 + MUL15(a[ 7], b[17]) 694 + MUL15(a[ 8], b[16]) 695 + MUL15(a[ 9], b[15]) 696 + MUL15(a[10], b[14]) 697 + MUL15(a[11], b[13]) 698 + MUL15(a[12], b[12]) 699 + MUL15(a[13], b[11]) 700 + MUL15(a[14], b[10]) 701 + MUL15(a[15], b[ 9]) 702 + MUL15(a[16], b[ 8]) 703 + MUL15(a[17], b[ 7]) 704 + MUL15(a[18], b[ 6]) 705 + MUL15(a[19], b[ 5]); 706 t[25] = MUL15(a[ 6], b[19]) 707 + MUL15(a[ 7], b[18]) 708 + MUL15(a[ 8], b[17]) 709 + MUL15(a[ 9], b[16]) 710 + MUL15(a[10], b[15]) 711 + MUL15(a[11], b[14]) 712 + MUL15(a[12], b[13]) 713 + MUL15(a[13], b[12]) 714 + MUL15(a[14], b[11]) 715 + MUL15(a[15], b[10]) 716 + MUL15(a[16], b[ 9]) 717 + MUL15(a[17], b[ 8]) 718 + MUL15(a[18], b[ 7]) 719 + MUL15(a[19], b[ 6]); 720 t[26] = MUL15(a[ 7], b[19]) 721 + MUL15(a[ 8], b[18]) 722 + MUL15(a[ 9], b[17]) 723 + MUL15(a[10], b[16]) 724 + MUL15(a[11], b[15]) 725 + MUL15(a[12], b[14]) 726 + MUL15(a[13], b[13]) 727 + MUL15(a[14], b[12]) 728 + MUL15(a[15], b[11]) 729 + MUL15(a[16], b[10]) 730 + MUL15(a[17], b[ 9]) 731 + MUL15(a[18], b[ 8]) 732 + MUL15(a[19], b[ 7]); 733 t[27] = MUL15(a[ 8], b[19]) 734 + MUL15(a[ 9], b[18]) 735 + MUL15(a[10], b[17]) 736 + MUL15(a[11], b[16]) 737 + MUL15(a[12], b[15]) 738 + MUL15(a[13], b[14]) 739 + MUL15(a[14], b[13]) 740 + MUL15(a[15], b[12]) 741 + MUL15(a[16], b[11]) 742 + MUL15(a[17], b[10]) 743 + MUL15(a[18], b[ 9]) 744 + MUL15(a[19], b[ 8]); 745 t[28] = MUL15(a[ 9], b[19]) 746 + MUL15(a[10], b[18]) 747 + MUL15(a[11], b[17]) 748 + MUL15(a[12], b[16]) 749 + MUL15(a[13], b[15]) 750 + MUL15(a[14], b[14]) 751 + MUL15(a[15], b[13]) 752 + MUL15(a[16], b[12]) 753 + MUL15(a[17], b[11]) 754 + MUL15(a[18], b[10]) 755 + MUL15(a[19], b[ 9]); 756 t[29] = MUL15(a[10], b[19]) 757 + MUL15(a[11], b[18]) 758 + MUL15(a[12], b[17]) 759 + MUL15(a[13], b[16]) 760 + MUL15(a[14], b[15]) 761 + MUL15(a[15], b[14]) 762 + MUL15(a[16], b[13]) 763 + MUL15(a[17], b[12]) 764 + MUL15(a[18], b[11]) 765 + MUL15(a[19], b[10]); 766 t[30] = MUL15(a[11], b[19]) 767 + MUL15(a[12], b[18]) 768 + MUL15(a[13], b[17]) 769 + MUL15(a[14], b[16]) 770 + MUL15(a[15], b[15]) 771 + MUL15(a[16], b[14]) 772 + MUL15(a[17], b[13]) 773 + MUL15(a[18], b[12]) 774 + MUL15(a[19], b[11]); 775 t[31] = MUL15(a[12], b[19]) 776 + MUL15(a[13], b[18]) 777 + MUL15(a[14], b[17]) 778 + MUL15(a[15], b[16]) 779 + MUL15(a[16], b[15]) 780 + MUL15(a[17], b[14]) 781 + MUL15(a[18], b[13]) 782 + MUL15(a[19], b[12]); 783 t[32] = MUL15(a[13], b[19]) 784 + MUL15(a[14], b[18]) 785 + MUL15(a[15], b[17]) 786 + MUL15(a[16], b[16]) 787 + MUL15(a[17], b[15]) 788 + MUL15(a[18], b[14]) 789 + MUL15(a[19], b[13]); 790 t[33] = MUL15(a[14], b[19]) 791 + MUL15(a[15], b[18]) 792 + MUL15(a[16], b[17]) 793 + MUL15(a[17], b[16]) 794 + MUL15(a[18], b[15]) 795 + MUL15(a[19], b[14]); 796 t[34] = MUL15(a[15], b[19]) 797 + MUL15(a[16], b[18]) 798 + MUL15(a[17], b[17]) 799 + MUL15(a[18], b[16]) 800 + MUL15(a[19], b[15]); 801 t[35] = MUL15(a[16], b[19]) 802 + MUL15(a[17], b[18]) 803 + MUL15(a[18], b[17]) 804 + MUL15(a[19], b[16]); 805 t[36] = MUL15(a[17], b[19]) 806 + MUL15(a[18], b[18]) 807 + MUL15(a[19], b[17]); 808 t[37] = MUL15(a[18], b[19]) 809 + MUL15(a[19], b[18]); 810 t[38] = MUL15(a[19], b[19]); 811 812 d[39] = norm13(d, t, 39); 813 } 814 815 static void 816 square20(uint32_t *d, const uint32_t *a) 817 { 818 uint32_t t[39]; 819 820 t[ 0] = MUL15(a[ 0], a[ 0]); 821 t[ 1] = ((MUL15(a[ 0], a[ 1])) << 1); 822 t[ 2] = MUL15(a[ 1], a[ 1]) 823 + ((MUL15(a[ 0], a[ 2])) << 1); 824 t[ 3] = ((MUL15(a[ 0], a[ 3]) 825 + MUL15(a[ 1], a[ 2])) << 1); 826 t[ 4] = MUL15(a[ 2], a[ 2]) 827 + ((MUL15(a[ 0], a[ 4]) 828 + MUL15(a[ 1], a[ 3])) << 1); 829 t[ 5] = ((MUL15(a[ 0], a[ 5]) 830 + MUL15(a[ 1], a[ 4]) 831 + MUL15(a[ 2], a[ 3])) << 1); 832 t[ 6] = MUL15(a[ 3], a[ 3]) 833 + ((MUL15(a[ 0], a[ 6]) 834 + MUL15(a[ 1], a[ 5]) 835 + MUL15(a[ 2], a[ 4])) << 1); 836 t[ 7] = ((MUL15(a[ 0], a[ 7]) 837 + MUL15(a[ 1], a[ 6]) 838 + MUL15(a[ 2], a[ 5]) 839 + MUL15(a[ 3], a[ 4])) << 1); 840 t[ 8] = MUL15(a[ 4], a[ 4]) 841 + ((MUL15(a[ 0], a[ 8]) 842 + MUL15(a[ 1], a[ 7]) 843 + MUL15(a[ 2], a[ 6]) 844 + MUL15(a[ 3], a[ 5])) << 1); 845 t[ 9] = ((MUL15(a[ 0], a[ 9]) 846 + MUL15(a[ 1], a[ 8]) 847 + MUL15(a[ 2], a[ 7]) 848 + MUL15(a[ 3], a[ 6]) 849 + MUL15(a[ 4], a[ 5])) << 1); 850 t[10] = MUL15(a[ 5], a[ 5]) 851 + ((MUL15(a[ 0], a[10]) 852 + MUL15(a[ 1], a[ 9]) 853 + MUL15(a[ 2], a[ 8]) 854 + MUL15(a[ 3], a[ 7]) 855 + MUL15(a[ 4], a[ 6])) << 1); 856 t[11] = ((MUL15(a[ 0], a[11]) 857 + MUL15(a[ 1], a[10]) 858 + MUL15(a[ 2], a[ 9]) 859 + MUL15(a[ 3], a[ 8]) 860 + MUL15(a[ 4], a[ 7]) 861 + MUL15(a[ 5], a[ 6])) << 1); 862 t[12] = MUL15(a[ 6], a[ 6]) 863 + ((MUL15(a[ 0], a[12]) 864 + MUL15(a[ 1], a[11]) 865 + MUL15(a[ 2], a[10]) 866 + MUL15(a[ 3], a[ 9]) 867 + MUL15(a[ 4], a[ 8]) 868 + MUL15(a[ 5], a[ 7])) << 1); 869 t[13] = ((MUL15(a[ 0], a[13]) 870 + MUL15(a[ 1], a[12]) 871 + MUL15(a[ 2], a[11]) 872 + MUL15(a[ 3], a[10]) 873 + MUL15(a[ 4], a[ 9]) 874 + MUL15(a[ 5], a[ 8]) 875 + MUL15(a[ 6], a[ 7])) << 1); 876 t[14] = MUL15(a[ 7], a[ 7]) 877 + ((MUL15(a[ 0], a[14]) 878 + MUL15(a[ 1], a[13]) 879 + MUL15(a[ 2], a[12]) 880 + MUL15(a[ 3], a[11]) 881 + MUL15(a[ 4], a[10]) 882 + MUL15(a[ 5], a[ 9]) 883 + MUL15(a[ 6], a[ 8])) << 1); 884 t[15] = ((MUL15(a[ 0], a[15]) 885 + MUL15(a[ 1], a[14]) 886 + MUL15(a[ 2], a[13]) 887 + MUL15(a[ 3], a[12]) 888 + MUL15(a[ 4], a[11]) 889 + MUL15(a[ 5], a[10]) 890 + MUL15(a[ 6], a[ 9]) 891 + MUL15(a[ 7], a[ 8])) << 1); 892 t[16] = MUL15(a[ 8], a[ 8]) 893 + ((MUL15(a[ 0], a[16]) 894 + MUL15(a[ 1], a[15]) 895 + MUL15(a[ 2], a[14]) 896 + MUL15(a[ 3], a[13]) 897 + MUL15(a[ 4], a[12]) 898 + MUL15(a[ 5], a[11]) 899 + MUL15(a[ 6], a[10]) 900 + MUL15(a[ 7], a[ 9])) << 1); 901 t[17] = ((MUL15(a[ 0], a[17]) 902 + MUL15(a[ 1], a[16]) 903 + MUL15(a[ 2], a[15]) 904 + MUL15(a[ 3], a[14]) 905 + MUL15(a[ 4], a[13]) 906 + MUL15(a[ 5], a[12]) 907 + MUL15(a[ 6], a[11]) 908 + MUL15(a[ 7], a[10]) 909 + MUL15(a[ 8], a[ 9])) << 1); 910 t[18] = MUL15(a[ 9], a[ 9]) 911 + ((MUL15(a[ 0], a[18]) 912 + MUL15(a[ 1], a[17]) 913 + MUL15(a[ 2], a[16]) 914 + MUL15(a[ 3], a[15]) 915 + MUL15(a[ 4], a[14]) 916 + MUL15(a[ 5], a[13]) 917 + MUL15(a[ 6], a[12]) 918 + MUL15(a[ 7], a[11]) 919 + MUL15(a[ 8], a[10])) << 1); 920 t[19] = ((MUL15(a[ 0], a[19]) 921 + MUL15(a[ 1], a[18]) 922 + MUL15(a[ 2], a[17]) 923 + MUL15(a[ 3], a[16]) 924 + MUL15(a[ 4], a[15]) 925 + MUL15(a[ 5], a[14]) 926 + MUL15(a[ 6], a[13]) 927 + MUL15(a[ 7], a[12]) 928 + MUL15(a[ 8], a[11]) 929 + MUL15(a[ 9], a[10])) << 1); 930 t[20] = MUL15(a[10], a[10]) 931 + ((MUL15(a[ 1], a[19]) 932 + MUL15(a[ 2], a[18]) 933 + MUL15(a[ 3], a[17]) 934 + MUL15(a[ 4], a[16]) 935 + MUL15(a[ 5], a[15]) 936 + MUL15(a[ 6], a[14]) 937 + MUL15(a[ 7], a[13]) 938 + MUL15(a[ 8], a[12]) 939 + MUL15(a[ 9], a[11])) << 1); 940 t[21] = ((MUL15(a[ 2], a[19]) 941 + MUL15(a[ 3], a[18]) 942 + MUL15(a[ 4], a[17]) 943 + MUL15(a[ 5], a[16]) 944 + MUL15(a[ 6], a[15]) 945 + MUL15(a[ 7], a[14]) 946 + MUL15(a[ 8], a[13]) 947 + MUL15(a[ 9], a[12]) 948 + MUL15(a[10], a[11])) << 1); 949 t[22] = MUL15(a[11], a[11]) 950 + ((MUL15(a[ 3], a[19]) 951 + MUL15(a[ 4], a[18]) 952 + MUL15(a[ 5], a[17]) 953 + MUL15(a[ 6], a[16]) 954 + MUL15(a[ 7], a[15]) 955 + MUL15(a[ 8], a[14]) 956 + MUL15(a[ 9], a[13]) 957 + MUL15(a[10], a[12])) << 1); 958 t[23] = ((MUL15(a[ 4], a[19]) 959 + MUL15(a[ 5], a[18]) 960 + MUL15(a[ 6], a[17]) 961 + MUL15(a[ 7], a[16]) 962 + MUL15(a[ 8], a[15]) 963 + MUL15(a[ 9], a[14]) 964 + MUL15(a[10], a[13]) 965 + MUL15(a[11], a[12])) << 1); 966 t[24] = MUL15(a[12], a[12]) 967 + ((MUL15(a[ 5], a[19]) 968 + MUL15(a[ 6], a[18]) 969 + MUL15(a[ 7], a[17]) 970 + MUL15(a[ 8], a[16]) 971 + MUL15(a[ 9], a[15]) 972 + MUL15(a[10], a[14]) 973 + MUL15(a[11], a[13])) << 1); 974 t[25] = ((MUL15(a[ 6], a[19]) 975 + MUL15(a[ 7], a[18]) 976 + MUL15(a[ 8], a[17]) 977 + MUL15(a[ 9], a[16]) 978 + MUL15(a[10], a[15]) 979 + MUL15(a[11], a[14]) 980 + MUL15(a[12], a[13])) << 1); 981 t[26] = MUL15(a[13], a[13]) 982 + ((MUL15(a[ 7], a[19]) 983 + MUL15(a[ 8], a[18]) 984 + MUL15(a[ 9], a[17]) 985 + MUL15(a[10], a[16]) 986 + MUL15(a[11], a[15]) 987 + MUL15(a[12], a[14])) << 1); 988 t[27] = ((MUL15(a[ 8], a[19]) 989 + MUL15(a[ 9], a[18]) 990 + MUL15(a[10], a[17]) 991 + MUL15(a[11], a[16]) 992 + MUL15(a[12], a[15]) 993 + MUL15(a[13], a[14])) << 1); 994 t[28] = MUL15(a[14], a[14]) 995 + ((MUL15(a[ 9], a[19]) 996 + MUL15(a[10], a[18]) 997 + MUL15(a[11], a[17]) 998 + MUL15(a[12], a[16]) 999 + MUL15(a[13], a[15])) << 1); 1000 t[29] = ((MUL15(a[10], a[19]) 1001 + MUL15(a[11], a[18]) 1002 + MUL15(a[12], a[17]) 1003 + MUL15(a[13], a[16]) 1004 + MUL15(a[14], a[15])) << 1); 1005 t[30] = MUL15(a[15], a[15]) 1006 + ((MUL15(a[11], a[19]) 1007 + MUL15(a[12], a[18]) 1008 + MUL15(a[13], a[17]) 1009 + MUL15(a[14], a[16])) << 1); 1010 t[31] = ((MUL15(a[12], a[19]) 1011 + MUL15(a[13], a[18]) 1012 + MUL15(a[14], a[17]) 1013 + MUL15(a[15], a[16])) << 1); 1014 t[32] = MUL15(a[16], a[16]) 1015 + ((MUL15(a[13], a[19]) 1016 + MUL15(a[14], a[18]) 1017 + MUL15(a[15], a[17])) << 1); 1018 t[33] = ((MUL15(a[14], a[19]) 1019 + MUL15(a[15], a[18]) 1020 + MUL15(a[16], a[17])) << 1); 1021 t[34] = MUL15(a[17], a[17]) 1022 + ((MUL15(a[15], a[19]) 1023 + MUL15(a[16], a[18])) << 1); 1024 t[35] = ((MUL15(a[16], a[19]) 1025 + MUL15(a[17], a[18])) << 1); 1026 t[36] = MUL15(a[18], a[18]) 1027 + ((MUL15(a[17], a[19])) << 1); 1028 t[37] = ((MUL15(a[18], a[19])) << 1); 1029 t[38] = MUL15(a[19], a[19]); 1030 1031 d[39] = norm13(d, t, 39); 1032 } 1033 1034 #endif 1035 1036 /* 1037 * Perform a "final reduction" in field F255 (field for Curve25519) 1038 * The source value must be less than twice the modulus. If the value 1039 * is not lower than the modulus, then the modulus is subtracted and 1040 * this function returns 1; otherwise, it leaves it untouched and it 1041 * returns 0. 1042 */ 1043 static uint32_t 1044 reduce_final_f255(uint32_t *d) 1045 { 1046 uint32_t t[20]; 1047 uint32_t cc; 1048 int i; 1049 1050 memcpy(t, d, sizeof t); 1051 cc = 19; 1052 for (i = 0; i < 20; i ++) { 1053 uint32_t w; 1054 1055 w = t[i] + cc; 1056 cc = w >> 13; 1057 t[i] = w & 0x1FFF; 1058 } 1059 cc = t[19] >> 8; 1060 t[19] &= 0xFF; 1061 CCOPY(cc, d, t, sizeof t); 1062 return cc; 1063 } 1064 1065 static void 1066 f255_mulgen(uint32_t *d, const uint32_t *a, const uint32_t *b, int square) 1067 { 1068 uint32_t t[40], cc, w; 1069 1070 /* 1071 * Compute raw multiplication. All result words fit in 13 bits 1072 * each; upper word (t[39]) must fit on 5 bits, since the product 1073 * of two 256-bit integers must fit on 512 bits. 1074 */ 1075 if (square) { 1076 square20(t, a); 1077 } else { 1078 mul20(t, a, b); 1079 } 1080 1081 /* 1082 * Modular reduction: each high word is added where necessary. 1083 * Since the modulus is 2^255-19 and word 20 corresponds to 1084 * offset 20*13 = 260, word 20+k must be added to word k with 1085 * a factor of 19*2^5 = 608. The extra bits in word 19 are also 1086 * added that way. 1087 */ 1088 cc = MUL15(t[19] >> 8, 19); 1089 t[19] &= 0xFF; 1090 1091 #define MM1(x) do { \ 1092 w = t[x] + cc + MUL15(t[(x) + 20], 608); \ 1093 t[x] = w & 0x1FFF; \ 1094 cc = w >> 13; \ 1095 } while (0) 1096 1097 MM1( 0); 1098 MM1( 1); 1099 MM1( 2); 1100 MM1( 3); 1101 MM1( 4); 1102 MM1( 5); 1103 MM1( 6); 1104 MM1( 7); 1105 MM1( 8); 1106 MM1( 9); 1107 MM1(10); 1108 MM1(11); 1109 MM1(12); 1110 MM1(13); 1111 MM1(14); 1112 MM1(15); 1113 MM1(16); 1114 MM1(17); 1115 MM1(18); 1116 MM1(19); 1117 1118 #undef MM1 1119 1120 cc = MUL15(w >> 8, 19); 1121 t[19] &= 0xFF; 1122 1123 #define MM2(x) do { \ 1124 w = t[x] + cc; \ 1125 d[x] = w & 0x1FFF; \ 1126 cc = w >> 13; \ 1127 } while (0) 1128 1129 MM2( 0); 1130 MM2( 1); 1131 MM2( 2); 1132 MM2( 3); 1133 MM2( 4); 1134 MM2( 5); 1135 MM2( 6); 1136 MM2( 7); 1137 MM2( 8); 1138 MM2( 9); 1139 MM2(10); 1140 MM2(11); 1141 MM2(12); 1142 MM2(13); 1143 MM2(14); 1144 MM2(15); 1145 MM2(16); 1146 MM2(17); 1147 MM2(18); 1148 MM2(19); 1149 1150 #undef MM2 1151 } 1152 1153 /* 1154 * Perform a multiplication of two integers modulo 2^255-19. 1155 * Operands are arrays of 20 words, each containing 13 bits of data, in 1156 * little-endian order. Input value may be up to 2^256-1; on output, value 1157 * fits on 256 bits and is lower than twice the modulus. 1158 * 1159 * f255_mul() is the general multiplication, f255_square() is specialised 1160 * for squarings. 1161 */ 1162 #define f255_mul(d, a, b) f255_mulgen(d, a, b, 0) 1163 #define f255_square(d, a) f255_mulgen(d, a, a, 1) 1164 1165 /* 1166 * Add two values in F255. Partial reduction is performed (down to less 1167 * than twice the modulus). 1168 */ 1169 static void 1170 f255_add(uint32_t *d, const uint32_t *a, const uint32_t *b) 1171 { 1172 int i; 1173 uint32_t cc, w; 1174 1175 cc = 0; 1176 for (i = 0; i < 20; i ++) { 1177 w = a[i] + b[i] + cc; 1178 d[i] = w & 0x1FFF; 1179 cc = w >> 13; 1180 } 1181 cc = MUL15(w >> 8, 19); 1182 d[19] &= 0xFF; 1183 for (i = 0; i < 20; i ++) { 1184 w = d[i] + cc; 1185 d[i] = w & 0x1FFF; 1186 cc = w >> 13; 1187 } 1188 } 1189 1190 /* 1191 * Subtract one value from another in F255. Partial reduction is 1192 * performed (down to less than twice the modulus). 1193 */ 1194 static void 1195 f255_sub(uint32_t *d, const uint32_t *a, const uint32_t *b) 1196 { 1197 /* 1198 * We actually compute a - b + 2*p, so that the final value is 1199 * necessarily positive. 1200 */ 1201 int i; 1202 uint32_t cc, w; 1203 1204 cc = (uint32_t)-38; 1205 for (i = 0; i < 20; i ++) { 1206 w = a[i] - b[i] + cc; 1207 d[i] = w & 0x1FFF; 1208 cc = ARSH(w, 13); 1209 } 1210 cc = MUL15((w + 0x200) >> 8, 19); 1211 d[19] &= 0xFF; 1212 for (i = 0; i < 20; i ++) { 1213 w = d[i] + cc; 1214 d[i] = w & 0x1FFF; 1215 cc = w >> 13; 1216 } 1217 } 1218 1219 /* 1220 * Multiply an integer by the 'A24' constant (121665). Partial reduction 1221 * is performed (down to less than twice the modulus). 1222 */ 1223 static void 1224 f255_mul_a24(uint32_t *d, const uint32_t *a) 1225 { 1226 int i; 1227 uint32_t cc, w; 1228 1229 cc = 0; 1230 for (i = 0; i < 20; i ++) { 1231 w = MUL15(a[i], 121665) + cc; 1232 d[i] = w & 0x1FFF; 1233 cc = w >> 13; 1234 } 1235 cc = MUL15(w >> 8, 19); 1236 d[19] &= 0xFF; 1237 for (i = 0; i < 20; i ++) { 1238 w = d[i] + cc; 1239 d[i] = w & 0x1FFF; 1240 cc = w >> 13; 1241 } 1242 } 1243 1244 static const unsigned char GEN[] = { 1245 0x09, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1246 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1247 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 1248 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 1249 }; 1250 1251 static const unsigned char ORDER[] = { 1252 0x7F, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1253 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1254 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 1255 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF 1256 }; 1257 1258 static const unsigned char * 1259 api_generator(int curve, size_t *len) 1260 { 1261 (void)curve; 1262 *len = 32; 1263 return GEN; 1264 } 1265 1266 static const unsigned char * 1267 api_order(int curve, size_t *len) 1268 { 1269 (void)curve; 1270 *len = 32; 1271 return ORDER; 1272 } 1273 1274 static size_t 1275 api_xoff(int curve, size_t *len) 1276 { 1277 (void)curve; 1278 *len = 32; 1279 return 0; 1280 } 1281 1282 static void 1283 cswap(uint32_t *a, uint32_t *b, uint32_t ctl) 1284 { 1285 int i; 1286 1287 ctl = -ctl; 1288 for (i = 0; i < 20; i ++) { 1289 uint32_t aw, bw, tw; 1290 1291 aw = a[i]; 1292 bw = b[i]; 1293 tw = ctl & (aw ^ bw); 1294 a[i] = aw ^ tw; 1295 b[i] = bw ^ tw; 1296 } 1297 } 1298 1299 static uint32_t 1300 api_mul(unsigned char *G, size_t Glen, 1301 const unsigned char *kb, size_t kblen, int curve) 1302 { 1303 uint32_t x1[20], x2[20], x3[20], z2[20], z3[20]; 1304 uint32_t a[20], aa[20], b[20], bb[20]; 1305 uint32_t c[20], d[20], e[20], da[20], cb[20]; 1306 unsigned char k[32]; 1307 uint32_t swap; 1308 int i; 1309 1310 (void)curve; 1311 1312 /* 1313 * Points are encoded over exactly 32 bytes. Multipliers must fit 1314 * in 32 bytes as well. 1315 * RFC 7748 mandates that the high bit of the last point byte must 1316 * be ignored/cleared. 1317 */ 1318 if (Glen != 32 || kblen > 32) { 1319 return 0; 1320 } 1321 G[31] &= 0x7F; 1322 1323 /* 1324 * Initialise variables x1, x2, z2, x3 and z3. We set all of them 1325 * into Montgomery representation. 1326 */ 1327 x1[19] = le8_to_le13(x1, G, 32); 1328 memcpy(x3, x1, sizeof x1); 1329 memset(z2, 0, sizeof z2); 1330 memset(x2, 0, sizeof x2); 1331 x2[0] = 1; 1332 memset(z3, 0, sizeof z3); 1333 z3[0] = 1; 1334 1335 memset(k, 0, (sizeof k) - kblen); 1336 memcpy(k + (sizeof k) - kblen, kb, kblen); 1337 k[31] &= 0xF8; 1338 k[0] &= 0x7F; 1339 k[0] |= 0x40; 1340 1341 /* obsolete 1342 print_int("x1", x1); 1343 */ 1344 1345 swap = 0; 1346 for (i = 254; i >= 0; i --) { 1347 uint32_t kt; 1348 1349 kt = (k[31 - (i >> 3)] >> (i & 7)) & 1; 1350 swap ^= kt; 1351 cswap(x2, x3, swap); 1352 cswap(z2, z3, swap); 1353 swap = kt; 1354 1355 /* obsolete 1356 print_int("x2", x2); 1357 print_int("z2", z2); 1358 print_int("x3", x3); 1359 print_int("z3", z3); 1360 */ 1361 1362 f255_add(a, x2, z2); 1363 f255_square(aa, a); 1364 f255_sub(b, x2, z2); 1365 f255_square(bb, b); 1366 f255_sub(e, aa, bb); 1367 f255_add(c, x3, z3); 1368 f255_sub(d, x3, z3); 1369 f255_mul(da, d, a); 1370 f255_mul(cb, c, b); 1371 1372 /* obsolete 1373 print_int("a ", a); 1374 print_int("aa", aa); 1375 print_int("b ", b); 1376 print_int("bb", bb); 1377 print_int("e ", e); 1378 print_int("c ", c); 1379 print_int("d ", d); 1380 print_int("da", da); 1381 print_int("cb", cb); 1382 */ 1383 1384 f255_add(x3, da, cb); 1385 f255_square(x3, x3); 1386 f255_sub(z3, da, cb); 1387 f255_square(z3, z3); 1388 f255_mul(z3, z3, x1); 1389 f255_mul(x2, aa, bb); 1390 f255_mul_a24(z2, e); 1391 f255_add(z2, z2, aa); 1392 f255_mul(z2, e, z2); 1393 1394 /* obsolete 1395 print_int("x2", x2); 1396 print_int("z2", z2); 1397 print_int("x3", x3); 1398 print_int("z3", z3); 1399 */ 1400 } 1401 cswap(x2, x3, swap); 1402 cswap(z2, z3, swap); 1403 1404 /* 1405 * Inverse z2 with a modular exponentiation. This is a simple 1406 * square-and-multiply algorithm; we mutualise most non-squarings 1407 * since the exponent contains almost only ones. 1408 */ 1409 memcpy(a, z2, sizeof z2); 1410 for (i = 0; i < 15; i ++) { 1411 f255_square(a, a); 1412 f255_mul(a, a, z2); 1413 } 1414 memcpy(b, a, sizeof a); 1415 for (i = 0; i < 14; i ++) { 1416 int j; 1417 1418 for (j = 0; j < 16; j ++) { 1419 f255_square(b, b); 1420 } 1421 f255_mul(b, b, a); 1422 } 1423 for (i = 14; i >= 0; i --) { 1424 f255_square(b, b); 1425 if ((0xFFEB >> i) & 1) { 1426 f255_mul(b, z2, b); 1427 } 1428 } 1429 f255_mul(x2, x2, b); 1430 reduce_final_f255(x2); 1431 le13_to_le8(G, 32, x2); 1432 return 1; 1433 } 1434 1435 static size_t 1436 api_mulgen(unsigned char *R, 1437 const unsigned char *x, size_t xlen, int curve) 1438 { 1439 const unsigned char *G; 1440 size_t Glen; 1441 1442 G = api_generator(curve, &Glen); 1443 memcpy(R, G, Glen); 1444 api_mul(R, Glen, x, xlen, curve); 1445 return Glen; 1446 } 1447 1448 static uint32_t 1449 api_muladd(unsigned char *A, const unsigned char *B, size_t len, 1450 const unsigned char *x, size_t xlen, 1451 const unsigned char *y, size_t ylen, int curve) 1452 { 1453 /* 1454 * We don't implement this method, since it is used for ECDSA 1455 * only, and there is no ECDSA over Curve25519 (which instead 1456 * uses EdDSA). 1457 */ 1458 (void)A; 1459 (void)B; 1460 (void)len; 1461 (void)x; 1462 (void)xlen; 1463 (void)y; 1464 (void)ylen; 1465 (void)curve; 1466 return 0; 1467 } 1468 1469 /* see bearssl_ec.h */ 1470 const br_ec_impl br_ec_c25519_m15 = { 1471 (uint32_t)0x20000000, 1472 &api_generator, 1473 &api_order, 1474 &api_xoff, 1475 &api_mul, 1476 &api_mulgen, 1477 &api_muladd 1478 }; 1479