1 /* 2 * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> 3 * 4 * Permission is hereby granted, free of charge, to any person obtaining 5 * a copy of this software and associated documentation files (the 6 * "Software"), to deal in the Software without restriction, including 7 * without limitation the rights to use, copy, modify, merge, publish, 8 * distribute, sublicense, and/or sell copies of the Software, and to 9 * permit persons to whom the Software is furnished to do so, subject to 10 * the following conditions: 11 * 12 * The above copyright notice and this permission notice shall be 13 * included in all copies or substantial portions of the Software. 14 * 15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18 * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19 * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20 * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21 * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22 * SOFTWARE. 23 */ 24 25 #include "inner.h" 26 27 /* 28 * In this file, we handle big integers with a custom format, i.e. 29 * without the usual one-word header. Value is split into 15-bit words, 30 * each stored in a 16-bit slot (top bit is zero) in little-endian 31 * order. The length (in words) is provided explicitly. In some cases, 32 * the value can be negative (using two's complement representation). In 33 * some cases, the top word is allowed to have a 16th bit. 34 */ 35 36 /* 37 * Negate big integer conditionally. The value consists of 'len' words, 38 * with 15 bits in each word (the top bit of each word should be 0, 39 * except possibly for the last word). If 'ctl' is 1, the negation is 40 * computed; otherwise, if 'ctl' is 0, then the value is unchanged. 41 */ 42 static void 43 cond_negate(uint16_t *a, size_t len, uint32_t ctl) 44 { 45 size_t k; 46 uint32_t cc, xm; 47 48 cc = ctl; 49 xm = 0x7FFF & -ctl; 50 for (k = 0; k < len; k ++) { 51 uint32_t aw; 52 53 aw = a[k]; 54 aw = (aw ^ xm) + cc; 55 a[k] = aw & 0x7FFF; 56 cc = (aw >> 15) & 1; 57 } 58 } 59 60 /* 61 * Finish modular reduction. Rules on input parameters: 62 * 63 * if neg = 1, then -m <= a < 0 64 * if neg = 0, then 0 <= a < 2*m 65 * 66 * If neg = 0, then the top word of a[] may use 16 bits. 67 * 68 * Also, modulus m must be odd. 69 */ 70 static void 71 finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg) 72 { 73 size_t k; 74 uint32_t cc, xm, ym; 75 76 /* 77 * First pass: compare a (assumed nonnegative) with m. 78 */ 79 cc = 0; 80 for (k = 0; k < len; k ++) { 81 uint32_t aw, mw; 82 83 aw = a[k]; 84 mw = m[k]; 85 cc = (aw - mw - cc) >> 31; 86 } 87 88 /* 89 * At this point: 90 * if neg = 1, then we must add m (regardless of cc) 91 * if neg = 0 and cc = 0, then we must subtract m 92 * if neg = 0 and cc = 1, then we must do nothing 93 */ 94 xm = 0x7FFF & -neg; 95 ym = -(neg | (1 - cc)); 96 cc = neg; 97 for (k = 0; k < len; k ++) { 98 uint32_t aw, mw; 99 100 aw = a[k]; 101 mw = (m[k] ^ xm) & ym; 102 aw = aw - mw - cc; 103 a[k] = aw & 0x7FFF; 104 cc = aw >> 31; 105 } 106 } 107 108 /* 109 * Compute: 110 * a <- (a*pa+b*pb)/(2^15) 111 * b <- (a*qa+b*qb)/(2^15) 112 * The division is assumed to be exact (i.e. the low word is dropped). 113 * If the final a is negative, then it is negated. Similarly for b. 114 * Returned value is the combination of two bits: 115 * bit 0: 1 if a had to be negated, 0 otherwise 116 * bit 1: 1 if b had to be negated, 0 otherwise 117 * 118 * Factors pa, pb, qa and qb must be at most 2^15 in absolute value. 119 * Source integers a and b must be nonnegative; top word is not allowed 120 * to contain an extra 16th bit. 121 */ 122 static uint32_t 123 co_reduce(uint16_t *a, uint16_t *b, size_t len, 124 int32_t pa, int32_t pb, int32_t qa, int32_t qb) 125 { 126 size_t k; 127 int32_t cca, ccb; 128 uint32_t nega, negb; 129 130 cca = 0; 131 ccb = 0; 132 for (k = 0; k < len; k ++) { 133 uint32_t wa, wb, za, zb; 134 uint16_t tta, ttb; 135 136 /* 137 * Since: 138 * |pa| <= 2^15 139 * |pb| <= 2^15 140 * 0 <= wa <= 2^15 - 1 141 * 0 <= wb <= 2^15 - 1 142 * |cca| <= 2^16 - 1 143 * Then: 144 * |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1 145 * 146 * Thus, the new value of cca is such that |cca| <= 2^16 - 1. 147 * The same applies to ccb. 148 */ 149 wa = a[k]; 150 wb = b[k]; 151 za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca; 152 zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb; 153 if (k > 0) { 154 a[k - 1] = za & 0x7FFF; 155 b[k - 1] = zb & 0x7FFF; 156 } 157 tta = za >> 15; 158 ttb = zb >> 15; 159 cca = *(int16_t *)&tta; 160 ccb = *(int16_t *)&ttb; 161 } 162 a[len - 1] = (uint16_t)cca; 163 b[len - 1] = (uint16_t)ccb; 164 nega = (uint32_t)cca >> 31; 165 negb = (uint32_t)ccb >> 31; 166 cond_negate(a, len, nega); 167 cond_negate(b, len, negb); 168 return nega | (negb << 1); 169 } 170 171 /* 172 * Compute: 173 * a <- (a*pa+b*pb)/(2^15) mod m 174 * b <- (a*qa+b*qb)/(2^15) mod m 175 * 176 * m0i is equal to -1/m[0] mod 2^15. 177 * 178 * Factors pa, pb, qa and qb must be at most 2^15 in absolute value. 179 * Source integers a and b must be nonnegative; top word is not allowed 180 * to contain an extra 16th bit. 181 */ 182 static void 183 co_reduce_mod(uint16_t *a, uint16_t *b, size_t len, 184 int32_t pa, int32_t pb, int32_t qa, int32_t qb, 185 const uint16_t *m, uint16_t m0i) 186 { 187 size_t k; 188 int32_t cca, ccb, fa, fb; 189 190 cca = 0; 191 ccb = 0; 192 fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF; 193 fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF; 194 for (k = 0; k < len; k ++) { 195 uint32_t wa, wb, za, zb; 196 uint32_t tta, ttb; 197 198 /* 199 * In this loop, carries 'cca' and 'ccb' always fit on 200 * 17 bits (in absolute value). 201 */ 202 wa = a[k]; 203 wb = b[k]; 204 za = wa * (uint32_t)pa + wb * (uint32_t)pb 205 + m[k] * (uint32_t)fa + (uint32_t)cca; 206 zb = wa * (uint32_t)qa + wb * (uint32_t)qb 207 + m[k] * (uint32_t)fb + (uint32_t)ccb; 208 if (k > 0) { 209 a[k - 1] = za & 0x7FFF; 210 b[k - 1] = zb & 0x7FFF; 211 } 212 213 /* 214 * The XOR-and-sub construction below does an arithmetic 215 * right shift in a portable way (technically, right-shifting 216 * a negative signed value is implementation-defined in C). 217 */ 218 #define M ((uint32_t)1 << 16) 219 tta = za >> 15; 220 ttb = zb >> 15; 221 tta = (tta ^ M) - M; 222 ttb = (ttb ^ M) - M; 223 cca = *(int32_t *)&tta; 224 ccb = *(int32_t *)&ttb; 225 #undef M 226 } 227 a[len - 1] = (uint32_t)cca; 228 b[len - 1] = (uint32_t)ccb; 229 230 /* 231 * At this point: 232 * -m <= a < 2*m 233 * -m <= b < 2*m 234 * (this is a case of Montgomery reduction) 235 * The top word of 'a' and 'b' may have a 16-th bit set. 236 * We may have to add or subtract the modulus. 237 */ 238 finish_mod(a, len, m, (uint32_t)cca >> 31); 239 finish_mod(b, len, m, (uint32_t)ccb >> 31); 240 } 241 242 /* see inner.h */ 243 uint32_t 244 br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i, 245 uint16_t *t) 246 { 247 /* 248 * Algorithm is an extended binary GCD. We maintain four values 249 * a, b, u and v, with the following invariants: 250 * 251 * a * x = y * u mod m 252 * b * x = y * v mod m 253 * 254 * Starting values are: 255 * 256 * a = y 257 * b = m 258 * u = x 259 * v = 0 260 * 261 * The formal definition of the algorithm is a sequence of steps: 262 * 263 * - If a is even, then a <- a/2 and u <- u/2 mod m. 264 * - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m. 265 * - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m. 266 * - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m. 267 * 268 * Algorithm stops when a = b. At that point, they both are equal 269 * to GCD(y,m); the modular division succeeds if that value is 1. 270 * The result of the modular division is then u (or v: both are 271 * equal at that point). 272 * 273 * Each step makes either a or b shrink by at least one bit; hence, 274 * if m has bit length k bits, then 2k-2 steps are sufficient. 275 * 276 * 277 * Though complexity is quadratic in the size of m, the bit-by-bit 278 * processing is not very efficient. We can speed up processing by 279 * remarking that the decisions are taken based only on observation 280 * of the top and low bits of a and b. 281 * 282 * In the loop below, at each iteration, we use the two top words 283 * of a and b, and the low words of a and b, to compute reduction 284 * parameters pa, pb, qa and qb such that the new values for a 285 * and b are: 286 * 287 * a' = (a*pa + b*pb) / (2^15) 288 * b' = (a*qa + b*qb) / (2^15) 289 * 290 * the division being exact. 291 * 292 * Since the choices are based on the top words, they may be slightly 293 * off, requiring an optional correction: if a' < 0, then we replace 294 * pa with -pa, and pb with -pb. The total length of a and b is 295 * thus reduced by at least 14 bits at each iteration. 296 * 297 * The stopping conditions are still the same, though: when a 298 * and b become equal, they must be both odd (since m is odd, 299 * the GCD cannot be even), therefore the next operation is a 300 * subtraction, and one of the values becomes 0. At that point, 301 * nothing else happens, i.e. one value is stuck at 0, and the 302 * other one is the GCD. 303 */ 304 size_t len, k; 305 uint16_t *a, *b, *u, *v; 306 uint32_t num, r; 307 308 len = (m[0] + 15) >> 4; 309 a = t; 310 b = a + len; 311 u = x + 1; 312 v = b + len; 313 memcpy(a, y + 1, len * sizeof *y); 314 memcpy(b, m + 1, len * sizeof *m); 315 memset(v, 0, len * sizeof *v); 316 317 /* 318 * Loop below ensures that a and b are reduced by some bits each, 319 * for a total of at least 14 bits. 320 */ 321 for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) { 322 size_t j; 323 uint32_t c0, c1; 324 uint32_t a0, a1, b0, b1; 325 uint32_t a_hi, b_hi, a_lo, b_lo; 326 int32_t pa, pb, qa, qb; 327 int i; 328 329 /* 330 * Extract top words of a and b. If j is the highest 331 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want 332 * (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1]. 333 * If a and b are down to one word each, then we use a[0] 334 * and b[0]. 335 */ 336 c0 = (uint32_t)-1; 337 c1 = (uint32_t)-1; 338 a0 = 0; 339 a1 = 0; 340 b0 = 0; 341 b1 = 0; 342 j = len; 343 while (j -- > 0) { 344 uint32_t aw, bw; 345 346 aw = a[j]; 347 bw = b[j]; 348 a0 ^= (a0 ^ aw) & c0; 349 a1 ^= (a1 ^ aw) & c1; 350 b0 ^= (b0 ^ bw) & c0; 351 b1 ^= (b1 ^ bw) & c1; 352 c1 = c0; 353 c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1; 354 } 355 356 /* 357 * If c1 = 0, then we grabbed two words for a and b. 358 * If c1 != 0 but c0 = 0, then we grabbed one word. It 359 * is not possible that c1 != 0 and c0 != 0, because that 360 * would mean that both integers are zero. 361 */ 362 a1 |= a0 & c1; 363 a0 &= ~c1; 364 b1 |= b0 & c1; 365 b0 &= ~c1; 366 a_hi = (a0 << 15) + a1; 367 b_hi = (b0 << 15) + b1; 368 a_lo = a[0]; 369 b_lo = b[0]; 370 371 /* 372 * Compute reduction factors: 373 * 374 * a' = a*pa + b*pb 375 * b' = a*qa + b*qb 376 * 377 * such that a' and b' are both multiple of 2^15, but are 378 * only marginally larger than a and b. 379 */ 380 pa = 1; 381 pb = 0; 382 qa = 0; 383 qb = 1; 384 for (i = 0; i < 15; i ++) { 385 /* 386 * At each iteration: 387 * 388 * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi 389 * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi 390 * a <- a/2 if: a is even 391 * b <- b/2 if: a is odd, b is even 392 * 393 * We multiply a_lo and b_lo by 2 at each 394 * iteration, thus a division by 2 really is a 395 * non-multiplication by 2. 396 */ 397 uint32_t r, oa, ob, cAB, cBA, cA; 398 399 /* 400 * cAB = 1 if b must be subtracted from a 401 * cBA = 1 if a must be subtracted from b 402 * cA = 1 if a is divided by 2, 0 otherwise 403 * 404 * Rules: 405 * 406 * cAB and cBA cannot be both 1. 407 * if a is not divided by 2, b is. 408 */ 409 r = GT(a_hi, b_hi); 410 oa = (a_lo >> i) & 1; 411 ob = (b_lo >> i) & 1; 412 cAB = oa & ob & r; 413 cBA = oa & ob & NOT(r); 414 cA = cAB | NOT(oa); 415 416 /* 417 * Conditional subtractions. 418 */ 419 a_lo -= b_lo & -cAB; 420 a_hi -= b_hi & -cAB; 421 pa -= qa & -(int32_t)cAB; 422 pb -= qb & -(int32_t)cAB; 423 b_lo -= a_lo & -cBA; 424 b_hi -= a_hi & -cBA; 425 qa -= pa & -(int32_t)cBA; 426 qb -= pb & -(int32_t)cBA; 427 428 /* 429 * Shifting. 430 */ 431 a_lo += a_lo & (cA - 1); 432 pa += pa & ((int32_t)cA - 1); 433 pb += pb & ((int32_t)cA - 1); 434 a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA; 435 b_lo += b_lo & -cA; 436 qa += qa & -(int32_t)cA; 437 qb += qb & -(int32_t)cA; 438 b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1); 439 } 440 441 /* 442 * Replace a and b with new values a' and b'. 443 */ 444 r = co_reduce(a, b, len, pa, pb, qa, qb); 445 pa -= pa * ((r & 1) << 1); 446 pb -= pb * ((r & 1) << 1); 447 qa -= qa * (r & 2); 448 qb -= qb * (r & 2); 449 co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i); 450 } 451 452 /* 453 * Now one of the arrays should be 0, and the other contains 454 * the GCD. If a is 0, then u is 0 as well, and v contains 455 * the division result. 456 * Result is correct if and only if GCD is 1. 457 */ 458 r = (a[0] | b[0]) ^ 1; 459 u[0] |= v[0]; 460 for (k = 1; k < len; k ++) { 461 r |= a[k] | b[k]; 462 u[k] |= v[k]; 463 } 464 return EQ0(r); 465 } 466