1*0957b409SSimon J. Gerraty /* 2*0957b409SSimon J. Gerraty * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org> 3*0957b409SSimon J. Gerraty * 4*0957b409SSimon J. Gerraty * Permission is hereby granted, free of charge, to any person obtaining 5*0957b409SSimon J. Gerraty * a copy of this software and associated documentation files (the 6*0957b409SSimon J. Gerraty * "Software"), to deal in the Software without restriction, including 7*0957b409SSimon J. Gerraty * without limitation the rights to use, copy, modify, merge, publish, 8*0957b409SSimon J. Gerraty * distribute, sublicense, and/or sell copies of the Software, and to 9*0957b409SSimon J. Gerraty * permit persons to whom the Software is furnished to do so, subject to 10*0957b409SSimon J. Gerraty * the following conditions: 11*0957b409SSimon J. Gerraty * 12*0957b409SSimon J. Gerraty * The above copyright notice and this permission notice shall be 13*0957b409SSimon J. Gerraty * included in all copies or substantial portions of the Software. 14*0957b409SSimon J. Gerraty * 15*0957b409SSimon J. Gerraty * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, 16*0957b409SSimon J. Gerraty * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 17*0957b409SSimon J. Gerraty * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND 18*0957b409SSimon J. Gerraty * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS 19*0957b409SSimon J. Gerraty * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN 20*0957b409SSimon J. Gerraty * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN 21*0957b409SSimon J. Gerraty * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 22*0957b409SSimon J. Gerraty * SOFTWARE. 23*0957b409SSimon J. Gerraty */ 24*0957b409SSimon J. Gerraty 25*0957b409SSimon J. Gerraty #include "inner.h" 26*0957b409SSimon J. Gerraty 27*0957b409SSimon J. Gerraty /* 28*0957b409SSimon J. Gerraty * In this file, we handle big integers with a custom format, i.e. 29*0957b409SSimon J. Gerraty * without the usual one-word header. Value is split into 31-bit words, 30*0957b409SSimon J. Gerraty * each stored in a 32-bit slot (top bit is zero) in little-endian 31*0957b409SSimon J. Gerraty * order. The length (in words) is provided explicitly. In some cases, 32*0957b409SSimon J. Gerraty * the value can be negative (using two's complement representation). In 33*0957b409SSimon J. Gerraty * some cases, the top word is allowed to have a 32th bit. 34*0957b409SSimon J. Gerraty */ 35*0957b409SSimon J. Gerraty 36*0957b409SSimon J. Gerraty /* 37*0957b409SSimon J. Gerraty * Negate big integer conditionally. The value consists of 'len' words, 38*0957b409SSimon J. Gerraty * with 31 bits in each word (the top bit of each word should be 0, 39*0957b409SSimon J. Gerraty * except possibly for the last word). If 'ctl' is 1, the negation is 40*0957b409SSimon J. Gerraty * computed; otherwise, if 'ctl' is 0, then the value is unchanged. 41*0957b409SSimon J. Gerraty */ 42*0957b409SSimon J. Gerraty static void 43*0957b409SSimon J. Gerraty cond_negate(uint32_t *a, size_t len, uint32_t ctl) 44*0957b409SSimon J. Gerraty { 45*0957b409SSimon J. Gerraty size_t k; 46*0957b409SSimon J. Gerraty uint32_t cc, xm; 47*0957b409SSimon J. Gerraty 48*0957b409SSimon J. Gerraty cc = ctl; 49*0957b409SSimon J. Gerraty xm = -ctl >> 1; 50*0957b409SSimon J. Gerraty for (k = 0; k < len; k ++) { 51*0957b409SSimon J. Gerraty uint32_t aw; 52*0957b409SSimon J. Gerraty 53*0957b409SSimon J. Gerraty aw = a[k]; 54*0957b409SSimon J. Gerraty aw = (aw ^ xm) + cc; 55*0957b409SSimon J. Gerraty a[k] = aw & 0x7FFFFFFF; 56*0957b409SSimon J. Gerraty cc = aw >> 31; 57*0957b409SSimon J. Gerraty } 58*0957b409SSimon J. Gerraty } 59*0957b409SSimon J. Gerraty 60*0957b409SSimon J. Gerraty /* 61*0957b409SSimon J. Gerraty * Finish modular reduction. Rules on input parameters: 62*0957b409SSimon J. Gerraty * 63*0957b409SSimon J. Gerraty * if neg = 1, then -m <= a < 0 64*0957b409SSimon J. Gerraty * if neg = 0, then 0 <= a < 2*m 65*0957b409SSimon J. Gerraty * 66*0957b409SSimon J. Gerraty * If neg = 0, then the top word of a[] may use 32 bits. 67*0957b409SSimon J. Gerraty * 68*0957b409SSimon J. Gerraty * Also, modulus m must be odd. 69*0957b409SSimon J. Gerraty */ 70*0957b409SSimon J. Gerraty static void 71*0957b409SSimon J. Gerraty finish_mod(uint32_t *a, size_t len, const uint32_t *m, uint32_t neg) 72*0957b409SSimon J. Gerraty { 73*0957b409SSimon J. Gerraty size_t k; 74*0957b409SSimon J. Gerraty uint32_t cc, xm, ym; 75*0957b409SSimon J. Gerraty 76*0957b409SSimon J. Gerraty /* 77*0957b409SSimon J. Gerraty * First pass: compare a (assumed nonnegative) with m. 78*0957b409SSimon J. Gerraty * Note that if the final word uses the top extra bit, then 79*0957b409SSimon J. Gerraty * subtracting m must yield a value less than 2^31, since we 80*0957b409SSimon J. Gerraty * assumed that a < 2*m. 81*0957b409SSimon J. Gerraty */ 82*0957b409SSimon J. Gerraty cc = 0; 83*0957b409SSimon J. Gerraty for (k = 0; k < len; k ++) { 84*0957b409SSimon J. Gerraty uint32_t aw, mw; 85*0957b409SSimon J. Gerraty 86*0957b409SSimon J. Gerraty aw = a[k]; 87*0957b409SSimon J. Gerraty mw = m[k]; 88*0957b409SSimon J. Gerraty cc = (aw - mw - cc) >> 31; 89*0957b409SSimon J. Gerraty } 90*0957b409SSimon J. Gerraty 91*0957b409SSimon J. Gerraty /* 92*0957b409SSimon J. Gerraty * At this point: 93*0957b409SSimon J. Gerraty * if neg = 1, then we must add m (regardless of cc) 94*0957b409SSimon J. Gerraty * if neg = 0 and cc = 0, then we must subtract m 95*0957b409SSimon J. Gerraty * if neg = 0 and cc = 1, then we must do nothing 96*0957b409SSimon J. Gerraty */ 97*0957b409SSimon J. Gerraty xm = -neg >> 1; 98*0957b409SSimon J. Gerraty ym = -(neg | (1 - cc)); 99*0957b409SSimon J. Gerraty cc = neg; 100*0957b409SSimon J. Gerraty for (k = 0; k < len; k ++) { 101*0957b409SSimon J. Gerraty uint32_t aw, mw; 102*0957b409SSimon J. Gerraty 103*0957b409SSimon J. Gerraty aw = a[k]; 104*0957b409SSimon J. Gerraty mw = (m[k] ^ xm) & ym; 105*0957b409SSimon J. Gerraty aw = aw - mw - cc; 106*0957b409SSimon J. Gerraty a[k] = aw & 0x7FFFFFFF; 107*0957b409SSimon J. Gerraty cc = aw >> 31; 108*0957b409SSimon J. Gerraty } 109*0957b409SSimon J. Gerraty } 110*0957b409SSimon J. Gerraty 111*0957b409SSimon J. Gerraty /* 112*0957b409SSimon J. Gerraty * Compute: 113*0957b409SSimon J. Gerraty * a <- (a*pa+b*pb)/(2^31) 114*0957b409SSimon J. Gerraty * b <- (a*qa+b*qb)/(2^31) 115*0957b409SSimon J. Gerraty * The division is assumed to be exact (i.e. the low word is dropped). 116*0957b409SSimon J. Gerraty * If the final a is negative, then it is negated. Similarly for b. 117*0957b409SSimon J. Gerraty * Returned value is the combination of two bits: 118*0957b409SSimon J. Gerraty * bit 0: 1 if a had to be negated, 0 otherwise 119*0957b409SSimon J. Gerraty * bit 1: 1 if b had to be negated, 0 otherwise 120*0957b409SSimon J. Gerraty * 121*0957b409SSimon J. Gerraty * Factors pa, pb, qa and qb must be at most 2^31 in absolute value. 122*0957b409SSimon J. Gerraty * Source integers a and b must be nonnegative; top word is not allowed 123*0957b409SSimon J. Gerraty * to contain an extra 32th bit. 124*0957b409SSimon J. Gerraty */ 125*0957b409SSimon J. Gerraty static uint32_t 126*0957b409SSimon J. Gerraty co_reduce(uint32_t *a, uint32_t *b, size_t len, 127*0957b409SSimon J. Gerraty int64_t pa, int64_t pb, int64_t qa, int64_t qb) 128*0957b409SSimon J. Gerraty { 129*0957b409SSimon J. Gerraty size_t k; 130*0957b409SSimon J. Gerraty int64_t cca, ccb; 131*0957b409SSimon J. Gerraty uint32_t nega, negb; 132*0957b409SSimon J. Gerraty 133*0957b409SSimon J. Gerraty cca = 0; 134*0957b409SSimon J. Gerraty ccb = 0; 135*0957b409SSimon J. Gerraty for (k = 0; k < len; k ++) { 136*0957b409SSimon J. Gerraty uint32_t wa, wb; 137*0957b409SSimon J. Gerraty uint64_t za, zb; 138*0957b409SSimon J. Gerraty uint64_t tta, ttb; 139*0957b409SSimon J. Gerraty 140*0957b409SSimon J. Gerraty /* 141*0957b409SSimon J. Gerraty * Since: 142*0957b409SSimon J. Gerraty * |pa| <= 2^31 143*0957b409SSimon J. Gerraty * |pb| <= 2^31 144*0957b409SSimon J. Gerraty * 0 <= wa <= 2^31 - 1 145*0957b409SSimon J. Gerraty * 0 <= wb <= 2^31 - 1 146*0957b409SSimon J. Gerraty * |cca| <= 2^32 - 1 147*0957b409SSimon J. Gerraty * Then: 148*0957b409SSimon J. Gerraty * |za| <= (2^31-1)*(2^32) + (2^32-1) = 2^63 - 1 149*0957b409SSimon J. Gerraty * 150*0957b409SSimon J. Gerraty * Thus, the new value of cca is such that |cca| <= 2^32 - 1. 151*0957b409SSimon J. Gerraty * The same applies to ccb. 152*0957b409SSimon J. Gerraty */ 153*0957b409SSimon J. Gerraty wa = a[k]; 154*0957b409SSimon J. Gerraty wb = b[k]; 155*0957b409SSimon J. Gerraty za = wa * (uint64_t)pa + wb * (uint64_t)pb + (uint64_t)cca; 156*0957b409SSimon J. Gerraty zb = wa * (uint64_t)qa + wb * (uint64_t)qb + (uint64_t)ccb; 157*0957b409SSimon J. Gerraty if (k > 0) { 158*0957b409SSimon J. Gerraty a[k - 1] = za & 0x7FFFFFFF; 159*0957b409SSimon J. Gerraty b[k - 1] = zb & 0x7FFFFFFF; 160*0957b409SSimon J. Gerraty } 161*0957b409SSimon J. Gerraty 162*0957b409SSimon J. Gerraty /* 163*0957b409SSimon J. Gerraty * For the new values of cca and ccb, we need a signed 164*0957b409SSimon J. Gerraty * right-shift; since, in C, right-shifting a signed 165*0957b409SSimon J. Gerraty * negative value is implementation-defined, we use a 166*0957b409SSimon J. Gerraty * custom portable sign extension expression. 167*0957b409SSimon J. Gerraty */ 168*0957b409SSimon J. Gerraty #define M ((uint64_t)1 << 32) 169*0957b409SSimon J. Gerraty tta = za >> 31; 170*0957b409SSimon J. Gerraty ttb = zb >> 31; 171*0957b409SSimon J. Gerraty tta = (tta ^ M) - M; 172*0957b409SSimon J. Gerraty ttb = (ttb ^ M) - M; 173*0957b409SSimon J. Gerraty cca = *(int64_t *)&tta; 174*0957b409SSimon J. Gerraty ccb = *(int64_t *)&ttb; 175*0957b409SSimon J. Gerraty #undef M 176*0957b409SSimon J. Gerraty } 177*0957b409SSimon J. Gerraty a[len - 1] = (uint32_t)cca; 178*0957b409SSimon J. Gerraty b[len - 1] = (uint32_t)ccb; 179*0957b409SSimon J. Gerraty 180*0957b409SSimon J. Gerraty nega = (uint32_t)((uint64_t)cca >> 63); 181*0957b409SSimon J. Gerraty negb = (uint32_t)((uint64_t)ccb >> 63); 182*0957b409SSimon J. Gerraty cond_negate(a, len, nega); 183*0957b409SSimon J. Gerraty cond_negate(b, len, negb); 184*0957b409SSimon J. Gerraty return nega | (negb << 1); 185*0957b409SSimon J. Gerraty } 186*0957b409SSimon J. Gerraty 187*0957b409SSimon J. Gerraty /* 188*0957b409SSimon J. Gerraty * Compute: 189*0957b409SSimon J. Gerraty * a <- (a*pa+b*pb)/(2^31) mod m 190*0957b409SSimon J. Gerraty * b <- (a*qa+b*qb)/(2^31) mod m 191*0957b409SSimon J. Gerraty * 192*0957b409SSimon J. Gerraty * m0i is equal to -1/m[0] mod 2^31. 193*0957b409SSimon J. Gerraty * 194*0957b409SSimon J. Gerraty * Factors pa, pb, qa and qb must be at most 2^31 in absolute value. 195*0957b409SSimon J. Gerraty * Source integers a and b must be nonnegative; top word is not allowed 196*0957b409SSimon J. Gerraty * to contain an extra 32th bit. 197*0957b409SSimon J. Gerraty */ 198*0957b409SSimon J. Gerraty static void 199*0957b409SSimon J. Gerraty co_reduce_mod(uint32_t *a, uint32_t *b, size_t len, 200*0957b409SSimon J. Gerraty int64_t pa, int64_t pb, int64_t qa, int64_t qb, 201*0957b409SSimon J. Gerraty const uint32_t *m, uint32_t m0i) 202*0957b409SSimon J. Gerraty { 203*0957b409SSimon J. Gerraty size_t k; 204*0957b409SSimon J. Gerraty int64_t cca, ccb; 205*0957b409SSimon J. Gerraty uint32_t fa, fb; 206*0957b409SSimon J. Gerraty 207*0957b409SSimon J. Gerraty cca = 0; 208*0957b409SSimon J. Gerraty ccb = 0; 209*0957b409SSimon J. Gerraty fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFFFFFF; 210*0957b409SSimon J. Gerraty fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFFFFFF; 211*0957b409SSimon J. Gerraty for (k = 0; k < len; k ++) { 212*0957b409SSimon J. Gerraty uint32_t wa, wb; 213*0957b409SSimon J. Gerraty uint64_t za, zb; 214*0957b409SSimon J. Gerraty uint64_t tta, ttb; 215*0957b409SSimon J. Gerraty 216*0957b409SSimon J. Gerraty /* 217*0957b409SSimon J. Gerraty * In this loop, carries 'cca' and 'ccb' always fit on 218*0957b409SSimon J. Gerraty * 33 bits (in absolute value). 219*0957b409SSimon J. Gerraty */ 220*0957b409SSimon J. Gerraty wa = a[k]; 221*0957b409SSimon J. Gerraty wb = b[k]; 222*0957b409SSimon J. Gerraty za = wa * (uint64_t)pa + wb * (uint64_t)pb 223*0957b409SSimon J. Gerraty + m[k] * (uint64_t)fa + (uint64_t)cca; 224*0957b409SSimon J. Gerraty zb = wa * (uint64_t)qa + wb * (uint64_t)qb 225*0957b409SSimon J. Gerraty + m[k] * (uint64_t)fb + (uint64_t)ccb; 226*0957b409SSimon J. Gerraty if (k > 0) { 227*0957b409SSimon J. Gerraty a[k - 1] = (uint32_t)za & 0x7FFFFFFF; 228*0957b409SSimon J. Gerraty b[k - 1] = (uint32_t)zb & 0x7FFFFFFF; 229*0957b409SSimon J. Gerraty } 230*0957b409SSimon J. Gerraty 231*0957b409SSimon J. Gerraty #define M ((uint64_t)1 << 32) 232*0957b409SSimon J. Gerraty tta = za >> 31; 233*0957b409SSimon J. Gerraty ttb = zb >> 31; 234*0957b409SSimon J. Gerraty tta = (tta ^ M) - M; 235*0957b409SSimon J. Gerraty ttb = (ttb ^ M) - M; 236*0957b409SSimon J. Gerraty cca = *(int64_t *)&tta; 237*0957b409SSimon J. Gerraty ccb = *(int64_t *)&ttb; 238*0957b409SSimon J. Gerraty #undef M 239*0957b409SSimon J. Gerraty } 240*0957b409SSimon J. Gerraty a[len - 1] = (uint32_t)cca; 241*0957b409SSimon J. Gerraty b[len - 1] = (uint32_t)ccb; 242*0957b409SSimon J. Gerraty 243*0957b409SSimon J. Gerraty /* 244*0957b409SSimon J. Gerraty * At this point: 245*0957b409SSimon J. Gerraty * -m <= a < 2*m 246*0957b409SSimon J. Gerraty * -m <= b < 2*m 247*0957b409SSimon J. Gerraty * (this is a case of Montgomery reduction) 248*0957b409SSimon J. Gerraty * The top word of 'a' and 'b' may have a 32-th bit set. 249*0957b409SSimon J. Gerraty * We may have to add or subtract the modulus. 250*0957b409SSimon J. Gerraty */ 251*0957b409SSimon J. Gerraty finish_mod(a, len, m, (uint32_t)((uint64_t)cca >> 63)); 252*0957b409SSimon J. Gerraty finish_mod(b, len, m, (uint32_t)((uint64_t)ccb >> 63)); 253*0957b409SSimon J. Gerraty } 254*0957b409SSimon J. Gerraty 255*0957b409SSimon J. Gerraty /* see inner.h */ 256*0957b409SSimon J. Gerraty uint32_t 257*0957b409SSimon J. Gerraty br_i31_moddiv(uint32_t *x, const uint32_t *y, const uint32_t *m, uint32_t m0i, 258*0957b409SSimon J. Gerraty uint32_t *t) 259*0957b409SSimon J. Gerraty { 260*0957b409SSimon J. Gerraty /* 261*0957b409SSimon J. Gerraty * Algorithm is an extended binary GCD. We maintain four values 262*0957b409SSimon J. Gerraty * a, b, u and v, with the following invariants: 263*0957b409SSimon J. Gerraty * 264*0957b409SSimon J. Gerraty * a * x = y * u mod m 265*0957b409SSimon J. Gerraty * b * x = y * v mod m 266*0957b409SSimon J. Gerraty * 267*0957b409SSimon J. Gerraty * Starting values are: 268*0957b409SSimon J. Gerraty * 269*0957b409SSimon J. Gerraty * a = y 270*0957b409SSimon J. Gerraty * b = m 271*0957b409SSimon J. Gerraty * u = x 272*0957b409SSimon J. Gerraty * v = 0 273*0957b409SSimon J. Gerraty * 274*0957b409SSimon J. Gerraty * The formal definition of the algorithm is a sequence of steps: 275*0957b409SSimon J. Gerraty * 276*0957b409SSimon J. Gerraty * - If a is even, then a <- a/2 and u <- u/2 mod m. 277*0957b409SSimon J. Gerraty * - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m. 278*0957b409SSimon J. Gerraty * - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m. 279*0957b409SSimon J. Gerraty * - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m. 280*0957b409SSimon J. Gerraty * 281*0957b409SSimon J. Gerraty * Algorithm stops when a = b. At that point, they both are equal 282*0957b409SSimon J. Gerraty * to GCD(y,m); the modular division succeeds if that value is 1. 283*0957b409SSimon J. Gerraty * The result of the modular division is then u (or v: both are 284*0957b409SSimon J. Gerraty * equal at that point). 285*0957b409SSimon J. Gerraty * 286*0957b409SSimon J. Gerraty * Each step makes either a or b shrink by at least one bit; hence, 287*0957b409SSimon J. Gerraty * if m has bit length k bits, then 2k-2 steps are sufficient. 288*0957b409SSimon J. Gerraty * 289*0957b409SSimon J. Gerraty * 290*0957b409SSimon J. Gerraty * Though complexity is quadratic in the size of m, the bit-by-bit 291*0957b409SSimon J. Gerraty * processing is not very efficient. We can speed up processing by 292*0957b409SSimon J. Gerraty * remarking that the decisions are taken based only on observation 293*0957b409SSimon J. Gerraty * of the top and low bits of a and b. 294*0957b409SSimon J. Gerraty * 295*0957b409SSimon J. Gerraty * In the loop below, at each iteration, we use the two top words 296*0957b409SSimon J. Gerraty * of a and b, and the low words of a and b, to compute reduction 297*0957b409SSimon J. Gerraty * parameters pa, pb, qa and qb such that the new values for a 298*0957b409SSimon J. Gerraty * and b are: 299*0957b409SSimon J. Gerraty * 300*0957b409SSimon J. Gerraty * a' = (a*pa + b*pb) / (2^31) 301*0957b409SSimon J. Gerraty * b' = (a*qa + b*qb) / (2^31) 302*0957b409SSimon J. Gerraty * 303*0957b409SSimon J. Gerraty * the division being exact. 304*0957b409SSimon J. Gerraty * 305*0957b409SSimon J. Gerraty * Since the choices are based on the top words, they may be slightly 306*0957b409SSimon J. Gerraty * off, requiring an optional correction: if a' < 0, then we replace 307*0957b409SSimon J. Gerraty * pa with -pa, and pb with -pb. The total length of a and b is 308*0957b409SSimon J. Gerraty * thus reduced by at least 30 bits at each iteration. 309*0957b409SSimon J. Gerraty * 310*0957b409SSimon J. Gerraty * The stopping conditions are still the same, though: when a 311*0957b409SSimon J. Gerraty * and b become equal, they must be both odd (since m is odd, 312*0957b409SSimon J. Gerraty * the GCD cannot be even), therefore the next operation is a 313*0957b409SSimon J. Gerraty * subtraction, and one of the values becomes 0. At that point, 314*0957b409SSimon J. Gerraty * nothing else happens, i.e. one value is stuck at 0, and the 315*0957b409SSimon J. Gerraty * other one is the GCD. 316*0957b409SSimon J. Gerraty */ 317*0957b409SSimon J. Gerraty size_t len, k; 318*0957b409SSimon J. Gerraty uint32_t *a, *b, *u, *v; 319*0957b409SSimon J. Gerraty uint32_t num, r; 320*0957b409SSimon J. Gerraty 321*0957b409SSimon J. Gerraty len = (m[0] + 31) >> 5; 322*0957b409SSimon J. Gerraty a = t; 323*0957b409SSimon J. Gerraty b = a + len; 324*0957b409SSimon J. Gerraty u = x + 1; 325*0957b409SSimon J. Gerraty v = b + len; 326*0957b409SSimon J. Gerraty memcpy(a, y + 1, len * sizeof *y); 327*0957b409SSimon J. Gerraty memcpy(b, m + 1, len * sizeof *m); 328*0957b409SSimon J. Gerraty memset(v, 0, len * sizeof *v); 329*0957b409SSimon J. Gerraty 330*0957b409SSimon J. Gerraty /* 331*0957b409SSimon J. Gerraty * Loop below ensures that a and b are reduced by some bits each, 332*0957b409SSimon J. Gerraty * for a total of at least 30 bits. 333*0957b409SSimon J. Gerraty */ 334*0957b409SSimon J. Gerraty for (num = ((m[0] - (m[0] >> 5)) << 1) + 30; num >= 30; num -= 30) { 335*0957b409SSimon J. Gerraty size_t j; 336*0957b409SSimon J. Gerraty uint32_t c0, c1; 337*0957b409SSimon J. Gerraty uint32_t a0, a1, b0, b1; 338*0957b409SSimon J. Gerraty uint64_t a_hi, b_hi; 339*0957b409SSimon J. Gerraty uint32_t a_lo, b_lo; 340*0957b409SSimon J. Gerraty int64_t pa, pb, qa, qb; 341*0957b409SSimon J. Gerraty int i; 342*0957b409SSimon J. Gerraty 343*0957b409SSimon J. Gerraty /* 344*0957b409SSimon J. Gerraty * Extract top words of a and b. If j is the highest 345*0957b409SSimon J. Gerraty * index >= 1 such that a[j] != 0 or b[j] != 0, then we want 346*0957b409SSimon J. Gerraty * (a[j] << 31) + a[j - 1], and (b[j] << 31) + b[j - 1]. 347*0957b409SSimon J. Gerraty * If a and b are down to one word each, then we use a[0] 348*0957b409SSimon J. Gerraty * and b[0]. 349*0957b409SSimon J. Gerraty */ 350*0957b409SSimon J. Gerraty c0 = (uint32_t)-1; 351*0957b409SSimon J. Gerraty c1 = (uint32_t)-1; 352*0957b409SSimon J. Gerraty a0 = 0; 353*0957b409SSimon J. Gerraty a1 = 0; 354*0957b409SSimon J. Gerraty b0 = 0; 355*0957b409SSimon J. Gerraty b1 = 0; 356*0957b409SSimon J. Gerraty j = len; 357*0957b409SSimon J. Gerraty while (j -- > 0) { 358*0957b409SSimon J. Gerraty uint32_t aw, bw; 359*0957b409SSimon J. Gerraty 360*0957b409SSimon J. Gerraty aw = a[j]; 361*0957b409SSimon J. Gerraty bw = b[j]; 362*0957b409SSimon J. Gerraty a0 ^= (a0 ^ aw) & c0; 363*0957b409SSimon J. Gerraty a1 ^= (a1 ^ aw) & c1; 364*0957b409SSimon J. Gerraty b0 ^= (b0 ^ bw) & c0; 365*0957b409SSimon J. Gerraty b1 ^= (b1 ^ bw) & c1; 366*0957b409SSimon J. Gerraty c1 = c0; 367*0957b409SSimon J. Gerraty c0 &= (((aw | bw) + 0x7FFFFFFF) >> 31) - (uint32_t)1; 368*0957b409SSimon J. Gerraty } 369*0957b409SSimon J. Gerraty 370*0957b409SSimon J. Gerraty /* 371*0957b409SSimon J. Gerraty * If c1 = 0, then we grabbed two words for a and b. 372*0957b409SSimon J. Gerraty * If c1 != 0 but c0 = 0, then we grabbed one word. It 373*0957b409SSimon J. Gerraty * is not possible that c1 != 0 and c0 != 0, because that 374*0957b409SSimon J. Gerraty * would mean that both integers are zero. 375*0957b409SSimon J. Gerraty */ 376*0957b409SSimon J. Gerraty a1 |= a0 & c1; 377*0957b409SSimon J. Gerraty a0 &= ~c1; 378*0957b409SSimon J. Gerraty b1 |= b0 & c1; 379*0957b409SSimon J. Gerraty b0 &= ~c1; 380*0957b409SSimon J. Gerraty a_hi = ((uint64_t)a0 << 31) + a1; 381*0957b409SSimon J. Gerraty b_hi = ((uint64_t)b0 << 31) + b1; 382*0957b409SSimon J. Gerraty a_lo = a[0]; 383*0957b409SSimon J. Gerraty b_lo = b[0]; 384*0957b409SSimon J. Gerraty 385*0957b409SSimon J. Gerraty /* 386*0957b409SSimon J. Gerraty * Compute reduction factors: 387*0957b409SSimon J. Gerraty * 388*0957b409SSimon J. Gerraty * a' = a*pa + b*pb 389*0957b409SSimon J. Gerraty * b' = a*qa + b*qb 390*0957b409SSimon J. Gerraty * 391*0957b409SSimon J. Gerraty * such that a' and b' are both multiple of 2^31, but are 392*0957b409SSimon J. Gerraty * only marginally larger than a and b. 393*0957b409SSimon J. Gerraty */ 394*0957b409SSimon J. Gerraty pa = 1; 395*0957b409SSimon J. Gerraty pb = 0; 396*0957b409SSimon J. Gerraty qa = 0; 397*0957b409SSimon J. Gerraty qb = 1; 398*0957b409SSimon J. Gerraty for (i = 0; i < 31; i ++) { 399*0957b409SSimon J. Gerraty /* 400*0957b409SSimon J. Gerraty * At each iteration: 401*0957b409SSimon J. Gerraty * 402*0957b409SSimon J. Gerraty * a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi 403*0957b409SSimon J. Gerraty * b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi 404*0957b409SSimon J. Gerraty * a <- a/2 if: a is even 405*0957b409SSimon J. Gerraty * b <- b/2 if: a is odd, b is even 406*0957b409SSimon J. Gerraty * 407*0957b409SSimon J. Gerraty * We multiply a_lo and b_lo by 2 at each 408*0957b409SSimon J. Gerraty * iteration, thus a division by 2 really is a 409*0957b409SSimon J. Gerraty * non-multiplication by 2. 410*0957b409SSimon J. Gerraty */ 411*0957b409SSimon J. Gerraty uint32_t r, oa, ob, cAB, cBA, cA; 412*0957b409SSimon J. Gerraty uint64_t rz; 413*0957b409SSimon J. Gerraty 414*0957b409SSimon J. Gerraty /* 415*0957b409SSimon J. Gerraty * r = GT(a_hi, b_hi) 416*0957b409SSimon J. Gerraty * But the GT() function works on uint32_t operands, 417*0957b409SSimon J. Gerraty * so we inline a 64-bit version here. 418*0957b409SSimon J. Gerraty */ 419*0957b409SSimon J. Gerraty rz = b_hi - a_hi; 420*0957b409SSimon J. Gerraty r = (uint32_t)((rz ^ ((a_hi ^ b_hi) 421*0957b409SSimon J. Gerraty & (a_hi ^ rz))) >> 63); 422*0957b409SSimon J. Gerraty 423*0957b409SSimon J. Gerraty /* 424*0957b409SSimon J. Gerraty * cAB = 1 if b must be subtracted from a 425*0957b409SSimon J. Gerraty * cBA = 1 if a must be subtracted from b 426*0957b409SSimon J. Gerraty * cA = 1 if a is divided by 2, 0 otherwise 427*0957b409SSimon J. Gerraty * 428*0957b409SSimon J. Gerraty * Rules: 429*0957b409SSimon J. Gerraty * 430*0957b409SSimon J. Gerraty * cAB and cBA cannot be both 1. 431*0957b409SSimon J. Gerraty * if a is not divided by 2, b is. 432*0957b409SSimon J. Gerraty */ 433*0957b409SSimon J. Gerraty oa = (a_lo >> i) & 1; 434*0957b409SSimon J. Gerraty ob = (b_lo >> i) & 1; 435*0957b409SSimon J. Gerraty cAB = oa & ob & r; 436*0957b409SSimon J. Gerraty cBA = oa & ob & NOT(r); 437*0957b409SSimon J. Gerraty cA = cAB | NOT(oa); 438*0957b409SSimon J. Gerraty 439*0957b409SSimon J. Gerraty /* 440*0957b409SSimon J. Gerraty * Conditional subtractions. 441*0957b409SSimon J. Gerraty */ 442*0957b409SSimon J. Gerraty a_lo -= b_lo & -cAB; 443*0957b409SSimon J. Gerraty a_hi -= b_hi & -(uint64_t)cAB; 444*0957b409SSimon J. Gerraty pa -= qa & -(int64_t)cAB; 445*0957b409SSimon J. Gerraty pb -= qb & -(int64_t)cAB; 446*0957b409SSimon J. Gerraty b_lo -= a_lo & -cBA; 447*0957b409SSimon J. Gerraty b_hi -= a_hi & -(uint64_t)cBA; 448*0957b409SSimon J. Gerraty qa -= pa & -(int64_t)cBA; 449*0957b409SSimon J. Gerraty qb -= pb & -(int64_t)cBA; 450*0957b409SSimon J. Gerraty 451*0957b409SSimon J. Gerraty /* 452*0957b409SSimon J. Gerraty * Shifting. 453*0957b409SSimon J. Gerraty */ 454*0957b409SSimon J. Gerraty a_lo += a_lo & (cA - 1); 455*0957b409SSimon J. Gerraty pa += pa & ((int64_t)cA - 1); 456*0957b409SSimon J. Gerraty pb += pb & ((int64_t)cA - 1); 457*0957b409SSimon J. Gerraty a_hi ^= (a_hi ^ (a_hi >> 1)) & -(uint64_t)cA; 458*0957b409SSimon J. Gerraty b_lo += b_lo & -cA; 459*0957b409SSimon J. Gerraty qa += qa & -(int64_t)cA; 460*0957b409SSimon J. Gerraty qb += qb & -(int64_t)cA; 461*0957b409SSimon J. Gerraty b_hi ^= (b_hi ^ (b_hi >> 1)) & ((uint64_t)cA - 1); 462*0957b409SSimon J. Gerraty } 463*0957b409SSimon J. Gerraty 464*0957b409SSimon J. Gerraty /* 465*0957b409SSimon J. Gerraty * Replace a and b with new values a' and b'. 466*0957b409SSimon J. Gerraty */ 467*0957b409SSimon J. Gerraty r = co_reduce(a, b, len, pa, pb, qa, qb); 468*0957b409SSimon J. Gerraty pa -= pa * ((r & 1) << 1); 469*0957b409SSimon J. Gerraty pb -= pb * ((r & 1) << 1); 470*0957b409SSimon J. Gerraty qa -= qa * (r & 2); 471*0957b409SSimon J. Gerraty qb -= qb * (r & 2); 472*0957b409SSimon J. Gerraty co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i); 473*0957b409SSimon J. Gerraty } 474*0957b409SSimon J. Gerraty 475*0957b409SSimon J. Gerraty /* 476*0957b409SSimon J. Gerraty * Now one of the arrays should be 0, and the other contains 477*0957b409SSimon J. Gerraty * the GCD. If a is 0, then u is 0 as well, and v contains 478*0957b409SSimon J. Gerraty * the division result. 479*0957b409SSimon J. Gerraty * Result is correct if and only if GCD is 1. 480*0957b409SSimon J. Gerraty */ 481*0957b409SSimon J. Gerraty r = (a[0] | b[0]) ^ 1; 482*0957b409SSimon J. Gerraty u[0] |= v[0]; 483*0957b409SSimon J. Gerraty for (k = 1; k < len; k ++) { 484*0957b409SSimon J. Gerraty r |= a[k] | b[k]; 485*0957b409SSimon J. Gerraty u[k] |= v[k]; 486*0957b409SSimon J. Gerraty } 487*0957b409SSimon J. Gerraty return EQ0(r); 488*0957b409SSimon J. Gerraty } 489