xref: /freebsd/contrib/bearssl/src/int/i15_moddiv.c (revision 2aaf9152a852aba9eb2036b95f4948ee77988826)
1*0957b409SSimon J. Gerraty /*
2*0957b409SSimon J. Gerraty  * Copyright (c) 2018 Thomas Pornin <pornin@bolet.org>
3*0957b409SSimon J. Gerraty  *
4*0957b409SSimon J. Gerraty  * Permission is hereby granted, free of charge, to any person obtaining
5*0957b409SSimon J. Gerraty  * a copy of this software and associated documentation files (the
6*0957b409SSimon J. Gerraty  * "Software"), to deal in the Software without restriction, including
7*0957b409SSimon J. Gerraty  * without limitation the rights to use, copy, modify, merge, publish,
8*0957b409SSimon J. Gerraty  * distribute, sublicense, and/or sell copies of the Software, and to
9*0957b409SSimon J. Gerraty  * permit persons to whom the Software is furnished to do so, subject to
10*0957b409SSimon J. Gerraty  * the following conditions:
11*0957b409SSimon J. Gerraty  *
12*0957b409SSimon J. Gerraty  * The above copyright notice and this permission notice shall be
13*0957b409SSimon J. Gerraty  * included in all copies or substantial portions of the Software.
14*0957b409SSimon J. Gerraty  *
15*0957b409SSimon J. Gerraty  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
16*0957b409SSimon J. Gerraty  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
17*0957b409SSimon J. Gerraty  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
18*0957b409SSimon J. Gerraty  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
19*0957b409SSimon J. Gerraty  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
20*0957b409SSimon J. Gerraty  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
21*0957b409SSimon J. Gerraty  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22*0957b409SSimon J. Gerraty  * SOFTWARE.
23*0957b409SSimon J. Gerraty  */
24*0957b409SSimon J. Gerraty 
25*0957b409SSimon J. Gerraty #include "inner.h"
26*0957b409SSimon J. Gerraty 
27*0957b409SSimon J. Gerraty /*
28*0957b409SSimon J. Gerraty  * In this file, we handle big integers with a custom format, i.e.
29*0957b409SSimon J. Gerraty  * without the usual one-word header. Value is split into 15-bit words,
30*0957b409SSimon J. Gerraty  * each stored in a 16-bit slot (top bit is zero) in little-endian
31*0957b409SSimon J. Gerraty  * order. The length (in words) is provided explicitly. In some cases,
32*0957b409SSimon J. Gerraty  * the value can be negative (using two's complement representation). In
33*0957b409SSimon J. Gerraty  * some cases, the top word is allowed to have a 16th bit.
34*0957b409SSimon J. Gerraty  */
35*0957b409SSimon J. Gerraty 
36*0957b409SSimon J. Gerraty /*
37*0957b409SSimon J. Gerraty  * Negate big integer conditionally. The value consists of 'len' words,
38*0957b409SSimon J. Gerraty  * with 15 bits in each word (the top bit of each word should be 0,
39*0957b409SSimon J. Gerraty  * except possibly for the last word). If 'ctl' is 1, the negation is
40*0957b409SSimon J. Gerraty  * computed; otherwise, if 'ctl' is 0, then the value is unchanged.
41*0957b409SSimon J. Gerraty  */
42*0957b409SSimon J. Gerraty static void
cond_negate(uint16_t * a,size_t len,uint32_t ctl)43*0957b409SSimon J. Gerraty cond_negate(uint16_t *a, size_t len, uint32_t ctl)
44*0957b409SSimon J. Gerraty {
45*0957b409SSimon J. Gerraty 	size_t k;
46*0957b409SSimon J. Gerraty 	uint32_t cc, xm;
47*0957b409SSimon J. Gerraty 
48*0957b409SSimon J. Gerraty 	cc = ctl;
49*0957b409SSimon J. Gerraty 	xm = 0x7FFF & -ctl;
50*0957b409SSimon J. Gerraty 	for (k = 0; k < len; k ++) {
51*0957b409SSimon J. Gerraty 		uint32_t aw;
52*0957b409SSimon J. Gerraty 
53*0957b409SSimon J. Gerraty 		aw = a[k];
54*0957b409SSimon J. Gerraty 		aw = (aw ^ xm) + cc;
55*0957b409SSimon J. Gerraty 		a[k] = aw & 0x7FFF;
56*0957b409SSimon J. Gerraty 		cc = (aw >> 15) & 1;
57*0957b409SSimon J. Gerraty 	}
58*0957b409SSimon J. Gerraty }
59*0957b409SSimon J. Gerraty 
60*0957b409SSimon J. Gerraty /*
61*0957b409SSimon J. Gerraty  * Finish modular reduction. Rules on input parameters:
62*0957b409SSimon J. Gerraty  *
63*0957b409SSimon J. Gerraty  *   if neg = 1, then -m <= a < 0
64*0957b409SSimon J. Gerraty  *   if neg = 0, then 0 <= a < 2*m
65*0957b409SSimon J. Gerraty  *
66*0957b409SSimon J. Gerraty  * If neg = 0, then the top word of a[] may use 16 bits.
67*0957b409SSimon J. Gerraty  *
68*0957b409SSimon J. Gerraty  * Also, modulus m must be odd.
69*0957b409SSimon J. Gerraty  */
70*0957b409SSimon J. Gerraty static void
finish_mod(uint16_t * a,size_t len,const uint16_t * m,uint32_t neg)71*0957b409SSimon J. Gerraty finish_mod(uint16_t *a, size_t len, const uint16_t *m, uint32_t neg)
72*0957b409SSimon J. Gerraty {
73*0957b409SSimon J. Gerraty 	size_t k;
74*0957b409SSimon J. Gerraty 	uint32_t cc, xm, ym;
75*0957b409SSimon J. Gerraty 
76*0957b409SSimon J. Gerraty 	/*
77*0957b409SSimon J. Gerraty 	 * First pass: compare a (assumed nonnegative) with m.
78*0957b409SSimon J. Gerraty 	 */
79*0957b409SSimon J. Gerraty 	cc = 0;
80*0957b409SSimon J. Gerraty 	for (k = 0; k < len; k ++) {
81*0957b409SSimon J. Gerraty 		uint32_t aw, mw;
82*0957b409SSimon J. Gerraty 
83*0957b409SSimon J. Gerraty 		aw = a[k];
84*0957b409SSimon J. Gerraty 		mw = m[k];
85*0957b409SSimon J. Gerraty 		cc = (aw - mw - cc) >> 31;
86*0957b409SSimon J. Gerraty 	}
87*0957b409SSimon J. Gerraty 
88*0957b409SSimon J. Gerraty 	/*
89*0957b409SSimon J. Gerraty 	 * At this point:
90*0957b409SSimon J. Gerraty 	 *   if neg = 1, then we must add m (regardless of cc)
91*0957b409SSimon J. Gerraty 	 *   if neg = 0 and cc = 0, then we must subtract m
92*0957b409SSimon J. Gerraty 	 *   if neg = 0 and cc = 1, then we must do nothing
93*0957b409SSimon J. Gerraty 	 */
94*0957b409SSimon J. Gerraty 	xm = 0x7FFF & -neg;
95*0957b409SSimon J. Gerraty 	ym = -(neg | (1 - cc));
96*0957b409SSimon J. Gerraty 	cc = neg;
97*0957b409SSimon J. Gerraty 	for (k = 0; k < len; k ++) {
98*0957b409SSimon J. Gerraty 		uint32_t aw, mw;
99*0957b409SSimon J. Gerraty 
100*0957b409SSimon J. Gerraty 		aw = a[k];
101*0957b409SSimon J. Gerraty 		mw = (m[k] ^ xm) & ym;
102*0957b409SSimon J. Gerraty 		aw = aw - mw - cc;
103*0957b409SSimon J. Gerraty 		a[k] = aw & 0x7FFF;
104*0957b409SSimon J. Gerraty 		cc = aw >> 31;
105*0957b409SSimon J. Gerraty 	}
106*0957b409SSimon J. Gerraty }
107*0957b409SSimon J. Gerraty 
108*0957b409SSimon J. Gerraty /*
109*0957b409SSimon J. Gerraty  * Compute:
110*0957b409SSimon J. Gerraty  *   a <- (a*pa+b*pb)/(2^15)
111*0957b409SSimon J. Gerraty  *   b <- (a*qa+b*qb)/(2^15)
112*0957b409SSimon J. Gerraty  * The division is assumed to be exact (i.e. the low word is dropped).
113*0957b409SSimon J. Gerraty  * If the final a is negative, then it is negated. Similarly for b.
114*0957b409SSimon J. Gerraty  * Returned value is the combination of two bits:
115*0957b409SSimon J. Gerraty  *   bit 0: 1 if a had to be negated, 0 otherwise
116*0957b409SSimon J. Gerraty  *   bit 1: 1 if b had to be negated, 0 otherwise
117*0957b409SSimon J. Gerraty  *
118*0957b409SSimon J. Gerraty  * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
119*0957b409SSimon J. Gerraty  * Source integers a and b must be nonnegative; top word is not allowed
120*0957b409SSimon J. Gerraty  * to contain an extra 16th bit.
121*0957b409SSimon J. Gerraty  */
122*0957b409SSimon J. Gerraty static uint32_t
co_reduce(uint16_t * a,uint16_t * b,size_t len,int32_t pa,int32_t pb,int32_t qa,int32_t qb)123*0957b409SSimon J. Gerraty co_reduce(uint16_t *a, uint16_t *b, size_t len,
124*0957b409SSimon J. Gerraty 	int32_t pa, int32_t pb, int32_t qa, int32_t qb)
125*0957b409SSimon J. Gerraty {
126*0957b409SSimon J. Gerraty 	size_t k;
127*0957b409SSimon J. Gerraty 	int32_t cca, ccb;
128*0957b409SSimon J. Gerraty 	uint32_t nega, negb;
129*0957b409SSimon J. Gerraty 
130*0957b409SSimon J. Gerraty 	cca = 0;
131*0957b409SSimon J. Gerraty 	ccb = 0;
132*0957b409SSimon J. Gerraty 	for (k = 0; k < len; k ++) {
133*0957b409SSimon J. Gerraty 		uint32_t wa, wb, za, zb;
134*0957b409SSimon J. Gerraty 		uint16_t tta, ttb;
135*0957b409SSimon J. Gerraty 
136*0957b409SSimon J. Gerraty 		/*
137*0957b409SSimon J. Gerraty 		 * Since:
138*0957b409SSimon J. Gerraty 		 *   |pa| <= 2^15
139*0957b409SSimon J. Gerraty 		 *   |pb| <= 2^15
140*0957b409SSimon J. Gerraty 		 *   0 <= wa <= 2^15 - 1
141*0957b409SSimon J. Gerraty 		 *   0 <= wb <= 2^15 - 1
142*0957b409SSimon J. Gerraty 		 *   |cca| <= 2^16 - 1
143*0957b409SSimon J. Gerraty 		 * Then:
144*0957b409SSimon J. Gerraty 		 *   |za| <= (2^15-1)*(2^16) + (2^16-1) = 2^31 - 1
145*0957b409SSimon J. Gerraty 		 *
146*0957b409SSimon J. Gerraty 		 * Thus, the new value of cca is such that |cca| <= 2^16 - 1.
147*0957b409SSimon J. Gerraty 		 * The same applies to ccb.
148*0957b409SSimon J. Gerraty 		 */
149*0957b409SSimon J. Gerraty 		wa = a[k];
150*0957b409SSimon J. Gerraty 		wb = b[k];
151*0957b409SSimon J. Gerraty 		za = wa * (uint32_t)pa + wb * (uint32_t)pb + (uint32_t)cca;
152*0957b409SSimon J. Gerraty 		zb = wa * (uint32_t)qa + wb * (uint32_t)qb + (uint32_t)ccb;
153*0957b409SSimon J. Gerraty 		if (k > 0) {
154*0957b409SSimon J. Gerraty 			a[k - 1] = za & 0x7FFF;
155*0957b409SSimon J. Gerraty 			b[k - 1] = zb & 0x7FFF;
156*0957b409SSimon J. Gerraty 		}
157*0957b409SSimon J. Gerraty 		tta = za >> 15;
158*0957b409SSimon J. Gerraty 		ttb = zb >> 15;
159*0957b409SSimon J. Gerraty 		cca = *(int16_t *)&tta;
160*0957b409SSimon J. Gerraty 		ccb = *(int16_t *)&ttb;
161*0957b409SSimon J. Gerraty 	}
162*0957b409SSimon J. Gerraty 	a[len - 1] = (uint16_t)cca;
163*0957b409SSimon J. Gerraty 	b[len - 1] = (uint16_t)ccb;
164*0957b409SSimon J. Gerraty 	nega = (uint32_t)cca >> 31;
165*0957b409SSimon J. Gerraty 	negb = (uint32_t)ccb >> 31;
166*0957b409SSimon J. Gerraty 	cond_negate(a, len, nega);
167*0957b409SSimon J. Gerraty 	cond_negate(b, len, negb);
168*0957b409SSimon J. Gerraty 	return nega | (negb << 1);
169*0957b409SSimon J. Gerraty }
170*0957b409SSimon J. Gerraty 
171*0957b409SSimon J. Gerraty /*
172*0957b409SSimon J. Gerraty  * Compute:
173*0957b409SSimon J. Gerraty  *   a <- (a*pa+b*pb)/(2^15) mod m
174*0957b409SSimon J. Gerraty  *   b <- (a*qa+b*qb)/(2^15) mod m
175*0957b409SSimon J. Gerraty  *
176*0957b409SSimon J. Gerraty  * m0i is equal to -1/m[0] mod 2^15.
177*0957b409SSimon J. Gerraty  *
178*0957b409SSimon J. Gerraty  * Factors pa, pb, qa and qb must be at most 2^15 in absolute value.
179*0957b409SSimon J. Gerraty  * Source integers a and b must be nonnegative; top word is not allowed
180*0957b409SSimon J. Gerraty  * to contain an extra 16th bit.
181*0957b409SSimon J. Gerraty  */
182*0957b409SSimon J. Gerraty static void
co_reduce_mod(uint16_t * a,uint16_t * b,size_t len,int32_t pa,int32_t pb,int32_t qa,int32_t qb,const uint16_t * m,uint16_t m0i)183*0957b409SSimon J. Gerraty co_reduce_mod(uint16_t *a, uint16_t *b, size_t len,
184*0957b409SSimon J. Gerraty 	int32_t pa, int32_t pb, int32_t qa, int32_t qb,
185*0957b409SSimon J. Gerraty 	const uint16_t *m, uint16_t m0i)
186*0957b409SSimon J. Gerraty {
187*0957b409SSimon J. Gerraty 	size_t k;
188*0957b409SSimon J. Gerraty 	int32_t cca, ccb, fa, fb;
189*0957b409SSimon J. Gerraty 
190*0957b409SSimon J. Gerraty 	cca = 0;
191*0957b409SSimon J. Gerraty 	ccb = 0;
192*0957b409SSimon J. Gerraty 	fa = ((a[0] * (uint32_t)pa + b[0] * (uint32_t)pb) * m0i) & 0x7FFF;
193*0957b409SSimon J. Gerraty 	fb = ((a[0] * (uint32_t)qa + b[0] * (uint32_t)qb) * m0i) & 0x7FFF;
194*0957b409SSimon J. Gerraty 	for (k = 0; k < len; k ++) {
195*0957b409SSimon J. Gerraty 		uint32_t wa, wb, za, zb;
196*0957b409SSimon J. Gerraty 		uint32_t tta, ttb;
197*0957b409SSimon J. Gerraty 
198*0957b409SSimon J. Gerraty 		/*
199*0957b409SSimon J. Gerraty 		 * In this loop, carries 'cca' and 'ccb' always fit on
200*0957b409SSimon J. Gerraty 		 * 17 bits (in absolute value).
201*0957b409SSimon J. Gerraty 		 */
202*0957b409SSimon J. Gerraty 		wa = a[k];
203*0957b409SSimon J. Gerraty 		wb = b[k];
204*0957b409SSimon J. Gerraty 		za = wa * (uint32_t)pa + wb * (uint32_t)pb
205*0957b409SSimon J. Gerraty 			+ m[k] * (uint32_t)fa + (uint32_t)cca;
206*0957b409SSimon J. Gerraty 		zb = wa * (uint32_t)qa + wb * (uint32_t)qb
207*0957b409SSimon J. Gerraty 			+ m[k] * (uint32_t)fb + (uint32_t)ccb;
208*0957b409SSimon J. Gerraty 		if (k > 0) {
209*0957b409SSimon J. Gerraty 			a[k - 1] = za & 0x7FFF;
210*0957b409SSimon J. Gerraty 			b[k - 1] = zb & 0x7FFF;
211*0957b409SSimon J. Gerraty 		}
212*0957b409SSimon J. Gerraty 
213*0957b409SSimon J. Gerraty 		/*
214*0957b409SSimon J. Gerraty 		 * The XOR-and-sub construction below does an arithmetic
215*0957b409SSimon J. Gerraty 		 * right shift in a portable way (technically, right-shifting
216*0957b409SSimon J. Gerraty 		 * a negative signed value is implementation-defined in C).
217*0957b409SSimon J. Gerraty 		 */
218*0957b409SSimon J. Gerraty #define M   ((uint32_t)1 << 16)
219*0957b409SSimon J. Gerraty 		tta = za >> 15;
220*0957b409SSimon J. Gerraty 		ttb = zb >> 15;
221*0957b409SSimon J. Gerraty 		tta = (tta ^ M) - M;
222*0957b409SSimon J. Gerraty 		ttb = (ttb ^ M) - M;
223*0957b409SSimon J. Gerraty 		cca = *(int32_t *)&tta;
224*0957b409SSimon J. Gerraty 		ccb = *(int32_t *)&ttb;
225*0957b409SSimon J. Gerraty #undef M
226*0957b409SSimon J. Gerraty 	}
227*0957b409SSimon J. Gerraty 	a[len - 1] = (uint32_t)cca;
228*0957b409SSimon J. Gerraty 	b[len - 1] = (uint32_t)ccb;
229*0957b409SSimon J. Gerraty 
230*0957b409SSimon J. Gerraty 	/*
231*0957b409SSimon J. Gerraty 	 * At this point:
232*0957b409SSimon J. Gerraty 	 *   -m <= a < 2*m
233*0957b409SSimon J. Gerraty 	 *   -m <= b < 2*m
234*0957b409SSimon J. Gerraty 	 * (this is a case of Montgomery reduction)
235*0957b409SSimon J. Gerraty 	 * The top word of 'a' and 'b' may have a 16-th bit set.
236*0957b409SSimon J. Gerraty 	 * We may have to add or subtract the modulus.
237*0957b409SSimon J. Gerraty 	 */
238*0957b409SSimon J. Gerraty 	finish_mod(a, len, m, (uint32_t)cca >> 31);
239*0957b409SSimon J. Gerraty 	finish_mod(b, len, m, (uint32_t)ccb >> 31);
240*0957b409SSimon J. Gerraty }
241*0957b409SSimon J. Gerraty 
242*0957b409SSimon J. Gerraty /* see inner.h */
243*0957b409SSimon J. Gerraty uint32_t
br_i15_moddiv(uint16_t * x,const uint16_t * y,const uint16_t * m,uint16_t m0i,uint16_t * t)244*0957b409SSimon J. Gerraty br_i15_moddiv(uint16_t *x, const uint16_t *y, const uint16_t *m, uint16_t m0i,
245*0957b409SSimon J. Gerraty 	uint16_t *t)
246*0957b409SSimon J. Gerraty {
247*0957b409SSimon J. Gerraty 	/*
248*0957b409SSimon J. Gerraty 	 * Algorithm is an extended binary GCD. We maintain four values
249*0957b409SSimon J. Gerraty 	 * a, b, u and v, with the following invariants:
250*0957b409SSimon J. Gerraty 	 *
251*0957b409SSimon J. Gerraty 	 *   a * x = y * u mod m
252*0957b409SSimon J. Gerraty 	 *   b * x = y * v mod m
253*0957b409SSimon J. Gerraty 	 *
254*0957b409SSimon J. Gerraty 	 * Starting values are:
255*0957b409SSimon J. Gerraty 	 *
256*0957b409SSimon J. Gerraty 	 *   a = y
257*0957b409SSimon J. Gerraty 	 *   b = m
258*0957b409SSimon J. Gerraty 	 *   u = x
259*0957b409SSimon J. Gerraty 	 *   v = 0
260*0957b409SSimon J. Gerraty 	 *
261*0957b409SSimon J. Gerraty 	 * The formal definition of the algorithm is a sequence of steps:
262*0957b409SSimon J. Gerraty 	 *
263*0957b409SSimon J. Gerraty 	 *   - If a is even, then a <- a/2 and u <- u/2 mod m.
264*0957b409SSimon J. Gerraty 	 *   - Otherwise, if b is even, then b <- b/2 and v <- v/2 mod m.
265*0957b409SSimon J. Gerraty 	 *   - Otherwise, if a > b, then a <- (a-b)/2 and u <- (u-v)/2 mod m.
266*0957b409SSimon J. Gerraty 	 *   - Otherwise, b <- (b-a)/2 and v <- (v-u)/2 mod m.
267*0957b409SSimon J. Gerraty 	 *
268*0957b409SSimon J. Gerraty 	 * Algorithm stops when a = b. At that point, they both are equal
269*0957b409SSimon J. Gerraty 	 * to GCD(y,m); the modular division succeeds if that value is 1.
270*0957b409SSimon J. Gerraty 	 * The result of the modular division is then u (or v: both are
271*0957b409SSimon J. Gerraty 	 * equal at that point).
272*0957b409SSimon J. Gerraty 	 *
273*0957b409SSimon J. Gerraty 	 * Each step makes either a or b shrink by at least one bit; hence,
274*0957b409SSimon J. Gerraty 	 * if m has bit length k bits, then 2k-2 steps are sufficient.
275*0957b409SSimon J. Gerraty 	 *
276*0957b409SSimon J. Gerraty 	 *
277*0957b409SSimon J. Gerraty 	 * Though complexity is quadratic in the size of m, the bit-by-bit
278*0957b409SSimon J. Gerraty 	 * processing is not very efficient. We can speed up processing by
279*0957b409SSimon J. Gerraty 	 * remarking that the decisions are taken based only on observation
280*0957b409SSimon J. Gerraty 	 * of the top and low bits of a and b.
281*0957b409SSimon J. Gerraty 	 *
282*0957b409SSimon J. Gerraty 	 * In the loop below, at each iteration, we use the two top words
283*0957b409SSimon J. Gerraty 	 * of a and b, and the low words of a and b, to compute reduction
284*0957b409SSimon J. Gerraty 	 * parameters pa, pb, qa and qb such that the new values for a
285*0957b409SSimon J. Gerraty 	 * and b are:
286*0957b409SSimon J. Gerraty 	 *
287*0957b409SSimon J. Gerraty 	 *   a' = (a*pa + b*pb) / (2^15)
288*0957b409SSimon J. Gerraty 	 *   b' = (a*qa + b*qb) / (2^15)
289*0957b409SSimon J. Gerraty 	 *
290*0957b409SSimon J. Gerraty 	 * the division being exact.
291*0957b409SSimon J. Gerraty 	 *
292*0957b409SSimon J. Gerraty 	 * Since the choices are based on the top words, they may be slightly
293*0957b409SSimon J. Gerraty 	 * off, requiring an optional correction: if a' < 0, then we replace
294*0957b409SSimon J. Gerraty 	 * pa with -pa, and pb with -pb. The total length of a and b is
295*0957b409SSimon J. Gerraty 	 * thus reduced by at least 14 bits at each iteration.
296*0957b409SSimon J. Gerraty 	 *
297*0957b409SSimon J. Gerraty 	 * The stopping conditions are still the same, though: when a
298*0957b409SSimon J. Gerraty 	 * and b become equal, they must be both odd (since m is odd,
299*0957b409SSimon J. Gerraty 	 * the GCD cannot be even), therefore the next operation is a
300*0957b409SSimon J. Gerraty 	 * subtraction, and one of the values becomes 0. At that point,
301*0957b409SSimon J. Gerraty 	 * nothing else happens, i.e. one value is stuck at 0, and the
302*0957b409SSimon J. Gerraty 	 * other one is the GCD.
303*0957b409SSimon J. Gerraty 	 */
304*0957b409SSimon J. Gerraty 	size_t len, k;
305*0957b409SSimon J. Gerraty 	uint16_t *a, *b, *u, *v;
306*0957b409SSimon J. Gerraty 	uint32_t num, r;
307*0957b409SSimon J. Gerraty 
308*0957b409SSimon J. Gerraty 	len = (m[0] + 15) >> 4;
309*0957b409SSimon J. Gerraty 	a = t;
310*0957b409SSimon J. Gerraty 	b = a + len;
311*0957b409SSimon J. Gerraty 	u = x + 1;
312*0957b409SSimon J. Gerraty 	v = b + len;
313*0957b409SSimon J. Gerraty 	memcpy(a, y + 1, len * sizeof *y);
314*0957b409SSimon J. Gerraty 	memcpy(b, m + 1, len * sizeof *m);
315*0957b409SSimon J. Gerraty 	memset(v, 0, len * sizeof *v);
316*0957b409SSimon J. Gerraty 
317*0957b409SSimon J. Gerraty 	/*
318*0957b409SSimon J. Gerraty 	 * Loop below ensures that a and b are reduced by some bits each,
319*0957b409SSimon J. Gerraty 	 * for a total of at least 14 bits.
320*0957b409SSimon J. Gerraty 	 */
321*0957b409SSimon J. Gerraty 	for (num = ((m[0] - (m[0] >> 4)) << 1) + 14; num >= 14; num -= 14) {
322*0957b409SSimon J. Gerraty 		size_t j;
323*0957b409SSimon J. Gerraty 		uint32_t c0, c1;
324*0957b409SSimon J. Gerraty 		uint32_t a0, a1, b0, b1;
325*0957b409SSimon J. Gerraty 		uint32_t a_hi, b_hi, a_lo, b_lo;
326*0957b409SSimon J. Gerraty 		int32_t pa, pb, qa, qb;
327*0957b409SSimon J. Gerraty 		int i;
328*0957b409SSimon J. Gerraty 
329*0957b409SSimon J. Gerraty 		/*
330*0957b409SSimon J. Gerraty 		 * Extract top words of a and b. If j is the highest
331*0957b409SSimon J. Gerraty 		 * index >= 1 such that a[j] != 0 or b[j] != 0, then we want
332*0957b409SSimon J. Gerraty 		 * (a[j] << 15) + a[j - 1], and (b[j] << 15) + b[j - 1].
333*0957b409SSimon J. Gerraty 		 * If a and b are down to one word each, then we use a[0]
334*0957b409SSimon J. Gerraty 		 * and b[0].
335*0957b409SSimon J. Gerraty 		 */
336*0957b409SSimon J. Gerraty 		c0 = (uint32_t)-1;
337*0957b409SSimon J. Gerraty 		c1 = (uint32_t)-1;
338*0957b409SSimon J. Gerraty 		a0 = 0;
339*0957b409SSimon J. Gerraty 		a1 = 0;
340*0957b409SSimon J. Gerraty 		b0 = 0;
341*0957b409SSimon J. Gerraty 		b1 = 0;
342*0957b409SSimon J. Gerraty 		j = len;
343*0957b409SSimon J. Gerraty 		while (j -- > 0) {
344*0957b409SSimon J. Gerraty 			uint32_t aw, bw;
345*0957b409SSimon J. Gerraty 
346*0957b409SSimon J. Gerraty 			aw = a[j];
347*0957b409SSimon J. Gerraty 			bw = b[j];
348*0957b409SSimon J. Gerraty 			a0 ^= (a0 ^ aw) & c0;
349*0957b409SSimon J. Gerraty 			a1 ^= (a1 ^ aw) & c1;
350*0957b409SSimon J. Gerraty 			b0 ^= (b0 ^ bw) & c0;
351*0957b409SSimon J. Gerraty 			b1 ^= (b1 ^ bw) & c1;
352*0957b409SSimon J. Gerraty 			c1 = c0;
353*0957b409SSimon J. Gerraty 			c0 &= (((aw | bw) + 0xFFFF) >> 16) - (uint32_t)1;
354*0957b409SSimon J. Gerraty 		}
355*0957b409SSimon J. Gerraty 
356*0957b409SSimon J. Gerraty 		/*
357*0957b409SSimon J. Gerraty 		 * If c1 = 0, then we grabbed two words for a and b.
358*0957b409SSimon J. Gerraty 		 * If c1 != 0 but c0 = 0, then we grabbed one word. It
359*0957b409SSimon J. Gerraty 		 * is not possible that c1 != 0 and c0 != 0, because that
360*0957b409SSimon J. Gerraty 		 * would mean that both integers are zero.
361*0957b409SSimon J. Gerraty 		 */
362*0957b409SSimon J. Gerraty 		a1 |= a0 & c1;
363*0957b409SSimon J. Gerraty 		a0 &= ~c1;
364*0957b409SSimon J. Gerraty 		b1 |= b0 & c1;
365*0957b409SSimon J. Gerraty 		b0 &= ~c1;
366*0957b409SSimon J. Gerraty 		a_hi = (a0 << 15) + a1;
367*0957b409SSimon J. Gerraty 		b_hi = (b0 << 15) + b1;
368*0957b409SSimon J. Gerraty 		a_lo = a[0];
369*0957b409SSimon J. Gerraty 		b_lo = b[0];
370*0957b409SSimon J. Gerraty 
371*0957b409SSimon J. Gerraty 		/*
372*0957b409SSimon J. Gerraty 		 * Compute reduction factors:
373*0957b409SSimon J. Gerraty 		 *
374*0957b409SSimon J. Gerraty 		 *   a' = a*pa + b*pb
375*0957b409SSimon J. Gerraty 		 *   b' = a*qa + b*qb
376*0957b409SSimon J. Gerraty 		 *
377*0957b409SSimon J. Gerraty 		 * such that a' and b' are both multiple of 2^15, but are
378*0957b409SSimon J. Gerraty 		 * only marginally larger than a and b.
379*0957b409SSimon J. Gerraty 		 */
380*0957b409SSimon J. Gerraty 		pa = 1;
381*0957b409SSimon J. Gerraty 		pb = 0;
382*0957b409SSimon J. Gerraty 		qa = 0;
383*0957b409SSimon J. Gerraty 		qb = 1;
384*0957b409SSimon J. Gerraty 		for (i = 0; i < 15; i ++) {
385*0957b409SSimon J. Gerraty 			/*
386*0957b409SSimon J. Gerraty 			 * At each iteration:
387*0957b409SSimon J. Gerraty 			 *
388*0957b409SSimon J. Gerraty 			 *   a <- (a-b)/2 if: a is odd, b is odd, a_hi > b_hi
389*0957b409SSimon J. Gerraty 			 *   b <- (b-a)/2 if: a is odd, b is odd, a_hi <= b_hi
390*0957b409SSimon J. Gerraty 			 *   a <- a/2 if: a is even
391*0957b409SSimon J. Gerraty 			 *   b <- b/2 if: a is odd, b is even
392*0957b409SSimon J. Gerraty 			 *
393*0957b409SSimon J. Gerraty 			 * We multiply a_lo and b_lo by 2 at each
394*0957b409SSimon J. Gerraty 			 * iteration, thus a division by 2 really is a
395*0957b409SSimon J. Gerraty 			 * non-multiplication by 2.
396*0957b409SSimon J. Gerraty 			 */
397*0957b409SSimon J. Gerraty 			uint32_t r, oa, ob, cAB, cBA, cA;
398*0957b409SSimon J. Gerraty 
399*0957b409SSimon J. Gerraty 			/*
400*0957b409SSimon J. Gerraty 			 * cAB = 1 if b must be subtracted from a
401*0957b409SSimon J. Gerraty 			 * cBA = 1 if a must be subtracted from b
402*0957b409SSimon J. Gerraty 			 * cA = 1 if a is divided by 2, 0 otherwise
403*0957b409SSimon J. Gerraty 			 *
404*0957b409SSimon J. Gerraty 			 * Rules:
405*0957b409SSimon J. Gerraty 			 *
406*0957b409SSimon J. Gerraty 			 *   cAB and cBA cannot be both 1.
407*0957b409SSimon J. Gerraty 			 *   if a is not divided by 2, b is.
408*0957b409SSimon J. Gerraty 			 */
409*0957b409SSimon J. Gerraty 			r = GT(a_hi, b_hi);
410*0957b409SSimon J. Gerraty 			oa = (a_lo >> i) & 1;
411*0957b409SSimon J. Gerraty 			ob = (b_lo >> i) & 1;
412*0957b409SSimon J. Gerraty 			cAB = oa & ob & r;
413*0957b409SSimon J. Gerraty 			cBA = oa & ob & NOT(r);
414*0957b409SSimon J. Gerraty 			cA = cAB | NOT(oa);
415*0957b409SSimon J. Gerraty 
416*0957b409SSimon J. Gerraty 			/*
417*0957b409SSimon J. Gerraty 			 * Conditional subtractions.
418*0957b409SSimon J. Gerraty 			 */
419*0957b409SSimon J. Gerraty 			a_lo -= b_lo & -cAB;
420*0957b409SSimon J. Gerraty 			a_hi -= b_hi & -cAB;
421*0957b409SSimon J. Gerraty 			pa -= qa & -(int32_t)cAB;
422*0957b409SSimon J. Gerraty 			pb -= qb & -(int32_t)cAB;
423*0957b409SSimon J. Gerraty 			b_lo -= a_lo & -cBA;
424*0957b409SSimon J. Gerraty 			b_hi -= a_hi & -cBA;
425*0957b409SSimon J. Gerraty 			qa -= pa & -(int32_t)cBA;
426*0957b409SSimon J. Gerraty 			qb -= pb & -(int32_t)cBA;
427*0957b409SSimon J. Gerraty 
428*0957b409SSimon J. Gerraty 			/*
429*0957b409SSimon J. Gerraty 			 * Shifting.
430*0957b409SSimon J. Gerraty 			 */
431*0957b409SSimon J. Gerraty 			a_lo += a_lo & (cA - 1);
432*0957b409SSimon J. Gerraty 			pa += pa & ((int32_t)cA - 1);
433*0957b409SSimon J. Gerraty 			pb += pb & ((int32_t)cA - 1);
434*0957b409SSimon J. Gerraty 			a_hi ^= (a_hi ^ (a_hi >> 1)) & -cA;
435*0957b409SSimon J. Gerraty 			b_lo += b_lo & -cA;
436*0957b409SSimon J. Gerraty 			qa += qa & -(int32_t)cA;
437*0957b409SSimon J. Gerraty 			qb += qb & -(int32_t)cA;
438*0957b409SSimon J. Gerraty 			b_hi ^= (b_hi ^ (b_hi >> 1)) & (cA - 1);
439*0957b409SSimon J. Gerraty 		}
440*0957b409SSimon J. Gerraty 
441*0957b409SSimon J. Gerraty 		/*
442*0957b409SSimon J. Gerraty 		 * Replace a and b with new values a' and b'.
443*0957b409SSimon J. Gerraty 		 */
444*0957b409SSimon J. Gerraty 		r = co_reduce(a, b, len, pa, pb, qa, qb);
445*0957b409SSimon J. Gerraty 		pa -= pa * ((r & 1) << 1);
446*0957b409SSimon J. Gerraty 		pb -= pb * ((r & 1) << 1);
447*0957b409SSimon J. Gerraty 		qa -= qa * (r & 2);
448*0957b409SSimon J. Gerraty 		qb -= qb * (r & 2);
449*0957b409SSimon J. Gerraty 		co_reduce_mod(u, v, len, pa, pb, qa, qb, m + 1, m0i);
450*0957b409SSimon J. Gerraty 	}
451*0957b409SSimon J. Gerraty 
452*0957b409SSimon J. Gerraty 	/*
453*0957b409SSimon J. Gerraty 	 * Now one of the arrays should be 0, and the other contains
454*0957b409SSimon J. Gerraty 	 * the GCD. If a is 0, then u is 0 as well, and v contains
455*0957b409SSimon J. Gerraty 	 * the division result.
456*0957b409SSimon J. Gerraty 	 * Result is correct if and only if GCD is 1.
457*0957b409SSimon J. Gerraty 	 */
458*0957b409SSimon J. Gerraty 	r = (a[0] | b[0]) ^ 1;
459*0957b409SSimon J. Gerraty 	u[0] |= v[0];
460*0957b409SSimon J. Gerraty 	for (k = 1; k < len; k ++) {
461*0957b409SSimon J. Gerraty 		r |= a[k] | b[k];
462*0957b409SSimon J. Gerraty 		u[k] |= v[k];
463*0957b409SSimon J. Gerraty 	}
464*0957b409SSimon J. Gerraty 	return EQ0(r);
465*0957b409SSimon J. Gerraty }
466