xref: /titanic_51/usr/src/cmd/sgs/rtld.4.x/umultiply.s (revision 1e49577a7fcde812700ded04431b49d67cc57d6d)
1/*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License, Version 1.0 only
6 * (the "License").  You may not use this file except in compliance
7 * with the License.
8 *
9 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10 * or http://www.opensolaris.org/os/licensing.
11 * See the License for the specific language governing permissions
12 * and limitations under the License.
13 *
14 * When distributing Covered Code, include this CDDL HEADER in each
15 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16 * If applicable, add the following below this CDDL HEADER, with the
17 * fields enclosed by brackets "[]" replaced with your own identifying
18 * information: Portions Copyright [yyyy] [name of copyright owner]
19 *
20 * CDDL HEADER END
21 */
22/*
23 *	.seg	"data"
24 *	.asciz	"Copyr 1987 Sun Micro"
25 *	.align	4
26 */
27	.seg	"text"
28
29#ident	"%Z%%M%	%I%	%E% SMI"
30
31!	Copyright (c) 1987 by Sun Microsystems, Inc.
32
33
34#include <sys/asm_linkage.h>
35
36/*
37 * procedure to perform a 32 by 32 unsigned integer multiply.
38 * pass the multiplier into %o0, and the multiplicand into %o1
39 * the least significant 32 bits of the result will be returned in %o0,
40 * and the most significant in %o1
41 *
42 * Most unsigned integer multiplies involve small numbers, so it is
43 * worthwhile to optimize for short multiplies at the expense of long
44 * multiplies.  This code checks the size of the multiplier, and has
45 * special cases for the following:
46 *
47 *	4 or fewer bit multipliers:	19 or 21 instruction cycles
48 *	8 or fewer bit multipliers:	26 or 28 instruction cycles
49 *	12 or fewer bit multipliers:	34 or 36 instruction cycles
50 *	16 or fewer bit multipliers:	42 or 44 instruction cycles
51 *
52 * Long multipliers require 58 or 60 instruction cycles:
53 *
54 * This code indicates that overflow has occured, by leaving the Z condition
55 * code clear. The following call sequence would be used if you wish to
56 * deal with overflow:
57 *
58 *	 	call	.umul
59 *		nop		( or set up last parameter here )
60 *		bnz	overflow_code	(or tnz to overflow handler)
61 */
62
63!	RTENTRY(.umul)
64	.global	.umul
65.umul:
66	wr	%o0, %y			! multiplier to Y register
67
68	andncc	%o0, 0xf, %o4		! mask out lower 4 bits; if branch
69					! taken, %o4, N and V have been cleared
70
71	be	umul_4bit		! 4-bit multiplier
72	sethi	%hi(0xffff0000), %o5	! mask for 16-bit case; have to
73					! wait 3 instructions after wd
74					! before %y has stabilized anyway
75
76	andncc	%o0, 0xff, %o4
77	be,a	umul_8bit		! 8-bit multiplier
78	mulscc	%o4, %o1, %o4		! first iteration of 9
79
80	andncc	%o0, 0xfff, %o4
81	be,a	umul_12bit		! 12-bit multiplier
82	mulscc	%o4, %o1, %o4		! first iteration of 13
83
84	andcc	%o0, %o5, %o4
85	be,a	umul_16bit		! 16-bit multiplier
86	mulscc	%o4, %o1, %o4		! first iteration of 17
87
88	andcc	%g0, %g0, %o4		! zero the partial product
89					! and clear N and V conditions
90	!
91	! long multiply
92	!
93	mulscc	%o4, %o1, %o4		! first iteration of 33
94	mulscc	%o4, %o1, %o4
95	mulscc	%o4, %o1, %o4
96	mulscc	%o4, %o1, %o4
97	mulscc	%o4, %o1, %o4
98	mulscc	%o4, %o1, %o4
99	mulscc	%o4, %o1, %o4
100	mulscc	%o4, %o1, %o4
101	mulscc	%o4, %o1, %o4
102	mulscc	%o4, %o1, %o4
103	mulscc	%o4, %o1, %o4
104	mulscc	%o4, %o1, %o4
105	mulscc	%o4, %o1, %o4
106	mulscc	%o4, %o1, %o4
107	mulscc	%o4, %o1, %o4
108	mulscc	%o4, %o1, %o4
109	mulscc	%o4, %o1, %o4
110	mulscc	%o4, %o1, %o4
111	mulscc	%o4, %o1, %o4
112	mulscc	%o4, %o1, %o4
113	mulscc	%o4, %o1, %o4
114	mulscc	%o4, %o1, %o4
115	mulscc	%o4, %o1, %o4
116	mulscc	%o4, %o1, %o4
117	mulscc	%o4, %o1, %o4
118	mulscc	%o4, %o1, %o4
119	mulscc	%o4, %o1, %o4
120	mulscc	%o4, %o1, %o4
121	mulscc	%o4, %o1, %o4
122	mulscc	%o4, %o1, %o4
123	mulscc	%o4, %o1, %o4
124	mulscc	%o4, %o1, %o4		! 32nd iteration
125	mulscc	%o4, %g0, %o4		! last iteration only shifts
126	!
127	! For unsigned multiplies, a pure shifty-add approach yields the
128	! correct result.  Signed multiplies introduce complications.
129	!
130	! With 32-bit twos-complement numbers, -x can be represented as
131	!
132	!	((2 - (x/(2**32)) mod 2) * 2**32.
133	!
134	! To simplify the equations, the radix point can be moved to just
135	! to the left of the sign bit.  So:
136	!
137	! 	 x *  y	= (xy) mod 2
138	!	-x *  y	= (2 - x) mod 2 * y = (2y - xy) mod 2
139	!	 x * -y	= x * (2 - y) mod 2 = (2x - xy) mod 2
140	!	-x * -y = (2 - x) * (2 - y) = (4 - 2x - 2y + xy) mod 2
141	!
142	! Because of the way the shift into the partial product is calculated
143	! (N xor V), the extra term is automagically removed for negative
144	! multiplicands, so no adjustment is necessary.
145	!
146	! But for unsigned multiplies, the high-order bit of the multiplicand
147	! is incorrectly treated as a sign bit.  For unsigned multiplies where
148	! the high-order bit of the multiplicand is one, the result is
149	!
150	!	xy - y * (2**32)
151	!
152	! we fix that here
153	!
154	tst	%o1
155	bge	1f
156	nop
157
158	add	%o4, %o0, %o4		! add (2**32) * %o0; bits 63-32
159					! of the product are in %o4
160	!
161	! The multiply hasn't overflowed if the high-order bits are 0
162	!
163	! if you are not interested in detecting overflow,
164	! replace the following code with:
165	!
166	!	1:
167	!		rd	%y, %o0
168	!		retl
169	!		mov	%o4, %o1
170	!
1711:
172	rd	%y, %o0
173	retl				! leaf routine return
174	addcc	%o4, %g0, %o1		! return high-order bits and set Z if
175					! high order bits are 0
176	!
177	! 4-bit multiply
178	!
179umul_4bit:
180	mulscc	%o4, %o1, %o4		! first iteration of 5
181	mulscc	%o4, %o1, %o4
182	mulscc	%o4, %o1, %o4
183	mulscc	%o4, %o1, %o4		! 4th iteration
184	mulscc	%o4, %g0, %o4		! last iteration only shifts
185
186	rd	%y, %o5
187	!
188	! The folowing code adds (2**32) * %o0 to the product if the
189	! multiplicand had it's high bit set (see 32-bit case for explanation)
190	!
191	tst	%o1
192	bge	2f
193	sra	%o4, 28, %o1		! right shift high bits by 28 bits
194
195	add	%o1, %o0, %o1
196	!
197	! The multiply hasn't overflowed if high-order bits are 0
198	!
199	! if you are not interested in detecting overflow,
200	! replace the following code with:
201	!
202	!	2:
203	!		sll	%o4, 4, %o0
204	!		srl	%o5, 28, %o5
205	!		retl
206	!		or	%o5, %o0, %o0
207	!
2082:
209	sll	%o4, 4, %o0		! left shift middle bits by 4 bits
210	srl	%o5, 28, %o5		! right shift low bits by 28 bits
211	or	%o5, %o0, %o0		! merge for true product
212	retl				! leaf routine return
213	tst	%o1			! set Z if high order bits are 0
214	!
215	! 8-bit multiply
216	!
217umul_8bit:
218	mulscc	%o4, %o1, %o4		! second iteration of 9
219	mulscc	%o4, %o1, %o4
220	mulscc	%o4, %o1, %o4
221	mulscc	%o4, %o1, %o4
222	mulscc	%o4, %o1, %o4
223	mulscc	%o4, %o1, %o4
224	mulscc	%o4, %o1, %o4		! 8th iteration
225	mulscc	%o4, %g0, %o4		! last iteration only shifts
226
227	rd	%y, %o5
228	!
229	! The folowing code adds (2**32) * %o0 to the product if the
230	! multiplicand had it's high bit set (see 32-bit case for explanation)
231	!
232	tst	%o1
233	bge	3f
234	sra	%o4, 24, %o1		! right shift high bits by 24 bits
235
236	add	%o1, %o0, %o1
237	!
238	! The multiply hasn't overflowed if high-order bits are 0
239	!
240	! if you are not interested in detecting overflow,
241	! replace the following code with:
242	!
243	!	3:
244	!		sll	%o4, 8, %o0
245	!		srl	%o5, 24, %o5
246	!		retl
247	!		or	%o5, %o0, %o0
248	!
2493:
250	sll	%o4, 8, %o0		! left shift middle bits by 8 bits
251	srl	%o5, 24, %o5		! right shift low bits by 24 bits
252	or	%o5, %o0, %o0		! merge for true product
253	retl				! leaf routine return
254	tst	%o1			! set Z if high order bits are 0
255	!
256	! 12-bit multiply
257	!
258umul_12bit:
259	mulscc	%o4, %o1, %o4		! second iteration of 13
260	mulscc	%o4, %o1, %o4
261	mulscc	%o4, %o1, %o4
262	mulscc	%o4, %o1, %o4
263	mulscc	%o4, %o1, %o4
264	mulscc	%o4, %o1, %o4
265	mulscc	%o4, %o1, %o4
266	mulscc	%o4, %o1, %o4
267	mulscc	%o4, %o1, %o4
268	mulscc	%o4, %o1, %o4
269	mulscc	%o4, %o1, %o4		! 12th iteration
270	mulscc	%o4, %g0, %o4		! last iteration only shifts
271
272	rd	%y, %o5
273	!
274	! The folowing code adds (2**32) * %o0 to the product if the
275	! multiplicand had it's high bit set (see 32-bit case for explanation)
276	!
277	tst	%o1
278	bge	4f
279	sra	%o4, 20, %o1		! right shift high bits by 20 bits
280
281	add	%o1, %o0, %o1
282	!
283	! The multiply hasn't overflowed if high-order bits are 0
284	!
285	! if you are not interested in detecting overflow,
286	! replace the following code with:
287	!
288	!	4:
289	!		sll	%o4, 12, %o0
290	!		srl	%o5, 20, %o5
291	!		retl
292	!		or	%o5, %o0, %o0
293	!
2944:
295	sll	%o4, 12, %o0		! left shift middle bits by 12 bits
296	srl	%o5, 20, %o5		! right shift low bits by 20 bits
297	or	%o5, %o0, %o0		! merge for true product
298	retl				! leaf routine return
299	tst	%o1			! set Z if high order bits are 0
300	!
301	! 16-bit multiply
302	!
303umul_16bit:
304	mulscc	%o4, %o1, %o4		! second iteration of 17
305	mulscc	%o4, %o1, %o4
306	mulscc	%o4, %o1, %o4
307	mulscc	%o4, %o1, %o4
308	mulscc	%o4, %o1, %o4
309	mulscc	%o4, %o1, %o4
310	mulscc	%o4, %o1, %o4
311	mulscc	%o4, %o1, %o4
312	mulscc	%o4, %o1, %o4
313	mulscc	%o4, %o1, %o4
314	mulscc	%o4, %o1, %o4
315	mulscc	%o4, %o1, %o4
316	mulscc	%o4, %o1, %o4
317	mulscc	%o4, %o1, %o4
318	mulscc	%o4, %o1, %o4		! 16th iteration
319	mulscc	%o4, %g0, %o4		! last iteration only shifts
320
321	rd	%y, %o5
322	!
323	! The folowing code adds (2**32) * %o0 to the product if the
324	! multiplicand had it's high bit set (see 32-bit case for explanation)
325	!
326	tst	%o1
327	bge	5f
328	sra	%o4, 16, %o1		! right shift high bits by 16 bits
329
330	add	%o1, %o0, %o1
331	!
332	! The multiply hasn't overflowed if high-order bits are 0
333	!
334	! if you are not interested in detecting overflow,
335	! replace the following code with:
336	!
337	!	5:
338	!		sll	%o4, 16, %o0
339	!		srl	%o5, 16, %o5
340	!		retl
341	!		or	%o5, %o0, %o0
342	!
3435:
344	sll	%o4, 16, %o0		! left shift middle bits by 16 bits
345	srl	%o5, 16, %o5		! right shift low bits by 16 bits
346	or	%o5, %o0, %o0		! merge for true product
347	retl				! leaf routine return
348	tst	%o1			! set Z if high order bits are 0
349