xref: /linux/arch/alpha/lib/ev6-divide.S (revision 00fc0e0dda6286407f3854cd71a125f519a5689c)
1/*
2 * arch/alpha/lib/ev6-divide.S
3 *
4 * 21264 version contributed by Rick Gorton <rick.gorton@alpha-processor.com>
5 *
6 * Alpha division..
7 */
8
9/*
10 * The alpha chip doesn't provide hardware division, so we have to do it
11 * by hand.  The compiler expects the functions
12 *
13 *	__divqu: 64-bit unsigned long divide
14 *	__remqu: 64-bit unsigned long remainder
15 *	__divqs/__remqs: signed 64-bit
16 *	__divlu/__remlu: unsigned 32-bit
17 *	__divls/__remls: signed 32-bit
18 *
19 * These are not normal C functions: instead of the normal
20 * calling sequence, these expect their arguments in registers
21 * $24 and $25, and return the result in $27. Register $28 may
22 * be clobbered (assembly temporary), anything else must be saved.
23 *
24 * In short: painful.
25 *
26 * This is a rather simple bit-at-a-time algorithm: it's very good
27 * at dividing random 64-bit numbers, but the more usual case where
28 * the divisor is small is handled better by the DEC algorithm
29 * using lookup tables. This uses much less memory, though, and is
30 * nicer on the cache.. Besides, I don't know the copyright status
31 * of the DEC code.
32 */
33
34/*
35 * My temporaries:
36 *	$0 - current bit
37 *	$1 - shifted divisor
38 *	$2 - modulus/quotient
39 *
40 *	$23 - return address
41 *	$24 - dividend
42 *	$25 - divisor
43 *
44 *	$27 - quotient/modulus
45 *	$28 - compare status
46 *
47 * Much of the information about 21264 scheduling/coding comes from:
48 *	Compiler Writer's Guide for the Alpha 21264
49 *	abbreviated as 'CWG' in other comments here
50 *	ftp.digital.com/pub/Digital/info/semiconductor/literature/dsc-library.html
51 * Scheduling notation:
52 *	E	- either cluster
53 *	U	- upper subcluster; U0 - subcluster U0; U1 - subcluster U1
54 *	L	- lower subcluster; L0 - subcluster L0; L1 - subcluster L1
55 * Try not to change the actual algorithm if possible for consistency.
56 */
57
58#include <asm/export.h>
59#define halt .long 0
60
61/*
62 * Select function type and registers
63 */
64#define mask	$0
65#define divisor	$1
66#define compare $28
67#define tmp1	$3
68#define tmp2	$4
69
70#ifdef DIV
71#define DIV_ONLY(x,y...) x,##y
72#define MOD_ONLY(x,y...)
73#define func(x) __div##x
74#define modulus $2
75#define quotient $27
76#define GETSIGN(x) xor $24,$25,x
77#define STACK 48
78#else
79#define DIV_ONLY(x,y...)
80#define MOD_ONLY(x,y...) x,##y
81#define func(x) __rem##x
82#define modulus $27
83#define quotient $2
84#define GETSIGN(x) bis $24,$24,x
85#define STACK 32
86#endif
87
88/*
89 * For 32-bit operations, we need to extend to 64-bit
90 */
91#ifdef INTSIZE
92#define ufunction func(lu)
93#define sfunction func(l)
94#define LONGIFY(x) zapnot x,15,x
95#define SLONGIFY(x) addl x,0,x
96#else
97#define ufunction func(qu)
98#define sfunction func(q)
99#define LONGIFY(x)
100#define SLONGIFY(x)
101#endif
102
103.set noat
104.align	4
105.globl	ufunction
106.ent	ufunction
107ufunction:
108	subq	$30,STACK,$30		# E :
109	.frame	$30,STACK,$23
110	.prologue 0
111
1127:	stq	$1, 0($30)		# L :
113	bis	$25,$25,divisor		# E :
114	stq	$2, 8($30)		# L : L U L U
115
116	bis	$24,$24,modulus		# E :
117	stq	$0,16($30)		# L :
118	bis	$31,$31,quotient	# E :
119	LONGIFY(divisor)		# E : U L L U
120
121	stq	tmp1,24($30)		# L :
122	LONGIFY(modulus)		# E :
123	bis	$31,1,mask		# E :
124	DIV_ONLY(stq tmp2,32($30))	# L : L U U L
125
126	beq	divisor, 9f			/* div by zero */
127	/*
128	 * In spite of the DIV_ONLY being either a non-instruction
129	 * or an actual stq, the addition of the .align directive
130	 * below ensures that label 1 is going to be nicely aligned
131	 */
132
133	.align	4
134#ifdef INTSIZE
135	/*
136	 * shift divisor left, using 3-bit shifts for
137	 * 32-bit divides as we can't overflow. Three-bit
138	 * shifts will result in looping three times less
139	 * here, but can result in two loops more later.
140	 * Thus using a large shift isn't worth it (and
141	 * s8add pairs better than a sll..)
142	 */
1431:	cmpult	divisor,modulus,compare	# E :
144	s8addq	divisor,$31,divisor	# E :
145	s8addq	mask,$31,mask		# E :
146	bne	compare,1b		# U : U L U L
147#else
1481:	cmpult	divisor,modulus,compare	# E :
149	nop				# E :
150	nop				# E :
151	blt     divisor, 2f		# U : U L U L
152
153	addq	divisor,divisor,divisor	# E :
154	addq	mask,mask,mask		# E :
155	unop				# E :
156	bne	compare,1b		# U : U L U L
157#endif
158
159	/* ok, start to go right again.. */
1602:
161	/*
162	 * Keep things nicely bundled... use a nop instead of not
163	 * having an instruction for DIV_ONLY
164	 */
165#ifdef DIV
166	DIV_ONLY(addq quotient,mask,tmp2) # E :
167#else
168	nop				# E :
169#endif
170	srl	mask,1,mask		# U :
171	cmpule	divisor,modulus,compare	# E :
172	subq	modulus,divisor,tmp1	# E :
173
174#ifdef DIV
175	DIV_ONLY(cmovne compare,tmp2,quotient)	# E : Latency 2, extra map slot
176	nop				# E : as part of the cmovne
177	srl	divisor,1,divisor	# U :
178	nop				# E : L U L U
179
180	nop				# E :
181	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
182	nop				# E : as part of the cmovne
183	bne	mask,2b			# U : U L U L
184#else
185	srl	divisor,1,divisor	# U :
186	cmovne	compare,tmp1,modulus	# E : Latency 2, extra map slot
187	nop				# E : as part of the cmovne
188	bne	mask,2b			# U : U L L U
189#endif
190
1919:	ldq	$1, 0($30)		# L :
192	ldq	$2, 8($30)		# L :
193	nop				# E :
194	nop				# E : U U L L
195
196	ldq	$0,16($30)		# L :
197	ldq	tmp1,24($30)		# L :
198	nop				# E :
199	nop				# E :
200
201#ifdef DIV
202	DIV_ONLY(ldq tmp2,32($30))	# L :
203#else
204	nop				# E :
205#endif
206	addq	$30,STACK,$30		# E :
207	ret	$31,($23),1		# L0 : L U U L
208	.end	ufunction
209EXPORT_SYMBOL(ufunction)
210
211/*
212 * Uhh.. Ugly signed division. I'd rather not have it at all, but
213 * it's needed in some circumstances. There are different ways to
214 * handle this, really. This does:
215 * 	-a / b = a / -b = -(a / b)
216 *	-a % b = -(a % b)
217 *	a % -b = a % b
218 * which is probably not the best solution, but at least should
219 * have the property that (x/y)*y + (x%y) = x.
220 */
221.align 4
222.globl	sfunction
223.ent	sfunction
224sfunction:
225	subq	$30,STACK,$30		# E :
226	.frame	$30,STACK,$23
227	.prologue 0
228	bis	$24,$25,$28		# E :
229	SLONGIFY($28)			# E :
230	bge	$28,7b			# U :
231
232	stq	$24,0($30)		# L :
233	subq	$31,$24,$28		# E :
234	stq	$25,8($30)		# L :
235	nop				# E : U L U L
236
237	cmovlt	$24,$28,$24	/* abs($24) */ # E : Latency 2, extra map slot
238	nop				# E : as part of the cmov
239	stq	$23,16($30)		# L :
240	subq	$31,$25,$28		# E : U L U L
241
242	stq	tmp1,24($30)		# L :
243	cmovlt	$25,$28,$25	/* abs($25) */ # E : Latency 2, extra map slot
244	nop				# E :
245	bsr	$23,ufunction		# L0: L U L U
246
247	ldq	$24,0($30)		# L :
248	ldq	$25,8($30)		# L :
249	GETSIGN($28)			# E :
250	subq	$31,$27,tmp1		# E : U U L L
251
252	SLONGIFY($28)			# E :
253	ldq	$23,16($30)		# L :
254	cmovlt	$28,tmp1,$27		# E : Latency 2, extra map slot
255	nop				# E : U L L U : as part of the cmov
256
257	ldq	tmp1,24($30)		# L :
258	nop				# E : as part of the cmov
259	addq	$30,STACK,$30		# E :
260	ret	$31,($23),1		# L0 : L U U L
261	.end	sfunction
262EXPORT_SYMBOL(sfunction)
263