xref: /freebsd/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dfmul.S (revision b4af4f93c682e445bf159f0d1ec90b636296c946)
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9// Double Precision Multiply
10#define A r1:0
11#define AH r1
12#define AL r0
13#define B r3:2
14#define BH r3
15#define BL r2
16
17#define BTMP r5:4
18#define BTMPH r5
19#define BTMPL r4
20
21#define PP_ODD r7:6
22#define PP_ODD_H r7
23#define PP_ODD_L r6
24
25#define ONE r9:8
26#define S_ONE r8
27#define S_ZERO r9
28
29#define PP_HH r11:10
30#define PP_HH_H r11
31#define PP_HH_L r10
32
33#define ATMP r13:12
34#define ATMPH r13
35#define ATMPL r12
36
37#define PP_LL r15:14
38#define PP_LL_H r15
39#define PP_LL_L r14
40
41#define TMP r28
42
43#define MANTBITS 52
44#define HI_MANTBITS 20
45#define EXPBITS 11
46#define BIAS 1024
47#define MANTISSA_TO_INT_BIAS 52
48
49// Some constant to adjust normalization amount in error code
50// Amount to right shift the partial product to get to a denorm
51#define FUDGE 5
52
53#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
54#define FAST_ALIAS(TAG) .global __hexagon_fast_##TAG ; .set __hexagon_fast_##TAG, __hexagon_##TAG
55#define FAST2_ALIAS(TAG) .global __hexagon_fast2_##TAG ; .set __hexagon_fast2_##TAG, __hexagon_##TAG
56#define END(TAG) .size TAG,.-TAG
57
58#define SR_ROUND_OFF 22
59	.text
60	.global __hexagon_muldf3
61	.type __hexagon_muldf3,@function
62	Q6_ALIAS(muldf3)
63  FAST_ALIAS(muldf3)
64  FAST2_ALIAS(muldf3)
65	.p2align 5
66__hexagon_muldf3:
67	{
68		p0 = dfclass(A,#2)
69		p0 = dfclass(B,#2)
70		ATMP = combine(##0x40000000,#0)
71	}
72	{
73		ATMP = insert(A,#MANTBITS,#EXPBITS-1)
74		BTMP = asl(B,#EXPBITS-1)
75		TMP = #-BIAS
76		ONE = #1
77	}
78	{
79		PP_ODD = mpyu(BTMPL,ATMPH)
80		BTMP = insert(ONE,#2,#62)
81	}
82	// since we know that the MSB of the H registers is zero, we should never carry
83	// H <= 2^31-1.  L <= 2^32-1.  Therefore, HL <= 2^63-2^32-2^31+1
84	// Adding 2 HLs, we get 2^64-3*2^32+2 maximum.
85	// Therefore, we can add 3 2^32-1 values safely without carry.  We only need one.
86	{
87		PP_LL = mpyu(ATMPL,BTMPL)
88		PP_ODD += mpyu(ATMPL,BTMPH)
89	}
90	{
91		PP_ODD += lsr(PP_LL,#32)
92		PP_HH = mpyu(ATMPH,BTMPH)
93		BTMP = combine(##BIAS+BIAS-4,#0)
94	}
95	{
96		PP_HH += lsr(PP_ODD,#32)
97		if (!p0) jump .Lmul_abnormal
98		p1 = cmp.eq(PP_LL_L,#0)		// 64 lsb's 0?
99		p1 = cmp.eq(PP_ODD_L,#0)	// 64 lsb's 0?
100	}
101
102	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
103	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
104
105#undef PP_ODD
106#undef PP_ODD_H
107#undef PP_ODD_L
108#define EXP10 r7:6
109#define EXP1 r7
110#define EXP0 r6
111	{
112		if (!p1) PP_HH_L = or(PP_HH_L,S_ONE)
113		EXP0 = extractu(AH,#EXPBITS,#HI_MANTBITS)
114		EXP1 = extractu(BH,#EXPBITS,#HI_MANTBITS)
115	}
116	{
117		PP_LL = neg(PP_HH)
118		EXP0 += add(TMP,EXP1)
119		TMP = xor(AH,BH)
120	}
121	{
122		if (!p2.new) PP_HH = PP_LL
123		p2 = cmp.gt(TMP,#-1)
124		p0 = !cmp.gt(EXP0,BTMPH)
125		p0 = cmp.gt(EXP0,BTMPL)
126		if (!p0.new) jump:nt .Lmul_ovf_unf
127	}
128	{
129		A = convert_d2df(PP_HH)
130		EXP0 = add(EXP0,#-BIAS-58)
131	}
132	{
133		AH += asl(EXP0,#HI_MANTBITS)
134		jumpr r31
135	}
136
137	.falign
138.Lpossible_unf:
139	// We end up with a positive exponent
140	// But we may have rounded up to an exponent of 1.
141	// If the exponent is 1, if we rounded up to it
142	// we need to also raise underflow
143	// Fortunately, this is pretty easy to detect, we must have +/- 0x0010_0000_0000_0000
144	// And the PP should also have more than one bit set
145	//
146	// Note: ATMP should have abs(PP_HH)
147	// Note: BTMPL should have 0x7FEFFFFF
148	{
149		p0 = cmp.eq(AL,#0)
150		p0 = bitsclr(AH,BTMPL)
151		if (!p0.new) jumpr:t r31
152		BTMPH = #0x7fff
153	}
154	{
155		p0 = bitsset(ATMPH,BTMPH)
156		BTMPL = USR
157		BTMPH = #0x030
158	}
159	{
160		if (p0) BTMPL = or(BTMPL,BTMPH)
161	}
162	{
163		USR = BTMPL
164	}
165	{
166		p0 = dfcmp.eq(A,A)
167		jumpr r31
168	}
169	.falign
170.Lmul_ovf_unf:
171	{
172		A = convert_d2df(PP_HH)
173		ATMP = abs(PP_HH)			// take absolute value
174		EXP1 = add(EXP0,#-BIAS-58)
175	}
176	{
177		AH += asl(EXP1,#HI_MANTBITS)
178		EXP1 = extractu(AH,#EXPBITS,#HI_MANTBITS)
179		BTMPL = ##0x7FEFFFFF
180	}
181	{
182		EXP1 += add(EXP0,##-BIAS-58)
183		//BTMPH = add(clb(ATMP),#-2)
184		BTMPH = #0
185	}
186	{
187		p0 = cmp.gt(EXP1,##BIAS+BIAS-2)	// overflow
188		if (p0.new) jump:nt .Lmul_ovf
189	}
190	{
191		p0 = cmp.gt(EXP1,#0)
192		if (p0.new) jump:nt .Lpossible_unf
193		BTMPH = sub(EXP0,BTMPH)
194		TMP = #63				// max amount to shift
195	}
196	// Underflow
197	//
198	// PP_HH has the partial product with sticky LSB.
199	// PP_HH can have a maximum of 0x3FFF_FFFF_FFFF_FFFF or thereabouts
200	// PP_HH can have a minimum of 0x1000_0000_0000_0000 or so
201	// The exponent of PP_HH is in  EXP1, which is non-positive (0 or negative)
202	// That's the exponent that happens after the normalization
203	//
204	// EXP0 has the exponent that, when added to the normalized value, is out of range.
205	//
206	// Strategy:
207	//
208	// * Shift down bits, with sticky bit, such that the bits are aligned according
209	//   to the LZ count and appropriate exponent, but not all the way to mantissa
210	//   field, keep around the last few bits.
211	// * Put a 1 near the MSB
212	// * Check the LSBs for inexact; if inexact also set underflow
213	// * Convert [u]d2df -- will correctly round according to rounding mode
214	// * Replace exponent field with zero
215
216	{
217		BTMPL = #0	 			// offset for extract
218		BTMPH = sub(#FUDGE,BTMPH)		// amount to right shift
219	}
220	{
221		p3 = cmp.gt(PP_HH_H,#-1)		// is it positive?
222		BTMPH = min(BTMPH,TMP)			// Don't shift more than 63
223		PP_HH = ATMP
224	}
225	{
226		TMP = USR
227		PP_LL = extractu(PP_HH,BTMP)
228	}
229	{
230		PP_HH = asr(PP_HH,BTMPH)
231		BTMPL = #0x0030					// underflow flag
232		AH = insert(S_ZERO,#EXPBITS,#HI_MANTBITS)
233	}
234	{
235		p0 = cmp.gtu(ONE,PP_LL)				// Did we extract all zeros?
236		if (!p0.new) PP_HH_L = or(PP_HH_L,S_ONE)	// add sticky bit
237		PP_HH_H = setbit(PP_HH_H,#HI_MANTBITS+3)	// Add back in a bit so we can use convert instruction
238	}
239	{
240		PP_LL = neg(PP_HH)
241		p1 = bitsclr(PP_HH_L,#0x7)		// Are the LSB's clear?
242		if (!p1.new) TMP = or(BTMPL,TMP)	// If not, Inexact+Underflow
243	}
244	{
245		if (!p3) PP_HH = PP_LL
246		USR = TMP
247	}
248	{
249		A = convert_d2df(PP_HH)			// Do rounding
250		p0 = dfcmp.eq(A,A)			// realize exception
251	}
252	{
253		AH = insert(S_ZERO,#EXPBITS-1,#HI_MANTBITS+1)		// Insert correct exponent
254		jumpr r31
255	}
256	.falign
257.Lmul_ovf:
258	// We get either max finite value or infinity.  Either way, overflow+inexact
259	{
260		TMP = USR
261		ATMP = combine(##0x7fefffff,#-1)	// positive max finite
262		A = PP_HH
263	}
264	{
265		PP_LL_L = extractu(TMP,#2,#SR_ROUND_OFF)	// rounding bits
266		TMP = or(TMP,#0x28)			// inexact + overflow
267		BTMP = combine(##0x7ff00000,#0)		// positive infinity
268	}
269	{
270		USR = TMP
271		PP_LL_L ^= lsr(AH,#31)			// Does sign match rounding?
272		TMP = PP_LL_L				// unmodified rounding mode
273	}
274	{
275		p0 = !cmp.eq(TMP,#1)			// If not round-to-zero and
276		p0 = !cmp.eq(PP_LL_L,#2)		// Not rounding the other way,
277		if (p0.new) ATMP = BTMP			// we should get infinity
278		p0 = dfcmp.eq(A,A)			// Realize FP exception if enabled
279	}
280	{
281		A = insert(ATMP,#63,#0)			// insert inf/maxfinite, leave sign
282		jumpr r31
283	}
284
285.Lmul_abnormal:
286	{
287		ATMP = extractu(A,#63,#0)		// strip off sign
288		BTMP = extractu(B,#63,#0)		// strip off sign
289	}
290	{
291		p3 = cmp.gtu(ATMP,BTMP)
292		if (!p3.new) A = B			// sort values
293		if (!p3.new) B = A			// sort values
294	}
295	{
296		// Any NaN --> NaN, possibly raise invalid if sNaN
297		p0 = dfclass(A,#0x0f)		// A not NaN?
298		if (!p0.new) jump:nt .Linvalid_nan
299		if (!p3) ATMP = BTMP
300		if (!p3) BTMP = ATMP
301	}
302	{
303		// Infinity * nonzero number is infinity
304		p1 = dfclass(A,#0x08)		// A is infinity
305		p1 = dfclass(B,#0x0e)		// B is nonzero
306	}
307	{
308		// Infinity * zero --> NaN, raise invalid
309		// Other zeros return zero
310		p0 = dfclass(A,#0x08)		// A is infinity
311		p0 = dfclass(B,#0x01)		// B is zero
312	}
313	{
314		if (p1) jump .Ltrue_inf
315		p2 = dfclass(B,#0x01)
316	}
317	{
318		if (p0) jump .Linvalid_zeroinf
319		if (p2) jump .Ltrue_zero		// so return zero
320		TMP = ##0x7c000000
321	}
322	// We are left with a normal or subnormal times a subnormal. A > B
323	// If A and B are both very small (exp(a) < BIAS-MANTBITS),
324	// we go to a single sticky bit, which we can round easily.
325	// If A and B might multiply to something bigger, decrease A exponent and increase
326	// B exponent and try again
327	{
328		p0 = bitsclr(AH,TMP)
329		if (p0.new) jump:nt .Lmul_tiny
330	}
331	{
332		TMP = cl0(BTMP)
333	}
334	{
335		TMP = add(TMP,#-EXPBITS)
336	}
337	{
338		BTMP = asl(BTMP,TMP)
339	}
340	{
341		B = insert(BTMP,#63,#0)
342		AH -= asl(TMP,#HI_MANTBITS)
343	}
344	jump __hexagon_muldf3
345.Lmul_tiny:
346	{
347		TMP = USR
348		A = xor(A,B)				// get sign bit
349	}
350	{
351		TMP = or(TMP,#0x30)			// Inexact + Underflow
352		A = insert(ONE,#63,#0)			// put in rounded up value
353		BTMPH = extractu(TMP,#2,#SR_ROUND_OFF)	// get rounding mode
354	}
355	{
356		USR = TMP
357		p0 = cmp.gt(BTMPH,#1)			// Round towards pos/neg inf?
358		if (!p0.new) AL = #0			// If not, zero
359		BTMPH ^= lsr(AH,#31)			// rounding my way --> set LSB
360	}
361	{
362		p0 = cmp.eq(BTMPH,#3)			// if rounding towards right inf
363		if (!p0.new) AL = #0			// don't go to zero
364		jumpr r31
365	}
366.Linvalid_zeroinf:
367	{
368		TMP = USR
369	}
370	{
371		A = #-1
372		TMP = or(TMP,#2)
373	}
374	{
375		USR = TMP
376	}
377	{
378		p0 = dfcmp.uo(A,A)			// force exception if enabled
379		jumpr r31
380	}
381.Linvalid_nan:
382	{
383		p0 = dfclass(B,#0x0f)			// if B is not NaN
384		TMP = convert_df2sf(A)			// will generate invalid if sNaN
385		if (p0.new) B = A 			// make it whatever A is
386	}
387	{
388		BL = convert_df2sf(B)			// will generate invalid if sNaN
389		A = #-1
390		jumpr r31
391	}
392	.falign
393.Ltrue_zero:
394	{
395		A = B
396		B = A
397	}
398.Ltrue_inf:
399	{
400		BH = extract(BH,#1,#31)
401	}
402	{
403		AH ^= asl(BH,#31)
404		jumpr r31
405	}
406END(__hexagon_muldf3)
407
408#undef ATMP
409#undef ATMPL
410#undef ATMPH
411#undef BTMP
412#undef BTMPL
413#undef BTMPH
414