xref: /freebsd/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S (revision e64fe029e9d3ce476e77a478318e0c3cd201ff08)
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
10#define END(TAG) .size TAG,.-TAG
11
12// Double Precision Multiply
13
14
15#define A r1:0
16#define AH r1
17#define AL r0
18#define B r3:2
19#define BH r3
20#define BL r2
21#define C r5:4
22#define CH r5
23#define CL r4
24
25
26
27#define BTMP r15:14
28#define BTMPH r15
29#define BTMPL r14
30
31#define ATMP r13:12
32#define ATMPH r13
33#define ATMPL r12
34
35#define CTMP r11:10
36#define CTMPH r11
37#define CTMPL r10
38
39#define PP_LL r9:8
40#define PP_LL_H r9
41#define PP_LL_L r8
42
43#define PP_ODD r7:6
44#define PP_ODD_H r7
45#define PP_ODD_L r6
46
47
48#define PP_HH r17:16
49#define PP_HH_H r17
50#define PP_HH_L r16
51
52#define EXPA r18
53#define EXPB r19
54#define EXPBA r19:18
55
56#define TMP r28
57
58#define P_TMP p0
59#define PROD_NEG p3
60#define EXACT p2
61#define SWAP p1
62
63#define MANTBITS 52
64#define HI_MANTBITS 20
65#define EXPBITS 11
66#define BIAS 1023
67#define STACKSPACE 32
68
69#define ADJUST 4
70
71#define FUDGE 7
72#define FUDGE2 3
73
74#ifndef SR_ROUND_OFF
75#define SR_ROUND_OFF 22
76#endif
77
78	// First, classify for normal values, and abort if abnormal
79	//
80	// Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
81	//
82	// Since we know that the 2 MSBs of the H registers is zero, we should never carry
83	// the partial products that involve the H registers
84	//
85	// Try to buy X slots, at the expense of latency if needed
86	//
87	// We will have PP_HH with the upper bits of the product, PP_LL with the lower
88	// PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
89	// PP_HH can have a minimum of 0x0100_0000_0000_0000
90	//
91	// 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
92	//
93	// We need to align CTMP.
94	// If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
95	// If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
96	// If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
97	//
98	// Convert partial product and CTMP to 2's complement prior to addition
99	//
100	// After we add, we need to normalize into upper 64 bits, then compute sticky.
101
102	.text
103	.global __hexagon_fmadf4
104        .type __hexagon_fmadf4,@function
105	.global __hexagon_fmadf5
106        .type __hexagon_fmadf5,@function
107	Q6_ALIAS(fmadf5)
108	.p2align 5
109__hexagon_fmadf4:
110__hexagon_fmadf5:
111.Lfma_begin:
112	{
113		P_TMP = dfclass(A,#2)
114		P_TMP = dfclass(B,#2)
115		ATMP = #0
116		BTMP = #0
117	}
118	{
119		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
120		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
121		PP_ODD_H = ##0x10000000
122		allocframe(#STACKSPACE)
123	}
124	{
125		PP_LL = mpyu(ATMPL,BTMPL)
126		if (!P_TMP) jump .Lfma_abnormal_ab
127		ATMPH = or(ATMPH,PP_ODD_H)
128		BTMPH = or(BTMPH,PP_ODD_H)
129	}
130	{
131		P_TMP = dfclass(C,#2)
132		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
133		CTMP = combine(PP_ODD_H,#0)
134		PP_ODD = combine(#0,PP_LL_H)
135	}
136.Lfma_abnormal_c_restart:
137	{
138		PP_ODD += mpyu(BTMPL,ATMPH)
139		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
140		memd(r29+#0) = PP_HH
141		memd(r29+#8) = EXPBA
142	}
143	{
144		PP_ODD += mpyu(ATMPL,BTMPH)
145		EXPBA = neg(CTMP)
146		P_TMP = cmp.gt(CH,#-1)
147		TMP = xor(AH,BH)
148	}
149	{
150		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
151		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
152		PP_HH = combine(#0,PP_ODD_H)
153		if (!P_TMP) CTMP = EXPBA
154	}
155	{
156		PP_HH += mpyu(ATMPH,BTMPH)
157		PP_LL = combine(PP_ODD_L,PP_LL_L)
158#undef PP_ODD
159#undef PP_ODD_H
160#undef PP_ODD_L
161#undef ATMP
162#undef ATMPL
163#undef ATMPH
164#undef BTMP
165#undef BTMPL
166#undef BTMPH
167#define RIGHTLEFTSHIFT r13:12
168#define RIGHTSHIFT r13
169#define LEFTSHIFT r12
170
171		EXPA = add(EXPA,EXPB)
172#undef EXPB
173#undef EXPBA
174#define EXPC r19
175#define EXPCA r19:18
176		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
177	}
178	// PP_HH:PP_LL now has product
179	// CTMP is negated
180	// EXPA,B,C are extracted
181	// We need to negate PP
182	// Since we will be adding with carry later, if we need to negate,
183	// just invert all bits now, which we can do conditionally and in parallel
184#define PP_HH_TMP r15:14
185#define PP_LL_TMP r7:6
186	{
187		EXPA = add(EXPA,#-BIAS+(ADJUST))
188		PROD_NEG = !cmp.gt(TMP,#-1)
189		PP_LL_TMP = #0
190		PP_HH_TMP = #0
191	}
192	{
193		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
194		P_TMP = !cmp.gt(TMP,#-1)
195		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
196		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
197	}
198	{
199		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
200		if (P_TMP) PP_LL = PP_LL_TMP
201#undef PP_LL_TMP
202#define CTMP2 r7:6
203#define CTMP2H r7
204#define CTMP2L r6
205		CTMP2 = #0
206		EXPC = sub(EXPA,EXPC)
207	}
208	{
209		if (P_TMP) PP_HH = PP_HH_TMP
210		P_TMP = cmp.gt(EXPC,#63)
211		if (SWAP) PP_LL = CTMP2
212		if (SWAP) CTMP2 = PP_LL
213	}
214#undef PP_HH_TMP
215//#define ONE r15:14
216//#define S_ONE r14
217#define ZERO r15:14
218#define S_ZERO r15
219#undef PROD_NEG
220#define P_CARRY p3
221	{
222		if (SWAP) PP_HH = CTMP	// Swap C and PP
223		if (SWAP) CTMP = PP_HH
224		if (P_TMP) EXPC = add(EXPC,#-64)
225		TMP = #63
226	}
227	{
228		// If diff > 63, pre-shift-right by 64...
229		if (P_TMP) CTMP2 = CTMP
230		TMP = asr(CTMPH,#31)
231		RIGHTSHIFT = min(EXPC,TMP)
232		LEFTSHIFT = #0
233	}
234#undef C
235#undef CH
236#undef CL
237#define STICKIES r5:4
238#define STICKIESH r5
239#define STICKIESL r4
240	{
241		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
242		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
243		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
244		LEFTSHIFT = sub(#64,RIGHTSHIFT)
245	}
246	{
247		ZERO = #0
248		TMP = #-2
249		CTMP2 |= lsl(CTMP,LEFTSHIFT)
250		CTMP = asr(CTMP,RIGHTSHIFT)
251	}
252	{
253		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
254		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
255#undef ZERO
256#define ONE r15:14
257#define S_ONE r14
258		ONE = #1
259		STICKIES = #0
260	}
261	{
262		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
263	}
264	{
265		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
266		TMP = #62
267	}
268	// PP_HH:PP_LL now holds the sum
269	// We may need to normalize left, up to ??? bits.
270	//
271	// I think that if we have massive cancellation, the range we normalize by
272	// is still limited
273	{
274		LEFTSHIFT = add(clb(PP_HH),#-2)
275		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
276	}
277	// We had all sign bits, shift left by 62.
278	{
279		CTMP = extractu(PP_LL,#62,#2)
280		PP_LL = asl(PP_LL,#62)
281		EXPA = add(EXPA,#-62)			// And adjust exponent of result
282	}
283	{
284		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
285	}
286	{
287		LEFTSHIFT = add(clb(PP_HH),#-2)
288	}
289	.falign
2901:
291	{
292		CTMP = asl(PP_HH,LEFTSHIFT)
293		STICKIES |= asl(PP_LL,LEFTSHIFT)
294		RIGHTSHIFT = sub(#64,LEFTSHIFT)
295		EXPA = sub(EXPA,LEFTSHIFT)
296	}
297	{
298		CTMP |= lsr(PP_LL,RIGHTSHIFT)
299		EXACT = cmp.gtu(ONE,STICKIES)
300		TMP = #BIAS+BIAS-2
301	}
302	{
303		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
304		// If EXPA is overflow/underflow, jump to ovf_unf
305		P_TMP = !cmp.gt(EXPA,TMP)
306		P_TMP = cmp.gt(EXPA,#1)
307		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
308	}
309	{
310		// XXX: FIXME: should PP_HH for check of zero be CTMP?
311		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
312		A = convert_d2df(CTMP)
313		EXPA = add(EXPA,#-BIAS-60)
314		PP_HH = memd(r29+#0)
315	}
316	{
317		AH += asl(EXPA,#HI_MANTBITS)
318		EXPCA = memd(r29+#8)
319		if (!P_TMP) dealloc_return		// not zero, return
320	}
321.Ladd_yields_zero:
322	// We had full cancellation.  Return +/- zero (-0 when round-down)
323	{
324		TMP = USR
325		A = #0
326	}
327	{
328		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
329		PP_HH = memd(r29+#0)
330		EXPCA = memd(r29+#8)
331	}
332	{
333		p0 = cmp.eq(TMP,#2)
334		if (p0.new) AH = ##0x80000000
335		dealloc_return
336	}
337
338#undef RIGHTLEFTSHIFT
339#undef RIGHTSHIFT
340#undef LEFTSHIFT
341#undef CTMP2
342#undef CTMP2H
343#undef CTMP2L
344
345.Lfma_ovf_unf:
346	{
347		p0 = cmp.gtu(ONE,CTMP)
348		if (p0.new) jump:nt .Ladd_yields_zero
349	}
350	{
351		A = convert_d2df(CTMP)
352		EXPA = add(EXPA,#-BIAS-60)
353		TMP = EXPA
354	}
355#define NEW_EXPB r7
356#define NEW_EXPA r6
357	{
358		AH += asl(EXPA,#HI_MANTBITS)
359		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
360	}
361	{
362		NEW_EXPA = add(EXPA,NEW_EXPB)
363		PP_HH = memd(r29+#0)
364		EXPCA = memd(r29+#8)
365#undef PP_HH
366#undef PP_HH_H
367#undef PP_HH_L
368#undef EXPCA
369#undef EXPC
370#undef EXPA
371#undef PP_LL
372#undef PP_LL_H
373#undef PP_LL_L
374#define EXPA r6
375#define EXPB r7
376#define EXPBA r7:6
377#define ATMP r9:8
378#define ATMPH r9
379#define ATMPL r8
380#undef NEW_EXPB
381#undef NEW_EXPA
382		ATMP = abs(CTMP)
383	}
384	{
385		p0 = cmp.gt(EXPA,##BIAS+BIAS)
386		if (p0.new) jump:nt .Lfma_ovf
387	}
388	{
389		p0 = cmp.gt(EXPA,#0)
390		if (p0.new) jump:nt .Lpossible_unf
391	}
392	{
393		// TMP has original EXPA.
394		// ATMP is corresponding value
395		// Normalize ATMP and shift right to correct location
396		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
397		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
398		p3 = cmp.gt(CTMPH,#-1)
399	}
400	// Underflow
401	// We know that the infinte range exponent should be EXPA
402	// CTMP is 2's complement, ATMP is abs(CTMP)
403	{
404		EXPA = add(EXPA,EXPB)		// how much to shift back right
405		ATMP = asl(ATMP,EXPB)		// shift left
406		AH = USR
407		TMP = #63
408	}
409	{
410		EXPB = min(EXPA,TMP)
411		EXPA = #0
412		AL = #0x0030
413	}
414	{
415		B = extractu(ATMP,EXPBA)
416		ATMP = asr(ATMP,EXPB)
417	}
418	{
419		p0 = cmp.gtu(ONE,B)
420		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
421		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
422	}
423	{
424		CTMP = neg(ATMP)
425		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
426		if (!p1.new) AH = or(AH,AL)
427		B = #0
428	}
429	{
430		if (p3) CTMP = ATMP
431		USR = AH
432		TMP = #-BIAS-(MANTBITS+FUDGE2)
433	}
434	{
435		A = convert_d2df(CTMP)
436	}
437	{
438		AH += asl(TMP,#HI_MANTBITS)
439		dealloc_return
440	}
441.Lpossible_unf:
442	{
443		TMP = ##0x7fefffff
444		ATMP = abs(CTMP)
445	}
446	{
447		p0 = cmp.eq(AL,#0)
448		p0 = bitsclr(AH,TMP)
449		if (!p0.new) dealloc_return:t
450		TMP = #0x7fff
451	}
452	{
453		p0 = bitsset(ATMPH,TMP)
454		BH = USR
455		BL = #0x0030
456	}
457	{
458		if (p0) BH = or(BH,BL)
459	}
460	{
461		USR = BH
462	}
463	{
464		p0 = dfcmp.eq(A,A)
465		dealloc_return
466	}
467.Lfma_ovf:
468	{
469		TMP = USR
470		CTMP = combine(##0x7fefffff,#-1)
471		A = CTMP
472	}
473	{
474		ATMP = combine(##0x7ff00000,#0)
475		BH = extractu(TMP,#2,#SR_ROUND_OFF)
476		TMP = or(TMP,#0x28)
477	}
478	{
479		USR = TMP
480		BH ^= lsr(AH,#31)
481		BL = BH
482	}
483	{
484		p0 = !cmp.eq(BL,#1)
485		p0 = !cmp.eq(BH,#2)
486	}
487	{
488		p0 = dfcmp.eq(ATMP,ATMP)
489		if (p0.new) CTMP = ATMP
490	}
491	{
492		A = insert(CTMP,#63,#0)
493		dealloc_return
494	}
495#undef CTMP
496#undef CTMPH
497#undef CTMPL
498#define BTMP r11:10
499#define BTMPH r11
500#define BTMPL r10
501
502#undef STICKIES
503#undef STICKIESH
504#undef STICKIESL
505#define C r5:4
506#define CH r5
507#define CL r4
508
509.Lfma_abnormal_ab:
510	{
511		ATMP = extractu(A,#63,#0)
512		BTMP = extractu(B,#63,#0)
513		deallocframe
514	}
515	{
516		p3 = cmp.gtu(ATMP,BTMP)
517		if (!p3.new) A = B		// sort values
518		if (!p3.new) B = A
519	}
520	{
521		p0 = dfclass(A,#0x0f)		// A NaN?
522		if (!p0.new) jump:nt .Lnan
523		if (!p3) ATMP = BTMP
524		if (!p3) BTMP = ATMP
525	}
526	{
527		p1 = dfclass(A,#0x08)		// A is infinity
528		p1 = dfclass(B,#0x0e)		// B is nonzero
529	}
530	{
531		p0 = dfclass(A,#0x08)		// a is inf
532		p0 = dfclass(B,#0x01)		// b is zero
533	}
534	{
535		if (p1) jump .Lab_inf
536		p2 = dfclass(B,#0x01)
537	}
538	{
539		if (p0) jump .Linvalid
540		if (p2) jump .Lab_true_zero
541		TMP = ##0x7c000000
542	}
543	// We are left with a normal or subnormal times a subnormal, A > B
544	// If A and B are both very small, we will go to a single sticky bit; replace
545	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
546	// if A and B might multiply to something bigger, decrease A exp and increase B exp
547	// and start over
548	{
549		p0 = bitsclr(AH,TMP)
550		if (p0.new) jump:nt .Lfma_ab_tiny
551	}
552	{
553		TMP = add(clb(BTMP),#-EXPBITS)
554	}
555	{
556		BTMP = asl(BTMP,TMP)
557	}
558	{
559		B = insert(BTMP,#63,#0)
560		AH -= asl(TMP,#HI_MANTBITS)
561	}
562	jump .Lfma_begin
563
564.Lfma_ab_tiny:
565	ATMP = combine(##0x00100000,#0)
566	{
567		A = insert(ATMP,#63,#0)
568		B = insert(ATMP,#63,#0)
569	}
570	jump .Lfma_begin
571
572.Lab_inf:
573	{
574		B = lsr(B,#63)
575		p0 = dfclass(C,#0x10)
576	}
577	{
578		A ^= asl(B,#63)
579		if (p0) jump .Lnan
580	}
581	{
582		p1 = dfclass(C,#0x08)
583		if (p1.new) jump:nt .Lfma_inf_plus_inf
584	}
585	// A*B is +/- inf, C is finite.  Return A
586	{
587		jumpr r31
588	}
589	.falign
590.Lfma_inf_plus_inf:
591	{	// adding infinities of different signs is invalid
592		p0 = dfcmp.eq(A,C)
593		if (!p0.new) jump:nt .Linvalid
594	}
595	{
596		jumpr r31
597	}
598
599.Lnan:
600	{
601		p0 = dfclass(B,#0x10)
602		p1 = dfclass(C,#0x10)
603		if (!p0.new) B = A
604		if (!p1.new) C = A
605	}
606	{	// find sNaNs
607		BH = convert_df2sf(B)
608		BL = convert_df2sf(C)
609	}
610	{
611		BH = convert_df2sf(A)
612		A = #-1
613		jumpr r31
614	}
615
616.Linvalid:
617	{
618		TMP = ##0x7f800001		// sp snan
619	}
620	{
621		A = convert_sf2df(TMP)
622		jumpr r31
623	}
624
625.Lab_true_zero:
626	// B is zero, A is finite number
627	{
628		p0 = dfclass(C,#0x10)
629		if (p0.new) jump:nt .Lnan
630		if (p0.new) A = C
631	}
632	{
633		p0 = dfcmp.eq(B,C)		// is C also zero?
634		AH = lsr(AH,#31)		// get sign
635	}
636	{
637		BH ^= asl(AH,#31)		// form correctly signed zero in B
638		if (!p0) A = C			// If C is not zero, return C
639		if (!p0) jumpr r31
640	}
641	// B has correctly signed zero, C is also zero
642.Lzero_plus_zero:
643	{
644		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
645		if (p0.new) jumpr:t r31
646		A = B
647	}
648	{
649		TMP = USR
650	}
651	{
652		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
653		A = #0
654	}
655	{
656		p0 = cmp.eq(TMP,#2)
657		if (p0.new) AH = ##0x80000000
658		jumpr r31
659	}
660#undef BTMP
661#undef BTMPH
662#undef BTMPL
663#define CTMP r11:10
664	.falign
665.Lfma_abnormal_c:
666	// We know that AB is normal * normal
667	// C is not normal: zero, subnormal, inf, or NaN.
668	{
669		p0 = dfclass(C,#0x10)		// is C NaN?
670		if (p0.new) jump:nt .Lnan
671		if (p0.new) A = C		// move NaN to A
672		deallocframe
673	}
674	{
675		p0 = dfclass(C,#0x08)		// is C inf?
676		if (p0.new) A = C		// return C
677		if (p0.new) jumpr:nt r31
678	}
679	// zero or subnormal
680	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
681	{
682		p0 = dfclass(C,#0x01)		// is C zero?
683		if (p0.new) jump:nt __hexagon_muldf3
684		TMP = #1
685	}
686	// Left with: subnormal
687	// Adjust C and jump back to restart
688	{
689		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
690		CTMP = #0
691		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
692		jump .Lfma_abnormal_c_restart
693	}
694END(fma)
695