xref: /freebsd/contrib/llvm-project/compiler-rt/lib/builtins/hexagon/dffma.S (revision 32100375a661c1e16588ddfa7b90ca8d26cb9786)
1//===----------------------Hexagon builtin routine ------------------------===//
2//
3// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4// See https://llvm.org/LICENSE.txt for license information.
5// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6//
7//===----------------------------------------------------------------------===//
8
9#define Q6_ALIAS(TAG) .global __qdsp_##TAG ; .set __qdsp_##TAG, __hexagon_##TAG
10#define END(TAG) .size TAG,.-TAG
11
12// Double Precision Multiply
13
14
15#define A r1:0
16#define AH r1
17#define AL r0
18#define B r3:2
19#define BH r3
20#define BL r2
21#define C r5:4
22#define CH r5
23#define CL r4
24
25
26
27#define BTMP r15:14
28#define BTMPH r15
29#define BTMPL r14
30
31#define ATMP r13:12
32#define ATMPH r13
33#define ATMPL r12
34
35#define CTMP r11:10
36#define CTMPH r11
37#define CTMPL r10
38
39#define PP_LL r9:8
40#define PP_LL_H r9
41#define PP_LL_L r8
42
43#define PP_ODD r7:6
44#define PP_ODD_H r7
45#define PP_ODD_L r6
46
47
48#define PP_HH r17:16
49#define PP_HH_H r17
50#define PP_HH_L r16
51
52#define EXPA r18
53#define EXPB r19
54#define EXPBA r19:18
55
56#define TMP r28
57
58#define P_TMP p0
59#define PROD_NEG p3
60#define EXACT p2
61#define SWAP p1
62
63#define MANTBITS 52
64#define HI_MANTBITS 20
65#define EXPBITS 11
66#define BIAS 1023
67#define STACKSPACE 32
68
69#define ADJUST 4
70
71#define FUDGE 7
72#define FUDGE2 3
73
74#ifndef SR_ROUND_OFF
75#define SR_ROUND_OFF 22
76#endif
77
78	// First, classify for normal values, and abort if abnormal
79	//
80	// Next, unpack mantissa into 0x1000_0000_0000_0000 + mant<<8
81	//
82	// Since we know that the 2 MSBs of the H registers is zero, we should never carry
83	// the partial products that involve the H registers
84	//
85	// Try to buy X slots, at the expense of latency if needed
86	//
87	// We will have PP_HH with the upper bits of the product, PP_LL with the lower
88	// PP_HH can have a maximum of 0x03FF_FFFF_FFFF_FFFF or thereabouts
89	// PP_HH can have a minimum of 0x0100_0000_0000_0000
90	//
91	// 0x0100_0000_0000_0000 has EXP of EXPA+EXPB-BIAS
92	//
93	// We need to align CTMP.
94	// If CTMP >> PP, convert PP to 64 bit with sticky, align CTMP, and follow normal add
95	// If CTMP << PP align CTMP and add 128 bits.  Then compute sticky
96	// If CTMP ~= PP, align CTMP and add 128 bits.  May have massive cancellation.
97	//
98	// Convert partial product and CTMP to 2's complement prior to addition
99	//
100	// After we add, we need to normalize into upper 64 bits, then compute sticky.
101
102	.text
103	.global __hexagon_fmadf4
104        .type __hexagon_fmadf4,@function
105	.global __hexagon_fmadf5
106        .type __hexagon_fmadf5,@function
107	.global fma
108	.type fma,@function
109	Q6_ALIAS(fmadf5)
110	.p2align 5
111__hexagon_fmadf4:
112__hexagon_fmadf5:
113fma:
114	{
115		P_TMP = dfclass(A,#2)
116		P_TMP = dfclass(B,#2)
117		ATMP = #0
118		BTMP = #0
119	}
120	{
121		ATMP = insert(A,#MANTBITS,#EXPBITS-3)
122		BTMP = insert(B,#MANTBITS,#EXPBITS-3)
123		PP_ODD_H = ##0x10000000
124		allocframe(#STACKSPACE)
125	}
126	{
127		PP_LL = mpyu(ATMPL,BTMPL)
128		if (!P_TMP) jump .Lfma_abnormal_ab
129		ATMPH = or(ATMPH,PP_ODD_H)
130		BTMPH = or(BTMPH,PP_ODD_H)
131	}
132	{
133		P_TMP = dfclass(C,#2)
134		if (!P_TMP.new) jump:nt .Lfma_abnormal_c
135		CTMP = combine(PP_ODD_H,#0)
136		PP_ODD = combine(#0,PP_LL_H)
137	}
138.Lfma_abnormal_c_restart:
139	{
140		PP_ODD += mpyu(BTMPL,ATMPH)
141		CTMP = insert(C,#MANTBITS,#EXPBITS-3)
142		memd(r29+#0) = PP_HH
143		memd(r29+#8) = EXPBA
144	}
145	{
146		PP_ODD += mpyu(ATMPL,BTMPH)
147		EXPBA = neg(CTMP)
148		P_TMP = cmp.gt(CH,#-1)
149		TMP = xor(AH,BH)
150	}
151	{
152		EXPA = extractu(AH,#EXPBITS,#HI_MANTBITS)
153		EXPB = extractu(BH,#EXPBITS,#HI_MANTBITS)
154		PP_HH = combine(#0,PP_ODD_H)
155		if (!P_TMP) CTMP = EXPBA
156	}
157	{
158		PP_HH += mpyu(ATMPH,BTMPH)
159		PP_LL = combine(PP_ODD_L,PP_LL_L)
160#undef PP_ODD
161#undef PP_ODD_H
162#undef PP_ODD_L
163#undef ATMP
164#undef ATMPL
165#undef ATMPH
166#undef BTMP
167#undef BTMPL
168#undef BTMPH
169#define RIGHTLEFTSHIFT r13:12
170#define RIGHTSHIFT r13
171#define LEFTSHIFT r12
172
173		EXPA = add(EXPA,EXPB)
174#undef EXPB
175#undef EXPBA
176#define EXPC r19
177#define EXPCA r19:18
178		EXPC = extractu(CH,#EXPBITS,#HI_MANTBITS)
179	}
180	// PP_HH:PP_LL now has product
181	// CTMP is negated
182	// EXPA,B,C are extracted
183	// We need to negate PP
184	// Since we will be adding with carry later, if we need to negate,
185	// just invert all bits now, which we can do conditionally and in parallel
186#define PP_HH_TMP r15:14
187#define PP_LL_TMP r7:6
188	{
189		EXPA = add(EXPA,#-BIAS+(ADJUST))
190		PROD_NEG = !cmp.gt(TMP,#-1)
191		PP_LL_TMP = #0
192		PP_HH_TMP = #0
193	}
194	{
195		PP_LL_TMP = sub(PP_LL_TMP,PP_LL,PROD_NEG):carry
196		P_TMP = !cmp.gt(TMP,#-1)
197		SWAP = cmp.gt(EXPC,EXPA)	// If C >> PP
198		if (SWAP.new) EXPCA = combine(EXPA,EXPC)
199	}
200	{
201		PP_HH_TMP = sub(PP_HH_TMP,PP_HH,PROD_NEG):carry
202		if (P_TMP) PP_LL = PP_LL_TMP
203#undef PP_LL_TMP
204#define CTMP2 r7:6
205#define CTMP2H r7
206#define CTMP2L r6
207		CTMP2 = #0
208		EXPC = sub(EXPA,EXPC)
209	}
210	{
211		if (P_TMP) PP_HH = PP_HH_TMP
212		P_TMP = cmp.gt(EXPC,#63)
213		if (SWAP) PP_LL = CTMP2
214		if (SWAP) CTMP2 = PP_LL
215	}
216#undef PP_HH_TMP
217//#define ONE r15:14
218//#define S_ONE r14
219#define ZERO r15:14
220#define S_ZERO r15
221#undef PROD_NEG
222#define P_CARRY p3
223	{
224		if (SWAP) PP_HH = CTMP	// Swap C and PP
225		if (SWAP) CTMP = PP_HH
226		if (P_TMP) EXPC = add(EXPC,#-64)
227		TMP = #63
228	}
229	{
230		// If diff > 63, pre-shift-right by 64...
231		if (P_TMP) CTMP2 = CTMP
232		TMP = asr(CTMPH,#31)
233		RIGHTSHIFT = min(EXPC,TMP)
234		LEFTSHIFT = #0
235	}
236#undef C
237#undef CH
238#undef CL
239#define STICKIES r5:4
240#define STICKIESH r5
241#define STICKIESL r4
242	{
243		if (P_TMP) CTMP = combine(TMP,TMP)	// sign extension of pre-shift-right-64
244		STICKIES = extract(CTMP2,RIGHTLEFTSHIFT)
245		CTMP2 = lsr(CTMP2,RIGHTSHIFT)
246		LEFTSHIFT = sub(#64,RIGHTSHIFT)
247	}
248	{
249		ZERO = #0
250		TMP = #-2
251		CTMP2 |= lsl(CTMP,LEFTSHIFT)
252		CTMP = asr(CTMP,RIGHTSHIFT)
253	}
254	{
255		P_CARRY = cmp.gtu(STICKIES,ZERO)	// If we have sticky bits from C shift
256		if (P_CARRY.new) CTMP2L = and(CTMP2L,TMP) // make sure adding 1 == OR
257#undef ZERO
258#define ONE r15:14
259#define S_ONE r14
260		ONE = #1
261		STICKIES = #0
262	}
263	{
264		PP_LL = add(CTMP2,PP_LL,P_CARRY):carry	// use the carry to add the sticky
265	}
266	{
267		PP_HH = add(CTMP,PP_HH,P_CARRY):carry
268		TMP = #62
269	}
270	// PP_HH:PP_LL now holds the sum
271	// We may need to normalize left, up to ??? bits.
272	//
273	// I think that if we have massive cancellation, the range we normalize by
274	// is still limited
275	{
276		LEFTSHIFT = add(clb(PP_HH),#-2)
277		if (!cmp.eq(LEFTSHIFT.new,TMP)) jump:t 1f	// all sign bits?
278	}
279	// We had all sign bits, shift left by 62.
280	{
281		CTMP = extractu(PP_LL,#62,#2)
282		PP_LL = asl(PP_LL,#62)
283		EXPA = add(EXPA,#-62)			// And adjust exponent of result
284	}
285	{
286		PP_HH = insert(CTMP,#62,#0)		// Then shift 63
287	}
288	{
289		LEFTSHIFT = add(clb(PP_HH),#-2)
290	}
291	.falign
2921:
293	{
294		CTMP = asl(PP_HH,LEFTSHIFT)
295		STICKIES |= asl(PP_LL,LEFTSHIFT)
296		RIGHTSHIFT = sub(#64,LEFTSHIFT)
297		EXPA = sub(EXPA,LEFTSHIFT)
298	}
299	{
300		CTMP |= lsr(PP_LL,RIGHTSHIFT)
301		EXACT = cmp.gtu(ONE,STICKIES)
302		TMP = #BIAS+BIAS-2
303	}
304	{
305		if (!EXACT) CTMPL = or(CTMPL,S_ONE)
306		// If EXPA is overflow/underflow, jump to ovf_unf
307		P_TMP = !cmp.gt(EXPA,TMP)
308		P_TMP = cmp.gt(EXPA,#1)
309		if (!P_TMP.new) jump:nt .Lfma_ovf_unf
310	}
311	{
312		// XXX: FIXME: should PP_HH for check of zero be CTMP?
313		P_TMP = cmp.gtu(ONE,CTMP)		// is result true zero?
314		A = convert_d2df(CTMP)
315		EXPA = add(EXPA,#-BIAS-60)
316		PP_HH = memd(r29+#0)
317	}
318	{
319		AH += asl(EXPA,#HI_MANTBITS)
320		EXPCA = memd(r29+#8)
321		if (!P_TMP) dealloc_return		// not zero, return
322	}
323.Ladd_yields_zero:
324	// We had full cancellation.  Return +/- zero (-0 when round-down)
325	{
326		TMP = USR
327		A = #0
328	}
329	{
330		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
331		PP_HH = memd(r29+#0)
332		EXPCA = memd(r29+#8)
333	}
334	{
335		p0 = cmp.eq(TMP,#2)
336		if (p0.new) AH = ##0x80000000
337		dealloc_return
338	}
339
340#undef RIGHTLEFTSHIFT
341#undef RIGHTSHIFT
342#undef LEFTSHIFT
343#undef CTMP2
344#undef CTMP2H
345#undef CTMP2L
346
347.Lfma_ovf_unf:
348	{
349		p0 = cmp.gtu(ONE,CTMP)
350		if (p0.new) jump:nt .Ladd_yields_zero
351	}
352	{
353		A = convert_d2df(CTMP)
354		EXPA = add(EXPA,#-BIAS-60)
355		TMP = EXPA
356	}
357#define NEW_EXPB r7
358#define NEW_EXPA r6
359	{
360		AH += asl(EXPA,#HI_MANTBITS)
361		NEW_EXPB = extractu(AH,#EXPBITS,#HI_MANTBITS)
362	}
363	{
364		NEW_EXPA = add(EXPA,NEW_EXPB)
365		PP_HH = memd(r29+#0)
366		EXPCA = memd(r29+#8)
367#undef PP_HH
368#undef PP_HH_H
369#undef PP_HH_L
370#undef EXPCA
371#undef EXPC
372#undef EXPA
373#undef PP_LL
374#undef PP_LL_H
375#undef PP_LL_L
376#define EXPA r6
377#define EXPB r7
378#define EXPBA r7:6
379#define ATMP r9:8
380#define ATMPH r9
381#define ATMPL r8
382#undef NEW_EXPB
383#undef NEW_EXPA
384		ATMP = abs(CTMP)
385	}
386	{
387		p0 = cmp.gt(EXPA,##BIAS+BIAS)
388		if (p0.new) jump:nt .Lfma_ovf
389	}
390	{
391		p0 = cmp.gt(EXPA,#0)
392		if (p0.new) jump:nt .Lpossible_unf
393	}
394	{
395		// TMP has original EXPA.
396		// ATMP is corresponding value
397		// Normalize ATMP and shift right to correct location
398		EXPB = add(clb(ATMP),#-2)		// Amount to left shift to normalize
399		EXPA = sub(#1+5,TMP)			// Amount to right shift to denormalize
400		p3 = cmp.gt(CTMPH,#-1)
401	}
402	// Underflow
403	// We know that the infinte range exponent should be EXPA
404	// CTMP is 2's complement, ATMP is abs(CTMP)
405	{
406		EXPA = add(EXPA,EXPB)		// how much to shift back right
407		ATMP = asl(ATMP,EXPB)		// shift left
408		AH = USR
409		TMP = #63
410	}
411	{
412		EXPB = min(EXPA,TMP)
413		EXPA = #0
414		AL = #0x0030
415	}
416	{
417		B = extractu(ATMP,EXPBA)
418		ATMP = asr(ATMP,EXPB)
419	}
420	{
421		p0 = cmp.gtu(ONE,B)
422		if (!p0.new) ATMPL = or(ATMPL,S_ONE)
423		ATMPH = setbit(ATMPH,#HI_MANTBITS+FUDGE2)
424	}
425	{
426		CTMP = neg(ATMP)
427		p1 = bitsclr(ATMPL,#(1<<FUDGE2)-1)
428		if (!p1.new) AH = or(AH,AL)
429		B = #0
430	}
431	{
432		if (p3) CTMP = ATMP
433		USR = AH
434		TMP = #-BIAS-(MANTBITS+FUDGE2)
435	}
436	{
437		A = convert_d2df(CTMP)
438	}
439	{
440		AH += asl(TMP,#HI_MANTBITS)
441		dealloc_return
442	}
443.Lpossible_unf:
444	{
445		TMP = ##0x7fefffff
446		ATMP = abs(CTMP)
447	}
448	{
449		p0 = cmp.eq(AL,#0)
450		p0 = bitsclr(AH,TMP)
451		if (!p0.new) dealloc_return:t
452		TMP = #0x7fff
453	}
454	{
455		p0 = bitsset(ATMPH,TMP)
456		BH = USR
457		BL = #0x0030
458	}
459	{
460		if (p0) BH = or(BH,BL)
461	}
462	{
463		USR = BH
464	}
465	{
466		p0 = dfcmp.eq(A,A)
467		dealloc_return
468	}
469.Lfma_ovf:
470	{
471		TMP = USR
472		CTMP = combine(##0x7fefffff,#-1)
473		A = CTMP
474	}
475	{
476		ATMP = combine(##0x7ff00000,#0)
477		BH = extractu(TMP,#2,#SR_ROUND_OFF)
478		TMP = or(TMP,#0x28)
479	}
480	{
481		USR = TMP
482		BH ^= lsr(AH,#31)
483		BL = BH
484	}
485	{
486		p0 = !cmp.eq(BL,#1)
487		p0 = !cmp.eq(BH,#2)
488	}
489	{
490		p0 = dfcmp.eq(ATMP,ATMP)
491		if (p0.new) CTMP = ATMP
492	}
493	{
494		A = insert(CTMP,#63,#0)
495		dealloc_return
496	}
497#undef CTMP
498#undef CTMPH
499#undef CTMPL
500#define BTMP r11:10
501#define BTMPH r11
502#define BTMPL r10
503
504#undef STICKIES
505#undef STICKIESH
506#undef STICKIESL
507#define C r5:4
508#define CH r5
509#define CL r4
510
511.Lfma_abnormal_ab:
512	{
513		ATMP = extractu(A,#63,#0)
514		BTMP = extractu(B,#63,#0)
515		deallocframe
516	}
517	{
518		p3 = cmp.gtu(ATMP,BTMP)
519		if (!p3.new) A = B		// sort values
520		if (!p3.new) B = A
521	}
522	{
523		p0 = dfclass(A,#0x0f)		// A NaN?
524		if (!p0.new) jump:nt .Lnan
525		if (!p3) ATMP = BTMP
526		if (!p3) BTMP = ATMP
527	}
528	{
529		p1 = dfclass(A,#0x08)		// A is infinity
530		p1 = dfclass(B,#0x0e)		// B is nonzero
531	}
532	{
533		p0 = dfclass(A,#0x08)		// a is inf
534		p0 = dfclass(B,#0x01)		// b is zero
535	}
536	{
537		if (p1) jump .Lab_inf
538		p2 = dfclass(B,#0x01)
539	}
540	{
541		if (p0) jump .Linvalid
542		if (p2) jump .Lab_true_zero
543		TMP = ##0x7c000000
544	}
545	// We are left with a normal or subnormal times a subnormal, A > B
546	// If A and B are both very small, we will go to a single sticky bit; replace
547	// A and B lower 63 bits with 0x0010_0000_0000_0000, which yields equivalent results
548	// if A and B might multiply to something bigger, decrease A exp and increase B exp
549	// and start over
550	{
551		p0 = bitsclr(AH,TMP)
552		if (p0.new) jump:nt .Lfma_ab_tiny
553	}
554	{
555		TMP = add(clb(BTMP),#-EXPBITS)
556	}
557	{
558		BTMP = asl(BTMP,TMP)
559	}
560	{
561		B = insert(BTMP,#63,#0)
562		AH -= asl(TMP,#HI_MANTBITS)
563	}
564	jump fma
565
566.Lfma_ab_tiny:
567	ATMP = combine(##0x00100000,#0)
568	{
569		A = insert(ATMP,#63,#0)
570		B = insert(ATMP,#63,#0)
571	}
572	jump fma
573
574.Lab_inf:
575	{
576		B = lsr(B,#63)
577		p0 = dfclass(C,#0x10)
578	}
579	{
580		A ^= asl(B,#63)
581		if (p0) jump .Lnan
582	}
583	{
584		p1 = dfclass(C,#0x08)
585		if (p1.new) jump:nt .Lfma_inf_plus_inf
586	}
587	// A*B is +/- inf, C is finite.  Return A
588	{
589		jumpr r31
590	}
591	.falign
592.Lfma_inf_plus_inf:
593	{	// adding infinities of different signs is invalid
594		p0 = dfcmp.eq(A,C)
595		if (!p0.new) jump:nt .Linvalid
596	}
597	{
598		jumpr r31
599	}
600
601.Lnan:
602	{
603		p0 = dfclass(B,#0x10)
604		p1 = dfclass(C,#0x10)
605		if (!p0.new) B = A
606		if (!p1.new) C = A
607	}
608	{	// find sNaNs
609		BH = convert_df2sf(B)
610		BL = convert_df2sf(C)
611	}
612	{
613		BH = convert_df2sf(A)
614		A = #-1
615		jumpr r31
616	}
617
618.Linvalid:
619	{
620		TMP = ##0x7f800001		// sp snan
621	}
622	{
623		A = convert_sf2df(TMP)
624		jumpr r31
625	}
626
627.Lab_true_zero:
628	// B is zero, A is finite number
629	{
630		p0 = dfclass(C,#0x10)
631		if (p0.new) jump:nt .Lnan
632		if (p0.new) A = C
633	}
634	{
635		p0 = dfcmp.eq(B,C)		// is C also zero?
636		AH = lsr(AH,#31)		// get sign
637	}
638	{
639		BH ^= asl(AH,#31)		// form correctly signed zero in B
640		if (!p0) A = C			// If C is not zero, return C
641		if (!p0) jumpr r31
642	}
643	// B has correctly signed zero, C is also zero
644.Lzero_plus_zero:
645	{
646		p0 = cmp.eq(B,C)		// yes, scalar equals.  +0++0 or -0+-0
647		if (p0.new) jumpr:t r31
648		A = B
649	}
650	{
651		TMP = USR
652	}
653	{
654		TMP = extractu(TMP,#2,#SR_ROUND_OFF)
655		A = #0
656	}
657	{
658		p0 = cmp.eq(TMP,#2)
659		if (p0.new) AH = ##0x80000000
660		jumpr r31
661	}
662#undef BTMP
663#undef BTMPH
664#undef BTMPL
665#define CTMP r11:10
666	.falign
667.Lfma_abnormal_c:
668	// We know that AB is normal * normal
669	// C is not normal: zero, subnormal, inf, or NaN.
670	{
671		p0 = dfclass(C,#0x10)		// is C NaN?
672		if (p0.new) jump:nt .Lnan
673		if (p0.new) A = C		// move NaN to A
674		deallocframe
675	}
676	{
677		p0 = dfclass(C,#0x08)		// is C inf?
678		if (p0.new) A = C		// return C
679		if (p0.new) jumpr:nt r31
680	}
681	// zero or subnormal
682	// If we have a zero, and we know AB is normal*normal, we can just call normal multiply
683	{
684		p0 = dfclass(C,#0x01)		// is C zero?
685		if (p0.new) jump:nt __hexagon_muldf3
686		TMP = #1
687	}
688	// Left with: subnormal
689	// Adjust C and jump back to restart
690	{
691		allocframe(#STACKSPACE)		// oops, deallocated above, re-allocate frame
692		CTMP = #0
693		CH = insert(TMP,#EXPBITS,#HI_MANTBITS)
694		jump .Lfma_abnormal_c_restart
695	}
696END(fma)
697