xref: /titanic_51/usr/src/lib/libmvec/common/vis/__vatan.S (revision 25c28e83beb90e7c80452a7c818c5e6f73a07dc8)
1*25c28e83SPiotr Jasiukajtis/*
2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START
3*25c28e83SPiotr Jasiukajtis *
4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the
5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License").
6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License.
7*25c28e83SPiotr Jasiukajtis *
8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing.
10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions
11*25c28e83SPiotr Jasiukajtis * and limitations under the License.
12*25c28e83SPiotr Jasiukajtis *
13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each
14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the
16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying
17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner]
18*25c28e83SPiotr Jasiukajtis *
19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END
20*25c28e83SPiotr Jasiukajtis */
21*25c28e83SPiotr Jasiukajtis/*
22*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc.  All rights reserved.
23*25c28e83SPiotr Jasiukajtis */
24*25c28e83SPiotr Jasiukajtis/*
25*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
26*25c28e83SPiotr Jasiukajtis * Use is subject to license terms.
27*25c28e83SPiotr Jasiukajtis */
28*25c28e83SPiotr Jasiukajtis
29*25c28e83SPiotr Jasiukajtis	.file	"__vatan.S"
30*25c28e83SPiotr Jasiukajtis
31*25c28e83SPiotr Jasiukajtis#include "libm.h"
32*25c28e83SPiotr Jasiukajtis
33*25c28e83SPiotr Jasiukajtis	RO_DATA
34*25c28e83SPiotr Jasiukajtis
35*25c28e83SPiotr Jasiukajtis! following is the C version of the ATAN algorithm
36*25c28e83SPiotr Jasiukajtis!  #include <math.h>
37*25c28e83SPiotr Jasiukajtis!  #include <stdio.h>
38*25c28e83SPiotr Jasiukajtis!  double jkatan(double *x)
39*25c28e83SPiotr Jasiukajtis!  {
40*25c28e83SPiotr Jasiukajtis!    double  f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy;
41*25c28e83SPiotr Jasiukajtis!    int index, sign, intf, intz;
42*25c28e83SPiotr Jasiukajtis!    extern const double __vlibm_TBL_atan1[];
43*25c28e83SPiotr Jasiukajtis!    long *pf = (long *) &f, *pz = (long *) &z;
44*25c28e83SPiotr Jasiukajtis!
45*25c28e83SPiotr Jasiukajtis!  /*    Power series  atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7
46*25c28e83SPiotr Jasiukajtis!   *    Error =  -3.08254E-18   On the interval  |x| < 1/64 */
47*25c28e83SPiotr Jasiukajtis!
48*25c28e83SPiotr Jasiukajtis!  /* define dummy names for readability.  Use parray to help compiler optimize loads */
49*25c28e83SPiotr Jasiukajtis!  #define p3    parray[0]
50*25c28e83SPiotr Jasiukajtis!  #define p2    parray[1]
51*25c28e83SPiotr Jasiukajtis!  #define p1    parray[2]
52*25c28e83SPiotr Jasiukajtis!  #define soffset      3
53*25c28e83SPiotr Jasiukajtis!
54*25c28e83SPiotr Jasiukajtis!    static const double parray[] = {
55*25c28e83SPiotr Jasiukajtis!     -1.428029046844299722E-01,		/* p[3]		*/
56*25c28e83SPiotr Jasiukajtis!      1.999999917247000615E-01, 		/* p[2]		*/
57*25c28e83SPiotr Jasiukajtis!     -3.333333333329292858E-01, 		/* p[1]		*/
58*25c28e83SPiotr Jasiukajtis!      1.0, 				/* not used for p[0], though		*/
59*25c28e83SPiotr Jasiukajtis!     -1.0,				/* used to flip sign of answer 		*/
60*25c28e83SPiotr Jasiukajtis!    };
61*25c28e83SPiotr Jasiukajtis!
62*25c28e83SPiotr Jasiukajtis!    f        = *x;				/* fetch argument		*/
63*25c28e83SPiotr Jasiukajtis!    intf     = pf[0];				/* grab upper half		*/
64*25c28e83SPiotr Jasiukajtis!    sign     = intf & 0x80000000;			/* sign of argument		*/
65*25c28e83SPiotr Jasiukajtis!    intf    ^= sign;				/* abs(upper argument)		*/
66*25c28e83SPiotr Jasiukajtis!    sign     = (unsigned) sign >> 31;		/* sign bit = 0 or 1		*/
67*25c28e83SPiotr Jasiukajtis!    pf[0]    = intf;
68*25c28e83SPiotr Jasiukajtis!
69*25c28e83SPiotr Jasiukajtis!    if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */
70*25c28e83SPiotr Jasiukajtis!    {
71*25c28e83SPiotr Jasiukajtis!      if(  (intf > 0x7ff00000) ||
72*25c28e83SPiotr Jasiukajtis!        ((intf == 0x7ff00000) &&  (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/
73*25c28e83SPiotr Jasiukajtis!      if( intf < 0x3e300000 ) 			/* avoid underflow for small arg */
74*25c28e83SPiotr Jasiukajtis!      {
75*25c28e83SPiotr Jasiukajtis!        dummy = 1.0e37 + f;
76*25c28e83SPiotr Jasiukajtis!        dummy = dummy;
77*25c28e83SPiotr Jasiukajtis!        return (*x);
78*25c28e83SPiotr Jasiukajtis!      }
79*25c28e83SPiotr Jasiukajtis!      if( intf > 0x43600000 )			/* avoid underflow for big arg  */
80*25c28e83SPiotr Jasiukajtis!      {
81*25c28e83SPiotr Jasiukajtis!        index = 2;
82*25c28e83SPiotr Jasiukajtis!        f     = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low   */
83*25c28e83SPiotr Jasiukajtis!        f     = parray[soffset + sign] * f;	/* put sign bit on ans		*/
84*25c28e83SPiotr Jasiukajtis!        return (f);
85*25c28e83SPiotr Jasiukajtis!      }
86*25c28e83SPiotr Jasiukajtis!    }
87*25c28e83SPiotr Jasiukajtis!
88*25c28e83SPiotr Jasiukajtis!    index    = 0;				/* points to 0,0 in table	*/
89*25c28e83SPiotr Jasiukajtis!    if (intf > 0x40500000)			/* if(|x| > 64               	*/
90*25c28e83SPiotr Jasiukajtis!    { f = -1.0/f;
91*25c28e83SPiotr Jasiukajtis!      index  = 2; 				/* point to pi/2 upper, lower	*/
92*25c28e83SPiotr Jasiukajtis!    }
93*25c28e83SPiotr Jasiukajtis!    else if( intf >= 0x3f900000 )		/* if |x| >= (1/64)... 		*/
94*25c28e83SPiotr Jasiukajtis!    {
95*25c28e83SPiotr Jasiukajtis!      intz   = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper	*/
96*25c28e83SPiotr Jasiukajtis!      pz[0]  = intz;				/* store as a double (z)	*/
97*25c28e83SPiotr Jasiukajtis!      pz[1]  = 0;				/* ...lower			*/
98*25c28e83SPiotr Jasiukajtis!      f      = (f - z)/(1.0 + f*z); 		/* get reduced argument		*/
99*25c28e83SPiotr Jasiukajtis!      index  = (intz - 0x3f900000) >> 15;	/* (index >> 16) << 1)		*/
100*25c28e83SPiotr Jasiukajtis!      index += 4;				/* skip over 0,0,pi/2,pi/2	*/
101*25c28e83SPiotr Jasiukajtis!    }
102*25c28e83SPiotr Jasiukajtis!    conup    = __vlibm_TBL_atan1[index];	/* upper table 			*/
103*25c28e83SPiotr Jasiukajtis!    conlo    = __vlibm_TBL_atan1[index+1];	/* lower table 			*/
104*25c28e83SPiotr Jasiukajtis!    tmp      = f*f;
105*25c28e83SPiotr Jasiukajtis!    poly     = (f*tmp)*((p3*tmp + p2)*tmp + p1);
106*25c28e83SPiotr Jasiukajtis!    ansu     = conup + f;     			/* compute atan(f)  upper 	*/
107*25c28e83SPiotr Jasiukajtis!    ansl     = (((conup - ansu) + f) + poly) + conlo;
108*25c28e83SPiotr Jasiukajtis!    ans      = ansu + ansl;
109*25c28e83SPiotr Jasiukajtis!    ans      = parray[soffset + sign] * ans;
110*25c28e83SPiotr Jasiukajtis!    return ans;
111*25c28e83SPiotr Jasiukajtis!  }
112*25c28e83SPiotr Jasiukajtis
113*25c28e83SPiotr Jasiukajtis/* 8 bytes = 1 double f.p. word */
114*25c28e83SPiotr Jasiukajtis#define WSIZE	8
115*25c28e83SPiotr Jasiukajtis
116*25c28e83SPiotr Jasiukajtis	.align	32			!align with full D-cache line
117*25c28e83SPiotr Jasiukajtis.COEFFS:
118*25c28e83SPiotr Jasiukajtis	.double	0r-1.428029046844299722E-01 	!p[3]
119*25c28e83SPiotr Jasiukajtis        .double 0r1.999999917247000615E-01 	!p[2]
120*25c28e83SPiotr Jasiukajtis        .double 0r-3.333333333329292858E-01	!p[1]
121*25c28e83SPiotr Jasiukajtis        .double 0r-1.0, 			!constant -1.0
122*25c28e83SPiotr Jasiukajtis	.word   0x00008000,0x0			!for fp rounding of reduced arg
123*25c28e83SPiotr Jasiukajtis	.word   0x7fff0000,0x0			!for fp truncation
124*25c28e83SPiotr Jasiukajtis	.word	0x47900000,0			!a number close to 1.0E37
125*25c28e83SPiotr Jasiukajtis	.word   0x80000000,0x0			!mask for fp sign bit
126*25c28e83SPiotr Jasiukajtis	.word   0x3f800000,0x0			!1.0/128.0 dummy "safe" argument
127*25c28e83SPiotr Jasiukajtis	.type	.COEFFS,#object
128*25c28e83SPiotr Jasiukajtis
129*25c28e83SPiotr Jasiukajtis	ENTRY(__vatan)
130*25c28e83SPiotr Jasiukajtis	save	%sp,-SA(MINFRAME)-16,%sp
131*25c28e83SPiotr Jasiukajtis	PIC_SETUP(g5)
132*25c28e83SPiotr Jasiukajtis	PIC_SET(g5,__vlibm_TBL_atan1,o4)
133*25c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS,o0)
134*25c28e83SPiotr Jasiukajtis/*
135*25c28e83SPiotr Jasiukajtis   __vatan(int n, double *x, int stridex, double *y, stridey)
136*25c28e83SPiotr Jasiukajtis   computes y(i) = atan( x(i) ), for 1=1,n.  Stridex, stridey
137*25c28e83SPiotr Jasiukajtis   are the distance between x and y elements
138*25c28e83SPiotr Jasiukajtis
139*25c28e83SPiotr Jasiukajtis	%i0    n
140*25c28e83SPiotr Jasiukajtis	%i1    address of x
141*25c28e83SPiotr Jasiukajtis	%i2    stride x
142*25c28e83SPiotr Jasiukajtis	%i3    address of y
143*25c28e83SPiotr Jasiukajtis	%i4    stride y
144*25c28e83SPiotr Jasiukajtis*/
145*25c28e83SPiotr Jasiukajtis	cmp	%i0,0			!if n <=0,
146*25c28e83SPiotr Jasiukajtis	ble,pn	%icc,.RETURN		!....then do nothing
147*25c28e83SPiotr Jasiukajtis	sll	%i2,3,%i2		!convert stride to byte count
148*25c28e83SPiotr Jasiukajtis	sll	%i4,3,%i4		!convert stride to byte count
149*25c28e83SPiotr Jasiukajtis
150*25c28e83SPiotr Jasiukajtis/*  pre-load constants before beginning main loop */
151*25c28e83SPiotr Jasiukajtis
152*25c28e83SPiotr Jasiukajtis	ldd	[%o0],%f58		!load p[3]
153*25c28e83SPiotr Jasiukajtis	mov	2,%i5			!argcount = 3
154*25c28e83SPiotr Jasiukajtis
155*25c28e83SPiotr Jasiukajtis	ldd	[%o0+WSIZE],%f60	!load p[2]
156*25c28e83SPiotr Jasiukajtis	add	%fp,STACK_BIAS-8,%l1		!yaddr1 = &dummy
157*25c28e83SPiotr Jasiukajtis	fzero   %f18			!ansu1 = 0
158*25c28e83SPiotr Jasiukajtis
159*25c28e83SPiotr Jasiukajtis	ldd	[%o0+2*WSIZE],%f62	!load p[1]
160*25c28e83SPiotr Jasiukajtis	add	%fp,STACK_BIAS-8,%l2		!yaddr2 = &dummy
161*25c28e83SPiotr Jasiukajtis	fzero   %f12			!(poly1) = 0
162*25c28e83SPiotr Jasiukajtis
163*25c28e83SPiotr Jasiukajtis	ldd 	[%o0+3*WSIZE],%f56	!-1.0
164*25c28e83SPiotr Jasiukajtis	fzero   %f14			!tmp1 = 0
165*25c28e83SPiotr Jasiukajtis
166*25c28e83SPiotr Jasiukajtis	ldd     [%o0+4*WSIZE],%f52	!load rounding mask
167*25c28e83SPiotr Jasiukajtis	fzero   %f16			!conup1 = 0
168*25c28e83SPiotr Jasiukajtis
169*25c28e83SPiotr Jasiukajtis	ldd     [%o0+5*WSIZE],%f54	!load truncation mask
170*25c28e83SPiotr Jasiukajtis	fzero   %f36			!f1 = 0
171*25c28e83SPiotr Jasiukajtis
172*25c28e83SPiotr Jasiukajtis	ldd	[%o0+6*WSIZE],%f50	!1.0e37
173*25c28e83SPiotr Jasiukajtis	fzero   %f38			!f2 = 0
174*25c28e83SPiotr Jasiukajtis
175*25c28e83SPiotr Jasiukajtis	ldd	[%o0+7*WSIZE],%f32	!mask for sign bit
176*25c28e83SPiotr Jasiukajtis
177*25c28e83SPiotr Jasiukajtis	ldd	[%o4+2*WSIZE],%f46	!pi/2 upper
178*25c28e83SPiotr Jasiukajtis	ldd	[%o4+(2*WSIZE+8)],%f48	!pi/2 lower
179*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x40500000),%l6	!64.0
180*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x3f900000),%l7	!1/64.0
181*25c28e83SPiotr Jasiukajtis	mov     0,%l4			!index1 = 0
182*25c28e83SPiotr Jasiukajtis	mov     0,%l5			!index2 = 0
183*25c28e83SPiotr Jasiukajtis
184*25c28e83SPiotr Jasiukajtis.MAINLOOP:
185*25c28e83SPiotr Jasiukajtis
186*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
187*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
188*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
189*25c28e83SPiotr Jasiukajtis
190*25c28e83SPiotr Jasiukajtis.LOOP0:
191*25c28e83SPiotr Jasiukajtis        deccc   %i0                     !--n
192*25c28e83SPiotr Jasiukajtis	bneg	1f
193*25c28e83SPiotr Jasiukajtis        mov	%i1,%o5			!xuse = x (delay slot)
194*25c28e83SPiotr Jasiukajtis
195*25c28e83SPiotr Jasiukajtis	ba	2f
196*25c28e83SPiotr Jasiukajtis	nop				!delay slot
197*25c28e83SPiotr Jasiukajtis1:
198*25c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
199*25c28e83SPiotr Jasiukajtis	dec	%i5			!argcount--
200*25c28e83SPiotr Jasiukajtis2:
201*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x80000000),%o7	!mask for sign bit
202*25c28e83SPiotr Jasiukajtis/*2 */  sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
203*25c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
204*25c28e83SPiotr Jasiukajtis	ldd     [%o4+%l5],%f26		!conup2 = __vlibm_TBL_atan1[index2]
205*25c28e83SPiotr Jasiukajtis
206*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
207*25c28e83SPiotr Jasiukajtis/*4 */  andn	%o0,%o7,%o0		!intf = fabs(intf)
208*25c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f34		!f = *x into f34
209*25c28e83SPiotr Jasiukajtis
210*25c28e83SPiotr Jasiukajtis	sub	%o1,%o0,%o1		!(-) if intf > big
211*25c28e83SPiotr Jasiukajtis/*6 */  sub	%o0,%o2,%o2		!(-) if intf < small
212*25c28e83SPiotr Jasiukajtis	fand 	%f34,%f32,%f40		!sign0 = sign bit
213*25c28e83SPiotr Jasiukajtis            fmuld   %f38,%f38,%f24      !tmp2= f2*f2
214*25c28e83SPiotr Jasiukajtis
215*25c28e83SPiotr Jasiukajtis/*7 */  orcc	%o1,%o2,%g0		!(-) if either true
216*25c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL0		!if (-) goto special cases below
217*25c28e83SPiotr Jasiukajtis	fabsd	%f34,%f34		!abs(f)  (delay slot)
218*25c28e83SPiotr Jasiukajtis	!----------------------
219*25c28e83SPiotr Jasiukajtis
220*25c28e83SPiotr Jasiukajtis
221*25c28e83SPiotr Jasiukajtis	sethi   %hi(0x8000),%o7		!rounding bit
222*25c28e83SPiotr Jasiukajtis/*8 */  fpadd32 %f34,%f52,%f0		!intf + 0x00008000 (again)
223*25c28e83SPiotr Jasiukajtis            faddd   %f26,%f38,%f28      !ansu2 = conup2 + f2
224*25c28e83SPiotr Jasiukajtis
225*25c28e83SPiotr Jasiukajtis 	add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
226*25c28e83SPiotr Jasiukajtis/*9*/   fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
227*25c28e83SPiotr Jasiukajtis            fmuld   %f58,%f24,%f22      !p[3]*tmp2
228*25c28e83SPiotr Jasiukajtis
229*25c28e83SPiotr Jasiukajtis/*10 */ sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
230*25c28e83SPiotr Jasiukajtis	fmuld	%f34,%f0,%f10		!f*z
231*25c28e83SPiotr Jasiukajtis	fsubd	%f34,%f0,%f20		!f - z
232*25c28e83SPiotr Jasiukajtis          add     %o4,%l4,%l4           !base addr + index1
233*25c28e83SPiotr Jasiukajtis          fmuld   %f14,%f12,%f12        !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1)
234*25c28e83SPiotr Jasiukajtis          faddd   %f16,%f36,%f16        !(conup1 - ansu1) + f1
235*25c28e83SPiotr Jasiukajtis
236*25c28e83SPiotr Jasiukajtis/*12 */ and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
237*25c28e83SPiotr Jasiukajtis            faddd   %f22,%f60,%f22      !p[3]*tmp2 + p[2]
238*25c28e83SPiotr Jasiukajtis          ldd     [%l4+WSIZE],%f14      !conlo1 = __vlibm_TBL_atan1[index+1]
239*25c28e83SPiotr Jasiukajtis
240*25c28e83SPiotr Jasiukajtis/*13 */ sub	%o0,%l7,%o2		!intz - 0x3f900000
241*25c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
242*25c28e83SPiotr Jasiukajtis          faddd   %f16,%f12,%f12        !((conup1 - ansu1) + f1) + poly1
243*25c28e83SPiotr Jasiukajtis
244*25c28e83SPiotr Jasiukajtis	cmp	%o0,%l6			!(|f| > 64)
245*25c28e83SPiotr Jasiukajtis        ble	.ELSE0			!if(|f| > 64) then
246*25c28e83SPiotr Jasiukajtis/*15 */ sra	  %o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
247*25c28e83SPiotr Jasiukajtis	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
248*25c28e83SPiotr Jasiukajtis          ba	  .ENDIF0		!continue
249*25c28e83SPiotr Jasiukajtis/*16 */   fdivd	  %f56,%f34,%f34	!f = -1.0/f (delay slot)
250*25c28e83SPiotr Jasiukajtis        .ELSE0:				!else f( |x| >= (1/64))
251*25c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
252*25c28e83SPiotr Jasiukajtis	bl  	.ENDIF0			!if( |x| >= (1/64) ) then...
253*25c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
254*25c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
255*25c28e83SPiotr Jasiukajtis/*16 */   fdivd	  %f20,%f10,%f34	!f = (f - z)/(1.0 + f*z), reduced argument
256*25c28e83SPiotr Jasiukajtis	.ENDIF0:
257*25c28e83SPiotr Jasiukajtis
258*25c28e83SPiotr Jasiukajtis/*17*/  sll	%o1,3,%l3		!index0 = index
259*25c28e83SPiotr Jasiukajtis	mov	%i3,%l0			!yaddr0 = address of y
260*25c28e83SPiotr Jasiukajtis          faddd   %f12,%f14,%f12        !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1
261*25c28e83SPiotr Jasiukajtis            fmuld   %f22,%f24,%f22      !(p3*tmp2 + p2)*tmp2
262*25c28e83SPiotr Jasiukajtis            fsubd   %f26,%f28,%f26      !conup2 - ansu2
263*25c28e83SPiotr Jasiukajtis
264*25c28e83SPiotr Jasiukajtis/*20*/  add	%i1,%i2,%i1		!x     += stridex
265*25c28e83SPiotr Jasiukajtis        add	%i3,%i4,%i3		!y     += stridey
266*25c28e83SPiotr Jasiukajtis          faddd   %f18,%f12,%f36        !ans1 = ansu1 + ansl1
267*25c28e83SPiotr Jasiukajtis            fmuld   %f38,%f24,%f24      !f*tmp2
268*25c28e83SPiotr Jasiukajtis            faddd   %f22,%f62,%f22      !(p3*tmp2 + p2)*tmp2 + p1
269*25c28e83SPiotr Jasiukajtis
270*25c28e83SPiotr Jasiukajtis/*23*/    for     %f36,%f42,%f36        !sign(ans1) = sign of argument
271*25c28e83SPiotr Jasiukajtis          std     %f36,[%l1]            !*yaddr1 = ans1
272*25c28e83SPiotr Jasiukajtis            add     %o4,%l5,%l5         !base addr + index2
273*25c28e83SPiotr Jasiukajtis            fmuld   %f24,%f22,%f22      !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1)
274*25c28e83SPiotr Jasiukajtis            faddd   %f26,%f38,%f26      !(conup2 - ansu2) + f2
275*25c28e83SPiotr Jasiukajtis	cmp	%i5,0			!if argcount =0, we are done
276*25c28e83SPiotr Jasiukajtis	be	.RETURN
277*25c28e83SPiotr Jasiukajtis	  nop
278*25c28e83SPiotr Jasiukajtis
279*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
280*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
281*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
282*25c28e83SPiotr Jasiukajtis
283*25c28e83SPiotr Jasiukajtis.LOOP1:
284*25c28e83SPiotr Jasiukajtis/*25*/  deccc   %i0                     !--n
285*25c28e83SPiotr Jasiukajtis        bneg    1f
286*25c28e83SPiotr Jasiukajtis        mov     %i1,%o5                 !xuse = x (delay slot)
287*25c28e83SPiotr Jasiukajtis        ba      2f
288*25c28e83SPiotr Jasiukajtis        nop                             !delay slot
289*25c28e83SPiotr Jasiukajtis1:
290*25c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
291*25c28e83SPiotr Jasiukajtis        dec     %i5                     !argcount--
292*25c28e83SPiotr Jasiukajtis2:
293*25c28e83SPiotr Jasiukajtis
294*25c28e83SPiotr Jasiukajtis/*26*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
295*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
296*25c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
297*25c28e83SPiotr Jasiukajtis
298*25c28e83SPiotr Jasiukajtis/*28*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
299*25c28e83SPiotr Jasiukajtis	andn	%o0,%o7,%o0		!intf = fabs(intf)
300*25c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f36		!f = *x into f36
301*25c28e83SPiotr Jasiukajtis
302*25c28e83SPiotr Jasiukajtis/*30*/  sub	%o1,%o0,%o1		!(-) if intf > big
303*25c28e83SPiotr Jasiukajtis	sub	%o0,%o2,%o2		!(-) if intf < small
304*25c28e83SPiotr Jasiukajtis	fand 	%f36,%f32,%f42		!sign1 = sign bit
305*25c28e83SPiotr Jasiukajtis
306*25c28e83SPiotr Jasiukajtis/*31*/  orcc	%o1,%o2,%g0		!(-) if either true
307*25c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL1		!if (-) goto special cases below
308*25c28e83SPiotr Jasiukajtis	fabsd	%f36,%f36		!abs(f)  (delay slot)
309*25c28e83SPiotr Jasiukajtis	!----------------------
310*25c28e83SPiotr Jasiukajtis
311*25c28e83SPiotr Jasiukajtis/*32*/  fpadd32 %f36,%f52,%f0		!intf + 0x00008000 (again)
312*25c28e83SPiotr Jasiukajtis          ldd     [%l5+WSIZE],%f24      !conlo2 = __vlibm_TBL_atan1[index2+1]
313*25c28e83SPiotr Jasiukajtis
314*25c28e83SPiotr Jasiukajtis/*33*/  fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
315*25c28e83SPiotr Jasiukajtis        sethi   %hi(0x8000),%o7		!rounding bit
316*25c28e83SPiotr Jasiukajtis          faddd   %f26,%f22,%f22        !((conup2 - ansu2) + f2) + poly2
317*25c28e83SPiotr Jasiukajtis
318*25c28e83SPiotr Jasiukajtis/*34*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
319*25c28e83SPiotr Jasiukajtis	sethi   %hi(0x7fff0000),%o7	!mask for rounding argument
320*25c28e83SPiotr Jasiukajtis	fmuld	%f36,%f0,%f10		!f*z
321*25c28e83SPiotr Jasiukajtis	fsubd	%f36,%f0,%f20		!f - z
322*25c28e83SPiotr Jasiukajtis
323*25c28e83SPiotr Jasiukajtis/*35*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
324*25c28e83SPiotr Jasiukajtis          faddd   %f22,%f24,%f22        !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2
325*25c28e83SPiotr Jasiukajtis
326*25c28e83SPiotr Jasiukajtis/*37*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
327*25c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
328*25c28e83SPiotr Jasiukajtis      ldd     [%o4+%l3],%f6             !conup0 = __vlibm_TBL_atan1[index0]
329*25c28e83SPiotr Jasiukajtis
330*25c28e83SPiotr Jasiukajtis        cmp	%o0,%l6			!(|f| > 64)
331*25c28e83SPiotr Jasiukajtis	ble	.ELSE1			!if(|f| > 64) then
332*25c28e83SPiotr Jasiukajtis/*38*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
333*25c28e83SPiotr Jasiukajtis          mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
334*25c28e83SPiotr Jasiukajtis	  ba	  .ENDIF1		!continue
335*25c28e83SPiotr Jasiukajtis/*40*/    fdivd	  %f56,%f36,%f36	!f = -1.0/f (delay slot)
336*25c28e83SPiotr Jasiukajtis        .ELSE1:				!else f( |x| >= (1/64))
337*25c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
338*25c28e83SPiotr Jasiukajtis	bl  	.ENDIF1			!if( |x| >= (1/64) ) then...
339*25c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
340*25c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
341*25c28e83SPiotr Jasiukajtis/*40*/    fdivd	  %f20,%f10,%f36	!f = (f - z)/(1.0 + f*z), reduced argument
342*25c28e83SPiotr Jasiukajtis	.ENDIF1:
343*25c28e83SPiotr Jasiukajtis
344*25c28e83SPiotr Jasiukajtis/*41*/sll     %o1,3,%l4		 	!index1 = index
345*25c28e83SPiotr Jasiukajtis      mov     %i3,%l1			!yaddr1 = address of y
346*25c28e83SPiotr Jasiukajtis      fmuld   %f34,%f34,%f4             !tmp0= f0*f0
347*25c28e83SPiotr Jasiukajtis          faddd   %f28,%f22,%f38        !ans2 = ansu2 + ansl2
348*25c28e83SPiotr Jasiukajtis
349*25c28e83SPiotr Jasiukajtis/*44*/add	%i1,%i2,%i1		!x     += stridex
350*25c28e83SPiotr Jasiukajtis      add	%i3,%i4,%i3		!y     += stridey
351*25c28e83SPiotr Jasiukajtis    fmuld   %f58,%f4,%f2                !p[3]*tmp0
352*25c28e83SPiotr Jasiukajtis    faddd   %f6,%f34,%f8                !ansu0 = conup0 + f0
353*25c28e83SPiotr Jasiukajtis          for     %f38,%f44,%f38        !sign(ans2) = sign of argument
354*25c28e83SPiotr Jasiukajtis          std     %f38,[%l2]            !*yaddr2 = ans2
355*25c28e83SPiotr Jasiukajtis          cmp	%i5,0			!if argcount =0, we are done
356*25c28e83SPiotr Jasiukajtis	  be	.RETURN
357*25c28e83SPiotr Jasiukajtis	  nop
358*25c28e83SPiotr Jasiukajtis
359*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
360*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
361*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
362*25c28e83SPiotr Jasiukajtis
363*25c28e83SPiotr Jasiukajtis.LOOP2:
364*25c28e83SPiotr Jasiukajtis/*46*/  deccc   %i0                     !--n
365*25c28e83SPiotr Jasiukajtis        bneg    1f
366*25c28e83SPiotr Jasiukajtis        mov     %i1,%o5                 !xuse = x (delay slot)
367*25c28e83SPiotr Jasiukajtis        ba      2f
368*25c28e83SPiotr Jasiukajtis        nop                             !delay slot
369*25c28e83SPiotr Jasiukajtis1:
370*25c28e83SPiotr Jasiukajtis	PIC_SET(g5,.COEFFS+8*WSIZE,o5)
371*25c28e83SPiotr Jasiukajtis        dec     %i5                     !argcount--
372*25c28e83SPiotr Jasiukajtis2:
373*25c28e83SPiotr Jasiukajtis
374*25c28e83SPiotr Jasiukajtis/*47*/  sethi	%hi(0x80000000),%o7	!mask for sign bit
375*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x43600000),%o1	!big = 0x43600000,0
376*25c28e83SPiotr Jasiukajtis	ld	[%o5],%o0		!intf = pf[0] = f upper
377*25c28e83SPiotr Jasiukajtis
378*25c28e83SPiotr Jasiukajtis/*49*/  sethi	%hi(0x3e300000),%o2	!small = 0x3e300000,0
379*25c28e83SPiotr Jasiukajtis	andn	%o0,%o7,%o0		!intf = fabs(intf)
380*25c28e83SPiotr Jasiukajtis	ldd 	[%o5],%f38		!f = *x into f38
381*25c28e83SPiotr Jasiukajtis
382*25c28e83SPiotr Jasiukajtis/*51*/  sub	%o1,%o0,%o1		!(-) if intf > big
383*25c28e83SPiotr Jasiukajtis	sub	%o0,%o2,%o2		!(-) if intf < small
384*25c28e83SPiotr Jasiukajtis	fand 	%f38,%f32,%f44		!sign2 = sign bit
385*25c28e83SPiotr Jasiukajtis
386*25c28e83SPiotr Jasiukajtis/*52*/  orcc	%o1,%o2,%g0		!(-) if either true
387*25c28e83SPiotr Jasiukajtis 	bneg,pn  %icc,.SPECIAL2		!if (-) goto special cases below
388*25c28e83SPiotr Jasiukajtis	fabsd	%f38,%f38		!abs(f)  (delay slot)
389*25c28e83SPiotr Jasiukajtis	!----------------------
390*25c28e83SPiotr Jasiukajtis
391*25c28e83SPiotr Jasiukajtis/*53*/  fpadd32 %f38,%f52,%f0		!intf + 0x00008000 (again)
392*25c28e83SPiotr Jasiukajtis      faddd   %f2,%f60,%f2              !p[3]*tmp0 + p[2]
393*25c28e83SPiotr Jasiukajtis
394*25c28e83SPiotr Jasiukajtis/*54*/  sethi   %hi(0x8000),%o7		!rounding bit
395*25c28e83SPiotr Jasiukajtis	fand    %f0,%f54,%f0		!pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again)
396*25c28e83SPiotr Jasiukajtis
397*25c28e83SPiotr Jasiukajtis/*55*/  add	%o0,%o7,%o0		!intf + 0x00008000  (delay slot)
398*25c28e83SPiotr Jasiukajtis	sethi   %hi(0x7fff0000),%o7     !mask for rounding argument
399*25c28e83SPiotr Jasiukajtis	fmuld	%f38,%f0,%f10		!f*z
400*25c28e83SPiotr Jasiukajtis	fsubd	%f38,%f0,%f20		!f - z
401*25c28e83SPiotr Jasiukajtis
402*25c28e83SPiotr Jasiukajtis/*56*/  and	%o0,%o7,%o0		!intz = (intf + 0x00008000) & 0x7fff0000
403*25c28e83SPiotr Jasiukajtis      fmuld   %f2,%f4,%f2               !(p3*tmp0 + p2)*tmp0
404*25c28e83SPiotr Jasiukajtis      fsubd   %f6,%f8,%f6               !conup0 - ansu0
405*25c28e83SPiotr Jasiukajtis
406*25c28e83SPiotr Jasiukajtis/*58*/  sub	%o0,%l7,%o2		!intz - 0x3f900000
407*25c28e83SPiotr Jasiukajtis	fsubd	%f10,%f56,%f10		!(f*z - (-1.0))
408*25c28e83SPiotr Jasiukajtis            ldd     [%o4+%l4],%f16      !conup1 = __vlibm_TBL_atan1[index1]
409*25c28e83SPiotr Jasiukajtis
410*25c28e83SPiotr Jasiukajtis	cmp	%o0,%l6			!(|f| > 64)
411*25c28e83SPiotr Jasiukajtis	ble	.ELSE2			!if(|f| > 64) then
412*25c28e83SPiotr Jasiukajtis/*60*/  sra	%o2,15,%o3		!index  = (intz - 0x3f900000) >> 15
413*25c28e83SPiotr Jasiukajtis	  mov	  2,%o1			!index == 2, point to conup, conlo = pi/2 upper, lower
414*25c28e83SPiotr Jasiukajtis	  ba	  .ENDIF2		!continue
415*25c28e83SPiotr Jasiukajtis/*61*/    fdivd	  %f56,%f38,%f38	!f = -1.0/f (delay slot)
416*25c28e83SPiotr Jasiukajtis        .ELSE2:				!else f( |x| >= (1/64))
417*25c28e83SPiotr Jasiukajtis	cmp	%o0,%l7			!if intf >= 1/64
418*25c28e83SPiotr Jasiukajtis	bl  	.ENDIF2			!if( |x| >= (1/64) ) then...
419*25c28e83SPiotr Jasiukajtis	mov	0,%o1			!index == 0 , point to conup,conlo = 0,0
420*25c28e83SPiotr Jasiukajtis          add	  %o3,4,%o1		!index = index + 4
421*25c28e83SPiotr Jasiukajtis/*61*/    fdivd	  %f20,%f10,%f38	!f = (f - z)/(1.0 + f*z), reduced argument
422*25c28e83SPiotr Jasiukajtis	.ENDIF2:
423*25c28e83SPiotr Jasiukajtis
424*25c28e83SPiotr Jasiukajtis
425*25c28e83SPiotr Jasiukajtis/*62*/  sll	%o1,3,%l5		!index2 = index
426*25c28e83SPiotr Jasiukajtis	mov	%i3,%l2			!yaddr2 = address of y
427*25c28e83SPiotr Jasiukajtis      fmuld   %f34,%f4,%f4              !f0*tmp0
428*25c28e83SPiotr Jasiukajtis      faddd   %f2,%f62,%f2              !(p3*tmp0 + p2)*tmp0 + p1
429*25c28e83SPiotr Jasiukajtis            fmuld   %f36,%f36,%f14      !tmp1= f1*f1
430*25c28e83SPiotr Jasiukajtis
431*25c28e83SPiotr Jasiukajtis/*65*/add     %o4,%l3,%l3               !base addr + index0
432*25c28e83SPiotr Jasiukajtis      fmuld   %f4,%f2,%f2               !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1)
433*25c28e83SPiotr Jasiukajtis      faddd   %f6,%f34,%f6              !(conup0 - ansu0) + f0
434*25c28e83SPiotr Jasiukajtis            fmuld   %f58,%f14,%f12      !p[3]*tmp1
435*25c28e83SPiotr Jasiukajtis            faddd   %f16,%f36,%f18      !ansu1 = conup1 + f1
436*25c28e83SPiotr Jasiukajtis      ldd     [%l3+WSIZE],%f4           !conlo0 = __vlibm_TBL_atan1[index0+1]
437*25c28e83SPiotr Jasiukajtis
438*25c28e83SPiotr Jasiukajtis/*68*/  add	%i1,%i2,%i1		!x     += stridex
439*25c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3		!y     += stridey
440*25c28e83SPiotr Jasiukajtis      faddd   %f6,%f2,%f2               !((conup0 - ansu0) + f0) + poly0
441*25c28e83SPiotr Jasiukajtis            faddd   %f12,%f60,%f12      !p[3]*tmp1 + p[2]
442*25c28e83SPiotr Jasiukajtis
443*25c28e83SPiotr Jasiukajtis/*71*/faddd   %f2,%f4,%f2               !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0
444*25c28e83SPiotr Jasiukajtis            fmuld   %f12,%f14,%f12      !(p3*tmp1 + p2)*tmp1
445*25c28e83SPiotr Jasiukajtis            fsubd   %f16,%f18,%f16      !conup1 - ansu1
446*25c28e83SPiotr Jasiukajtis
447*25c28e83SPiotr Jasiukajtis/*74*/faddd   %f8,%f2,%f34              !ans0 = ansu0 + ansl0
448*25c28e83SPiotr Jasiukajtis          fmuld   %f36,%f14,%f14        !f1*tmp1
449*25c28e83SPiotr Jasiukajtis          faddd   %f12,%f62,%f12        !(p3*tmp1 + p2)*tmp1 + p1
450*25c28e83SPiotr Jasiukajtis
451*25c28e83SPiotr Jasiukajtis/*77*/  for     %f34,%f40,%f34          !sign(ans0) = sign of argument
452*25c28e83SPiotr Jasiukajtis        std     %f34,[%l0]              !*yaddr0 = ans, always gets stored (delay slot)
453*25c28e83SPiotr Jasiukajtis        cmp	%i5,0			!if argcount =0, we are done
454*25c28e83SPiotr Jasiukajtis	bg	.MAINLOOP
455*25c28e83SPiotr Jasiukajtis	  nop
456*25c28e83SPiotr Jasiukajtis
457*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
458*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
459*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
460*25c28e83SPiotr Jasiukajtis
461*25c28e83SPiotr Jasiukajtis.RETURN:
462*25c28e83SPiotr Jasiukajtis	ret
463*25c28e83SPiotr Jasiukajtis	restore	%g0,%g0,%g0
464*25c28e83SPiotr Jasiukajtis
465*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
466*25c28e83SPiotr Jasiukajtis    /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/
467*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
468*25c28e83SPiotr Jasiukajtis
469*25c28e83SPiotr Jasiukajtis/* at this point
470*25c28e83SPiotr Jasiukajtis   %i1     x address
471*25c28e83SPiotr Jasiukajtis   %o0     intf
472*25c28e83SPiotr Jasiukajtis   %o2     intf - 0x3e300000
473*25c28e83SPiotr Jasiukajtis   %f34,36,38     f0,f1,f2
474*25c28e83SPiotr Jasiukajtis   %f40,42,44     sign0,sign1,sign2
475*25c28e83SPiotr Jasiukajtis*/
476*25c28e83SPiotr Jasiukajtis
477*25c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
478*25c28e83SPiotr Jasiukajtis.SPECIAL0:
479*25c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
480*25c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
481*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
482*25c28e83SPiotr Jasiukajtis	  ba	  3f
483*25c28e83SPiotr Jasiukajtis	  faddd	  %f34,%f50,%f30		!dummy op just to generate exception (delay slot)
484*25c28e83SPiotr Jasiukajtis1:
485*25c28e83SPiotr Jasiukajtis	ld	[%o5+4],%o5			!load x lower word
486*25c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
487*25c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
488*25c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
489*25c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
490*25c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
491*25c28e83SPiotr Jasiukajtis	  nop
492*25c28e83SPiotr Jasiukajtis	  fmuld	  %f34,%f34,%f34		!...... (x*x) trigger invalid exception
493*25c28e83SPiotr Jasiukajtis	  ba      3f
494*25c28e83SPiotr Jasiukajtis	  nop
495*25c28e83SPiotr Jasiukajtis2:
496*25c28e83SPiotr Jasiukajtis	faddd	%f46,%f48,%f34			!ans = pi/2 upper + pi/2 lower
497*25c28e83SPiotr Jasiukajtis3:
498*25c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
499*25c28e83SPiotr Jasiukajtis	for     %f34,%f40,%f34			!sign(ans) = sign of argument
500*25c28e83SPiotr Jasiukajtis	std	%f34,[%i3]			!*y = ans
501*25c28e83SPiotr Jasiukajtis	ba	.LOOP0				!keep looping
502*25c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey (delay slot)
503*25c28e83SPiotr Jasiukajtis
504*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
505*25c28e83SPiotr Jasiukajtis    /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/
506*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
507*25c28e83SPiotr Jasiukajtis
508*25c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
509*25c28e83SPiotr Jasiukajtis.SPECIAL1:
510*25c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
511*25c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
512*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
513*25c28e83SPiotr Jasiukajtis	  ba	  3f
514*25c28e83SPiotr Jasiukajtis	  faddd	  %f36,%f50,%f30		!dummy op just to generate exception (delay slot)
515*25c28e83SPiotr Jasiukajtis1:
516*25c28e83SPiotr Jasiukajtis	ld	[%o5+4],%o5			!load x lower word
517*25c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
518*25c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
519*25c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
520*25c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
521*25c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
522*25c28e83SPiotr Jasiukajtis	  nop
523*25c28e83SPiotr Jasiukajtis	  fmuld	  %f36,%f36,%f36		!...... (x*x) trigger invalid exception
524*25c28e83SPiotr Jasiukajtis	  ba      3f
525*25c28e83SPiotr Jasiukajtis	  nop
526*25c28e83SPiotr Jasiukajtis2:
527*25c28e83SPiotr Jasiukajtis	faddd	%f46,%f48,%f36			!ans = pi/2 upper + pi/2 lower
528*25c28e83SPiotr Jasiukajtis3:
529*25c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
530*25c28e83SPiotr Jasiukajtis	for     %f36,%f42,%f36			!sign(ans) = sign of argument
531*25c28e83SPiotr Jasiukajtis	std	%f36,[%i3]			!*y = ans
532*25c28e83SPiotr Jasiukajtis	ba      .LOOP1 				!keep looping
533*25c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey (delay slot)
534*25c28e83SPiotr Jasiukajtis
535*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
536*25c28e83SPiotr Jasiukajtis    /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/
537*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
538*25c28e83SPiotr Jasiukajtis
539*25c28e83SPiotr Jasiukajtis 	.align	32				!align on I-cache boundary
540*25c28e83SPiotr Jasiukajtis.SPECIAL2:
541*25c28e83SPiotr Jasiukajtis	orcc	%o2,%g0,%g0			!(-) if intf < 0x3e300000
542*25c28e83SPiotr Jasiukajtis	bpos	1f  				!if >=...continue
543*25c28e83SPiotr Jasiukajtis	sethi	%hi(0x7ff00000),%g1	!upper word of Inf (we use 64-bit wide int for this)
544*25c28e83SPiotr Jasiukajtis	  ba	  3f
545*25c28e83SPiotr Jasiukajtis	  faddd	  %f38,%f50,%f30		!dummy op just to generate exception (delay slot)
546*25c28e83SPiotr Jasiukajtis1:
547*25c28e83SPiotr Jasiukajtis	ld	[%o5+4],%o5			!load x lower word
548*25c28e83SPiotr Jasiukajtis	sllx	%o0,32,%o0			!left justify intf
549*25c28e83SPiotr Jasiukajtis	sllx	%g1,32,%g1		!left justify Inf
550*25c28e83SPiotr Jasiukajtis	or	%o0,%o5,%o0			!merge in lower intf
551*25c28e83SPiotr Jasiukajtis	cmp	%o0,%g1				!if intf > 0x7ff00000 00000000
552*25c28e83SPiotr Jasiukajtis	ble,pt	%xcc,2f				!pass thru if NaN
553*25c28e83SPiotr Jasiukajtis	  nop
554*25c28e83SPiotr Jasiukajtis	  fmuld	  %f38,%f38,%f38		!...... (x*x) trigger invalid exception
555*25c28e83SPiotr Jasiukajtis	  ba      3f
556*25c28e83SPiotr Jasiukajtis	  nop
557*25c28e83SPiotr Jasiukajtis2:
558*25c28e83SPiotr Jasiukajtis	faddd	%f46,%f48,%f38			!ans = pi/2 upper + pi/2 lower
559*25c28e83SPiotr Jasiukajtis3:
560*25c28e83SPiotr Jasiukajtis	add	%i1,%i2,%i1			!x += stridex
561*25c28e83SPiotr Jasiukajtis	for     %f38,%f44,%f38			!sign(ans) = sign of argument
562*25c28e83SPiotr Jasiukajtis	std	%f38,[%i3]			!*y = ans
563*25c28e83SPiotr Jasiukajtis	ba      .LOOP2 				!keep looping
564*25c28e83SPiotr Jasiukajtis	add	%i3,%i4,%i3			!y += stridey
565*25c28e83SPiotr Jasiukajtis
566*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
567*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
568*25c28e83SPiotr Jasiukajtis    /*--------------------------------------------------------------------------*/
569*25c28e83SPiotr Jasiukajtis
570*25c28e83SPiotr Jasiukajtis	SET_SIZE(__vatan)
571*25c28e83SPiotr Jasiukajtis
572*25c28e83SPiotr Jasiukajtis!	.ident	"03-20-96 Sparc V9 3-way-unrolled version"
573