1*25c28e83SPiotr Jasiukajtis/* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis/* 22*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23*25c28e83SPiotr Jasiukajtis */ 24*25c28e83SPiotr Jasiukajtis/* 25*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 27*25c28e83SPiotr Jasiukajtis */ 28*25c28e83SPiotr Jasiukajtis 29*25c28e83SPiotr Jasiukajtis .file "__vatan.S" 30*25c28e83SPiotr Jasiukajtis 31*25c28e83SPiotr Jasiukajtis#include "libm.h" 32*25c28e83SPiotr Jasiukajtis 33*25c28e83SPiotr Jasiukajtis RO_DATA 34*25c28e83SPiotr Jasiukajtis 35*25c28e83SPiotr Jasiukajtis! following is the C version of the ATAN algorithm 36*25c28e83SPiotr Jasiukajtis! #include <math.h> 37*25c28e83SPiotr Jasiukajtis! #include <stdio.h> 38*25c28e83SPiotr Jasiukajtis! double jkatan(double *x) 39*25c28e83SPiotr Jasiukajtis! { 40*25c28e83SPiotr Jasiukajtis! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy; 41*25c28e83SPiotr Jasiukajtis! int index, sign, intf, intz; 42*25c28e83SPiotr Jasiukajtis! extern const double __vlibm_TBL_atan1[]; 43*25c28e83SPiotr Jasiukajtis! long *pf = (long *) &f, *pz = (long *) &z; 44*25c28e83SPiotr Jasiukajtis! 45*25c28e83SPiotr Jasiukajtis! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 46*25c28e83SPiotr Jasiukajtis! * Error = -3.08254E-18 On the interval |x| < 1/64 */ 47*25c28e83SPiotr Jasiukajtis! 48*25c28e83SPiotr Jasiukajtis! /* define dummy names for readability. Use parray to help compiler optimize loads */ 49*25c28e83SPiotr Jasiukajtis! #define p3 parray[0] 50*25c28e83SPiotr Jasiukajtis! #define p2 parray[1] 51*25c28e83SPiotr Jasiukajtis! #define p1 parray[2] 52*25c28e83SPiotr Jasiukajtis! #define soffset 3 53*25c28e83SPiotr Jasiukajtis! 54*25c28e83SPiotr Jasiukajtis! static const double parray[] = { 55*25c28e83SPiotr Jasiukajtis! -1.428029046844299722E-01, /* p[3] */ 56*25c28e83SPiotr Jasiukajtis! 1.999999917247000615E-01, /* p[2] */ 57*25c28e83SPiotr Jasiukajtis! -3.333333333329292858E-01, /* p[1] */ 58*25c28e83SPiotr Jasiukajtis! 1.0, /* not used for p[0], though */ 59*25c28e83SPiotr Jasiukajtis! -1.0, /* used to flip sign of answer */ 60*25c28e83SPiotr Jasiukajtis! }; 61*25c28e83SPiotr Jasiukajtis! 62*25c28e83SPiotr Jasiukajtis! f = *x; /* fetch argument */ 63*25c28e83SPiotr Jasiukajtis! intf = pf[0]; /* grab upper half */ 64*25c28e83SPiotr Jasiukajtis! sign = intf & 0x80000000; /* sign of argument */ 65*25c28e83SPiotr Jasiukajtis! intf ^= sign; /* abs(upper argument) */ 66*25c28e83SPiotr Jasiukajtis! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */ 67*25c28e83SPiotr Jasiukajtis! pf[0] = intf; 68*25c28e83SPiotr Jasiukajtis! 69*25c28e83SPiotr Jasiukajtis! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */ 70*25c28e83SPiotr Jasiukajtis! { 71*25c28e83SPiotr Jasiukajtis! if( (intf > 0x7ff00000) || 72*25c28e83SPiotr Jasiukajtis! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/ 73*25c28e83SPiotr Jasiukajtis! if( intf < 0x3e300000 ) /* avoid underflow for small arg */ 74*25c28e83SPiotr Jasiukajtis! { 75*25c28e83SPiotr Jasiukajtis! dummy = 1.0e37 + f; 76*25c28e83SPiotr Jasiukajtis! dummy = dummy; 77*25c28e83SPiotr Jasiukajtis! return (*x); 78*25c28e83SPiotr Jasiukajtis! } 79*25c28e83SPiotr Jasiukajtis! if( intf > 0x43600000 ) /* avoid underflow for big arg */ 80*25c28e83SPiotr Jasiukajtis! { 81*25c28e83SPiotr Jasiukajtis! index = 2; 82*25c28e83SPiotr Jasiukajtis! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */ 83*25c28e83SPiotr Jasiukajtis! f = parray[soffset + sign] * f; /* put sign bit on ans */ 84*25c28e83SPiotr Jasiukajtis! return (f); 85*25c28e83SPiotr Jasiukajtis! } 86*25c28e83SPiotr Jasiukajtis! } 87*25c28e83SPiotr Jasiukajtis! 88*25c28e83SPiotr Jasiukajtis! index = 0; /* points to 0,0 in table */ 89*25c28e83SPiotr Jasiukajtis! if (intf > 0x40500000) /* if(|x| > 64 */ 90*25c28e83SPiotr Jasiukajtis! { f = -1.0/f; 91*25c28e83SPiotr Jasiukajtis! index = 2; /* point to pi/2 upper, lower */ 92*25c28e83SPiotr Jasiukajtis! } 93*25c28e83SPiotr Jasiukajtis! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */ 94*25c28e83SPiotr Jasiukajtis! { 95*25c28e83SPiotr Jasiukajtis! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ 96*25c28e83SPiotr Jasiukajtis! pz[0] = intz; /* store as a double (z) */ 97*25c28e83SPiotr Jasiukajtis! pz[1] = 0; /* ...lower */ 98*25c28e83SPiotr Jasiukajtis! f = (f - z)/(1.0 + f*z); /* get reduced argument */ 99*25c28e83SPiotr Jasiukajtis! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ 100*25c28e83SPiotr Jasiukajtis! index += 4; /* skip over 0,0,pi/2,pi/2 */ 101*25c28e83SPiotr Jasiukajtis! } 102*25c28e83SPiotr Jasiukajtis! conup = __vlibm_TBL_atan1[index]; /* upper table */ 103*25c28e83SPiotr Jasiukajtis! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */ 104*25c28e83SPiotr Jasiukajtis! tmp = f*f; 105*25c28e83SPiotr Jasiukajtis! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1); 106*25c28e83SPiotr Jasiukajtis! ansu = conup + f; /* compute atan(f) upper */ 107*25c28e83SPiotr Jasiukajtis! ansl = (((conup - ansu) + f) + poly) + conlo; 108*25c28e83SPiotr Jasiukajtis! ans = ansu + ansl; 109*25c28e83SPiotr Jasiukajtis! ans = parray[soffset + sign] * ans; 110*25c28e83SPiotr Jasiukajtis! return ans; 111*25c28e83SPiotr Jasiukajtis! } 112*25c28e83SPiotr Jasiukajtis 113*25c28e83SPiotr Jasiukajtis/* 8 bytes = 1 double f.p. word */ 114*25c28e83SPiotr Jasiukajtis#define WSIZE 8 115*25c28e83SPiotr Jasiukajtis 116*25c28e83SPiotr Jasiukajtis .align 32 !align with full D-cache line 117*25c28e83SPiotr Jasiukajtis.COEFFS: 118*25c28e83SPiotr Jasiukajtis .double 0r-1.428029046844299722E-01 !p[3] 119*25c28e83SPiotr Jasiukajtis .double 0r1.999999917247000615E-01 !p[2] 120*25c28e83SPiotr Jasiukajtis .double 0r-3.333333333329292858E-01 !p[1] 121*25c28e83SPiotr Jasiukajtis .double 0r-1.0, !constant -1.0 122*25c28e83SPiotr Jasiukajtis .word 0x00008000,0x0 !for fp rounding of reduced arg 123*25c28e83SPiotr Jasiukajtis .word 0x7fff0000,0x0 !for fp truncation 124*25c28e83SPiotr Jasiukajtis .word 0x47900000,0 !a number close to 1.0E37 125*25c28e83SPiotr Jasiukajtis .word 0x80000000,0x0 !mask for fp sign bit 126*25c28e83SPiotr Jasiukajtis .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument 127*25c28e83SPiotr Jasiukajtis .type .COEFFS,#object 128*25c28e83SPiotr Jasiukajtis 129*25c28e83SPiotr Jasiukajtis ENTRY(__vatan) 130*25c28e83SPiotr Jasiukajtis save %sp,-SA(MINFRAME)-16,%sp 131*25c28e83SPiotr Jasiukajtis PIC_SETUP(g5) 132*25c28e83SPiotr Jasiukajtis PIC_SET(g5,__vlibm_TBL_atan1,o4) 133*25c28e83SPiotr Jasiukajtis PIC_SET(g5,.COEFFS,o0) 134*25c28e83SPiotr Jasiukajtis/* 135*25c28e83SPiotr Jasiukajtis __vatan(int n, double *x, int stridex, double *y, stridey) 136*25c28e83SPiotr Jasiukajtis computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey 137*25c28e83SPiotr Jasiukajtis are the distance between x and y elements 138*25c28e83SPiotr Jasiukajtis 139*25c28e83SPiotr Jasiukajtis %i0 n 140*25c28e83SPiotr Jasiukajtis %i1 address of x 141*25c28e83SPiotr Jasiukajtis %i2 stride x 142*25c28e83SPiotr Jasiukajtis %i3 address of y 143*25c28e83SPiotr Jasiukajtis %i4 stride y 144*25c28e83SPiotr Jasiukajtis*/ 145*25c28e83SPiotr Jasiukajtis cmp %i0,0 !if n <=0, 146*25c28e83SPiotr Jasiukajtis ble,pn %icc,.RETURN !....then do nothing 147*25c28e83SPiotr Jasiukajtis sll %i2,3,%i2 !convert stride to byte count 148*25c28e83SPiotr Jasiukajtis sll %i4,3,%i4 !convert stride to byte count 149*25c28e83SPiotr Jasiukajtis 150*25c28e83SPiotr Jasiukajtis/* pre-load constants before beginning main loop */ 151*25c28e83SPiotr Jasiukajtis 152*25c28e83SPiotr Jasiukajtis ldd [%o0],%f58 !load p[3] 153*25c28e83SPiotr Jasiukajtis mov 2,%i5 !argcount = 3 154*25c28e83SPiotr Jasiukajtis 155*25c28e83SPiotr Jasiukajtis ldd [%o0+WSIZE],%f60 !load p[2] 156*25c28e83SPiotr Jasiukajtis add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy 157*25c28e83SPiotr Jasiukajtis fzero %f18 !ansu1 = 0 158*25c28e83SPiotr Jasiukajtis 159*25c28e83SPiotr Jasiukajtis ldd [%o0+2*WSIZE],%f62 !load p[1] 160*25c28e83SPiotr Jasiukajtis add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy 161*25c28e83SPiotr Jasiukajtis fzero %f12 !(poly1) = 0 162*25c28e83SPiotr Jasiukajtis 163*25c28e83SPiotr Jasiukajtis ldd [%o0+3*WSIZE],%f56 !-1.0 164*25c28e83SPiotr Jasiukajtis fzero %f14 !tmp1 = 0 165*25c28e83SPiotr Jasiukajtis 166*25c28e83SPiotr Jasiukajtis ldd [%o0+4*WSIZE],%f52 !load rounding mask 167*25c28e83SPiotr Jasiukajtis fzero %f16 !conup1 = 0 168*25c28e83SPiotr Jasiukajtis 169*25c28e83SPiotr Jasiukajtis ldd [%o0+5*WSIZE],%f54 !load truncation mask 170*25c28e83SPiotr Jasiukajtis fzero %f36 !f1 = 0 171*25c28e83SPiotr Jasiukajtis 172*25c28e83SPiotr Jasiukajtis ldd [%o0+6*WSIZE],%f50 !1.0e37 173*25c28e83SPiotr Jasiukajtis fzero %f38 !f2 = 0 174*25c28e83SPiotr Jasiukajtis 175*25c28e83SPiotr Jasiukajtis ldd [%o0+7*WSIZE],%f32 !mask for sign bit 176*25c28e83SPiotr Jasiukajtis 177*25c28e83SPiotr Jasiukajtis ldd [%o4+2*WSIZE],%f46 !pi/2 upper 178*25c28e83SPiotr Jasiukajtis ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower 179*25c28e83SPiotr Jasiukajtis sethi %hi(0x40500000),%l6 !64.0 180*25c28e83SPiotr Jasiukajtis sethi %hi(0x3f900000),%l7 !1/64.0 181*25c28e83SPiotr Jasiukajtis mov 0,%l4 !index1 = 0 182*25c28e83SPiotr Jasiukajtis mov 0,%l5 !index2 = 0 183*25c28e83SPiotr Jasiukajtis 184*25c28e83SPiotr Jasiukajtis.MAINLOOP: 185*25c28e83SPiotr Jasiukajtis 186*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 187*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 188*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 189*25c28e83SPiotr Jasiukajtis 190*25c28e83SPiotr Jasiukajtis.LOOP0: 191*25c28e83SPiotr Jasiukajtis deccc %i0 !--n 192*25c28e83SPiotr Jasiukajtis bneg 1f 193*25c28e83SPiotr Jasiukajtis mov %i1,%o5 !xuse = x (delay slot) 194*25c28e83SPiotr Jasiukajtis 195*25c28e83SPiotr Jasiukajtis ba 2f 196*25c28e83SPiotr Jasiukajtis nop !delay slot 197*25c28e83SPiotr Jasiukajtis1: 198*25c28e83SPiotr Jasiukajtis PIC_SET(g5,.COEFFS+8*WSIZE,o5) 199*25c28e83SPiotr Jasiukajtis dec %i5 !argcount-- 200*25c28e83SPiotr Jasiukajtis2: 201*25c28e83SPiotr Jasiukajtis sethi %hi(0x80000000),%o7 !mask for sign bit 202*25c28e83SPiotr Jasiukajtis/*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0 203*25c28e83SPiotr Jasiukajtis ld [%o5],%o0 !intf = pf[0] = f upper 204*25c28e83SPiotr Jasiukajtis ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2] 205*25c28e83SPiotr Jasiukajtis 206*25c28e83SPiotr Jasiukajtis sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 207*25c28e83SPiotr Jasiukajtis/*4 */ andn %o0,%o7,%o0 !intf = fabs(intf) 208*25c28e83SPiotr Jasiukajtis ldd [%o5],%f34 !f = *x into f34 209*25c28e83SPiotr Jasiukajtis 210*25c28e83SPiotr Jasiukajtis sub %o1,%o0,%o1 !(-) if intf > big 211*25c28e83SPiotr Jasiukajtis/*6 */ sub %o0,%o2,%o2 !(-) if intf < small 212*25c28e83SPiotr Jasiukajtis fand %f34,%f32,%f40 !sign0 = sign bit 213*25c28e83SPiotr Jasiukajtis fmuld %f38,%f38,%f24 !tmp2= f2*f2 214*25c28e83SPiotr Jasiukajtis 215*25c28e83SPiotr Jasiukajtis/*7 */ orcc %o1,%o2,%g0 !(-) if either true 216*25c28e83SPiotr Jasiukajtis bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below 217*25c28e83SPiotr Jasiukajtis fabsd %f34,%f34 !abs(f) (delay slot) 218*25c28e83SPiotr Jasiukajtis !---------------------- 219*25c28e83SPiotr Jasiukajtis 220*25c28e83SPiotr Jasiukajtis 221*25c28e83SPiotr Jasiukajtis sethi %hi(0x8000),%o7 !rounding bit 222*25c28e83SPiotr Jasiukajtis/*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again) 223*25c28e83SPiotr Jasiukajtis faddd %f26,%f38,%f28 !ansu2 = conup2 + f2 224*25c28e83SPiotr Jasiukajtis 225*25c28e83SPiotr Jasiukajtis add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 226*25c28e83SPiotr Jasiukajtis/*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 227*25c28e83SPiotr Jasiukajtis fmuld %f58,%f24,%f22 !p[3]*tmp2 228*25c28e83SPiotr Jasiukajtis 229*25c28e83SPiotr Jasiukajtis/*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument 230*25c28e83SPiotr Jasiukajtis fmuld %f34,%f0,%f10 !f*z 231*25c28e83SPiotr Jasiukajtis fsubd %f34,%f0,%f20 !f - z 232*25c28e83SPiotr Jasiukajtis add %o4,%l4,%l4 !base addr + index1 233*25c28e83SPiotr Jasiukajtis fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1) 234*25c28e83SPiotr Jasiukajtis faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1 235*25c28e83SPiotr Jasiukajtis 236*25c28e83SPiotr Jasiukajtis/*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 237*25c28e83SPiotr Jasiukajtis faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2] 238*25c28e83SPiotr Jasiukajtis ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1] 239*25c28e83SPiotr Jasiukajtis 240*25c28e83SPiotr Jasiukajtis/*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000 241*25c28e83SPiotr Jasiukajtis fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 242*25c28e83SPiotr Jasiukajtis faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1 243*25c28e83SPiotr Jasiukajtis 244*25c28e83SPiotr Jasiukajtis cmp %o0,%l6 !(|f| > 64) 245*25c28e83SPiotr Jasiukajtis ble .ELSE0 !if(|f| > 64) then 246*25c28e83SPiotr Jasiukajtis/*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 247*25c28e83SPiotr Jasiukajtis mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 248*25c28e83SPiotr Jasiukajtis ba .ENDIF0 !continue 249*25c28e83SPiotr Jasiukajtis/*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot) 250*25c28e83SPiotr Jasiukajtis .ELSE0: !else f( |x| >= (1/64)) 251*25c28e83SPiotr Jasiukajtis cmp %o0,%l7 !if intf >= 1/64 252*25c28e83SPiotr Jasiukajtis bl .ENDIF0 !if( |x| >= (1/64) ) then... 253*25c28e83SPiotr Jasiukajtis mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 254*25c28e83SPiotr Jasiukajtis add %o3,4,%o1 !index = index + 4 255*25c28e83SPiotr Jasiukajtis/*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument 256*25c28e83SPiotr Jasiukajtis .ENDIF0: 257*25c28e83SPiotr Jasiukajtis 258*25c28e83SPiotr Jasiukajtis/*17*/ sll %o1,3,%l3 !index0 = index 259*25c28e83SPiotr Jasiukajtis mov %i3,%l0 !yaddr0 = address of y 260*25c28e83SPiotr Jasiukajtis faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1 261*25c28e83SPiotr Jasiukajtis fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2 262*25c28e83SPiotr Jasiukajtis fsubd %f26,%f28,%f26 !conup2 - ansu2 263*25c28e83SPiotr Jasiukajtis 264*25c28e83SPiotr Jasiukajtis/*20*/ add %i1,%i2,%i1 !x += stridex 265*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey 266*25c28e83SPiotr Jasiukajtis faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1 267*25c28e83SPiotr Jasiukajtis fmuld %f38,%f24,%f24 !f*tmp2 268*25c28e83SPiotr Jasiukajtis faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1 269*25c28e83SPiotr Jasiukajtis 270*25c28e83SPiotr Jasiukajtis/*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument 271*25c28e83SPiotr Jasiukajtis std %f36,[%l1] !*yaddr1 = ans1 272*25c28e83SPiotr Jasiukajtis add %o4,%l5,%l5 !base addr + index2 273*25c28e83SPiotr Jasiukajtis fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1) 274*25c28e83SPiotr Jasiukajtis faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2 275*25c28e83SPiotr Jasiukajtis cmp %i5,0 !if argcount =0, we are done 276*25c28e83SPiotr Jasiukajtis be .RETURN 277*25c28e83SPiotr Jasiukajtis nop 278*25c28e83SPiotr Jasiukajtis 279*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 280*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 281*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 282*25c28e83SPiotr Jasiukajtis 283*25c28e83SPiotr Jasiukajtis.LOOP1: 284*25c28e83SPiotr Jasiukajtis/*25*/ deccc %i0 !--n 285*25c28e83SPiotr Jasiukajtis bneg 1f 286*25c28e83SPiotr Jasiukajtis mov %i1,%o5 !xuse = x (delay slot) 287*25c28e83SPiotr Jasiukajtis ba 2f 288*25c28e83SPiotr Jasiukajtis nop !delay slot 289*25c28e83SPiotr Jasiukajtis1: 290*25c28e83SPiotr Jasiukajtis PIC_SET(g5,.COEFFS+8*WSIZE,o5) 291*25c28e83SPiotr Jasiukajtis dec %i5 !argcount-- 292*25c28e83SPiotr Jasiukajtis2: 293*25c28e83SPiotr Jasiukajtis 294*25c28e83SPiotr Jasiukajtis/*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit 295*25c28e83SPiotr Jasiukajtis sethi %hi(0x43600000),%o1 !big = 0x43600000,0 296*25c28e83SPiotr Jasiukajtis ld [%o5],%o0 !intf = pf[0] = f upper 297*25c28e83SPiotr Jasiukajtis 298*25c28e83SPiotr Jasiukajtis/*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 299*25c28e83SPiotr Jasiukajtis andn %o0,%o7,%o0 !intf = fabs(intf) 300*25c28e83SPiotr Jasiukajtis ldd [%o5],%f36 !f = *x into f36 301*25c28e83SPiotr Jasiukajtis 302*25c28e83SPiotr Jasiukajtis/*30*/ sub %o1,%o0,%o1 !(-) if intf > big 303*25c28e83SPiotr Jasiukajtis sub %o0,%o2,%o2 !(-) if intf < small 304*25c28e83SPiotr Jasiukajtis fand %f36,%f32,%f42 !sign1 = sign bit 305*25c28e83SPiotr Jasiukajtis 306*25c28e83SPiotr Jasiukajtis/*31*/ orcc %o1,%o2,%g0 !(-) if either true 307*25c28e83SPiotr Jasiukajtis bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below 308*25c28e83SPiotr Jasiukajtis fabsd %f36,%f36 !abs(f) (delay slot) 309*25c28e83SPiotr Jasiukajtis !---------------------- 310*25c28e83SPiotr Jasiukajtis 311*25c28e83SPiotr Jasiukajtis/*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again) 312*25c28e83SPiotr Jasiukajtis ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1] 313*25c28e83SPiotr Jasiukajtis 314*25c28e83SPiotr Jasiukajtis/*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 315*25c28e83SPiotr Jasiukajtis sethi %hi(0x8000),%o7 !rounding bit 316*25c28e83SPiotr Jasiukajtis faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2 317*25c28e83SPiotr Jasiukajtis 318*25c28e83SPiotr Jasiukajtis/*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 319*25c28e83SPiotr Jasiukajtis sethi %hi(0x7fff0000),%o7 !mask for rounding argument 320*25c28e83SPiotr Jasiukajtis fmuld %f36,%f0,%f10 !f*z 321*25c28e83SPiotr Jasiukajtis fsubd %f36,%f0,%f20 !f - z 322*25c28e83SPiotr Jasiukajtis 323*25c28e83SPiotr Jasiukajtis/*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 324*25c28e83SPiotr Jasiukajtis faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2 325*25c28e83SPiotr Jasiukajtis 326*25c28e83SPiotr Jasiukajtis/*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000 327*25c28e83SPiotr Jasiukajtis fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 328*25c28e83SPiotr Jasiukajtis ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0] 329*25c28e83SPiotr Jasiukajtis 330*25c28e83SPiotr Jasiukajtis cmp %o0,%l6 !(|f| > 64) 331*25c28e83SPiotr Jasiukajtis ble .ELSE1 !if(|f| > 64) then 332*25c28e83SPiotr Jasiukajtis/*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 333*25c28e83SPiotr Jasiukajtis mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 334*25c28e83SPiotr Jasiukajtis ba .ENDIF1 !continue 335*25c28e83SPiotr Jasiukajtis/*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot) 336*25c28e83SPiotr Jasiukajtis .ELSE1: !else f( |x| >= (1/64)) 337*25c28e83SPiotr Jasiukajtis cmp %o0,%l7 !if intf >= 1/64 338*25c28e83SPiotr Jasiukajtis bl .ENDIF1 !if( |x| >= (1/64) ) then... 339*25c28e83SPiotr Jasiukajtis mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 340*25c28e83SPiotr Jasiukajtis add %o3,4,%o1 !index = index + 4 341*25c28e83SPiotr Jasiukajtis/*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument 342*25c28e83SPiotr Jasiukajtis .ENDIF1: 343*25c28e83SPiotr Jasiukajtis 344*25c28e83SPiotr Jasiukajtis/*41*/sll %o1,3,%l4 !index1 = index 345*25c28e83SPiotr Jasiukajtis mov %i3,%l1 !yaddr1 = address of y 346*25c28e83SPiotr Jasiukajtis fmuld %f34,%f34,%f4 !tmp0= f0*f0 347*25c28e83SPiotr Jasiukajtis faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2 348*25c28e83SPiotr Jasiukajtis 349*25c28e83SPiotr Jasiukajtis/*44*/add %i1,%i2,%i1 !x += stridex 350*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey 351*25c28e83SPiotr Jasiukajtis fmuld %f58,%f4,%f2 !p[3]*tmp0 352*25c28e83SPiotr Jasiukajtis faddd %f6,%f34,%f8 !ansu0 = conup0 + f0 353*25c28e83SPiotr Jasiukajtis for %f38,%f44,%f38 !sign(ans2) = sign of argument 354*25c28e83SPiotr Jasiukajtis std %f38,[%l2] !*yaddr2 = ans2 355*25c28e83SPiotr Jasiukajtis cmp %i5,0 !if argcount =0, we are done 356*25c28e83SPiotr Jasiukajtis be .RETURN 357*25c28e83SPiotr Jasiukajtis nop 358*25c28e83SPiotr Jasiukajtis 359*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 360*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 361*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 362*25c28e83SPiotr Jasiukajtis 363*25c28e83SPiotr Jasiukajtis.LOOP2: 364*25c28e83SPiotr Jasiukajtis/*46*/ deccc %i0 !--n 365*25c28e83SPiotr Jasiukajtis bneg 1f 366*25c28e83SPiotr Jasiukajtis mov %i1,%o5 !xuse = x (delay slot) 367*25c28e83SPiotr Jasiukajtis ba 2f 368*25c28e83SPiotr Jasiukajtis nop !delay slot 369*25c28e83SPiotr Jasiukajtis1: 370*25c28e83SPiotr Jasiukajtis PIC_SET(g5,.COEFFS+8*WSIZE,o5) 371*25c28e83SPiotr Jasiukajtis dec %i5 !argcount-- 372*25c28e83SPiotr Jasiukajtis2: 373*25c28e83SPiotr Jasiukajtis 374*25c28e83SPiotr Jasiukajtis/*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit 375*25c28e83SPiotr Jasiukajtis sethi %hi(0x43600000),%o1 !big = 0x43600000,0 376*25c28e83SPiotr Jasiukajtis ld [%o5],%o0 !intf = pf[0] = f upper 377*25c28e83SPiotr Jasiukajtis 378*25c28e83SPiotr Jasiukajtis/*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 379*25c28e83SPiotr Jasiukajtis andn %o0,%o7,%o0 !intf = fabs(intf) 380*25c28e83SPiotr Jasiukajtis ldd [%o5],%f38 !f = *x into f38 381*25c28e83SPiotr Jasiukajtis 382*25c28e83SPiotr Jasiukajtis/*51*/ sub %o1,%o0,%o1 !(-) if intf > big 383*25c28e83SPiotr Jasiukajtis sub %o0,%o2,%o2 !(-) if intf < small 384*25c28e83SPiotr Jasiukajtis fand %f38,%f32,%f44 !sign2 = sign bit 385*25c28e83SPiotr Jasiukajtis 386*25c28e83SPiotr Jasiukajtis/*52*/ orcc %o1,%o2,%g0 !(-) if either true 387*25c28e83SPiotr Jasiukajtis bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below 388*25c28e83SPiotr Jasiukajtis fabsd %f38,%f38 !abs(f) (delay slot) 389*25c28e83SPiotr Jasiukajtis !---------------------- 390*25c28e83SPiotr Jasiukajtis 391*25c28e83SPiotr Jasiukajtis/*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again) 392*25c28e83SPiotr Jasiukajtis faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2] 393*25c28e83SPiotr Jasiukajtis 394*25c28e83SPiotr Jasiukajtis/*54*/ sethi %hi(0x8000),%o7 !rounding bit 395*25c28e83SPiotr Jasiukajtis fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 396*25c28e83SPiotr Jasiukajtis 397*25c28e83SPiotr Jasiukajtis/*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 398*25c28e83SPiotr Jasiukajtis sethi %hi(0x7fff0000),%o7 !mask for rounding argument 399*25c28e83SPiotr Jasiukajtis fmuld %f38,%f0,%f10 !f*z 400*25c28e83SPiotr Jasiukajtis fsubd %f38,%f0,%f20 !f - z 401*25c28e83SPiotr Jasiukajtis 402*25c28e83SPiotr Jasiukajtis/*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 403*25c28e83SPiotr Jasiukajtis fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0 404*25c28e83SPiotr Jasiukajtis fsubd %f6,%f8,%f6 !conup0 - ansu0 405*25c28e83SPiotr Jasiukajtis 406*25c28e83SPiotr Jasiukajtis/*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000 407*25c28e83SPiotr Jasiukajtis fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 408*25c28e83SPiotr Jasiukajtis ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1] 409*25c28e83SPiotr Jasiukajtis 410*25c28e83SPiotr Jasiukajtis cmp %o0,%l6 !(|f| > 64) 411*25c28e83SPiotr Jasiukajtis ble .ELSE2 !if(|f| > 64) then 412*25c28e83SPiotr Jasiukajtis/*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 413*25c28e83SPiotr Jasiukajtis mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 414*25c28e83SPiotr Jasiukajtis ba .ENDIF2 !continue 415*25c28e83SPiotr Jasiukajtis/*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot) 416*25c28e83SPiotr Jasiukajtis .ELSE2: !else f( |x| >= (1/64)) 417*25c28e83SPiotr Jasiukajtis cmp %o0,%l7 !if intf >= 1/64 418*25c28e83SPiotr Jasiukajtis bl .ENDIF2 !if( |x| >= (1/64) ) then... 419*25c28e83SPiotr Jasiukajtis mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 420*25c28e83SPiotr Jasiukajtis add %o3,4,%o1 !index = index + 4 421*25c28e83SPiotr Jasiukajtis/*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument 422*25c28e83SPiotr Jasiukajtis .ENDIF2: 423*25c28e83SPiotr Jasiukajtis 424*25c28e83SPiotr Jasiukajtis 425*25c28e83SPiotr Jasiukajtis/*62*/ sll %o1,3,%l5 !index2 = index 426*25c28e83SPiotr Jasiukajtis mov %i3,%l2 !yaddr2 = address of y 427*25c28e83SPiotr Jasiukajtis fmuld %f34,%f4,%f4 !f0*tmp0 428*25c28e83SPiotr Jasiukajtis faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1 429*25c28e83SPiotr Jasiukajtis fmuld %f36,%f36,%f14 !tmp1= f1*f1 430*25c28e83SPiotr Jasiukajtis 431*25c28e83SPiotr Jasiukajtis/*65*/add %o4,%l3,%l3 !base addr + index0 432*25c28e83SPiotr Jasiukajtis fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1) 433*25c28e83SPiotr Jasiukajtis faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0 434*25c28e83SPiotr Jasiukajtis fmuld %f58,%f14,%f12 !p[3]*tmp1 435*25c28e83SPiotr Jasiukajtis faddd %f16,%f36,%f18 !ansu1 = conup1 + f1 436*25c28e83SPiotr Jasiukajtis ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1] 437*25c28e83SPiotr Jasiukajtis 438*25c28e83SPiotr Jasiukajtis/*68*/ add %i1,%i2,%i1 !x += stridex 439*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey 440*25c28e83SPiotr Jasiukajtis faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0 441*25c28e83SPiotr Jasiukajtis faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2] 442*25c28e83SPiotr Jasiukajtis 443*25c28e83SPiotr Jasiukajtis/*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0 444*25c28e83SPiotr Jasiukajtis fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1 445*25c28e83SPiotr Jasiukajtis fsubd %f16,%f18,%f16 !conup1 - ansu1 446*25c28e83SPiotr Jasiukajtis 447*25c28e83SPiotr Jasiukajtis/*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0 448*25c28e83SPiotr Jasiukajtis fmuld %f36,%f14,%f14 !f1*tmp1 449*25c28e83SPiotr Jasiukajtis faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1 450*25c28e83SPiotr Jasiukajtis 451*25c28e83SPiotr Jasiukajtis/*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument 452*25c28e83SPiotr Jasiukajtis std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot) 453*25c28e83SPiotr Jasiukajtis cmp %i5,0 !if argcount =0, we are done 454*25c28e83SPiotr Jasiukajtis bg .MAINLOOP 455*25c28e83SPiotr Jasiukajtis nop 456*25c28e83SPiotr Jasiukajtis 457*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 458*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 459*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 460*25c28e83SPiotr Jasiukajtis 461*25c28e83SPiotr Jasiukajtis.RETURN: 462*25c28e83SPiotr Jasiukajtis ret 463*25c28e83SPiotr Jasiukajtis restore %g0,%g0,%g0 464*25c28e83SPiotr Jasiukajtis 465*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 466*25c28e83SPiotr Jasiukajtis /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/ 467*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 468*25c28e83SPiotr Jasiukajtis 469*25c28e83SPiotr Jasiukajtis/* at this point 470*25c28e83SPiotr Jasiukajtis %i1 x address 471*25c28e83SPiotr Jasiukajtis %o0 intf 472*25c28e83SPiotr Jasiukajtis %o2 intf - 0x3e300000 473*25c28e83SPiotr Jasiukajtis %f34,36,38 f0,f1,f2 474*25c28e83SPiotr Jasiukajtis %f40,42,44 sign0,sign1,sign2 475*25c28e83SPiotr Jasiukajtis*/ 476*25c28e83SPiotr Jasiukajtis 477*25c28e83SPiotr Jasiukajtis .align 32 !align on I-cache boundary 478*25c28e83SPiotr Jasiukajtis.SPECIAL0: 479*25c28e83SPiotr Jasiukajtis orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 480*25c28e83SPiotr Jasiukajtis bpos 1f !if >=...continue 481*25c28e83SPiotr Jasiukajtis sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 482*25c28e83SPiotr Jasiukajtis ba 3f 483*25c28e83SPiotr Jasiukajtis faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot) 484*25c28e83SPiotr Jasiukajtis1: 485*25c28e83SPiotr Jasiukajtis ld [%o5+4],%o5 !load x lower word 486*25c28e83SPiotr Jasiukajtis sllx %o0,32,%o0 !left justify intf 487*25c28e83SPiotr Jasiukajtis sllx %g1,32,%g1 !left justify Inf 488*25c28e83SPiotr Jasiukajtis or %o0,%o5,%o0 !merge in lower intf 489*25c28e83SPiotr Jasiukajtis cmp %o0,%g1 !if intf > 0x7ff00000 00000000 490*25c28e83SPiotr Jasiukajtis ble,pt %xcc,2f !pass thru if NaN 491*25c28e83SPiotr Jasiukajtis nop 492*25c28e83SPiotr Jasiukajtis fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception 493*25c28e83SPiotr Jasiukajtis ba 3f 494*25c28e83SPiotr Jasiukajtis nop 495*25c28e83SPiotr Jasiukajtis2: 496*25c28e83SPiotr Jasiukajtis faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower 497*25c28e83SPiotr Jasiukajtis3: 498*25c28e83SPiotr Jasiukajtis add %i1,%i2,%i1 !x += stridex 499*25c28e83SPiotr Jasiukajtis for %f34,%f40,%f34 !sign(ans) = sign of argument 500*25c28e83SPiotr Jasiukajtis std %f34,[%i3] !*y = ans 501*25c28e83SPiotr Jasiukajtis ba .LOOP0 !keep looping 502*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey (delay slot) 503*25c28e83SPiotr Jasiukajtis 504*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 505*25c28e83SPiotr Jasiukajtis /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/ 506*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 507*25c28e83SPiotr Jasiukajtis 508*25c28e83SPiotr Jasiukajtis .align 32 !align on I-cache boundary 509*25c28e83SPiotr Jasiukajtis.SPECIAL1: 510*25c28e83SPiotr Jasiukajtis orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 511*25c28e83SPiotr Jasiukajtis bpos 1f !if >=...continue 512*25c28e83SPiotr Jasiukajtis sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 513*25c28e83SPiotr Jasiukajtis ba 3f 514*25c28e83SPiotr Jasiukajtis faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot) 515*25c28e83SPiotr Jasiukajtis1: 516*25c28e83SPiotr Jasiukajtis ld [%o5+4],%o5 !load x lower word 517*25c28e83SPiotr Jasiukajtis sllx %o0,32,%o0 !left justify intf 518*25c28e83SPiotr Jasiukajtis sllx %g1,32,%g1 !left justify Inf 519*25c28e83SPiotr Jasiukajtis or %o0,%o5,%o0 !merge in lower intf 520*25c28e83SPiotr Jasiukajtis cmp %o0,%g1 !if intf > 0x7ff00000 00000000 521*25c28e83SPiotr Jasiukajtis ble,pt %xcc,2f !pass thru if NaN 522*25c28e83SPiotr Jasiukajtis nop 523*25c28e83SPiotr Jasiukajtis fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception 524*25c28e83SPiotr Jasiukajtis ba 3f 525*25c28e83SPiotr Jasiukajtis nop 526*25c28e83SPiotr Jasiukajtis2: 527*25c28e83SPiotr Jasiukajtis faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower 528*25c28e83SPiotr Jasiukajtis3: 529*25c28e83SPiotr Jasiukajtis add %i1,%i2,%i1 !x += stridex 530*25c28e83SPiotr Jasiukajtis for %f36,%f42,%f36 !sign(ans) = sign of argument 531*25c28e83SPiotr Jasiukajtis std %f36,[%i3] !*y = ans 532*25c28e83SPiotr Jasiukajtis ba .LOOP1 !keep looping 533*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey (delay slot) 534*25c28e83SPiotr Jasiukajtis 535*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 536*25c28e83SPiotr Jasiukajtis /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/ 537*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 538*25c28e83SPiotr Jasiukajtis 539*25c28e83SPiotr Jasiukajtis .align 32 !align on I-cache boundary 540*25c28e83SPiotr Jasiukajtis.SPECIAL2: 541*25c28e83SPiotr Jasiukajtis orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 542*25c28e83SPiotr Jasiukajtis bpos 1f !if >=...continue 543*25c28e83SPiotr Jasiukajtis sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 544*25c28e83SPiotr Jasiukajtis ba 3f 545*25c28e83SPiotr Jasiukajtis faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot) 546*25c28e83SPiotr Jasiukajtis1: 547*25c28e83SPiotr Jasiukajtis ld [%o5+4],%o5 !load x lower word 548*25c28e83SPiotr Jasiukajtis sllx %o0,32,%o0 !left justify intf 549*25c28e83SPiotr Jasiukajtis sllx %g1,32,%g1 !left justify Inf 550*25c28e83SPiotr Jasiukajtis or %o0,%o5,%o0 !merge in lower intf 551*25c28e83SPiotr Jasiukajtis cmp %o0,%g1 !if intf > 0x7ff00000 00000000 552*25c28e83SPiotr Jasiukajtis ble,pt %xcc,2f !pass thru if NaN 553*25c28e83SPiotr Jasiukajtis nop 554*25c28e83SPiotr Jasiukajtis fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception 555*25c28e83SPiotr Jasiukajtis ba 3f 556*25c28e83SPiotr Jasiukajtis nop 557*25c28e83SPiotr Jasiukajtis2: 558*25c28e83SPiotr Jasiukajtis faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower 559*25c28e83SPiotr Jasiukajtis3: 560*25c28e83SPiotr Jasiukajtis add %i1,%i2,%i1 !x += stridex 561*25c28e83SPiotr Jasiukajtis for %f38,%f44,%f38 !sign(ans) = sign of argument 562*25c28e83SPiotr Jasiukajtis std %f38,[%i3] !*y = ans 563*25c28e83SPiotr Jasiukajtis ba .LOOP2 !keep looping 564*25c28e83SPiotr Jasiukajtis add %i3,%i4,%i3 !y += stridey 565*25c28e83SPiotr Jasiukajtis 566*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 567*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 568*25c28e83SPiotr Jasiukajtis /*--------------------------------------------------------------------------*/ 569*25c28e83SPiotr Jasiukajtis 570*25c28e83SPiotr Jasiukajtis SET_SIZE(__vatan) 571*25c28e83SPiotr Jasiukajtis 572*25c28e83SPiotr Jasiukajtis! .ident "03-20-96 Sparc V9 3-way-unrolled version" 573