1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vatan.S" 30 31#include "libm.h" 32 33 RO_DATA 34 35! following is the C version of the ATAN algorithm 36! #include <math.h> 37! #include <stdio.h> 38! double jkatan(double *x) 39! { 40! double f, z, ans, ansu, ansl, tmp, poly, conup, conlo, dummy; 41! int index, sign, intf, intz; 42! extern const double __vlibm_TBL_atan1[]; 43! long *pf = (long *) &f, *pz = (long *) &z; 44! 45! /* Power series atan(x) = x + p1*x**3 + p2*x**5 + p3*x**7 46! * Error = -3.08254E-18 On the interval |x| < 1/64 */ 47! 48! /* define dummy names for readability. Use parray to help compiler optimize loads */ 49! #define p3 parray[0] 50! #define p2 parray[1] 51! #define p1 parray[2] 52! #define soffset 3 53! 54! static const double parray[] = { 55! -1.428029046844299722E-01, /* p[3] */ 56! 1.999999917247000615E-01, /* p[2] */ 57! -3.333333333329292858E-01, /* p[1] */ 58! 1.0, /* not used for p[0], though */ 59! -1.0, /* used to flip sign of answer */ 60! }; 61! 62! f = *x; /* fetch argument */ 63! intf = pf[0]; /* grab upper half */ 64! sign = intf & 0x80000000; /* sign of argument */ 65! intf ^= sign; /* abs(upper argument) */ 66! sign = (unsigned) sign >> 31; /* sign bit = 0 or 1 */ 67! pf[0] = intf; 68! 69! if( (intf > 0x43600000) || (intf < 0x3e300000) ) /* filter out special cases */ 70! { 71! if( (intf > 0x7ff00000) || 72! ((intf == 0x7ff00000) && (pf[1] !=0)) ) return (*x-*x);/* return NaN if x=NaN*/ 73! if( intf < 0x3e300000 ) /* avoid underflow for small arg */ 74! { 75! dummy = 1.0e37 + f; 76! dummy = dummy; 77! return (*x); 78! } 79! if( intf > 0x43600000 ) /* avoid underflow for big arg */ 80! { 81! index = 2; 82! f = __vlibm_TBL_atan1[index] + __vlibm_TBL_atan1[index+1];/* pi/2 up + pi/2 low */ 83! f = parray[soffset + sign] * f; /* put sign bit on ans */ 84! return (f); 85! } 86! } 87! 88! index = 0; /* points to 0,0 in table */ 89! if (intf > 0x40500000) /* if(|x| > 64 */ 90! { f = -1.0/f; 91! index = 2; /* point to pi/2 upper, lower */ 92! } 93! else if( intf >= 0x3f900000 ) /* if |x| >= (1/64)... */ 94! { 95! intz = (intf + 0x00008000) & 0x7fff0000;/* round arg, keep upper */ 96! pz[0] = intz; /* store as a double (z) */ 97! pz[1] = 0; /* ...lower */ 98! f = (f - z)/(1.0 + f*z); /* get reduced argument */ 99! index = (intz - 0x3f900000) >> 15; /* (index >> 16) << 1) */ 100! index += 4; /* skip over 0,0,pi/2,pi/2 */ 101! } 102! conup = __vlibm_TBL_atan1[index]; /* upper table */ 103! conlo = __vlibm_TBL_atan1[index+1]; /* lower table */ 104! tmp = f*f; 105! poly = (f*tmp)*((p3*tmp + p2)*tmp + p1); 106! ansu = conup + f; /* compute atan(f) upper */ 107! ansl = (((conup - ansu) + f) + poly) + conlo; 108! ans = ansu + ansl; 109! ans = parray[soffset + sign] * ans; 110! return ans; 111! } 112 113/* 8 bytes = 1 double f.p. word */ 114#define WSIZE 8 115 116 .align 32 !align with full D-cache line 117.COEFFS: 118 .double 0r-1.428029046844299722E-01 !p[3] 119 .double 0r1.999999917247000615E-01 !p[2] 120 .double 0r-3.333333333329292858E-01 !p[1] 121 .double 0r-1.0, !constant -1.0 122 .word 0x00008000,0x0 !for fp rounding of reduced arg 123 .word 0x7fff0000,0x0 !for fp truncation 124 .word 0x47900000,0 !a number close to 1.0E37 125 .word 0x80000000,0x0 !mask for fp sign bit 126 .word 0x3f800000,0x0 !1.0/128.0 dummy "safe" argument 127 .type .COEFFS,#object 128 129 ENTRY(__vatan) 130 save %sp,-SA(MINFRAME)-16,%sp 131 PIC_SETUP(g5) 132 PIC_SET(g5,__vlibm_TBL_atan1,o4) 133 PIC_SET(g5,.COEFFS,o0) 134/* 135 __vatan(int n, double *x, int stridex, double *y, stridey) 136 computes y(i) = atan( x(i) ), for 1=1,n. Stridex, stridey 137 are the distance between x and y elements 138 139 %i0 n 140 %i1 address of x 141 %i2 stride x 142 %i3 address of y 143 %i4 stride y 144*/ 145 cmp %i0,0 !if n <=0, 146 ble,pn %icc,.RETURN !....then do nothing 147 sll %i2,3,%i2 !convert stride to byte count 148 sll %i4,3,%i4 !convert stride to byte count 149 150/* pre-load constants before beginning main loop */ 151 152 ldd [%o0],%f58 !load p[3] 153 mov 2,%i5 !argcount = 3 154 155 ldd [%o0+WSIZE],%f60 !load p[2] 156 add %fp,STACK_BIAS-8,%l1 !yaddr1 = &dummy 157 fzero %f18 !ansu1 = 0 158 159 ldd [%o0+2*WSIZE],%f62 !load p[1] 160 add %fp,STACK_BIAS-8,%l2 !yaddr2 = &dummy 161 fzero %f12 !(poly1) = 0 162 163 ldd [%o0+3*WSIZE],%f56 !-1.0 164 fzero %f14 !tmp1 = 0 165 166 ldd [%o0+4*WSIZE],%f52 !load rounding mask 167 fzero %f16 !conup1 = 0 168 169 ldd [%o0+5*WSIZE],%f54 !load truncation mask 170 fzero %f36 !f1 = 0 171 172 ldd [%o0+6*WSIZE],%f50 !1.0e37 173 fzero %f38 !f2 = 0 174 175 ldd [%o0+7*WSIZE],%f32 !mask for sign bit 176 177 ldd [%o4+2*WSIZE],%f46 !pi/2 upper 178 ldd [%o4+(2*WSIZE+8)],%f48 !pi/2 lower 179 sethi %hi(0x40500000),%l6 !64.0 180 sethi %hi(0x3f900000),%l7 !1/64.0 181 mov 0,%l4 !index1 = 0 182 mov 0,%l5 !index2 = 0 183 184.MAINLOOP: 185 186 /*--------------------------------------------------------------------------*/ 187 /*--------------------------------------------------------------------------*/ 188 /*--------------------------------------------------------------------------*/ 189 190.LOOP0: 191 deccc %i0 !--n 192 bneg 1f 193 mov %i1,%o5 !xuse = x (delay slot) 194 195 ba 2f 196 nop !delay slot 1971: 198 PIC_SET(g5,.COEFFS+8*WSIZE,o5) 199 dec %i5 !argcount-- 2002: 201 sethi %hi(0x80000000),%o7 !mask for sign bit 202/*2 */ sethi %hi(0x43600000),%o1 !big = 0x43600000,0 203 ld [%o5],%o0 !intf = pf[0] = f upper 204 ldd [%o4+%l5],%f26 !conup2 = __vlibm_TBL_atan1[index2] 205 206 sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 207/*4 */ andn %o0,%o7,%o0 !intf = fabs(intf) 208 ldd [%o5],%f34 !f = *x into f34 209 210 sub %o1,%o0,%o1 !(-) if intf > big 211/*6 */ sub %o0,%o2,%o2 !(-) if intf < small 212 fand %f34,%f32,%f40 !sign0 = sign bit 213 fmuld %f38,%f38,%f24 !tmp2= f2*f2 214 215/*7 */ orcc %o1,%o2,%g0 !(-) if either true 216 bneg,pn %icc,.SPECIAL0 !if (-) goto special cases below 217 fabsd %f34,%f34 !abs(f) (delay slot) 218 !---------------------- 219 220 221 sethi %hi(0x8000),%o7 !rounding bit 222/*8 */ fpadd32 %f34,%f52,%f0 !intf + 0x00008000 (again) 223 faddd %f26,%f38,%f28 !ansu2 = conup2 + f2 224 225 add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 226/*9*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 227 fmuld %f58,%f24,%f22 !p[3]*tmp2 228 229/*10 */ sethi %hi(0x7fff0000),%o7 !mask for rounding argument 230 fmuld %f34,%f0,%f10 !f*z 231 fsubd %f34,%f0,%f20 !f - z 232 add %o4,%l4,%l4 !base addr + index1 233 fmuld %f14,%f12,%f12 !poly1 = (f1*tmp1)*((p3*tmp1 + p2)*tmp1 + p1) 234 faddd %f16,%f36,%f16 !(conup1 - ansu1) + f1 235 236/*12 */ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 237 faddd %f22,%f60,%f22 !p[3]*tmp2 + p[2] 238 ldd [%l4+WSIZE],%f14 !conlo1 = __vlibm_TBL_atan1[index+1] 239 240/*13 */ sub %o0,%l7,%o2 !intz - 0x3f900000 241 fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 242 faddd %f16,%f12,%f12 !((conup1 - ansu1) + f1) + poly1 243 244 cmp %o0,%l6 !(|f| > 64) 245 ble .ELSE0 !if(|f| > 64) then 246/*15 */ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 247 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 248 ba .ENDIF0 !continue 249/*16 */ fdivd %f56,%f34,%f34 !f = -1.0/f (delay slot) 250 .ELSE0: !else f( |x| >= (1/64)) 251 cmp %o0,%l7 !if intf >= 1/64 252 bl .ENDIF0 !if( |x| >= (1/64) ) then... 253 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 254 add %o3,4,%o1 !index = index + 4 255/*16 */ fdivd %f20,%f10,%f34 !f = (f - z)/(1.0 + f*z), reduced argument 256 .ENDIF0: 257 258/*17*/ sll %o1,3,%l3 !index0 = index 259 mov %i3,%l0 !yaddr0 = address of y 260 faddd %f12,%f14,%f12 !ansl1 = (((conup1 - ansu)1 + f1) + poly1) + conlo1 261 fmuld %f22,%f24,%f22 !(p3*tmp2 + p2)*tmp2 262 fsubd %f26,%f28,%f26 !conup2 - ansu2 263 264/*20*/ add %i1,%i2,%i1 !x += stridex 265 add %i3,%i4,%i3 !y += stridey 266 faddd %f18,%f12,%f36 !ans1 = ansu1 + ansl1 267 fmuld %f38,%f24,%f24 !f*tmp2 268 faddd %f22,%f62,%f22 !(p3*tmp2 + p2)*tmp2 + p1 269 270/*23*/ for %f36,%f42,%f36 !sign(ans1) = sign of argument 271 std %f36,[%l1] !*yaddr1 = ans1 272 add %o4,%l5,%l5 !base addr + index2 273 fmuld %f24,%f22,%f22 !poly2 = (f2*tmp2)*((p3*tmp2 + p2)*tmp2 + p1) 274 faddd %f26,%f38,%f26 !(conup2 - ansu2) + f2 275 cmp %i5,0 !if argcount =0, we are done 276 be .RETURN 277 nop 278 279 /*--------------------------------------------------------------------------*/ 280 /*--------------------------------------------------------------------------*/ 281 /*--------------------------------------------------------------------------*/ 282 283.LOOP1: 284/*25*/ deccc %i0 !--n 285 bneg 1f 286 mov %i1,%o5 !xuse = x (delay slot) 287 ba 2f 288 nop !delay slot 2891: 290 PIC_SET(g5,.COEFFS+8*WSIZE,o5) 291 dec %i5 !argcount-- 2922: 293 294/*26*/ sethi %hi(0x80000000),%o7 !mask for sign bit 295 sethi %hi(0x43600000),%o1 !big = 0x43600000,0 296 ld [%o5],%o0 !intf = pf[0] = f upper 297 298/*28*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 299 andn %o0,%o7,%o0 !intf = fabs(intf) 300 ldd [%o5],%f36 !f = *x into f36 301 302/*30*/ sub %o1,%o0,%o1 !(-) if intf > big 303 sub %o0,%o2,%o2 !(-) if intf < small 304 fand %f36,%f32,%f42 !sign1 = sign bit 305 306/*31*/ orcc %o1,%o2,%g0 !(-) if either true 307 bneg,pn %icc,.SPECIAL1 !if (-) goto special cases below 308 fabsd %f36,%f36 !abs(f) (delay slot) 309 !---------------------- 310 311/*32*/ fpadd32 %f36,%f52,%f0 !intf + 0x00008000 (again) 312 ldd [%l5+WSIZE],%f24 !conlo2 = __vlibm_TBL_atan1[index2+1] 313 314/*33*/ fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 315 sethi %hi(0x8000),%o7 !rounding bit 316 faddd %f26,%f22,%f22 !((conup2 - ansu2) + f2) + poly2 317 318/*34*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 319 sethi %hi(0x7fff0000),%o7 !mask for rounding argument 320 fmuld %f36,%f0,%f10 !f*z 321 fsubd %f36,%f0,%f20 !f - z 322 323/*35*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 324 faddd %f22,%f24,%f22 !ansl2 = (((conup2 - ansu2) + f2) + poly2) + conlo2 325 326/*37*/ sub %o0,%l7,%o2 !intz - 0x3f900000 327 fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 328 ldd [%o4+%l3],%f6 !conup0 = __vlibm_TBL_atan1[index0] 329 330 cmp %o0,%l6 !(|f| > 64) 331 ble .ELSE1 !if(|f| > 64) then 332/*38*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 333 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 334 ba .ENDIF1 !continue 335/*40*/ fdivd %f56,%f36,%f36 !f = -1.0/f (delay slot) 336 .ELSE1: !else f( |x| >= (1/64)) 337 cmp %o0,%l7 !if intf >= 1/64 338 bl .ENDIF1 !if( |x| >= (1/64) ) then... 339 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 340 add %o3,4,%o1 !index = index + 4 341/*40*/ fdivd %f20,%f10,%f36 !f = (f - z)/(1.0 + f*z), reduced argument 342 .ENDIF1: 343 344/*41*/sll %o1,3,%l4 !index1 = index 345 mov %i3,%l1 !yaddr1 = address of y 346 fmuld %f34,%f34,%f4 !tmp0= f0*f0 347 faddd %f28,%f22,%f38 !ans2 = ansu2 + ansl2 348 349/*44*/add %i1,%i2,%i1 !x += stridex 350 add %i3,%i4,%i3 !y += stridey 351 fmuld %f58,%f4,%f2 !p[3]*tmp0 352 faddd %f6,%f34,%f8 !ansu0 = conup0 + f0 353 for %f38,%f44,%f38 !sign(ans2) = sign of argument 354 std %f38,[%l2] !*yaddr2 = ans2 355 cmp %i5,0 !if argcount =0, we are done 356 be .RETURN 357 nop 358 359 /*--------------------------------------------------------------------------*/ 360 /*--------------------------------------------------------------------------*/ 361 /*--------------------------------------------------------------------------*/ 362 363.LOOP2: 364/*46*/ deccc %i0 !--n 365 bneg 1f 366 mov %i1,%o5 !xuse = x (delay slot) 367 ba 2f 368 nop !delay slot 3691: 370 PIC_SET(g5,.COEFFS+8*WSIZE,o5) 371 dec %i5 !argcount-- 3722: 373 374/*47*/ sethi %hi(0x80000000),%o7 !mask for sign bit 375 sethi %hi(0x43600000),%o1 !big = 0x43600000,0 376 ld [%o5],%o0 !intf = pf[0] = f upper 377 378/*49*/ sethi %hi(0x3e300000),%o2 !small = 0x3e300000,0 379 andn %o0,%o7,%o0 !intf = fabs(intf) 380 ldd [%o5],%f38 !f = *x into f38 381 382/*51*/ sub %o1,%o0,%o1 !(-) if intf > big 383 sub %o0,%o2,%o2 !(-) if intf < small 384 fand %f38,%f32,%f44 !sign2 = sign bit 385 386/*52*/ orcc %o1,%o2,%g0 !(-) if either true 387 bneg,pn %icc,.SPECIAL2 !if (-) goto special cases below 388 fabsd %f38,%f38 !abs(f) (delay slot) 389 !---------------------- 390 391/*53*/ fpadd32 %f38,%f52,%f0 !intf + 0x00008000 (again) 392 faddd %f2,%f60,%f2 !p[3]*tmp0 + p[2] 393 394/*54*/ sethi %hi(0x8000),%o7 !rounding bit 395 fand %f0,%f54,%f0 !pz[0] = intz = (intf + 0x00008000) & 0x7fff0000 (again) 396 397/*55*/ add %o0,%o7,%o0 !intf + 0x00008000 (delay slot) 398 sethi %hi(0x7fff0000),%o7 !mask for rounding argument 399 fmuld %f38,%f0,%f10 !f*z 400 fsubd %f38,%f0,%f20 !f - z 401 402/*56*/ and %o0,%o7,%o0 !intz = (intf + 0x00008000) & 0x7fff0000 403 fmuld %f2,%f4,%f2 !(p3*tmp0 + p2)*tmp0 404 fsubd %f6,%f8,%f6 !conup0 - ansu0 405 406/*58*/ sub %o0,%l7,%o2 !intz - 0x3f900000 407 fsubd %f10,%f56,%f10 !(f*z - (-1.0)) 408 ldd [%o4+%l4],%f16 !conup1 = __vlibm_TBL_atan1[index1] 409 410 cmp %o0,%l6 !(|f| > 64) 411 ble .ELSE2 !if(|f| > 64) then 412/*60*/ sra %o2,15,%o3 !index = (intz - 0x3f900000) >> 15 413 mov 2,%o1 !index == 2, point to conup, conlo = pi/2 upper, lower 414 ba .ENDIF2 !continue 415/*61*/ fdivd %f56,%f38,%f38 !f = -1.0/f (delay slot) 416 .ELSE2: !else f( |x| >= (1/64)) 417 cmp %o0,%l7 !if intf >= 1/64 418 bl .ENDIF2 !if( |x| >= (1/64) ) then... 419 mov 0,%o1 !index == 0 , point to conup,conlo = 0,0 420 add %o3,4,%o1 !index = index + 4 421/*61*/ fdivd %f20,%f10,%f38 !f = (f - z)/(1.0 + f*z), reduced argument 422 .ENDIF2: 423 424 425/*62*/ sll %o1,3,%l5 !index2 = index 426 mov %i3,%l2 !yaddr2 = address of y 427 fmuld %f34,%f4,%f4 !f0*tmp0 428 faddd %f2,%f62,%f2 !(p3*tmp0 + p2)*tmp0 + p1 429 fmuld %f36,%f36,%f14 !tmp1= f1*f1 430 431/*65*/add %o4,%l3,%l3 !base addr + index0 432 fmuld %f4,%f2,%f2 !poly0 = (f0*tmp0)*((p3*tmp0 + p2)*tmp0 + p1) 433 faddd %f6,%f34,%f6 !(conup0 - ansu0) + f0 434 fmuld %f58,%f14,%f12 !p[3]*tmp1 435 faddd %f16,%f36,%f18 !ansu1 = conup1 + f1 436 ldd [%l3+WSIZE],%f4 !conlo0 = __vlibm_TBL_atan1[index0+1] 437 438/*68*/ add %i1,%i2,%i1 !x += stridex 439 add %i3,%i4,%i3 !y += stridey 440 faddd %f6,%f2,%f2 !((conup0 - ansu0) + f0) + poly0 441 faddd %f12,%f60,%f12 !p[3]*tmp1 + p[2] 442 443/*71*/faddd %f2,%f4,%f2 !ansl0 = (((conup0 - ansu)0 + f0) + poly0) + conlo0 444 fmuld %f12,%f14,%f12 !(p3*tmp1 + p2)*tmp1 445 fsubd %f16,%f18,%f16 !conup1 - ansu1 446 447/*74*/faddd %f8,%f2,%f34 !ans0 = ansu0 + ansl0 448 fmuld %f36,%f14,%f14 !f1*tmp1 449 faddd %f12,%f62,%f12 !(p3*tmp1 + p2)*tmp1 + p1 450 451/*77*/ for %f34,%f40,%f34 !sign(ans0) = sign of argument 452 std %f34,[%l0] !*yaddr0 = ans, always gets stored (delay slot) 453 cmp %i5,0 !if argcount =0, we are done 454 bg .MAINLOOP 455 nop 456 457 /*--------------------------------------------------------------------------*/ 458 /*--------------------------------------------------------------------------*/ 459 /*--------------------------------------------------------------------------*/ 460 461.RETURN: 462 ret 463 restore %g0,%g0,%g0 464 465 /*--------------------------------------------------------------------------*/ 466 /*------------SPECIAL CASE HANDLING FOR LOOP0 ------------------------------*/ 467 /*--------------------------------------------------------------------------*/ 468 469/* at this point 470 %i1 x address 471 %o0 intf 472 %o2 intf - 0x3e300000 473 %f34,36,38 f0,f1,f2 474 %f40,42,44 sign0,sign1,sign2 475*/ 476 477 .align 32 !align on I-cache boundary 478.SPECIAL0: 479 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 480 bpos 1f !if >=...continue 481 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 482 ba 3f 483 faddd %f34,%f50,%f30 !dummy op just to generate exception (delay slot) 4841: 485 ld [%o5+4],%o5 !load x lower word 486 sllx %o0,32,%o0 !left justify intf 487 sllx %g1,32,%g1 !left justify Inf 488 or %o0,%o5,%o0 !merge in lower intf 489 cmp %o0,%g1 !if intf > 0x7ff00000 00000000 490 ble,pt %xcc,2f !pass thru if NaN 491 nop 492 fmuld %f34,%f34,%f34 !...... (x*x) trigger invalid exception 493 ba 3f 494 nop 4952: 496 faddd %f46,%f48,%f34 !ans = pi/2 upper + pi/2 lower 4973: 498 add %i1,%i2,%i1 !x += stridex 499 for %f34,%f40,%f34 !sign(ans) = sign of argument 500 std %f34,[%i3] !*y = ans 501 ba .LOOP0 !keep looping 502 add %i3,%i4,%i3 !y += stridey (delay slot) 503 504 /*--------------------------------------------------------------------------*/ 505 /*-----------SPECIAL CASE HANDLING FOR LOOP1 -------------------------------*/ 506 /*--------------------------------------------------------------------------*/ 507 508 .align 32 !align on I-cache boundary 509.SPECIAL1: 510 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 511 bpos 1f !if >=...continue 512 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 513 ba 3f 514 faddd %f36,%f50,%f30 !dummy op just to generate exception (delay slot) 5151: 516 ld [%o5+4],%o5 !load x lower word 517 sllx %o0,32,%o0 !left justify intf 518 sllx %g1,32,%g1 !left justify Inf 519 or %o0,%o5,%o0 !merge in lower intf 520 cmp %o0,%g1 !if intf > 0x7ff00000 00000000 521 ble,pt %xcc,2f !pass thru if NaN 522 nop 523 fmuld %f36,%f36,%f36 !...... (x*x) trigger invalid exception 524 ba 3f 525 nop 5262: 527 faddd %f46,%f48,%f36 !ans = pi/2 upper + pi/2 lower 5283: 529 add %i1,%i2,%i1 !x += stridex 530 for %f36,%f42,%f36 !sign(ans) = sign of argument 531 std %f36,[%i3] !*y = ans 532 ba .LOOP1 !keep looping 533 add %i3,%i4,%i3 !y += stridey (delay slot) 534 535 /*--------------------------------------------------------------------------*/ 536 /*------------SPECIAL CASE HANDLING FOR LOOP2 ------------------------------*/ 537 /*--------------------------------------------------------------------------*/ 538 539 .align 32 !align on I-cache boundary 540.SPECIAL2: 541 orcc %o2,%g0,%g0 !(-) if intf < 0x3e300000 542 bpos 1f !if >=...continue 543 sethi %hi(0x7ff00000),%g1 !upper word of Inf (we use 64-bit wide int for this) 544 ba 3f 545 faddd %f38,%f50,%f30 !dummy op just to generate exception (delay slot) 5461: 547 ld [%o5+4],%o5 !load x lower word 548 sllx %o0,32,%o0 !left justify intf 549 sllx %g1,32,%g1 !left justify Inf 550 or %o0,%o5,%o0 !merge in lower intf 551 cmp %o0,%g1 !if intf > 0x7ff00000 00000000 552 ble,pt %xcc,2f !pass thru if NaN 553 nop 554 fmuld %f38,%f38,%f38 !...... (x*x) trigger invalid exception 555 ba 3f 556 nop 5572: 558 faddd %f46,%f48,%f38 !ans = pi/2 upper + pi/2 lower 5593: 560 add %i1,%i2,%i1 !x += stridex 561 for %f38,%f44,%f38 !sign(ans) = sign of argument 562 std %f38,[%i3] !*y = ans 563 ba .LOOP2 !keep looping 564 add %i3,%i4,%i3 !y += stridey 565 566 /*--------------------------------------------------------------------------*/ 567 /*--------------------------------------------------------------------------*/ 568 /*--------------------------------------------------------------------------*/ 569 570 SET_SIZE(__vatan) 571 572! .ident "03-20-96 Sparc V9 3-way-unrolled version" 573