1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vhypot.S" 30 31#include "libm.h" 32 33 RO_DATA 34 .align 64 35 36.CONST_TBL: 37 .word 0x7ff00000, 0 ! DC0 38 .word 0x7fe00000, 0 ! DC1 39 .word 0x00100000, 0 ! DC2 40 .word 0x41b00000, 0 ! D2ON28 = 268435456.0 41 .word 0x7fd00000, 0 ! DC3 42 43#define counter %i0 44#define tmp_counter %l3 45#define tmp_px %l5 46#define tmp_py %o7 47#define stridex %i2 48#define stridey %i4 49#define stridez %l0 50 51#define DC0 %f8 52#define DC0_HI %f8 53#define DC0_LO %f9 54#define DC1 %f46 55#define DC2 %f48 56#define DC3 %f0 57#define D2ON28 %f62 58 59!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 60! !!!!! algorithm !!!!! 61! ((float*)&x)[0] = ((float*)px)[0]; 62! ((float*)&x)[1] = ((float*)px)[1]; 63! 64! ((float*)&y)[0] = ((float*)py)[0]; 65! ((float*)&y)[1] = ((float*)py)[1]; 66! 67! x = fabs(x); 68! y = fabs(y); 69! 70! c0 = vis_fcmple32(DC1,x); 71! c2 = vis_fcmple32(DC1,y); 72! c1 = vis_fcmpgt32(DC2,x); 73! c3 = vis_fcmpgt32(DC2,y); 74! 75! c0 |= c2; 76! c1 &= c3; 77! if ( (c0 & 2) != 0 ) 78! { 79! lx = ((int*)px)[1]; 80! ly = ((int*)py)[1]; 81! hx = *(int*)px; 82! hy = *(int*)py; 83! 84! hx &= 0x7fffffff; 85! hy &= 0x7fffffff; 86! 87! j0 = hx; 88! if ( j0 < hy ) j0 = hy; 89! j0 &= 0x7ff00000; 90! if ( j0 >= 0x7ff00000 ) 91! { 92! if ( hx == 0x7ff00000 && lx == 0 ) res = x == y ? y : x; 93! else if ( hy == 0x7ff00000 && ly == 0 ) res = x == y ? x : y; 94! else res = x * y; 95! 96! ((float*)pz)[0] = ((float*)&res)[0]; 97! ((float*)pz)[1] = ((float*)&res)[1]; 98! } 99! else 100! { 101! diff = hy - hx; 102! j0 = diff >> 31; 103! if ( ((diff ^ j0) - j0) < 0x03600000 ) 104! {! 105! x *= D2ONM1022; 106! y *= D2ONM1022; 107! 108! x_hi = ( x + two28 ) - two28; 109! x_lo = x - x_hi; 110! y_hi = ( y + two28 ) - two28; 111! y_lo = y - y_hi; 112! res = (x_hi * x_hi + y_hi * y_hi); 113! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); 114! 115! res = sqrt(res); 116! 117! res = D2ONP1022 * res; 118! ((float*)pz)[0] = ((float*)&res)[0]; 119! ((float*)pz)[1] = ((float*)&res)[1]; 120! } 121! else 122! { 123! res = x + y; 124! ((float*)pz)[0] = ((float*)&res)[0]; 125! ((float*)pz)[1] = ((float*)&res)[1]; 126! } 127! } 128! px += stridex; 129! py += stridey; 130! pz += stridez; 131! continue; 132! } 133! if ( (c1 & 2) != 0 ) 134! { 135! x *= D2ONP1022; 136! y *= D2ONP1022; 137! 138! x_hi = ( x + two28 ) - two28; 139! x_lo = x - x_hi; 140! y_hi = ( y + two28 ) - two28; 141! y_lo = y - y_hi; 142! res = (x_hi * x_hi + y_hi * y_hi); 143! res += ((x + x_hi) * x_lo + (y + y_hi) * y_lo); 144! 145! res = sqrt(res); 146! 147! res = D2ONM1022 * res; 148! ((float*)pz)[0] = ((float*)&res)[0]; 149! ((float*)pz)[1] = ((float*)&res)[1]; 150! px += stridex; 151! py += stridey; 152! pz += stridez; 153! continue; 154! } 155! 156! dmax = x; 157! if ( dmax < y ) dmax = y; 158! 159! dmax = vis_fand(dmax,DC0); 160! dnorm = vis_fpsub32(DC1,dmax); 161! 162! x *= dnorm; 163! y *= dnorm; 164! 165! x_hi = x + D2ON28; 166! x_hi -= D2ON28; 167! x_lo = x - x_hi; 168! 169! y_hi = y + D2ON28; 170! y_hi -= D2ON28; 171! y_lo = y - y_hi; 172! 173! res = x_hi * x_hi; 174! dtmp1 = x + x_hi; 175! dtmp0 = y_hi * y_hi; 176! dtmp2 = y + y_hi; 177! 178! res += dtmp0; 179! dtmp1 *= x_lo; 180! dtmp2 *= y_lo; 181! dtmp1 += dtmp2; 182! res += dtmp1; 183! 184! res = sqrt(res); 185! 186! res = dmax * res; 187! ((float*)pz)[0] = ((float*)&res)[0]; 188! ((float*)pz)[1] = ((float*)&res)[1]; 189! 190! px += stridex; 191! py += stridey; 192! pz += stridez; 193!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 194 195 ENTRY(__vhypot) 196 save %sp,-SA(MINFRAME),%sp 197 PIC_SETUP(l7) 198 PIC_SET(l7,.CONST_TBL,o3) 199 wr %g0,0x82,%asi 200 201#ifdef __sparcv9 202 ldx [%fp+STACK_BIAS+176],%l0 203#else 204 ld [%fp+STACK_BIAS+92],%l0 205#endif 206 ldd [%o3],DC0 207 sll %i2,3,stridex 208 mov %i0,tmp_counter 209 210 ldd [%o3+8],DC1 211 sll %i4,3,stridey 212 mov %i1,tmp_px 213 214 ldd [%o3+16],DC2 215 sll %l0,3,stridez 216 mov %i3,tmp_py 217 218 ldd [%o3+24],D2ON28 219 220 ldd [%o3+32],DC3 221 222.begin: 223 mov tmp_counter,counter 224 mov tmp_px,%i1 225 mov tmp_py,%i3 226 clr tmp_counter 227.begin1: 228 cmp counter,0 229 ble,pn %icc,.exit 230 nop 231 232 lda [%i1]%asi,%o0 233 sethi %hi(0x7ffffc00),%o5 234 235 lda [%i3]%asi,%o2 236 add %o5,1023,%o5 237 238 lda [%i1]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 239 240 lda [%i1+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 241 add %i1,stridex,%o1 ! px += stridex 242 243 lda [%i3]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 244 sethi %hi(0x00100000),%l7 245 and %o0,%o5,%o0 246 247 lda [%i3+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 248 and %o2,%o5,%o2 249 sethi %hi(0x7fe00000),%l6 250 251 fabsd %f26,%f36 ! (1_0) x = fabs(x); 252 cmp %o0,%o2 253 mov %o2,%l4 254 255 fabsd %f24,%f54 ! (1_0) y = fabs(y); 256 add %i3,stridey,%o5 ! py += stridey 257 movg %icc,%o0,%o2 258 lda [%o5]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 259 260 cmp %o2,%l6 261 sethi %hi(0x7ff00000),%o4 262 bge,pn %icc,.spec0 263 lda [%o5+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 264 265 cmp %o2,%l7 266 bl,pn %icc,.spec1 267 nop 268 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 269 270 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 271 add %i3,stridey,%i3 ! py += stridey 272 273 fabsd %f28,%f34 ! (2_0) y = fabs(y); 274 275 fabsd %f26,%f50 ! (2_0) x = fabs(x); 276 277 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 278 279 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 280 281 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 282 283 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 284 285 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 286 287 andcc %o3,2,%g0 ! (2_0) c0 & 2 288 bnz,pn %icc,.update0 ! (2_0) if ( (c0 & 2) != 0 ) 289 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 290.cont0: 291 add %i3,stridey,%l4 ! py += stridey 292 andcc %o4,2,%g0 ! (2_0) c1 & 2 293 bnz,pn %icc,.update1 ! (2_0) if ( (c1 & 2) != 0 ) 294 fmovd %f36,%f56 ! (1_0) dmax = x; 295.cont1: 296 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 297 add %o1,stridex,%l2 ! px += stridex 298 299 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 300 301 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; 302 303 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; 304 305 fabsd %f30,%f30 ! (3_1) y = fabs(y); 306 307 fabsd %f18,%f18 ! (3_1) x = fabs(x); 308 309 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y 310 311 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; 312 313 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); 314 315 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); 316 317 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); 318 319 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); 320 321 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); 322 323 or %o3,%o0,%o3 ! (3_1) c0 |= c2; 324 325 andcc %o3,2,%g0 ! (3_1) c0 & 2 326 bnz,pn %icc,.update2 ! (3_1) if ( (c0 & 2) != 0 ) 327 and %o4,%o1,%o4 ! (3_1) c1 &= c3; 328.cont2: 329 add %l4,stridey,%i3 ! py += stridey 330 andcc %o4,2,%g0 ! (3_1) c1 & 2 331 bnz,pn %icc,.update3 ! (3_1) if ( (c1 & 2) != 0 ) 332 fmovd %f50,%f32 ! (2_1) dmax = x; 333.cont3: 334 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); 335 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; 336 337 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; 338 339 add %l2,stridex,%l1 ! px += stridex 340 341 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; 342 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0] 343 344 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; 345 346 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; 347 fabsd %f20,%f40 ! (0_0) y = fabs(y); 348 349 fabsd %f22,%f20 ! (0_0) x = fabs(x); 350 351 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y 352 353 354 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; 355 356 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 357 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); 358 359 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 360 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); 361 362 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); 363 364 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); 365 366 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); 367 368 or %g5,%o2,%g5 ! (0_0) c0 |= c2; 369 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 370 371 andcc %g5,2,%g0 ! (0_0) c0 & 2 372 bnz,pn %icc,.update4 ! (0_0) if ( (c0 & 2) != 0 ) 373 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 374.cont4: 375 and %g1,%o4,%g1 ! (0_0) c1 &= c3; 376 377 add %i3,stridey,%l2 ! py += stridey 378 andcc %g1,2,%g0 ! (0_0) c1 & 2 379 bnz,pn %icc,.update5 ! (0_0) if ( (c1 & 2) != 0 ) 380 fmovd %f18,%f44 ! (3_1) dmax = x; 381.cont5: 382 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); 383 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 384 385 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 386 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 387 add %l1,stridex,%l7 ! px += stridex 388 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 389 390 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 391 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 392 393 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; 394 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 395 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 396 397 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 398 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 399 400 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; 401 fabsd %f24,%f54 ! (1_0) y = fabs(y); 402 403 fabsd %f26,%f36 ! (1_0) x = fabs(x); 404 405 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 406 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y 407 408 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 409 410 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; 411 412 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; 413 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); 414 415 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; 416 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); 417 418 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 419 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); 420 421 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 422 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); 423 424 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); 425 426 or %g1,%g5,%g1 ! (1_0) c0 |= c2; 427 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; 428 429 andcc %g1,2,%g0 ! (1_0) c0 & 2 430 bnz,pn %icc,.update6 ! (1_0) if ( (c0 & 2) != 0 ) 431 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; 432.cont6: 433 and %o5,%o1,%o5 ! (1_0) c1 &= c3; 434 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 435 436 add %l2,stridey,%i3 ! py += stridey 437 andcc %o5,2,%g0 ! (1_0) c1 & 2 438 bnz,pn %icc,.update7 ! (1_0) if ( (c1 & 2) != 0 ) 439 fmovd %f20,%f4 ! (0_0) dmax = x; 440.cont7: 441 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); 442 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 443 444 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; 445 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 446 add %l7,stridex,%o1 ! px += stridex 447 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; 448 449 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 450 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 451 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; 452 453 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; 454 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; 455 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 456 457 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; 458 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; 459 460 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; 461 fabsd %f28,%f34 ! (2_0) y = fabs(y); 462 463 fabsd %f26,%f50 ! (2_0) x = fabs(x); 464 465 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; 466 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y 467 468 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; 469 470 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; 471 472 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; 473 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 474 475 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; 476 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 477 478 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; 479 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 480 481 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; 482 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 483 484 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); 485 486 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 487 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; 488 489 andcc %o3,2,%g0 ! (2_0) c0 & 2 490 bnz,pn %icc,.update8 ! (2_0) if ( (c0 & 2) != 0 ) 491 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; 492.cont8: 493 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 494 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; 495 496 add %i3,stridey,%l4 ! py += stridey 497 andcc %o4,2,%g0 ! (2_0) c1 & 2 498 bnz,pn %icc,.update9 ! (2_0) if ( (c1 & 2) != 0 ) 499 fmovd %f36,%f56 ! (1_0) dmax = x; 500.cont9: 501 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 502 add %o1,stridex,%l2 ! px += stridex 503 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); 504 505 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; 506 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 507 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; 508 509 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); 510 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; 511 512 cmp counter,4 513 bl,pn %icc,.tail 514 nop 515 516 ba .main_loop 517 sub counter,4,counter 518 519 .align 16 520.main_loop: 521 fmuld %f20,%f44,%f2 ! (0_1) x *= dnorm; 522 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; 523 lda [%l2]%asi,%f18 ! (3_1) ((float*)&x)[0] = ((float*)px)[0]; 524 525 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; 526 lda [%l2+4]%asi,%f19 ! (3_1) ((float*)&x)[1] = ((float*)px)[1]; 527 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; 528 529 fmuld %f40,%f44,%f44 ! (0_1) y *= dnorm; 530 fabsd %f30,%f30 ! (3_1) y = fabs(y); 531 532 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; 533 fabsd %f18,%f18 ! (3_1) x = fabs(x); 534 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; 535 536 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; 537 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; 538 fcmped %fcc2,%f54,%f56 ! (1_1) dmax ? y 539 540 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; 541 542 fmovdg %fcc2,%f54,%f56 ! (1_1) if ( dmax < y ) dmax = y; 543 544 faddd %f2,D2ON28,%f10 ! (0_1) x_hi = x + D2ON28; 545 fcmple32 DC1,%f18,%o3 ! (3_1) c0 = vis_fcmple32(DC1,x); 546 547 faddd %f44,D2ON28,%f20 ! (0_1) y_hi = y + D2ON28; 548 fcmple32 DC1,%f30,%o0 ! (3_1) c2 = vis_fcmple32(DC1,y); 549 550 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; 551 fcmpgt32 DC2,%f18,%o4 ! (3_1) c1 = vis_fcmpgt32(DC2,x); 552 553 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; 554 fcmpgt32 DC2,%f30,%o1 ! (3_1) c3 = vis_fcmpgt32(DC2,y); 555 556 fand %f56,DC0,%f38 ! (1_1) dmax = vis_fand(dmax,DC0); 557 558 or %o3,%o0,%o3 ! (3_1) c0 |= c2; 559 fsubd %f10,D2ON28,%f58 ! (0_1) x_hi -= D2ON28; 560 561 andcc %o3,2,%g0 ! (3_1) c0 & 2 562 bnz,pn %icc,.update10 ! (3_1) if ( (c0 & 2) != 0 ) 563 fsubd %f20,D2ON28,%f56 ! (0_1) y_hi -= D2ON28; 564.cont10: 565 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; 566 and %o4,%o1,%o4 ! (3_1) c1 &= c3; 567 568 add %l4,stridey,%i3 ! py += stridey 569 andcc %o4,2,%g0 ! (3_1) c1 & 2 570 bnz,pn %icc,.update11 ! (3_1) if ( (c1 & 2) != 0 ) 571 fmovd %f50,%f32 ! (2_1) dmax = x; 572.cont11: 573 fpsub32 DC1,%f38,%f10 ! (1_1) dnorm = vis_fpsub32(DC1,dmax); 574 add %l2,stridex,%l1 ! px += stridex 575 lda [%i3]%asi,%f20 ! (0_0) ((float*)&y)[0] = ((float*)py)[0]; 576 577 fmuld %f58,%f58,%f6 ! (0_1) res = x_hi * x_hi; 578 lda [%i3+4]%asi,%f21 ! (0_0) ((float*)&y)[1] = ((float*)py)[1]; 579 add %i5,stridez,%l6 ! pz += stridez 580 faddd %f44,%f56,%f60 ! (0_1) dtmp2 = y + y_hi; 581 582 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); 583 lda [%l1]%asi,%f22 ! (0_0) ((float*)&x)[0] = ((float*)px)[0]; 584 faddd %f2,%f58,%f24 ! (0_1) dtmp1 = x + x_hi; 585 586 fmuld %f36,%f10,%f36 ! (1_1) x *= dnorm; 587 fsubd %f2,%f58,%f26 ! (0_1) x_lo = x - x_hi; 588 lda [%l1+4]%asi,%f23 ! (0_0) ((float*)&x)[1] = ((float*)px)[1]; 589 590 fmuld %f56,%f56,%f28 ! (0_1) dtmp0 = y_hi * y_hi; 591 fsubd %f44,%f56,%f44 ! (0_1) y_lo = y - y_hi; 592 593 fmuld %f54,%f10,%f56 ! (1_1) y *= dnorm; 594 fabsd %f20,%f40 ! (0_0) y = fabs(y); 595 596 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; 597 fabsd %f22,%f20 ! (0_0) x = fabs(x); 598 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; 599 600 fmuld %f24,%f26,%f10 ! (0_1) dtmp1 *= x_lo; 601 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; 602 fcmped %fcc3,%f34,%f32 ! (2_1) dmax ? y 603 604 fmuld %f60,%f44,%f12 ! (0_1) dtmp2 *= y_lo; 605 606 fmovdg %fcc3,%f34,%f32 ! (2_1) if ( dmax < y ) dmax = y; 607 608 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 609 fcmple32 DC1,%f20,%g5 ! (0_0) c0 = vis_fcmple32(DC1,x); 610 611 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 612 fcmple32 DC1,%f40,%o2 ! (0_0) c2 = vis_fcmple32(DC1,y); 613 614 faddd %f6,%f28,%f24 ! (0_1) res += dtmp0; 615 fcmpgt32 DC2,%f20,%g1 ! (0_0) c1 = vis_fcmpgt32(DC2,x); 616 617 faddd %f10,%f12,%f26 ! (0_1) dtmp1 += dtmp2; 618 fcmpgt32 DC2,%f40,%o4 ! (0_0) c3 = vis_fcmpgt32(DC2,y); 619 620 fand %f32,DC0,%f52 ! (2_1) dmax = vis_fand(dmax,DC0); 621 622 or %g5,%o2,%g5 ! (0_0) c0 |= c2; 623 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 624 625 andcc %g5,2,%g0 ! (0_0) c0 & 2 626 bnz,pn %icc,.update12 ! (0_0) if ( (c0 & 2) != 0 ) 627 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 628.cont12: 629 and %g1,%o4,%g1 ! (0_0) c1 &= c3; 630 faddd %f24,%f26,%f12 ! (0_1) res += dtmp1; 631 632 add %i3,stridey,%l2 ! py += stridey 633 andcc %g1,2,%g0 ! (0_0) c1 & 2 634 bnz,pn %icc,.update13 ! (0_0) if ( (c1 & 2) != 0 ) 635 fmovd %f18,%f44 ! (3_1) dmax = x; 636.cont13: 637 fpsub32 DC1,%f52,%f10 ! (2_1) dnorm = vis_fpsub32(DC1,dmax); 638 add %l1,stridex,%l7 ! px += stridex 639 lda [%l2]%asi,%f24 ! (1_0) ((float*)&y)[0] = ((float*)py)[0]; 640 641 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 642 add %l6,stridez,%i5 ! pz += stridez 643 lda [%l2+4]%asi,%f25 ! (1_0) ((float*)&y)[1] = ((float*)py)[1]; 644 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 645 646 fsqrtd %f12,%f12 ! (0_1) res = sqrt(res); 647 lda [%l7]%asi,%f26 ! (1_0) ((float*)&x)[0] = ((float*)px)[0]; 648 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 649 650 fmuld %f50,%f10,%f50 ! (2_1) x *= dnorm; 651 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 652 lda [%l7+4]%asi,%f27 ! (1_0) ((float*)&x)[1] = ((float*)px)[1]; 653 654 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 655 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 656 657 fmuld %f34,%f10,%f34 ! (2_1) y *= dnorm; 658 fabsd %f24,%f54 ! (1_0) y = fabs(y); 659 660 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; 661 fabsd %f26,%f36 ! (1_0) x = fabs(x); 662 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; 663 664 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 665 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; 666 fcmped %fcc0,%f30,%f44 ! (3_1) dmax ? y 667 668 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 669 670 fmovdg %fcc0,%f30,%f44 ! (3_1) if ( dmax < y ) dmax = y; 671 672 faddd %f50,D2ON28,%f58 ! (2_1) x_hi = x + D2ON28; 673 fcmple32 DC1,%f36,%g1 ! (1_0) c0 = vis_fcmple32(DC1,x); 674 675 faddd %f34,D2ON28,%f22 ! (2_1) y_hi = y + D2ON28; 676 fcmple32 DC1,%f54,%g5 ! (1_0) c2 = vis_fcmple32(DC1,y); 677 678 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 679 fcmpgt32 DC2,%f36,%o5 ! (1_0) c1 = vis_fcmpgt32(DC2,x); 680 681 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 682 fcmpgt32 DC2,%f54,%o1 ! (1_0) c3 = vis_fcmpgt32(DC2,y); 683 684 fand %f44,DC0,%f14 ! (3_1) dmax = vis_fand(dmax,DC0); 685 686 or %g1,%g5,%g1 ! (1_0) c0 |= c2; 687 fsubd %f58,D2ON28,%f44 ! (2_1) x_hi -= D2ON28; 688 689 andcc %g1,2,%g0 ! (1_0) c0 & 2 690 bnz,pn %icc,.update14 ! (1_0) if ( (c0 & 2) != 0 ) 691 fsubd %f22,D2ON28,%f58 ! (2_1) y_hi -= D2ON28; 692.cont14: 693 and %o5,%o1,%o5 ! (1_0) c1 &= c3; 694 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 695 696 add %l2,stridey,%i3 ! py += stridey 697 andcc %o5,2,%g0 ! (1_0) c1 & 2 698 bnz,pn %icc,.update15 ! (1_0) if ( (c1 & 2) != 0 ) 699 fmovd %f20,%f4 ! (0_0) dmax = x; 700.cont15: 701 fpsub32 DC1,%f14,%f10 ! (3_1) dnorm = vis_fpsub32(DC1,dmax); 702 add %l7,stridex,%o1 ! px += stridex 703 lda [%i3]%asi,%f28 ! (2_0) ((float*)&y)[0] = ((float*)py)[0]; 704 705 fmuld %f44,%f44,%f2 ! (2_1) res = x_hi * x_hi; 706 add %i5,stridez,%g5 ! pz += stridez 707 lda [%i3+4]%asi,%f29 ! (2_0) ((float*)&y)[1] = ((float*)py)[1]; 708 faddd %f34,%f58,%f60 ! (2_1) dtmp2 = y + y_hi; 709 710 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 711 lda [%o1]%asi,%f26 ! (2_0) ((float*)&x)[0] = ((float*)px)[0]; 712 faddd %f50,%f44,%f56 ! (2_1) dtmp1 = x + x_hi; 713 714 fmuld %f18,%f10,%f6 ! (3_1) x *= dnorm; 715 fsubd %f50,%f44,%f18 ! (2_1) x_lo = x - x_hi; 716 lda [%o1+4]%asi,%f27 ! (2_0) ((float*)&x)[1] = ((float*)px)[1]; 717 718 fmuld %f58,%f58,%f44 ! (2_1) dtmp0 = y_hi * y_hi; 719 fsubd %f34,%f58,%f22 ! (2_1) y_lo = y - y_hi; 720 721 fmuld %f30,%f10,%f58 ! (3_1) y *= dnorm; 722 fabsd %f28,%f34 ! (2_0) y = fabs(y); 723 724 fmuld %f16,%f12,%f16 ! (0_1) res = dmax * res; 725 fabsd %f26,%f50 ! (2_0) x = fabs(x); 726 st %f16,[%g5] ! (0_1) ((float*)pz)[0] = ((float*)&res)[0]; 727 728 fmuld %f56,%f18,%f10 ! (2_1) dtmp1 *= x_lo; 729 st %f17,[%g5+4] ! (0_1) ((float*)pz)[1] = ((float*)&res)[1]; 730 fcmped %fcc1,%f40,%f4 ! (0_0) dmax ? y 731 732 fmuld %f60,%f22,%f12 ! (2_1) dtmp2 *= y_lo; 733 734 fmovdg %fcc1,%f40,%f4 ! (0_0) if ( dmax < y ) dmax = y; 735 736 faddd %f6,D2ON28,%f56 ! (3_1) x_hi = x + D2ON28; 737 fcmple32 DC1,%f50,%o3 ! (2_0) c0 = vis_fcmple32(DC1,x); 738 739 faddd %f58,D2ON28,%f28 ! (3_1) y_hi = y + D2ON28; 740 fcmple32 DC1,%f34,%o0 ! (2_0) c2 = vis_fcmple32(DC1,y); 741 742 faddd %f2,%f44,%f30 ! (2_1) res += dtmp0; 743 fcmpgt32 DC2,%f50,%o4 ! (2_0) c1 = vis_fcmpgt32(DC2,x); 744 745 faddd %f10,%f12,%f26 ! (2_1) dtmp1 += dtmp2; 746 fcmpgt32 DC2,%f34,%o5 ! (2_0) c3 = vis_fcmpgt32(DC2,y); 747 748 fand %f4,DC0,%f16 ! (0_0) dmax = vis_fand(dmax,DC0); 749 750 or %o3,%o0,%o3 ! (2_0) c0 |= c2; 751 fsubd %f56,D2ON28,%f18 ! (3_1) x_hi -= D2ON28; 752 753 andcc %o3,2,%g0 ! (2_0) c0 & 2 754 bnz,pn %icc,.update16 ! (2_0) if ( (c0 & 2) != 0 ) 755 fsubd %f28,D2ON28,%f4 ! (3_1) y_hi -= D2ON28; 756.cont16: 757 and %o4,%o5,%o4 ! (2_0) c1 &= c3; 758 faddd %f30,%f26,%f12 ! (2_1) res += dtmp1; 759 760 add %i3,stridey,%l4 ! py += stridey 761 andcc %o4,2,%g0 ! (2_0) c1 & 2 762 bnz,pn %icc,.update17 ! (2_0) if ( (c1 & 2) != 0 ) 763 fmovd %f36,%f56 ! (1_0) dmax = x; 764.cont17: 765 lda [%l4]%asi,%f30 ! (3_0) ((float*)&y)[0] = ((float*)py)[0]; 766 add %o1,stridex,%l2 ! px += stridex 767 fpsub32 DC1,%f16,%f44 ! (0_0) dnorm = vis_fpsub32(DC1,dmax); 768 769 fmuld %f18,%f18,%f60 ! (3_1) res = x_hi * x_hi; 770 add %g5,stridez,%i5 ! pz += stridez 771 lda [%l4+4]%asi,%f31 ! (3_0) ((float*)&y)[1] = ((float*)py)[1]; 772 faddd %f58,%f4,%f32 ! (3_1) dtmp2 = y + y_hi; 773 774 fsqrtd %f12,%f12 ! (2_1) res = sqrt(res); 775 subcc counter,4,counter ! counter -= 4; 776 bpos,pt %icc,.main_loop 777 faddd %f6,%f18,%f28 ! (3_1) dtmp1 = x + x_hi; 778 779 add counter,4,counter 780 781.tail: 782 subcc counter,1,counter 783 bneg,a .begin 784 nop 785 786 fsubd %f6,%f18,%f20 ! (3_2) x_lo = x - x_hi; 787 788 fmuld %f4,%f4,%f22 ! (3_2) dtmp0 = y_hi * y_hi; 789 fsubd %f58,%f4,%f58 ! (3_2) y_lo = y - y_hi; 790 791 fmuld %f38,%f24,%f10 ! (1_2) res = dmax * res; 792 st %f10,[%i5] ! (1_2) ((float*)pz)[0] = ((float*)&res)[0]; 793 794 st %f11,[%i5+4] ! (1_2) ((float*)pz)[1] = ((float*)&res)[1]; 795 796 subcc counter,1,counter 797 bneg,a .begin 798 add %i5,stridez,%i5 799 800 fmuld %f28,%f20,%f28 ! (3_2) dtmp1 *= x_lo; 801 802 fmuld %f32,%f58,%f24 ! (3_2) dtmp2 *= y_lo; 803 804 faddd %f60,%f22,%f22 ! (3_2) res += dtmp0; 805 806 faddd %f28,%f24,%f26 ! (3_2) dtmp1 += dtmp2; 807 808 faddd %f22,%f26,%f28 ! (3_2) res += dtmp1; 809 810 add %i5,stridez,%l6 ! pz += stridez 811 812 fsqrtd %f28,%f4 ! (3_2) res = sqrt(res); 813 add %l2,stridex,%l1 ! px += stridex 814 815 fmuld %f52,%f12,%f12 ! (2_2) res = dmax * res; 816 st %f12,[%l6] ! (2_2) ((float*)pz)[0] = ((float*)&res)[0]; 817 818 st %f13,[%l6+4] ! (2_2) ((float*)pz)[1] = ((float*)&res)[1]; 819 820 subcc counter,1,counter 821 bneg .begin 822 add %l6,stridez,%i5 823 824 fmuld %f14,%f4,%f14 ! (3_2) res = dmax * res; 825 st %f14,[%i5] ! (3_2) ((float*)pz)[0] = ((float*)&res)[0]; 826 827 st %f15,[%i5+4] ! (3_2) ((float*)pz)[1] = ((float*)&res)[1]; 828 829 ba .begin 830 add %i5,stridez,%i5 831 832 .align 16 833.spec0: 834 ld [%i1+4],%l1 ! lx = ((int*)px)[1]; 835 cmp %o2,%o4 ! j0 ? 0x7ff00000 836 bge,pn %icc,1f ! if ( j0 >= 0x7ff00000 ) 837 fabsd %f26,%f26 ! x = fabs(x); 838 839 sub %o0,%l4,%o0 ! diff = hy - hx; 840 fabsd %f24,%f24 ! y = fabs(y); 841 842 sra %o0,31,%l4 ! j0 = diff >> 31; 843 844 xor %o0,%l4,%o0 ! diff ^ j0 845 846 sethi %hi(0x03600000),%l1 847 sub %o0,%l4,%o0 ! (diff ^ j0) - j0 848 849 cmp %o0,%l1 ! ((diff ^ j0) - j0) ? 0x03600000 850 bge,a,pn %icc,2f ! if ( ((diff ^ j0) - j0) >= 0x03600000 ) 851 faddd %f26,%f24,%f24 ! *pz = x + y 852 853 fmuld %f26,DC2,%f36 ! (1_1) x *= dnorm; 854 855 fmuld %f24,DC2,%f56 ! (1_1) y *= dnorm; 856 857 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 858 859 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 860 861 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 862 863 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 864 865 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 866 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 867 868 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 869 870 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 871 872 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 873 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 874 875 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 876 877 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 878 879 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 880 881 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 882 883 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 884 885 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 886 887 fmuld DC3,%f24,%f24 ! (1_2) res = dmax * res; 8882: 889 add %i3,stridey,%i3 890 add %i1,stridex,%i1 891 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 892 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 893 894 add %i5,stridez,%i5 895 ba .begin1 896 sub counter,1,counter 897 8981: 899 ld [%i3+4],%l2 ! ly = ((int*)py)[1]; 900 cmp %o0,%o4 ! hx ? 0x7ff00000 901 bne,pn %icc,1f ! if ( hx != 0x7ff00000 ) 902 fabsd %f24,%f24 ! y = fabs(y); 903 904 cmp %l1,0 ! lx ? 0 905 be,pn %icc,2f ! if ( lx == 0 ) 906 nop 9071: 908 cmp %l4,%o4 ! hy ? 0x7ff00000 909 bne,pn %icc,1f ! if ( hy != 0x7ff00000 ) 910 nop 911 912 cmp %l2,0 ! ly ? 0 913 be,pn %icc,2f ! if ( ly == 0 ) 914 nop 9151: 916 add %i3,stridey,%i3 917 add %i1,stridex,%i1 918 fmuld %f26,%f24,%f24 ! res = x * y; 919 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 920 921 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 922 923 add %i5,stridez,%i5 924 ba .begin1 925 sub counter,1,counter 926 9272: 928 add %i1,stridex,%i1 929 add %i3,stridey,%i3 930 st DC0_HI,[%i5] ! ((int*)pz)[0] = 0x7ff00000; 931 st DC0_LO,[%i5+4] ! ((int*)pz)[1] = 0; 932 fcmpd %f26,%f24 ! x ? y 933 934 add %i5,stridez,%i5 935 ba .begin1 936 sub counter,1,counter 937 938 .align 16 939.spec1: 940 fmuld %f26,DC3,%f36 ! (1_1) x *= dnorm; 941 942 fmuld %f24,DC3,%f56 ! (1_1) y *= dnorm; 943 944 faddd %f36,D2ON28,%f58 ! (1_1) x_hi = x + D2ON28; 945 946 faddd %f56,D2ON28,%f22 ! (1_1) y_hi = y + D2ON28; 947 948 fsubd %f58,D2ON28,%f58 ! (1_1) x_hi -= D2ON28; 949 950 fsubd %f22,D2ON28,%f22 ! (1_1) y_hi -= D2ON28; 951 952 fmuld %f58,%f58,%f60 ! (1_1) res = x_hi * x_hi; 953 faddd %f56,%f22,%f28 ! (1_1) dtmp2 = y + y_hi; 954 955 faddd %f36,%f58,%f6 ! (1_1) dtmp1 = x + x_hi; 956 957 fsubd %f36,%f58,%f58 ! (1_1) x_lo = x - x_hi; 958 959 fmuld %f22,%f22,%f2 ! (1_1) dtmp0 = y_hi * y_hi; 960 fsubd %f56,%f22,%f56 ! (1_1) y_lo = y - y_hi; 961 962 fmuld %f6,%f58,%f10 ! (1_1) dtmp1 *= x_lo; 963 964 fmuld %f28,%f56,%f26 ! (1_1) dtmp2 *= y_lo; 965 966 faddd %f60,%f2,%f24 ! (1_1) res += dtmp0; 967 968 faddd %f10,%f26,%f28 ! (1_1) dtmp1 += dtmp2; 969 970 faddd %f24,%f28,%f26 ! (1_1) res += dtmp1; 971 972 fsqrtd %f26,%f24 ! (1_1) res = sqrt(res); 973 974 fmuld DC2,%f24,%f24 ! (1_2) res = dmax * res; 975 976 add %i3,stridey,%i3 977 add %i1,stridex,%i1 978 st %f24,[%i5] ! ((float*)pz)[0] = ((float*)&res)[0]; 979 980 st %f25,[%i5+4] ! ((float*)pz)[1] = ((float*)&res)[1]; 981 add %i5,stridez,%i5 982 ba .begin1 983 sub counter,1,counter 984 985 .align 16 986.update0: 987 fzero %f50 988 cmp counter,1 989 ble .cont0 990 fzero %f34 991 992 mov %o1,tmp_px 993 mov %i3,tmp_py 994 995 sub counter,1,tmp_counter 996 ba .cont0 997 mov 1,counter 998 999 .align 16 1000.update1: 1001 fzero %f50 1002 cmp counter,1 1003 ble .cont1 1004 fzero %f34 1005 1006 mov %o1,tmp_px 1007 mov %i3,tmp_py 1008 1009 sub counter,1,tmp_counter 1010 ba .cont1 1011 mov 1,counter 1012 1013 .align 16 1014.update2: 1015 fzero %f18 1016 cmp counter,2 1017 ble .cont2 1018 fzero %f30 1019 1020 mov %l2,tmp_px 1021 mov %l4,tmp_py 1022 1023 sub counter,2,tmp_counter 1024 ba .cont1 1025 mov 2,counter 1026 1027 .align 16 1028.update3: 1029 fzero %f18 1030 cmp counter,2 1031 ble .cont3 1032 fzero %f30 1033 1034 mov %l2,tmp_px 1035 mov %l4,tmp_py 1036 1037 sub counter,2,tmp_counter 1038 ba .cont3 1039 mov 2,counter 1040 1041 .align 16 1042.update4: 1043 fzero %f20 1044 cmp counter,3 1045 ble .cont4 1046 fzero %f40 1047 1048 mov %l1,tmp_px 1049 mov %i3,tmp_py 1050 1051 sub counter,3,tmp_counter 1052 ba .cont4 1053 mov 3,counter 1054 1055 .align 16 1056.update5: 1057 fzero %f20 1058 cmp counter,3 1059 ble .cont5 1060 fzero %f40 1061 1062 mov %l1,tmp_px 1063 mov %i3,tmp_py 1064 1065 sub counter,3,tmp_counter 1066 ba .cont5 1067 mov 3,counter 1068 1069 .align 16 1070.update6: 1071 fzero %f36 1072 cmp counter,4 1073 ble .cont6 1074 fzero %f54 1075 1076 mov %l7,tmp_px 1077 mov %l2,tmp_py 1078 1079 sub counter,4,tmp_counter 1080 ba .cont6 1081 mov 4,counter 1082 1083 .align 16 1084.update7: 1085 fzero %f36 1086 cmp counter,4 1087 ble .cont7 1088 fzero %f54 1089 1090 mov %l7,tmp_px 1091 mov %l2,tmp_py 1092 1093 sub counter,4,tmp_counter 1094 ba .cont7 1095 mov 4,counter 1096 1097 .align 16 1098.update8: 1099 fzero %f50 1100 cmp counter,5 1101 ble .cont8 1102 fzero %f34 1103 1104 mov %o1,tmp_px 1105 mov %i3,tmp_py 1106 1107 sub counter,5,tmp_counter 1108 ba .cont8 1109 mov 5,counter 1110 1111 .align 16 1112.update9: 1113 fzero %f50 1114 cmp counter,5 1115 ble .cont9 1116 fzero %f34 1117 1118 mov %o1,tmp_px 1119 mov %i3,tmp_py 1120 1121 sub counter,5,tmp_counter 1122 ba .cont9 1123 mov 5,counter 1124 1125 1126 .align 16 1127.update10: 1128 fzero %f18 1129 cmp counter,2 1130 ble .cont10 1131 fzero %f30 1132 1133 mov %l2,tmp_px 1134 mov %l4,tmp_py 1135 1136 sub counter,2,tmp_counter 1137 ba .cont10 1138 mov 2,counter 1139 1140 .align 16 1141.update11: 1142 fzero %f18 1143 cmp counter,2 1144 ble .cont11 1145 fzero %f30 1146 1147 mov %l2,tmp_px 1148 mov %l4,tmp_py 1149 1150 sub counter,2,tmp_counter 1151 ba .cont11 1152 mov 2,counter 1153 1154 .align 16 1155.update12: 1156 fzero %f20 1157 cmp counter,3 1158 ble .cont12 1159 fzero %f40 1160 1161 mov %l1,tmp_px 1162 mov %i3,tmp_py 1163 1164 sub counter,3,tmp_counter 1165 ba .cont12 1166 mov 3,counter 1167 1168 .align 16 1169.update13: 1170 fzero %f20 1171 cmp counter,3 1172 ble .cont13 1173 fzero %f40 1174 1175 mov %l1,tmp_px 1176 mov %i3,tmp_py 1177 1178 sub counter,3,tmp_counter 1179 ba .cont13 1180 mov 3,counter 1181 1182 .align 16 1183.update14: 1184 fzero %f54 1185 cmp counter,4 1186 ble .cont14 1187 fzero %f36 1188 1189 mov %l7,tmp_px 1190 mov %l2,tmp_py 1191 1192 sub counter,4,tmp_counter 1193 ba .cont14 1194 mov 4,counter 1195 1196 .align 16 1197.update15: 1198 fzero %f54 1199 cmp counter,4 1200 ble .cont15 1201 fzero %f36 1202 1203 mov %l7,tmp_px 1204 mov %l2,tmp_py 1205 1206 sub counter,4,tmp_counter 1207 ba .cont15 1208 mov 4,counter 1209 1210 .align 16 1211.update16: 1212 fzero %f50 1213 cmp counter,5 1214 ble .cont16 1215 fzero %f34 1216 1217 mov %o1,tmp_px 1218 mov %i3,tmp_py 1219 1220 sub counter,5,tmp_counter 1221 ba .cont16 1222 mov 5,counter 1223 1224 .align 16 1225.update17: 1226 fzero %f50 1227 cmp counter,5 1228 ble .cont17 1229 fzero %f34 1230 1231 mov %o1,tmp_px 1232 mov %i3,tmp_py 1233 1234 sub counter,5,tmp_counter 1235 ba .cont17 1236 mov 5,counter 1237 1238 .align 16 1239.exit: 1240 ret 1241 restore 1242 SET_SIZE(__vhypot) 1243 1244