1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vsqrtf_ultra3.S" 30 31#include "libm.h" 32#if defined(LIBMVEC_SO_BUILD) 33 .weak __vsqrtf 34 .type __vsqrtf,#function 35 __vsqrtf = __vsqrtf_ultra3 36#endif 37 38 RO_DATA 39 .align 64 40 41.CONST_TBL: 42 .word 0x3fe00001, 0x80007e00 ! K1 = 5.00000715259318464227e-01 43 .word 0xbfc00003, 0xc0017a01 ! K2 = -1.25000447037521686593e-01 44 .word 0x000fffff, 0xffffffff ! DC0 = 0x000fffffffffffff 45 .word 0x3ff00000, 0x00000000 ! DC1 = 0x3ff0000000000000 46 .word 0x7ffff000, 0x00000000 ! DC2 = 0x7ffff00000000000 47 48#define DC0 %f6 49#define DC1 %f4 50#define DC2 %f2 51#define K2 %f38 52#define K1 %f36 53#define TBL %l2 54#define stridex %l3 55#define stridey %l4 56#define _0x1ff0 %l5 57#define counter %l6 58#define _0x00800000 %l7 59#define _0x7f800000 %o0 60 61#define tmp_px STACK_BIAS-0x40 62#define tmp_counter STACK_BIAS-0x38 63#define tmp0 STACK_BIAS-0x30 64#define tmp1 STACK_BIAS-0x28 65#define tmp2 STACK_BIAS-0x20 66#define tmp3 STACK_BIAS-0x18 67#define tmp4 STACK_BIAS-0x10 68 69! sizeof temp storage - must be a multiple of 16 for V9 70#define tmps 0x40 71 72!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 73! !!!!! algorithm !!!!! 74! 75! x0 = *px; 76! ax = *(int*)px; 77! px += stridex; 78! 79! if( ax >= 0x7f800000 ) 80! { 81! *py = sqrtf(x0); 82! py += stridey; 83! continue; 84! } 85! if( ax < 0x00800000 ) 86! { 87! *py = sqrtf(x0); 88! py += stridey; 89! continue; 90! } 91! 92! db0 = (double)x0; 93! iexp0 = ax >> 24; 94! iexp0 += 0x3c0; 95! lexp0 = (long long)iexp0 << 52; 96! 97! db0 = vis_fand(db0,DC0); 98! db0 = vis_for(db0,DC1); 99! hi0 = vis_fand(db0,DC2); 100! 101! ax >>= 11; 102! si0 = ax & 0x1ff0; 103! dtmp0 = ((double*)((char*)TBL + si0))[0]; 104! xx0 = (db0 - hi0); 105! xx0 *= dtmp0; 106! dtmp0 = ((double*)((char*)TBL + si0))[1] 107! res0 = K2 * xx0; 108! res0 += K1; 109! res0 *= xx0; 110! res0 += DC1; 111! res0 = dtmp0 * res0; 112! dtmp1 = *((double*)&lexp0); 113! res0 *= dtmp1; 114! fres0 = (float)res0; 115! *py = fres0; 116! py += stridey; 117!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 118 119 ENTRY(__vsqrtf_ultra3) 120 save %sp,-SA(MINFRAME)-tmps,%sp 121 PIC_SETUP(l7) 122 PIC_SET(l7,.CONST_TBL,o2) 123 PIC_SET(l7,__vlibm_TBL_sqrtf,l2) 124 125 st %i0,[%fp+tmp_counter] 126 sll %i2,2,stridex 127 or %g0,0xff8,%l5 128 129 stx %i1,[%fp+tmp_px] 130 sll %l5,1,_0x1ff0 131 132 ldd [%o2],K1 133 sll %i4,2,stridey 134 135 ldd [%o2+8],K2 136 or %g0,%i3,%g5 137 138 ldd [%o2+16],DC0 139 sethi %hi(0x7f800000),%o0 140 141 ldd [%o2+24],DC1 142 sethi %hi(0x00800000),%l7 143 144 ldd [%o2+32],DC2 145 146.begin: 147 ld [%fp+tmp_counter],counter 148 ldx [%fp+tmp_px],%i1 149 st %g0,[%fp+tmp_counter] 150.begin1: 151 cmp counter,0 152 ble,pn %icc,.exit 153 154 lda [%i1]0x82,%o2 ! (2_0) ax = *(int*)px; 155 156 or %g0,%i1,%o7 157 lda [%i1]0x82,%f25 ! (2_0) x0 = *px; 158 159 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 160 bge,pn %icc,.spec ! (2_0) if( ax >= 0x7f800000 ) 161 nop 162 163 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 164 bl,pn %icc,.spec ! (2_0) if( ax < 0x00800000 ) 165 nop 166 167 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 168 169 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 170 171 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 172 173 add %o7,stridex,%i1 ! px += stridex 174 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 175 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 176 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 177 178 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 179 bge,pn %icc,.update0 ! (3_0) if( ax >= 0x7f800000 ) 180 nop 181.cont0: 182 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 183 184 sra %o2,11,%i2 ! (2_0) ax >>= 11; 185 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 186 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 187 188 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 189 bl,pn %icc,.update1 ! (3_0) if( ax < 0x00800000 ) 190 nop 191.cont1: 192 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 193 194 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 195 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 196 197 add %i1,stridex,%i1 ! px += stridex 198 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 199 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 200 201 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 202 203 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 204 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 205 206 add %o4,960,%i0 ! (3_0) iexp0 += 0x3c0; 207 208 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 209 bge,pn %icc,.update2 ! (4_1) if( ax >= 0x7f800000 ) 210 nop 211.cont2: 212 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); 213 sllx %i0,52,%g1 ! (3_1) lexp0 = (long long)iexp0 << 52; 214 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 215 216 sra %o1,11,%l0 ! (3_1) ax >>= 11; 217 stx %g1,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); 218 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); 219 220 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 221 bl,pn %icc,.update3 ! (4_1) if( ax < 0x00800000 ) 222 nop 223.cont3: 224 fstod %f13,%f50 ! (4_1) db0 = (double)x0; 225 226 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; 227 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; 228 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; 229 230 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 231 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); 232 233 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; 234 235 add %i1,stridex,%o4 ! px += stridex 236 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; 237 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; 238 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); 239 240 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; 241 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 242 bge,pn %icc,.update4 ! (0_0) if( ax >= 0x7f800000 ) 243 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); 244.cont4: 245 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; 246 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 247 248 sra %o2,11,%i5 ! (4_1) ax >>= 11; 249 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); 250 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); 251 252 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 253 bl,pn %icc,.update5 ! (0_0) if( ax < 0x00800000 ) 254 nop 255.cont5: 256 fstod %f17,%f56 ! (0_0) db0 = (double)x0; 257 258 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; 259 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; 260 faddd %f52,K1,%f52 ! (2_1) res0 += K1; 261 262 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; 263 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; 264 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); 265 266 add %o4,stridex,%i1 ! px += stridex 267 268 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; 269 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 270 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; 271 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); 272 273 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; 274 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 275 bge,pn %icc,.update6 ! (1_0) if( ax >= 0x7f800000 ) 276 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); 277.cont6: 278 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; 279 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; 280 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 281 282 sra %l1,11,%i4 ! (0_0) ax >>= 11; 283 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); 284 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); 285 286 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 287 bl,pn %icc,.update7 ! (1_0) if( ax < 0x00800000 ) 288 nop 289.cont7: 290 fstod %f21,%f56 ! (1_0) db0 = (double)x0; 291 292 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; 293 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; 294 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; 295 faddd %f50,K1,%f62 ! (3_1) res0 += K1; 296 297 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 298 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); 299 300 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; 301 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 302 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; 303 304 add %i1,stridex,%o7 ! px += stridex 305 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; 306 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; 307 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); 308 309 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; 310 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 311 bge,pn %icc,.update8 ! (2_0) if( ax >= 0x7f800000 ) 312 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); 313.cont8: 314 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; 315 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; 316 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 317 318 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; 319 sra %i0,11,%g1 ! (1_0) ax >>= 11; 320 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); 321 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); 322 323 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 324 bl,pn %icc,.update9 ! (2_0) if( ax < 0x00800000 ) 325 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); 326 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 327.cont9: 328 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; 329 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; 330 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 331 faddd %f50,K1,%f34 ! (4_1) res0 += K1; 332 333 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 334 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); 335 336 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; 337 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 338 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 339 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; 340 341 add %o7,stridex,%i1 ! px += stridex 342 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 343 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 344 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 345 346 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; 347 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 348 bge,pn %icc,.update10 ! (3_0) if( ax >= 0x7f800000 ) 349 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); 350.cont10: 351 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; 352 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 353 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 354 355 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; 356 sra %o2,11,%i2 ! (2_0) ax >>= 11; 357 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 358 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 359 360 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 361 bl,pn %icc,.update11 ! (3_0) if( ax < 0x00800000 ) 362 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); 363 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 364.cont11: 365 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; 366 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 367 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 368 faddd %f50,K1,%f56 ! (0_0) res0 += K1; 369 370 add %i1,stridex,%i1 ! px += stridex 371 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 372 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 373 374 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; 375 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 376 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 377 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; 378 379 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 380 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 381 382 or %g0,%g5,%i3 383 cmp counter,5 384 bl,pn %icc,.tail 385 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; 386 387 ba .main_loop 388 sub counter,5,counter ! counter 389 390 .align 16 391.main_loop: 392 fmuld K2,%f30,%f60 ! (1_1) res0 = K2 * xx0; 393 cmp %o2,_0x7f800000 ! (4_1) ax ? 0x7f800000 394 bge,pn %icc,.update12 ! (4_1) if( ax >= 0x7f800000 ) 395 fsubd %f40,%f46,%f44 ! (2_1) xx0 = (db0 - hi0); 396.cont12: 397 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; 398 sllx %g5,52,%g5 ! (3_1) lexp0 = (long long)iexp0 << 52; 399 ldd [%i2],%f40 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 400 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; 401 402 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; 403 sra %o1,11,%l0 ! (3_1) ax >>= 11; 404 stx %g5,[%fp+tmp1] ! (3_1) dtmp1 = *((double*)&lexp0); 405 for %f58,DC1,%f48 ! (3_1) db0 = vis_for(db0,DC1); 406 407 cmp %o2,_0x00800000 ! (4_1) ax ? 0x00800000 408 bl,pn %icc,.update13 ! (4_1) if( ax < 0x00800000 ) 409 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); 410 fstod %f13,%f50 ! (4_1) db0 = (double)x0; 411.cont13: 412 fmuld %f44,%f40,%f46 ! (2_1) xx0 *= dtmp0; 413 and %l0,_0x1ff0,%i0 ! (3_1) si0 = ax & 0x1ff0; 414 lda [%i1+stridex]0x82,%l1 ! (0_0) ax = *(int*)px; 415 faddd %f60,K1,%f32 ! (1_1) res0 += K1; 416 417 add %i0,TBL,%l0 ! (3_1) (char*)TBL + si0 418 add %i3,stridey,%o3 ! py += stridey 419 st %f15,[%i3] ! (2_2) *py = fres0; 420 fand %f48,DC2,%f62 ! (3_1) hi0 = vis_fand(db0,DC2); 421 422 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; 423 sra %o2,24,%o7 ! (4_1) iexp0 = ax >> 24; 424 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 425 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; 426 427 add %i1,stridex,%o4 ! px += stridex 428 add %o7,960,%o7 ! (4_1) iexp0 += 0x3c0; 429 lda [%i1+stridex]0x82,%f17 ! (0_0) x0 = *px; 430 fand %f50,DC0,%f54 ! (4_1) db0 = vis_fand(db0,DC0); 431 432 fmuld K2,%f46,%f52 ! (2_1) res0 = K2 * xx0; 433 cmp %l1,_0x7f800000 ! (0_0) ax ? 0x7f800000 434 bge,pn %icc,.update14 ! (0_0) if( ax >= 0x7f800000 ) 435 fsubd %f48,%f62,%f42 ! (3_1) xx0 = (db0 - hi0); 436.cont14: 437 fmuld %f32,%f30,%f48 ! (1_1) res0 *= xx0; 438 sllx %o7,52,%o1 ! (4_1) lexp0 = (long long)iexp0 << 52; 439 ldd [%i0+TBL],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 440 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; 441 442 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; 443 sra %o2,11,%i5 ! (4_1) ax >>= 11; 444 stx %o1,[%fp+tmp2] ! (4_1) dtmp1 = *((double*)&lexp0); 445 for %f54,DC1,%f34 ! (4_1) db0 = vis_for(db0,DC1); 446 447 cmp %l1,_0x00800000 ! (0_0) ax ? 0x00800000 448 bl,pn %icc,.update15 ! (0_0) if( ax < 0x00800000 ) 449 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); 450 fstod %f17,%f56 ! (0_0) db0 = (double)x0; 451.cont15: 452 fmuld %f42,%f40,%f42 ! (3_1) xx0 *= dtmp0; 453 add %o3,stridey,%g5 ! py += stridey 454 lda [stridex+%o4]0x82,%i0 ! (1_0) ax = *(int*)px; 455 faddd %f52,K1,%f52 ! (2_1) res0 += K1; 456 457 sra %l1,24,%g1 ! (0_0) iexp0 = ax >> 24; 458 and %i5,_0x1ff0,%i5 ! (4_1) si0 = ax & 0x1ff0; 459 st %f19,[%o3] ! (3_2) *py = fres0; 460 fand %f34,DC2,%f62 ! (4_1) hi0 = vis_fand(db0,DC2); 461 462 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; 463 add %o4,stridex,%i1 ! px += stridex 464 ldd [%i4+8],%f60 ! (1_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 465 faddd %f48,DC1,%f58 ! (1_1) res0 += DC1; 466 467 add %g1,960,%o5 ! (0_0) iexp0 += 0x3c0; 468 add %i5,TBL,%i3 ! (4_1) (char*)TBL + si0 469 lda [stridex+%o4]0x82,%f21 ! (1_0) x0 = *px; 470 fand %f56,DC0,%f32 ! (0_0) db0 = vis_fand(db0,DC0); 471 472 fmuld K2,%f42,%f50 ! (3_1) res0 = K2 * xx0; 473 cmp %i0,_0x7f800000 ! (1_0) ax ? 0x7f800000 474 bge,pn %icc,.update16 ! (1_0) if( ax >= 0x7f800000 ) 475 fsubd %f34,%f62,%f54 ! (4_1) xx0 = (db0 - hi0); 476.cont16: 477 fmuld %f52,%f46,%f52 ! (2_1) res0 *= xx0; 478 sllx %o5,52,%o7 ! (0_0) lexp0 = (long long)iexp0 << 52; 479 ldd [TBL+%i5],%f62 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[0]; 480 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; 481 482 fmuld %f60,%f58,%f44 ! (1_1) res0 = dtmp0 * res0; 483 sra %l1,11,%i4 ! (0_0) ax >>= 11; 484 stx %o7,[%fp+tmp3] ! (0_0) dtmp1 = *((double*)&lexp0); 485 for %f32,DC1,%f48 ! (0_0) db0 = vis_for(db0,DC1); 486 487 cmp %i0,_0x00800000 ! (1_0) ax ? 0x00800000 488 bl,pn %icc,.update17 ! (1_0) if( ax < 0x00800000 ) 489 ldd [%fp+tmp4],%f34 ! (1_1) dtmp1 = *((double*)&lexp0); 490 fstod %f21,%f56 ! (1_0) db0 = (double)x0; 491.cont17: 492 fmuld %f54,%f62,%f46 ! (4_1) xx0 *= dtmp0; 493 and %i4,_0x1ff0,%g1 ! (0_0) si0 = ax & 0x1ff0; 494 lda [%i1+stridex]0x82,%o2 ! (2_0) ax = *(int*)px; 495 faddd %f50,K1,%f62 ! (3_1) res0 += K1; 496 497 add %g1,TBL,%i5 ! (0_0) (double*)((char*)TBL + si0 498 add %g5,stridey,%g5 ! py += stridey 499 st %f23,[stridey+%o3] ! (4_2) *py = fres0; 500 fand %f48,DC2,%f32 ! (0_0) hi0 = vis_fand(db0,DC2); 501 502 fmuld %f44,%f34,%f44 ! (1_1) res0 *= dtmp1; 503 sra %i0,24,%o4 ! (1_0) iexp0 = ax >> 24; 504 ldd [%i2+8],%f60 ! (2_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 505 faddd %f52,DC1,%f58 ! (2_1) res0 += DC1; 506 507 add %i1,stridex,%o7 ! px += stridex 508 add %o4,960,%i2 ! (1_0) iexp0 += 0x3c0; 509 lda [%i1+stridex]0x82,%f25 ! (2_0) x0 = *px; 510 fand %f56,DC0,%f34 ! (1_0) db0 = vis_fand(db0,DC0); 511 512 fmuld K2,%f46,%f50 ! (4_1) res0 = K2 * xx0; 513 cmp %o2,_0x7f800000 ! (2_0) ax ? 0x7f800000 514 bge,pn %icc,.update18 ! (2_0) if( ax >= 0x7f800000 ) 515 fsubd %f48,%f32,%f52 ! (0_0) xx0 = (db0 - hi0); 516.cont18: 517 fmuld %f62,%f42,%f54 ! (3_1) res0 *= xx0; 518 sllx %i2,52,%o4 ! (1_0) lexp0 = (long long)iexp0 << 52; 519 ldd [TBL+%g1],%f32 ! (0_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 520 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; 521 522 fmuld %f60,%f58,%f60 ! (2_1) res0 = dtmp0 * res0; 523 sra %i0,11,%g1 ! (1_0) ax >>= 11; 524 stx %o4,[%fp+tmp4] ! (1_0) dtmp1 = *((double*)&lexp0); 525 for %f34,DC1,%f48 ! (1_0) db0 = vis_for(db0,DC1); 526 527 cmp %o2,_0x00800000 ! (2_0) ax ? 0x00800000 528 bl,pn %icc,.update19 ! (2_0) if( ax < 0x00800000 ) 529 ldd [%fp+tmp0],%f40 ! (2_1) dtmp1 = *((double*)&lexp0); 530 fstod %f25,%f56 ! (2_0) db0 = (double)x0; 531.cont19: 532 fmuld %f52,%f32,%f42 ! (0_0) xx0 *= dtmp0; 533 and %g1,_0x1ff0,%o5 ! (1_0) si0 = ax & 0x1ff0; 534 lda [stridex+%o7]0x82,%o1 ! (3_0) ax = *(int*)px; 535 faddd %f50,K1,%f34 ! (4_1) res0 += K1; 536 537 add %o5,TBL,%i4 ! (1_0) (char*)TBL + si0 538 add %g5,stridey,%g1 ! py += stridey 539 st %f27,[%g5] ! (0_1) *py = fres0; 540 fand %f48,DC2,%f62 ! (1_0) hi0 = vis_fand(db0,DC2); 541 542 fmuld %f60,%f40,%f32 ! (2_1) res0 *= dtmp1; 543 sra %o2,24,%l1 ! (2_0) iexp0 = ax >> 24; 544 ldd [%l0+8],%f40 ! (3_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 545 faddd %f54,DC1,%f58 ! (3_1) res0 += DC1; 546 547 add %o7,stridex,%i1 ! px += stridex 548 add %l1,960,%l0 ! (2_0) iexp0 += 0x3c0; 549 lda [stridex+%o7]0x82,%f0 ! (3_0) x0 = *px; 550 fand %f56,DC0,%f60 ! (2_0) db0 = vis_fand(db0,DC0); 551 552 fmuld K2,%f42,%f50 ! (0_0) res0 = K2 * xx0; 553 cmp %o1,_0x7f800000 ! (3_0) ax ? 0x7f800000 554 bge,pn %icc,.update20 ! (3_0) if( ax >= 0x7f800000 ) 555 fsubd %f48,%f62,%f54 ! (1_0) xx0 = (db0 - hi0); 556.cont20: 557 fmuld %f34,%f46,%f52 ! (4_1) res0 *= xx0; 558 sllx %l0,52,%o3 ! (2_0) lexp0 = (long long)iexp0 << 52; 559 ldd [TBL+%o5],%f56 ! (1_0) dtmp0 = ((double*)((char*)TBL + si0))[0]; 560 fdtos %f44,%f8 ! (1_1) fres0 = (float)res0; 561 562 fmuld %f40,%f58,%f34 ! (3_1) res0 = dtmp0 * res0; 563 sra %o2,11,%i2 ! (2_0) ax >>= 11; 564 stx %o3,[%fp+tmp0] ! (2_0) dtmp1 = *((double*)&lexp0); 565 for %f60,DC1,%f40 ! (2_0) db0 = vis_for(db0,DC1); 566 567 cmp %o1,_0x00800000 ! (3_0) ax ? 0x00800000 568 bl,pn %icc,.update21 ! (3_0) if( ax < 0x00800000 ) 569 ldd [%fp+tmp1],%f62 ! (3_1) dtmp1 = *((double*)&lexp0); 570 fstod %f0,%f48 ! (3_0) db0 = (double)x0; 571.cont21: 572 fmuld %f54,%f56,%f30 ! (1_0) xx0 *= dtmp0; 573 and %i2,_0x1ff0,%o3 ! (2_0) si0 = ax & 0x1ff0; 574 lda [%i1+stridex]0x82,%o2 ! (4_0) ax = *(int*)px; 575 faddd %f50,K1,%f56 ! (0_0) res0 += K1; 576 577 add %i1,stridex,%i1 ! px += stridex 578 add %o3,TBL,%i2 ! (2_0) (char*)TBL + si0 579 st %f8,[stridey+%g5] ! (1_1) *py = fres0; 580 fand %f40,DC2,%f46 ! (2_0) hi0 = vis_fand(db0,DC2); 581 582 fmuld %f34,%f62,%f28 ! (3_1) res0 *= dtmp1; 583 sra %o1,24,%o4 ! (3_0) iexp0 = ax >> 24; 584 ldd [%i3+8],%f50 ! (4_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 585 faddd %f52,DC1,%f54 ! (4_1) res0 += DC1; 586 587 add %g1,stridey,%i3 ! py += stridey 588 subcc counter,5,counter ! counter 589 lda [%i1]0x82,%f13 ! (4_0) x0 = *px; 590 fand %f48,DC0,%f58 ! (3_0) db0 = vis_fand(db0,DC0); 591 592 bpos,pt %icc,.main_loop 593 add %o4,960,%g5 ! (3_0) iexp0 += 0x3c0; 594 595 add counter,5,counter 596.tail: 597 subcc counter,1,counter 598 bneg,a .begin 599 or %g0,%i3,%g5 600 601 fmuld %f56,%f42,%f52 ! (0_1) res0 *= xx0; 602 fdtos %f32,%f15 ! (2_2) fres0 = (float)res0; 603 604 fmuld %f50,%f54,%f42 ! (4_2) res0 = dtmp0 * res0; 605 606 ldd [%fp+tmp2],%f56 ! (4_2) dtmp1 = *((double*)&lexp0); 607 608 add %i3,stridey,%o3 ! py += stridey 609 st %f15,[%i3] ! (2_2) *py = fres0; 610 611 subcc counter,1,counter 612 bneg,a .begin 613 or %g0,%o3,%g5 614 615 fmuld %f42,%f56,%f44 ! (4_2) res0 *= dtmp1; 616 ldd [%i5+8],%f58 ! (0_1) dtmp0 = ((double*)((char*)TBL + si0))[1] 617 faddd %f52,DC1,%f34 ! (0_1) res0 += DC1; 618 619 fdtos %f28,%f19 ! (3_2) fres0 = (float)res0; 620 621 fmuld %f58,%f34,%f32 ! (0_1) res0 = dtmp0 * res0; 622 623 ldd [%fp+tmp3],%f60 ! (0_1) dtmp1 = *((double*)&lexp0); 624 625 add %o3,stridey,%g5 ! py += stridey 626 627 st %f19,[%o3] ! (3_2) *py = fres0; 628 629 subcc counter,1,counter 630 bneg,a .begin 631 nop 632 633 fmuld %f32,%f60,%f40 ! (0_1) res0 *= dtmp1; 634 635 fdtos %f44,%f23 ! (4_2) fres0 = (float)res0; 636 637 add %g5,stridey,%g5 ! py += stridey 638 st %f23,[stridey+%o3] ! (4_2) *py = fres0; 639 640 subcc counter,1,counter 641 bneg,a .begin 642 nop 643 644 fdtos %f40,%f27 ! (0_1) fres0 = (float)res0; 645 646 st %f27,[%g5] ! (0_1) *py = fres0; 647 648 ba .begin 649 add %g5,stridey,%g5 650 651 .align 16 652.spec: 653 fsqrts %f25,%f25 654 sub counter,1,counter 655 add %i1,stridex,%i1 656 st %f25,[%g5] 657 ba .begin1 658 add %g5,stridey,%g5 659 660 .align 16 661.update0: 662 cmp counter,1 663 ble .cont0 664 fzeros %f0 665 666 stx %i1,[%fp+tmp_px] 667 sethi %hi(0x7f800000),%o1 668 669 sub counter,1,counter 670 st counter,[%fp+tmp_counter] 671 672 ba .cont0 673 or %g0,1,counter 674 675 .align 16 676.update1: 677 cmp counter,1 678 ble .cont1 679 fzeros %f0 680 681 stx %i1,[%fp+tmp_px] 682 clr %o1 683 684 sub counter,1,counter 685 st counter,[%fp+tmp_counter] 686 687 ba .cont1 688 or %g0,1,counter 689 690 .align 16 691.update2: 692 cmp counter,2 693 ble .cont2 694 fzeros %f13 695 696 stx %i1,[%fp+tmp_px] 697 sethi %hi(0x7f800000),%o2 698 699 sub counter,2,counter 700 st counter,[%fp+tmp_counter] 701 702 ba .cont2 703 or %g0,2,counter 704 705 .align 16 706.update3: 707 cmp counter,2 708 ble .cont3 709 fzeros %f13 710 711 stx %i1,[%fp+tmp_px] 712 clr %o2 713 714 sub counter,2,counter 715 st counter,[%fp+tmp_counter] 716 717 ba .cont3 718 or %g0,2,counter 719 720 .align 16 721.update4: 722 cmp counter,3 723 ble .cont4 724 fzeros %f17 725 726 stx %o4,[%fp+tmp_px] 727 sethi %hi(0x7f800000),%l1 728 729 sub counter,3,counter 730 st counter,[%fp+tmp_counter] 731 732 ba .cont4 733 or %g0,3,counter 734 735 .align 16 736.update5: 737 cmp counter,3 738 ble .cont5 739 fzeros %f17 740 741 stx %o4,[%fp+tmp_px] 742 clr %l1 743 744 sub counter,3,counter 745 st counter,[%fp+tmp_counter] 746 747 ba .cont5 748 or %g0,3,counter 749 750 .align 16 751.update6: 752 cmp counter,4 753 ble .cont6 754 fzeros %f21 755 756 stx %i1,[%fp+tmp_px] 757 sethi %hi(0x7f800000),%i0 758 759 sub counter,4,counter 760 st counter,[%fp+tmp_counter] 761 762 ba .cont6 763 or %g0,4,counter 764 765 .align 16 766.update7: 767 cmp counter,4 768 ble .cont7 769 fzeros %f21 770 771 stx %i1,[%fp+tmp_px] 772 clr %i0 773 774 sub counter,4,counter 775 st counter,[%fp+tmp_counter] 776 777 ba .cont7 778 or %g0,4,counter 779 780 .align 16 781.update8: 782 cmp counter,5 783 ble .cont8 784 fzeros %f25 785 786 stx %o7,[%fp+tmp_px] 787 sethi %hi(0x7f800000),%o2 788 789 sub counter,5,counter 790 st counter,[%fp+tmp_counter] 791 792 ba .cont8 793 or %g0,5,counter 794 795 .align 16 796.update9: 797 cmp counter,5 798 ble .cont9 799 fzeros %f25 800 801 stx %o7,[%fp+tmp_px] 802 clr %o2 803 804 sub counter,5,counter 805 st counter,[%fp+tmp_counter] 806 807 ba .cont9 808 or %g0,5,counter 809 810 .align 16 811.update10: 812 cmp counter,6 813 ble .cont10 814 fzeros %f0 815 816 stx %i1,[%fp+tmp_px] 817 sethi %hi(0x7f800000),%o1 818 819 sub counter,6,counter 820 st counter,[%fp+tmp_counter] 821 822 ba .cont10 823 or %g0,6,counter 824 825 .align 16 826.update11: 827 cmp counter,6 828 ble .cont11 829 fzeros %f0 830 831 stx %i1,[%fp+tmp_px] 832 clr %o1 833 834 sub counter,6,counter 835 st counter,[%fp+tmp_counter] 836 837 ba .cont11 838 or %g0,6,counter 839 840 .align 16 841.update12: 842 cmp counter,2 843 ble .cont12 844 fzeros %f13 845 846 stx %i1,[%fp+tmp_px] 847 sethi %hi(0x7f800000),%o2 848 849 sub counter,2,counter 850 st counter,[%fp+tmp_counter] 851 852 ba .cont12 853 or %g0,2,counter 854 855 .align 16 856.update13: 857 cmp counter,2 858 ble .cont13 859 fzeros %f13 860 861 stx %i1,[%fp+tmp_px] 862 clr %o2 863 864 sub counter,2,counter 865 st counter,[%fp+tmp_counter] 866 867 ba .cont13 868 or %g0,2,counter 869 870 .align 16 871.update14: 872 cmp counter,3 873 ble .cont14 874 fzeros %f17 875 876 stx %o4,[%fp+tmp_px] 877 sethi %hi(0x7f800000),%l1 878 879 sub counter,3,counter 880 st counter,[%fp+tmp_counter] 881 882 ba .cont14 883 or %g0,3,counter 884 885 .align 16 886.update15: 887 cmp counter,3 888 ble .cont15 889 fzeros %f17 890 891 stx %o4,[%fp+tmp_px] 892 clr %l1 893 894 sub counter,3,counter 895 st counter,[%fp+tmp_counter] 896 897 ba .cont15 898 or %g0,3,counter 899 900 .align 16 901.update16: 902 cmp counter,4 903 ble .cont16 904 fzeros %f21 905 906 stx %i1,[%fp+tmp_px] 907 sethi %hi(0x7f800000),%i0 908 909 sub counter,4,counter 910 st counter,[%fp+tmp_counter] 911 912 ba .cont16 913 or %g0,4,counter 914 915 .align 16 916.update17: 917 cmp counter,4 918 ble .cont17 919 fzeros %f21 920 921 stx %i1,[%fp+tmp_px] 922 clr %i0 923 924 sub counter,4,counter 925 st counter,[%fp+tmp_counter] 926 927 ba .cont17 928 or %g0,4,counter 929 930 .align 16 931.update18: 932 cmp counter,5 933 ble .cont18 934 fzeros %f25 935 936 stx %o7,[%fp+tmp_px] 937 sethi %hi(0x7f800000),%o2 938 939 sub counter,5,counter 940 st counter,[%fp+tmp_counter] 941 942 ba .cont18 943 or %g0,5,counter 944 945 .align 16 946.update19: 947 cmp counter,5 948 ble .cont19 949 fzeros %f25 950 951 stx %o7,[%fp+tmp_px] 952 clr %o2 953 954 sub counter,5,counter 955 st counter,[%fp+tmp_counter] 956 957 ba .cont19 958 or %g0,5,counter 959 960 .align 16 961.update20: 962 cmp counter,6 963 ble .cont20 964 fzeros %f0 965 966 stx %i1,[%fp+tmp_px] 967 sethi %hi(0x7f800000),%o1 968 969 sub counter,6,counter 970 st counter,[%fp+tmp_counter] 971 972 ba .cont20 973 or %g0,6,counter 974 975 .align 16 976.update21: 977 cmp counter,6 978 ble .cont21 979 fzeros %f0 980 981 stx %i1,[%fp+tmp_px] 982 clr %o1 983 984 sub counter,6,counter 985 st counter,[%fp+tmp_counter] 986 987 ba .cont21 988 or %g0,6,counter 989 990.exit: 991 ret 992 restore 993 SET_SIZE(__vsqrtf_ultra3) 994 995