1/* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21/* 22 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 23 */ 24/* 25 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 26 * Use is subject to license terms. 27 */ 28 29 .file "__vrsqrtf.S" 30 31#include "libm.h" 32 33 RO_DATA 34 .align 64 35 36! i = [0,63] 37! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-24; 38! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 39! i = [64,127] 40! TBL[2*i ] = 1 / (*(double*)&(0x3fe0000000000000ULL + (i << 46))) * 2**-23; 41! TBL[2*i+1] = 1 / sqrtl(*(double*)&(0x3fe0000000000000ULL + (i << 46))); 42 43.CONST_TBL: 44 .word 0x3e800000, 0x00000000, 0x3ff6a09e, 0x667f3bcd, 45 .word 0x3e7f81f8, 0x1f81f820, 0x3ff673e3, 0x2ef63a03, 46 .word 0x3e7f07c1, 0xf07c1f08, 0x3ff6482d, 0x37a5a3d2, 47 .word 0x3e7e9131, 0xabf0b767, 0x3ff61d72, 0xb7978671, 48 .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3ff5f3aa, 0x673fa911, 49 .word 0x3e7dae60, 0x76b981db, 0x3ff5cacb, 0x7802f342, 50 .word 0x3e7d41d4, 0x1d41d41d, 0x3ff5a2cd, 0x8c69d61a, 51 .word 0x3e7cd856, 0x89039b0b, 0x3ff57ba8, 0xb0ee01b9, 52 .word 0x3e7c71c7, 0x1c71c71c, 0x3ff55555, 0x55555555, 53 .word 0x3e7c0e07, 0x0381c0e0, 0x3ff52fcc, 0x468d6b54, 54 .word 0x3e7bacf9, 0x14c1bad0, 0x3ff50b06, 0xa8fc6b70, 55 .word 0x3e7b4e81, 0xb4e81b4f, 0x3ff4e6fd, 0xf33cf032, 56 .word 0x3e7af286, 0xbca1af28, 0x3ff4c3ab, 0xe93bcf74, 57 .word 0x3e7a98ef, 0x606a63be, 0x3ff4a10a, 0x97af7b92, 58 .word 0x3e7a41a4, 0x1a41a41a, 0x3ff47f14, 0x4fe17f9f, 59 .word 0x3e79ec8e, 0x951033d9, 0x3ff45dc3, 0xa3c34fa3, 60 .word 0x3e799999, 0x9999999a, 0x3ff43d13, 0x6248490f, 61 .word 0x3e7948b0, 0xfcd6e9e0, 0x3ff41cfe, 0x93ff5199, 62 .word 0x3e78f9c1, 0x8f9c18fa, 0x3ff3fd80, 0x77e70577, 63 .word 0x3e78acb9, 0x0f6bf3aa, 0x3ff3de94, 0x8077db58, 64 .word 0x3e786186, 0x18618618, 0x3ff3c036, 0x50e00e03, 65 .word 0x3e781818, 0x18181818, 0x3ff3a261, 0xba6d7a37, 66 .word 0x3e77d05f, 0x417d05f4, 0x3ff38512, 0xba21f51e, 67 .word 0x3e778a4c, 0x8178a4c8, 0x3ff36845, 0x766eec92, 68 .word 0x3e7745d1, 0x745d1746, 0x3ff34bf6, 0x3d156826, 69 .word 0x3e7702e0, 0x5c0b8170, 0x3ff33021, 0x8127c0e0, 70 .word 0x3e76c16c, 0x16c16c17, 0x3ff314c3, 0xd92a9e91, 71 .word 0x3e768168, 0x16816817, 0x3ff2f9d9, 0xfd52fd50, 72 .word 0x3e7642c8, 0x590b2164, 0x3ff2df60, 0xc5df2c9e, 73 .word 0x3e760581, 0x60581606, 0x3ff2c555, 0x2988e428, 74 .word 0x3e75c988, 0x2b931057, 0x3ff2abb4, 0x3c0eb0f4, 75 .word 0x3e758ed2, 0x308158ed, 0x3ff2927b, 0x2cd320f5, 76 .word 0x3e755555, 0x55555555, 0x3ff279a7, 0x4590331c, 77 .word 0x3e751d07, 0xeae2f815, 0x3ff26135, 0xe91daf55, 78 .word 0x3e74e5e0, 0xa72f0539, 0x3ff24924, 0x92492492, 79 .word 0x3e74afd6, 0xa052bf5b, 0x3ff23170, 0xd2be638a, 80 .word 0x3e747ae1, 0x47ae147b, 0x3ff21a18, 0x51ff630a, 81 .word 0x3e7446f8, 0x6562d9fb, 0x3ff20318, 0xcc6a8f5d, 82 .word 0x3e741414, 0x14141414, 0x3ff1ec70, 0x124e98f9, 83 .word 0x3e73e22c, 0xbce4a902, 0x3ff1d61c, 0x070ae7d3, 84 .word 0x3e73b13b, 0x13b13b14, 0x3ff1c01a, 0xa03be896, 85 .word 0x3e738138, 0x13813814, 0x3ff1aa69, 0xe4f2777f, 86 .word 0x3e73521c, 0xfb2b78c1, 0x3ff19507, 0xecf5b9e9, 87 .word 0x3e7323e3, 0x4a2b10bf, 0x3ff17ff2, 0xe00ec3ee, 88 .word 0x3e72f684, 0xbda12f68, 0x3ff16b28, 0xf55d72d4, 89 .word 0x3e72c9fb, 0x4d812ca0, 0x3ff156a8, 0x72b5ef62, 90 .word 0x3e729e41, 0x29e4129e, 0x3ff1426f, 0xac0654db, 91 .word 0x3e727350, 0xb8812735, 0x3ff12e7d, 0x02c40253, 92 .word 0x3e724924, 0x92492492, 0x3ff11ace, 0xe560242a, 93 .word 0x3e721fb7, 0x8121fb78, 0x3ff10763, 0xcec30b26, 94 .word 0x3e71f704, 0x7dc11f70, 0x3ff0f43a, 0x45cdedad, 95 .word 0x3e71cf06, 0xada2811d, 0x3ff0e150, 0xdce2b60c, 96 .word 0x3e71a7b9, 0x611a7b96, 0x3ff0cea6, 0x317186dc, 97 .word 0x3e718118, 0x11811812, 0x3ff0bc38, 0xeb8ba412, 98 .word 0x3e715b1e, 0x5f75270d, 0x3ff0aa07, 0xbd7b7488, 99 .word 0x3e7135c8, 0x1135c811, 0x3ff09811, 0x63615499, 100 .word 0x3e711111, 0x11111111, 0x3ff08654, 0xa2d4f6db, 101 .word 0x3e70ecf5, 0x6be69c90, 0x3ff074d0, 0x4a8b1438, 102 .word 0x3e70c971, 0x4fbcda3b, 0x3ff06383, 0x31ff307a, 103 .word 0x3e70a681, 0x0a6810a7, 0x3ff0526c, 0x39213bfa, 104 .word 0x3e708421, 0x08421084, 0x3ff0418a, 0x4806de7d, 105 .word 0x3e70624d, 0xd2f1a9fc, 0x3ff030dc, 0x4ea03a72, 106 .word 0x3e704104, 0x10410410, 0x3ff02061, 0x446ffa9a, 107 .word 0x3e702040, 0x81020408, 0x3ff01018, 0x28467ee9, 108 .word 0x3e800000, 0x00000000, 0x3ff00000, 0x00000000, 109 .word 0x3e7f81f8, 0x1f81f820, 0x3fefc0bd, 0x88a0f1d9, 110 .word 0x3e7f07c1, 0xf07c1f08, 0x3fef82ec, 0x882c0f9b, 111 .word 0x3e7e9131, 0xabf0b767, 0x3fef467f, 0x2814b0cc, 112 .word 0x3e7e1e1e, 0x1e1e1e1e, 0x3fef0b68, 0x48d2af1c, 113 .word 0x3e7dae60, 0x76b981db, 0x3feed19b, 0x75e78957, 114 .word 0x3e7d41d4, 0x1d41d41d, 0x3fee990c, 0xdad55ed2, 115 .word 0x3e7cd856, 0x89039b0b, 0x3fee61b1, 0x38f18adc, 116 .word 0x3e7c71c7, 0x1c71c71c, 0x3fee2b7d, 0xddfefa66, 117 .word 0x3e7c0e07, 0x0381c0e0, 0x3fedf668, 0x9b7e6350, 118 .word 0x3e7bacf9, 0x14c1bad0, 0x3fedc267, 0xbea45549, 119 .word 0x3e7b4e81, 0xb4e81b4f, 0x3fed8f72, 0x08e6b82d, 120 .word 0x3e7af286, 0xbca1af28, 0x3fed5d7e, 0xa914b937, 121 .word 0x3e7a98ef, 0x606a63be, 0x3fed2c85, 0x34ed6d86, 122 .word 0x3e7a41a4, 0x1a41a41a, 0x3fecfc7d, 0xa32a9213, 123 .word 0x3e79ec8e, 0x951033d9, 0x3feccd60, 0x45f5d358, 124 .word 0x3e799999, 0x9999999a, 0x3fec9f25, 0xc5bfedd9, 125 .word 0x3e7948b0, 0xfcd6e9e0, 0x3fec71c7, 0x1c71c71c, 126 .word 0x3e78f9c1, 0x8f9c18fa, 0x3fec453d, 0x90f057a2, 127 .word 0x3e78acb9, 0x0f6bf3aa, 0x3fec1982, 0xb2ece47b, 128 .word 0x3e786186, 0x18618618, 0x3febee90, 0x56fb9c39, 129 .word 0x3e781818, 0x18181818, 0x3febc460, 0x92eb3118, 130 .word 0x3e77d05f, 0x417d05f4, 0x3feb9aed, 0xba588347, 131 .word 0x3e778a4c, 0x8178a4c8, 0x3feb7232, 0x5b79db11, 132 .word 0x3e7745d1, 0x745d1746, 0x3feb4a29, 0x3c1d9550, 133 .word 0x3e7702e0, 0x5c0b8170, 0x3feb22cd, 0x56d87d7e, 134 .word 0x3e76c16c, 0x16c16c17, 0x3feafc19, 0xd8606169, 135 .word 0x3e768168, 0x16816817, 0x3fead60a, 0x1d0fb394, 136 .word 0x3e7642c8, 0x590b2164, 0x3feab099, 0xae8f539a, 137 .word 0x3e760581, 0x60581606, 0x3fea8bc4, 0x41a3d02c, 138 .word 0x3e75c988, 0x2b931057, 0x3fea6785, 0xb41bacf7, 139 .word 0x3e758ed2, 0x308158ed, 0x3fea43da, 0x0adc6899, 140 .word 0x3e755555, 0x55555555, 0x3fea20bd, 0x700c2c3e, 141 .word 0x3e751d07, 0xeae2f815, 0x3fe9fe2c, 0x315637ee, 142 .word 0x3e74e5e0, 0xa72f0539, 0x3fe9dc22, 0xbe484458, 143 .word 0x3e74afd6, 0xa052bf5b, 0x3fe9ba9d, 0xa6c73588, 144 .word 0x3e747ae1, 0x47ae147b, 0x3fe99999, 0x9999999a, 145 .word 0x3e7446f8, 0x6562d9fb, 0x3fe97913, 0x63068b54, 146 .word 0x3e741414, 0x14141414, 0x3fe95907, 0xeb87ab44, 147 .word 0x3e73e22c, 0xbce4a902, 0x3fe93974, 0x368cfa31, 148 .word 0x3e73b13b, 0x13b13b14, 0x3fe91a55, 0x6151761c, 149 .word 0x3e738138, 0x13813814, 0x3fe8fba8, 0xa1bf6f96, 150 .word 0x3e73521c, 0xfb2b78c1, 0x3fe8dd6b, 0x4563a009, 151 .word 0x3e7323e3, 0x4a2b10bf, 0x3fe8bf9a, 0xb06e1af3, 152 .word 0x3e72f684, 0xbda12f68, 0x3fe8a234, 0x5cc04426, 153 .word 0x3e72c9fb, 0x4d812ca0, 0x3fe88535, 0xd90703c6, 154 .word 0x3e729e41, 0x29e4129e, 0x3fe8689c, 0xc7e07e7d, 155 .word 0x3e727350, 0xb8812735, 0x3fe84c66, 0xdf0ca4c2, 156 .word 0x3e724924, 0x92492492, 0x3fe83091, 0xe6a7f7e7, 157 .word 0x3e721fb7, 0x8121fb78, 0x3fe8151b, 0xb86fee1d, 158 .word 0x3e71f704, 0x7dc11f70, 0x3fe7fa02, 0x3f1068d1, 159 .word 0x3e71cf06, 0xada2811d, 0x3fe7df43, 0x7579b9b5, 160 .word 0x3e71a7b9, 0x611a7b96, 0x3fe7c4dd, 0x663ebb88, 161 .word 0x3e718118, 0x11811812, 0x3fe7aace, 0x2afa8b72, 162 .word 0x3e715b1e, 0x5f75270d, 0x3fe79113, 0xebbd7729, 163 .word 0x3e7135c8, 0x1135c811, 0x3fe777ac, 0xde80baea, 164 .word 0x3e711111, 0x11111111, 0x3fe75e97, 0x46a0b098, 165 .word 0x3e70ecf5, 0x6be69c90, 0x3fe745d1, 0x745d1746, 166 .word 0x3e70c971, 0x4fbcda3b, 0x3fe72d59, 0xc45f1fc5, 167 .word 0x3e70a681, 0x0a6810a7, 0x3fe7152e, 0x9f44f01f, 168 .word 0x3e708421, 0x08421084, 0x3fe6fd4e, 0x79325467, 169 .word 0x3e70624d, 0xd2f1a9fc, 0x3fe6e5b7, 0xd16657e1, 170 .word 0x3e704104, 0x10410410, 0x3fe6ce69, 0x31d5858d, 171 .word 0x3e702040, 0x81020408, 0x3fe6b761, 0x2ec892f6, 172 173 .word 0x3fefffff, 0xfee7f18f ! K0 = 9.99999997962321453275e-01 174 .word 0xbfdfffff, 0xfe07e52f ! K1 = -4.99999998166077580600e-01 175 .word 0x3fd80118, 0x0ca296d9 ! K2 = 3.75066768969515586277e-01 176 .word 0xbfd400fc, 0x0bbb8e78 ! K3 = -3.12560092408808548438e-01 177 .word 0x7ffe0000, 0x7ffe0000 ! DC0 178 .word 0x3f800000, 0x40000000 ! FTWO 179 180#define stridex %l4 181#define stridex2 %l1 182#define stridey %l3 183#define stridey2 %i2 184#define TBL %l2 185#define counter %i5 186 187#define K3 %f38 188#define K2 %f36 189#define K1 %f34 190#define K0 %f32 191#define DC0 %f4 192#define FONE %f2 193#define FTWO %f3 194 195#define _0x00800000 %o2 196#define _0x7f800000 %o4 197 198#define tmp0 STACK_BIAS-0x30 199#define tmp1 STACK_BIAS-0x28 200#define tmp2 STACK_BIAS-0x20 201#define tmp3 STACK_BIAS-0x18 202#define tmp_counter STACK_BIAS-0x10 203#define tmp_px STACK_BIAS-0x08 204 205! sizeof temp storage - must be a multiple of 16 for V9 206#define tmps 0x30 207 208!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 209! !!!!! algorithm !!!!! 210! ((float*)&ddx0)[0] = *px; 211! ax0 = *(int*)px; 212! 213! ((float*)&ddx0)[1] = *(px + stridex); 214! ax1 = *(int*)(px + stridex); 215! 216! px += stridex2; 217! 218! if ( ax0 >= 0x7f800000 ) 219! { 220! RETURN ( FONE / ((float*)&dres0)[0] ); 221! } 222! if ( ax0 < 0x00800000 ) 223! { 224! float res = ((float*)&dres0)[0]; 225! 226! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ 227! { 228! RETURN ( FONE / res ) 229! } 230! else if ( ax0 >= 0 ) /* X = denormal */ 231! { 232! double res0, xx0, tbl_div0, tbl_sqrt0; 233! float fres0; 234! int iax0, si0, iexp0; 235! 236! res = *(int*)&res; 237! res *= FTWO; 238! ax0 = *(int*)&res; 239! iexp0 = ax0 >> 24; 240! iexp0 = 0x3f + 0x4b - iexp0; 241! iexp0 = iexp0 << 23; 242! 243! si0 = (ax0 >> 13) & 0x7f0; 244! 245! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 246! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 247! iax0 = ax0 & 0x7ffe0000; 248! iax0 = ax0 - iax0; 249! xx0 = iax0 * tbl_div0; 250! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 251! 252! fres0 = res0; 253! iexp0 += *(int*)&fres0; 254! RETURN(*(float*)&iexp0) 255! } 256! else /* X = negative */ 257! { 258! RETURN ( sqrtf(res) ) 259! } 260! } 261! if ( ax1 >= 0x7f800000 ) 262! { 263! RETURN ( FONE / ((float*)&dres0)[1] ) 264! } 265! if ( ax1 < 0x00800000 ) 266! { 267! float res = ((float*)&dres0)[1]; 268! if ( (ax0 & 0x7fffffff) == 0 ) /* |X| = zero */ 269! { 270! RETURN ( FONE / res ) 271! } 272! else if ( ax0 >= 0 ) /* X = denormal */ 273! { 274! double res0, xx0, tbl_div0, tbl_sqrt0; 275! float fres0; 276! int iax1, si0, iexp0; 277! 278! res = *(int*)&res; 279! res *= FTWO; 280! ax1 = *(int*)&res; 281! iexp0 = ax1 >> 24; 282! iexp0 = 0x3f + 0x4b - iexp0; 283! iexp0 = iexp0 << 23; 284! 285! si0 = (ax1 >> 13) & 0x7f0; 286! 287! tbl_div0 = ((double*)((char*)__TBL_rsqrtf + si0))[0]; 288! tbl_sqrt0 = ((double*)((char*)__TBL_rsqrtf + si0))[1]; 289! iax1 = ax1 & 0x7ffe0000; 290! iax1 = ax1 - iax1; 291! xx0 = iax1 * tbl_div0; 292! res0 = tbl_sqrt0 * (((A3 * xx0 + A2) * xx0 + A1) * xx0 + A0); 293! 294! fres0 = res0; 295! iexp0 += *(int*)&fres0; 296! RETURN(*(float*)&iexp0) 297! } 298! else /* X = negative */ 299! { 300! RETURN ( sqrtf(res) ) 301! } 302! } 303! 304! iexp0 = ax0 >> 24; 305! iexp1 = ax1 >> 24; 306! iexp0 = 0x3f - iexp0; 307! iexp1 = 0x3f - iexp1; 308! iexp1 &= 0x1ff; 309! lexp0 = iexp0 << 55; 310! lexp1 = iexp1 << 23; 311! 312! lexp0 |= lexp1; 313! 314! fdx0 = *((double*)&lexp0); 315! 316! si0 = ax0 >> 13; 317! si1 = ax1 >> 13; 318! si0 &= 0x7f0; 319! si1 &= 0x7f0; 320! 321! addr0 = (char*)TBL + si0; 322! addr1 = (char*)TBL + si1; 323! tbl_div0 = ((double*)((char*)TBL + si0))[0]; 324! tbl_div1 = ((double*)((char*)TBL + si1))[0]; 325! tbl_sqrt0 = ((double*)addr0)[1]; 326! tbl_sqrt1 = ((double*)addr1)[1]; 327! dfx0 = vis_fand(ddx0,DC0); 328! dfx0 = vis_fpsub32(ddx0,dfx0); 329! dtmp0 = (double)(((int*)&dfx0)[0]); 330! dtmp1 = (double)(((int*)&dfx0)[1]); 331! xx0 = dtmp0 * tbl_div0; 332! xx1 = dtmp1 * tbl_div1; 333! res0 = K3 * xx0; 334! res1 = K3 * xx1; 335! res0 += K2; 336! res1 += K2; 337! res0 *= xx0; 338! res1 *= xx1; 339! res0 += K1; 340! res1 += K1; 341! res0 *= xx0; 342! res1 *= xx1; 343! res0 += K0; 344! res1 += K0; 345! res0 = tbl_sqrt0 * res0; 346! res1 = tbl_sqrt1 * res1; 347! ((float*)&dres0)[0] = (float)res0; 348! ((float*)&dres0)[1] = (float)res1; 349! dres0 = vis_fpadd32(dres0,fdx0); 350! *py = ((float*)&dres0)[0]; 351! *(py + stridey) = ((float*)&dres0)[1]; 352! py += stridey2; 353! 354!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! 355 356 ENTRY(__vrsqrtf) 357 save %sp,-SA(MINFRAME)-tmps,%sp 358 PIC_SETUP(l7) 359 PIC_SET(l7,.CONST_TBL,l2) 360 361 st %i0,[%fp+tmp_counter] 362 stx %i1,[%fp+tmp_px] 363 364 ldd [TBL+2048],K0 365 sll %i2,2,stridex 366 367 ldd [TBL+2048+8],K1 368 sll %i4,2,stridey 369 mov %i3,%i2 370 371 ldd [TBL+2048+16],K2 372 sethi %hi(0x7f800000),_0x7f800000 373 sll stridex,1,stridex2 374 375 ldd [TBL+2048+24],K3 376 sethi %hi(0x00800000),_0x00800000 377 378 ldd [TBL+2048+32],DC0 379 add %g0,0x3f,%l0 380 381 ldd [TBL+2048+40],FONE 382! ld [TBL+2048+44],FTWO 383.begin: 384 ld [%fp+tmp_counter],counter 385 ldx [%fp+tmp_px],%l7 386 st %g0,[%fp+tmp_counter] 387.begin1: 388 cmp counter,0 389 ble,pn %icc,.exit 390 391 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; 392 393 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); 394 sethi %hi(0x7ffffc00),%o0 395 396 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; 397 add %l7,stridex2,%i1 ! px += stridex2 398 add %o0,0x3ff,%o0 399 400 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); 401 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 402 403 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; 404 add %i1,stridex2,%o5 ! px += stridex2 405 406 cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 407 bge,pn %icc,.spec0 ! (4_1) if ( ax0 >= 0x7f800000 ) 408 nop 409 410 cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 411 bl,pn %icc,.spec1 ! (4_1) if ( ax0 < 0x00800000 ) 412 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; 413.cont_spec: 414 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; 415 416 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 417 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; 418 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; 419 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 420 421 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 422 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; 423 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; 424 425 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; 426 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; 427 428 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; 429 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; 430 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); 431 432 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; 433 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); 434 435 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; 436 437 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); 438 439 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; 440 441 lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; 442 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; 443 444 lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); 445 446 lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; 447 448 lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); 449 cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 450 bge,pn %icc,.update0 ! (5_1) if ( ax1 >= 0x7f800000 ) 451 fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; 452.cont0: 453 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; 454 cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 455 bl,pn %icc,.update1 ! (5_1) if ( ax1 < 0x00800000 ) 456 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 457.cont1: 458 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; 459 cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 460 461 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; 462 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; 463 464 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 465 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; 466 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; 467 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 468 469 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 470 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; 471 sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; 472 faddd %f52,K2,%f62 ! (4_1) res0 += K2; 473 474 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; 475 bge,pn %icc,.update2 ! (0_0) if ( ax0 >= 0x7f800000 ) 476 faddd %f50,K2,%f60 ! (5_1) res1 += K2; 477.cont2: 478 cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 479 and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; 480 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); 481 482 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; 483 bl,pn %icc,.update3 ! (0_0) if ( ax0 < 0x00800000 ) 484 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); 485.cont3: 486 fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; 487 sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; 488 489 fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; 490 or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; 491 stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); 492 493 fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; 494 sll stridex,1,stridex2 ! stridex2 = stridex * 2; 495 496 lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; 497 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; 498 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; 499 500 lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); 501 add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; 502 faddd %f30,K1,%f62 ! (4_1) res0 += K1; 503 504 lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; 505 add %o5,stridex2,%l7 ! px += stridex2 506 faddd %f48,K1,%f42 ! (5_1) res1 += K1; 507 508 lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); 509 cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 510 bge,pn %icc,.update4 ! (1_0) if ( ax1 >= 0x7f800000 ) 511 fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; 512.cont4: 513 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; 514 cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 515 bl,pn %icc,.update5 ! (1_0) if ( ax1 < 0x00800000 ) 516 fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 517.cont5: 518 fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; 519 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; 520 cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 521 522 fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; 523 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; 524 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; 525 526 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 527 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; 528 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; 529 fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 530 531 ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 532 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; 533 sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; 534 faddd %f52,K2,%f40 ! (0_0) res0 += K2; 535 536 ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; 537 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; 538 and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; 539 faddd %f50,K2,%f60 ! (1_0) res0 += K2; 540 541 ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; 542 sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; 543 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; 544 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); 545 546 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; 547 fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); 548 549 fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; 550 or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; 551 faddd %f48,K0,%f62 ! (4_1) res0 += K0; 552 553 fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; 554 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; 555 stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); 556 faddd %f58,K0,%f60 ! (5_1) res1 += K0; 557 558 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; 559 bge,pn %icc,.update6 ! (2_0) if ( ax0 >= 0x7f800000 ) 560 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; 561.cont6: 562 cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 563 bl,pn %icc,.update7 ! (2_0) if ( ax0 < 0x00800000 ) 564 nop 565.cont7: 566 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 567 568 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); 569 cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 570 fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; 571 faddd %f40,K1,%f46 ! (0_0) res0 += K1; 572 573 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; 574 add %l7,stridex2,%i1 ! px += stridex2 575 fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; 576 faddd %f48,K1,%f62 ! (1_0) res1 += K1; 577 578 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); 579 add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; 580 bge,pn %icc,.update8 ! (3_0) if ( ax1 >= 0x7f800000 ) 581 fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; 582.cont8: 583 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; 584 cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 585 bl,pn %icc,.update9 ! (3_0) if ( ax1 < 0x00800000 ) 586 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 587.cont9: 588 fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; 589 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; 590 add %i1,stridex2,%o5 ! px += stridex2 591 fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; 592 593 fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; 594 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; 595 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; 596 fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; 597 598 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 599 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; 600 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; 601 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 602 603 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 604 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; 605 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; 606 faddd %f52,K2,%f58 ! (2_0) res0 += K2; 607 608 ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; 609 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; 610 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; 611 faddd %f50,K2,%f60 ! (3_0) res1 += K2; 612 613 ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; 614 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; 615 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; 616 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); 617 618 ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); 619 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; 620 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); 621 622 fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; 623 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; 624 faddd %f48,K0,%f22 ! (0_0) res0 += K0; 625 626 fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; 627 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); 628 faddd %f40,K0,%f26 ! (1_0) res1 += K0; 629 630 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; 631 fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); 632 633 or %g0,%i2,%l7 634 add stridey,stridey,stridey2 635 636 cmp counter,6 637 bl,pn %icc,.tail 638 nop 639 640 ba .main_loop 641 sub counter,6,counter ! counter 642 643 .align 16 644.main_loop: 645 lda [%i1]0x82,%f18 ! (0_0) ((float*)&ddx0)[0] = *px; 646 cmp %g1,_0x7f800000 ! (4_1) ax0 ? 0x7f800000 647 bge,pn %icc,.update10 ! (4_1) if ( ax0 >= 0x7f800000 ) 648 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; 649.cont10: 650 lda [stridex+%i1]0x82,%f19 ! (1_0) ((float*)&ddx0)[1] = *(px + stridex); 651 cmp %g1,_0x00800000 ! (4_1) ax0 ? 0x00800000 652 fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; 653 faddd %f62,K1,%f42 ! (2_1) res0 += K1; 654 655 lda [%i1]0x82,%g1 ! (0_0) ax0 = *(int*)px; 656 fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; 657 bl,pn %icc,.update11 ! (4_1) if ( ax0 < 0x00800000 ) 658 faddd %f58,K1,%f62 ! (3_1) res1 += K1; 659.cont11: 660 lda [stridex+%i1]0x82,%i4 ! (1_0) ax1 = *(int*)(px + stridex); 661 cmp %g5,_0x7f800000 ! (5_1) ax1 ? 0x7f800000 662 bge,pn %icc,.update12 ! (5_1) if ( ax1 >= 0x7f800000 ) 663 fmuld K3,%f40,%f52 ! (4_1) res0 = K3 * xx0; 664.cont12: 665 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; 666 cmp %g5,_0x00800000 ! (5_1) ax1 ? 0x00800000 667 bl,pn %icc,.update13 ! (5_1) if ( ax1 < 0x00800000 ) 668 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 669.cont13: 670 fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; 671 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; 672 cmp %g1,_0x7f800000 ! (0_0) ax0 ? 0x7f800000 673 fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; 674 675 fmuld %f62,%f24,%f58 ! (3_1) res1 *= xx1; 676 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; 677 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; 678 fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; 679 680 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 681 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; 682 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; 683 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 684 685 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 686 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; 687 sub %l0,%i1,%i1 ! (1_0) iexp1 = 0x3f - iexp1; 688 faddd %f52,K2,%f62 ! (4_1) res0 += K2; 689 690 ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; 691 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; 692 bge,pn %icc,.update14 ! (0_0) if ( ax0 >= 0x7f800000 ) 693 faddd %f50,K2,%f60 ! (5_1) res1 += K2; 694.cont14: 695 ldd [%o1+8],%f28 ! (3_1) tbl_sqrt1 = ((double*)addr0)[1]; 696 cmp %g1,_0x00800000 ! (0_0) ax0 ? 0x00800000 697 and %i1,511,%i0 ! (1_0) iexp1 = 0x1ff; 698 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); 699 700 ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); 701 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; 702 bl,pn %icc,.update15 ! (0_0) if ( ax0 < 0x00800000 ) 703 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); 704.cont15: 705 fmuld %f62,%f40,%f30 ! (4_1) res0 *= xx0; 706 sllx %g5,55,%g5 ! (0_0) lexp0 = iexp0 << 55; 707 st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; 708 faddd %f48,K0,%f62 ! (2_1) res0 += K0; 709 710 fmuld %f60,%f46,%f48 ! (5_1) res1 *= xx1; 711 or %g5,%i0,%g5 ! (0_0) lexp0 |= lexp1; 712 stx %g5,[%fp+tmp1] ! (0_0) fdx0 = *((double*)lexp0); 713 faddd %f58,K0,%f60 ! (3_1) res1 += K0; 714 715 fmuld %f56,%f54,%f26 ! (0_0) xx0 = dtmp0 * tbl_div0; 716 sll stridex,1,stridex2 ! stridex2 = stridex * 2; 717 st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; 718 fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); 719 720 lda [%o5]0x82,%f24 ! (2_0) ((float*)&ddx0)[0] = *px; 721 add %l7,stridey2,%i1 ! py += stridey2 722 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; 723 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; 724 725 lda [stridex+%o5]0x82,%f25 ! (3_0) ((float*)&ddx0)[1] = *(px + stridex); 726 add %l5,TBL,%l5 ! (4_1) addr0 = (char*)TBL + si0; 727 fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; 728 faddd %f30,K1,%f62 ! (4_1) res0 += K1; 729 730 lda [%o5]0x82,%g1 ! (2_0) ax0 = *(int*)px; 731 add %o5,stridex2,%l7 ! px += stridex2 732 fmuld %f28,%f60,%f56 ! (3_1) res1 = tbl_sqrt1 * res1; 733 faddd %f48,K1,%f42 ! (5_1) res1 += K1; 734 735 lda [stridex+%o5]0x82,%o5 ! (3_0) ax1 = *(int*)(px + stridex); 736 cmp %i4,_0x7f800000 ! (1_0) ax1 ? 0x7f800000 737 bge,pn %icc,.update16 ! (1_0) if ( ax1 >= 0x7f800000 ) 738 fmuld K3,%f26,%f52 ! (0_0) res0 = K3 * xx0; 739.cont16: 740 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; 741 cmp %i4,_0x00800000 ! (1_0) ax1 ? 0x00800000 742 bl,pn %icc,.update17 ! (1_0) if ( ax1 < 0x00800000 ) 743 fand %f24,DC0,%f54 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 744.cont17: 745 fmuld %f62,%f40,%f48 ! (4_1) res0 *= xx0; 746 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; 747 cmp %g1,_0x7f800000 ! (2_0) ax0 ? 0x7f800000 748 fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; 749 750 fmuld %f42,%f46,%f58 ! (5_1) res1 *= xx1; 751 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; 752 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; 753 fdtos %f56,%f21 ! (3_1) ((float*)&dres0)[0] = (float)res0; 754 755 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 756 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; 757 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; 758 fpsub32 %f24,%f54,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 759 760 ldd [%o1+TBL],%f46 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 761 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; 762 sub %l0,%o3,%o3 ! (3_0) iexp1 = 0x3f - iexp1; 763 faddd %f52,K2,%f40 ! (0_0) res0 += K2; 764 765 ldd [%l5+8],%f42 ! (4_1) tbl_sqrt0 = ((double*)addr0)[1]; 766 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; 767 and %o3,511,%i3 ! (3_0) iexp1 &= 0x1ff; 768 faddd %f50,K2,%f60 ! (1_0) res0 += K2; 769 770 ldd [%l6+8],%f28 ! (5_1) tbl_sqrt1 = ((double*)addr1)[1]; 771 sllx %g5,55,%g5 ! (2_0) lexp0 = iexp0 << 55; 772 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; 773 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); 774 775 ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); 776 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; 777 add %i1,stridey2,%o3 ! py += stridey2 778 fitod %f13,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); 779 780 fmuld %f40,%f26,%f40 ! (0_0) res0 *= xx0; 781 or %g5,%i3,%g5 ! (2_0) lexp0 |= lexp1; 782 st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; 783 faddd %f48,K0,%f62 ! (4_1) res0 += K0; 784 785 fmuld %f60,%f44,%f48 ! (1_0) res1 *= xx1; 786 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; 787 stx %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); 788 faddd %f58,K0,%f60 ! (5_1) res1 += K0; 789 790 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; 791 bge,pn %icc,.update18 ! (2_0) if ( ax0 >= 0x7f800000 ) 792 st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; 793 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); 794.cont18: 795 cmp %g1,_0x00800000 ! (2_0) ax0 ? 0x00800000 796 bl,pn %icc,.update19 ! (2_0) if ( ax0 < 0x00800000 ) 797 lda [%l7]0x82,%f14 ! (4_0) ((float*)&ddx0)[0] = *px; 798 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 799.cont19: 800 lda [stridex+%l7]0x82,%f15 ! (5_0) ((float*)&ddx0)[1] = *(px + stridex); 801 cmp %o5,_0x7f800000 ! (3_0) ax1 ? 0x7f800000 802 fmuld %f42,%f62,%f58 ! (4_1) res0 = tbl_sqrt0 * res0; 803 faddd %f40,K1,%f46 ! (0_0) res0 += K1; 804 805 lda [%l7]0x82,%g1 ! (4_0) ax0 = *(int*)px; 806 add %l7,stridex2,%i1 ! px += stridex2 807 fmuld %f28,%f60,%f56 ! (5_1) res1 = tbl_sqrt1 * res1; 808 faddd %f48,K1,%f62 ! (1_0) res1 += K1; 809 810 lda [stridex+%l7]0x82,%g5 ! (5_0) ax1 = *(int*)(px + stridex); 811 add %o0,TBL,%o0 ! (0_0) addr0 = (char*)TBL + si0; 812 bge,pn %icc,.update20 ! (3_0) if ( ax1 >= 0x7f800000 ) 813 fmuld K3,%f30,%f52 ! (2_0) res0 = K3 * xx0; 814.cont20: 815 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; 816 cmp %o5,_0x00800000 ! (3_0) ax1 ? 0x00800000 817 bl,pn %icc,.update21 ! (3_0) if ( ax1 < 0x00800000 ) 818 fand %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 819.cont21: 820 fmuld %f46,%f26,%f48 ! (0_0) res0 *= xx0; 821 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; 822 add %i1,stridex2,%o5 ! px += stridex2 823 fdtos %f58,%f6 ! (4_1) ((float*)&dres0)[0] = (float)res0; 824 825 fmuld %f62,%f44,%f40 ! (1_0) res1 *= xx1; 826 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; 827 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; 828 fdtos %f56,%f7 ! (5_1) ((float*)&dres0)[1] = (float)res1; 829 830 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 831 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; 832 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; 833 fpsub32 %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 834 835 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 836 sra %g1,24,%i3 ! (4_0) iexp0 = ax0 >> 24; 837 sub %l0,%l7,%l7 ! (5_0) iexp1 = 0x3f - iexp1; 838 faddd %f52,K2,%f58 ! (2_0) res0 += K2; 839 840 ldd [%o0+8],%f42 ! (0_0) tbl_sqrt0 = ((double*)addr0)[1]; 841 and %l7,511,%l1 ! (5_0) iexp1 = 0x1ff; 842 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; 843 faddd %f50,K2,%f60 ! (3_0) res1 += K2; 844 845 ldd [%o7+8],%f28 ! (1_0) tbl_sqrt1 = ((double*)addr1)[1]; 846 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; 847 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; 848 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); 849 850 ldd [%fp+tmp0],%f52 ! (4_1) fdx0 = *((double*)lexp0); 851 sllx %o0,55,%o0 ! (4_0) lexp0 = iexp0 << 55; 852 add %o3,stridey2,%l7 ! py += stridey2 853 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); 854 855 fmuld %f58,%f30,%f62 ! (2_0) res0 *= xx0; 856 or %o0,%l1,%o0 ! (4_0) lexp0 |= lexp1; 857 st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; 858 faddd %f48,K0,%f22 ! (0_0) res0 += K0; 859 860 fmuld %f60,%f24,%f58 ! (3_0) res1 *= xx1; 861 subcc counter,6,counter ! counter -= 6; 862 stx %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); 863 faddd %f40,K0,%f26 ! (1_0) res1 += K0; 864 865 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; 866 st %f1,[stridey+%o3] ! (3_1) *(py + stridey) = ((float*)&dres0)[1]; 867 bpos,pt %icc,.main_loop 868 fpadd32 %f6,%f52,%f10 ! (4_1) dres0 = vis_fpadd32(dres0,fdx0); 869 870 add counter,6,counter 871.tail: 872 sll stridex,1,stridex2 873 subcc counter,1,counter 874 bneg,a .begin 875 mov %l7,%i2 876 877 fmuld %f42,%f22,%f44 ! (0_1) res0 = tbl_sqrt0 * res0; 878 faddd %f62,K1,%f42 ! (2_1) res0 += K1; 879 880 fmuld %f28,%f26,%f60 ! (1_1) res1 = tbl_sqrt1 * res1; 881 882 fmuld %f42,%f30,%f48 ! (2_1) res0 *= xx0; 883 fdtos %f44,%f8 ! (0_1) ((float*)&dres0)[0] = (float)res0; 884 885 fdtos %f60,%f9 ! (1_1) ((float*)&dres0)[1] = (float)res1; 886 887 ldd [%i0+8],%f42 ! (2_1) tbl_sqrt0 = ((double*)addr0)[1]; 888 889 ldd [%fp+tmp1],%f52 ! (0_1) fdx0 = *((double*)lexp0); 890 891 st %f10,[%l7] ! (4_2) *py = ((float*)&dres0)[0]; 892 subcc counter,1,counter 893 bneg,a .begin 894 add %l7,stridey,%i2 895 896 faddd %f48,K0,%f62 ! (2_1) res0 += K0; 897 st %f11,[stridey+%l7] ! (5_2) *(py + stridey) = ((float*)&dres0)[1]; 898 subcc counter,1,counter 899 bneg,a .begin 900 add %l7,stridey2,%i2 901 fpadd32 %f8,%f52,%f10 ! (0_1) dres0 = vis_fpadd32(dres0,fdx0); 902 903 add %l7,stridey2,%i1 ! py += stridey2 904 905 fmuld %f42,%f62,%f58 ! (2_1) res0 = tbl_sqrt0 * res0; 906 907 fdtos %f58,%f20 ! (2_1) ((float*)&dres0)[0] = (float)res0; 908 909 ldd [%fp+tmp2],%f52 ! (2_1) fdx0 = *((double*)lexp0); 910 add %i1,stridey2,%o3 ! py += stridey2 911 912 st %f10,[%i1] ! (0_1) *py = ((float*)&dres0)[0]; 913 subcc counter,1,counter 914 bneg,a .begin 915 add %i1,stridey,%i2 916 917 st %f11,[stridey+%i1] ! (1_1) *(py + stridey) = ((float*)&dres0)[1]; 918 subcc counter,1,counter 919 bneg,a .begin 920 mov %o3,%i2 921 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); 922 923 st %f0,[%o3] ! (2_1) *py = ((float*)&dres0)[0]; 924 ba .begin 925 add %o3,stridey,%i2 926 927 .align 16 928.spec0: 929 fdivs FONE,%f14,%f14 ! x0 = FONE / x0; 930 add %l7,stridex,%l7 ! px += stridex 931 st %f14,[%i2] ! *py = x0; 932 sub counter,1,counter 933 ba .begin1 934 add %i2,stridey,%i2 ! py += stridey 935 936 .align 16 937.spec1: 938 andcc %g1,%o0,%g0 939 bz,a 1f 940 fdivs FONE,%f14,%f14 ! x0 = DONE / x0; 941 942 cmp %g1,0 943 bl,a 1f 944 fsqrts %f14,%f14 ! x0 = sqrtf(x0); 945 946 fitod %f14,%f0 947 fdtos %f0,%f14 948 fmuls %f14,FTWO,%f14 949 st %f14,[%fp+tmp3] 950 ld [%fp+tmp3],%g1 951 sethi %hi(0x4b000000),%o0 952 sra %g1,13,%l5 ! (4_0) si0 = ax0 >> 13; 953 fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 954 ba .cont_spec 955 sub %g1,%o0,%g1 9561: 957 add %l7,stridex,%l7 ! px += stridex 958 sub counter,1,counter 959 st %f14,[%i2] ! *py = x0; 960 ba .begin1 961 add %i2,stridey,%i2 ! py += stridey 962 963 .align 16 964.update0: 965 cmp counter,1 966 ble .cont0 967 nop 968 969 sub %i1,stridex,%o1 970 stx %o1,[%fp+tmp_px] 971 972 sub counter,1,counter 973 st counter,[%fp+tmp_counter] 974 975 ba .cont0 976 mov 1,counter 977 978 .align 16 979.update1: 980 sethi %hi(0x7ffffc00),%o0 981 cmp counter,1 982 ble .cont1 983 984 add %o0,0x3ff,%o0 985 986 andcc %g5,%o0,%g0 987 bz,a 1f 988 nop 989 990 cmp %g5,0 991 bl,a 1f 992 nop 993 994 fitod %f15,%f0 995 fdtos %f0,%f15 996 fmuls %f15,FTWO,%f15 997 st %f15,[%fp+tmp3] 998 ld [%fp+tmp3],%g5 999 sethi %hi(0x4b000000),%o0 1000 sub %g5,%o0,%g5 1001 1002 fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 1003 1004 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; 1005 1006 sra %g5,24,%l7 ! (5_0) iexp1 = ax1 >> 24; 1007 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; 1008 1009 fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1010 1011 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1012 sub %l0,%l7,%l1 ! (5_0) iexp1 = 0x3f - iexp1; 1013 1014 sll %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; 1015 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; 1016 st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); 1017 fitod %f17,%f44 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); 1018 1019 fmuld %f44,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; 1020 1021 ba .cont1 1022 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; 10231: 1024 sub %i1,stridex,%o1 1025 stx %o1,[%fp+tmp_px] 1026 1027 sub counter,1,counter 1028 st counter,[%fp+tmp_counter] 1029 1030 ba .cont1 1031 mov 1,counter 1032 1033 .align 16 1034.update2: 1035 cmp counter,2 1036 ble .cont2 1037 sub %o5,stridex,%o1 1038 1039 sub %o1,stridex,%o1 1040 stx %o1,[%fp+tmp_px] 1041 1042 sub counter,2,counter 1043 st counter,[%fp+tmp_counter] 1044 1045 ba .cont2 1046 mov 2,counter 1047 1048 .align 16 1049.update3: 1050 sethi %hi(0x7ffffc00),%o1 1051 cmp counter,2 1052 ble .cont3 1053 1054 add %o1,0x3ff,%o1 1055 1056 andcc %g1,%o1,%g0 1057 bz,a 1f 1058 sub %o5,stridex,%o1 1059 1060 cmp %g1,0 1061 bl,a 1f 1062 sub %o5,stridex,%o1 1063 1064 fitod %f18,%f0 1065 fdtos %f0,%f18 1066 fmuls %f18,FTWO,%f18 1067 st %f18,[%fp+tmp3] 1068 ld [%fp+tmp3],%g1 1069 sethi %hi(0x4b000000),%o1 1070 sub %g1,%o1,%g1 1071 1072 fand %f18,DC0,%f56 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 1073 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; 1074 1075 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; 1076 1077 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 1078 fpsub32 %f18,%f56,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1079 1080 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; 1081 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; 1082 ba .cont3 1083 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); 10841: 1085 sub %o1,stridex,%o1 1086 stx %o1,[%fp+tmp_px] 1087 1088 sub counter,2,counter 1089 st counter,[%fp+tmp_counter] 1090 1091 ba .cont3 1092 mov 2,counter 1093 1094 .align 16 1095.update4: 1096 cmp counter,3 1097 ble .cont4 1098 sub %l7,stridex2,%o1 1099 1100 sub %o1,stridex,%o1 1101 stx %o1,[%fp+tmp_px] 1102 1103 sub counter,3,counter 1104 st counter,[%fp+tmp_counter] 1105 1106 ba .cont4 1107 mov 3,counter 1108 1109 .align 16 1110.update5: 1111 sethi %hi(0x7ffffc00),%o1 1112 cmp counter,3 1113 ble .cont5 1114 1115 add %o1,0x3ff,%o1 1116 1117 andcc %i4,%o1,%g0 1118 bz,a 1f 1119 sub %l7,stridex2,%o1 1120 1121 cmp %i4,0 1122 bl,a 1f 1123 sub %l7,stridex2,%o1 1124 1125 fitod %f19,%f0 1126 fdtos %f0,%f19 1127 fmuls %f19,FTWO,%f19 1128 st %f19,[%fp+tmp3] 1129 ld [%fp+tmp3],%i4 1130 sethi %hi(0x4b000000),%o1 1131 sub %i4,%o1,%i4 1132 1133 fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 1134 1135 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; 1136 1137 sra %i4,24,%i1 ! (1_0) iexp1 = ax1 >> 24; 1138 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; 1139 fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1140 1141 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1142 sub %l0,%i1,%i0 ! (1_0) iexp1 = 0x3f - iexp1; 1143 1144 sll %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; 1145 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); 1146 1147 st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); 1148 1149 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; 1150 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; 1151 1152 ba .cont5 1153 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; 11541: 1155 sub %o1,stridex,%o1 1156 stx %o1,[%fp+tmp_px] 1157 1158 sub counter,3,counter 1159 st counter,[%fp+tmp_counter] 1160 1161 ba .cont5 1162 mov 3,counter 1163 1164 .align 16 1165.update6: 1166 cmp counter,4 1167 ble .cont6 1168 sub %l7,stridex,%o3 1169 1170 sub %o3,stridex,%o3 1171 stx %o3,[%fp+tmp_px] 1172 1173 sub counter,4,counter 1174 st counter,[%fp+tmp_counter] 1175 1176 ba .cont6 1177 mov 4,counter 1178 1179 .align 16 1180.update7: 1181 sethi %hi(0x7ffffc00),%o3 1182 cmp counter,4 1183 ble .cont7 1184 1185 add %o3,0x3ff,%o3 1186 1187 andcc %g1,%o3,%g0 1188 bz,a 1f 1189 sub %l7,stridex,%o3 1190 1191 cmp %g1,0 1192 bl,a 1f 1193 sub %l7,stridex,%o3 1194 1195 fitod %f24,%f0 1196 fdtos %f0,%f24 1197 fmuls %f24,FTWO,%f24 1198 st %f24,[%fp+tmp3] 1199 ld [%fp+tmp3],%g1 1200 sethi %hi(0x4b000000),%o3 1201 sub %g1,%o3,%g1 1202 1203 fands %f24,DC0,%f0 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 1204 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; 1205 1206 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; 1207 1208 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 1209 fpsub32s %f24,%f0,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1210 1211 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; 1212 1213 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; 1214 1215 sll %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; 1216 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; 1217 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); 1218 1219 st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); 1220 ba .cont7 1221 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; 12221: 1223 sub %o3,stridex,%o3 1224 stx %o3,[%fp+tmp_px] 1225 1226 sub counter,4,counter 1227 st counter,[%fp+tmp_counter] 1228 1229 ba .cont7 1230 mov 4,counter 1231 1232 .align 16 1233.update8: 1234 cmp counter,5 1235 ble .cont8 1236 nop 1237 1238 sub %l7,stridex,%o3 1239 stx %o3,[%fp+tmp_px] 1240 1241 sub counter,5,counter 1242 st counter,[%fp+tmp_counter] 1243 1244 ba .cont8 1245 mov 5,counter 1246 1247 .align 16 1248.update9: 1249 sethi %hi(0x7ffffc00),%o3 1250 cmp counter,5 1251 ble .cont9 1252 sub %l7,stridex,%i3 1253 1254 add %o3,0x3ff,%o3 1255 1256 andcc %o5,%o3,%g0 1257 bz 1f 1258 ld [%i3],%f0 1259 1260 cmp %o5,0 1261 bl,a 1f 1262 nop 1263 1264 fitod %f0,%f0 1265 fdtos %f0,%f0 1266 fmuls %f0,FTWO,%f0 1267 st %f0,[%fp+tmp3] 1268 ld [%fp+tmp3],%o5 1269 sethi %hi(0x4b000000),%o3 1270 sub %o5,%o3,%o5 1271 1272 fands %f0,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 1273 1274 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; 1275 1276 sra %o5,24,%o3 ! (3_0) iexp1 = ax1 >> 24; 1277 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; 1278 fpsub32s %f0,%f8,%f0 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1279 1280 ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1281 sub %l0,%o3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; 1282 1283 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; 1284 fitod %f0,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); 1285 1286 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; 1287 st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); 1288 1289 fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 1290 1291 ba .cont9 1292 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; 12931: 1294 stx %i3,[%fp+tmp_px] 1295 1296 sub counter,5,counter 1297 st counter,[%fp+tmp_counter] 1298 1299 ba .cont9 1300 mov 5,counter 1301 1302 .align 16 1303.update10: 1304 cmp counter,0 1305 ble .cont10 1306 sub %i1,stridex,%o3 1307 1308 sub %o3,stridex,%o3 1309 stx %o3,[%fp+tmp_px] 1310 1311 st counter,[%fp+tmp_counter] 1312 1313 ba .cont10 1314 mov 0,counter 1315 1316 .align 16 1317.update11: 1318 sethi %hi(0x7ffffc00),%i4 1319 cmp counter,0 1320 ble .cont11 1321 sub %i1,stridex,%o3 1322 1323 sub %o3,stridex,%o3 1324 add %i4,0x3ff,%i4 1325 ld [%o3],%i3 1326 1327 andcc %i3,%i4,%g0 1328 bz 1f 1329 1330 cmp %i3,0 1331 bl,a 1f 1332 nop 1333 1334 fitod %f14,%f0 1335 fdtos %f0,%f14 1336 fmuls %f14,FTWO,%f14 1337 st %f14,[%fp+tmp3] 1338 ld [%fp+tmp3],%i3 1339 sethi %hi(0x4b000000),%o3 1340 sub %i3,%o3,%i3 1341 1342 fands %f14,DC0,%f16 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 1343 sra %i3,13,%l5 ! (4_0) si0 = ax0 >> 13; 1344 1345 and %l5,2032,%l5 ! (4_0) si0 &= 0x7f0; 1346 1347 ldd [%l5+TBL],%f54 ! (4_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 1348 fpsub32s %f14,%f16,%f16 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1349 1350 sra %i3,24,%i3 ! (4_0) iexp0 = ax0 >> 24; 1351 1352 sub %l0,%i3,%o0 ! (4_0) iexp0 = 0x3f - iexp0; 1353 fitod %f16,%f56 ! (4_0) dtmp0 = (double)(((int*)dfx0)[0]); 1354 1355 sllx %o0,23,%o0 ! (4_0) lexp0 = iexp0 << 55; 1356 1357 st %o0,[%fp+tmp0] ! (4_0) fdx0 = *((double*)lexp0); 1358 1359 ba .cont11 1360 fmuld %f56,%f54,%f40 ! (4_0) xx0 = dtmp0 * tbl_div0; 13611: 1362 stx %o3,[%fp+tmp_px] 1363 1364 st counter,[%fp+tmp_counter] 1365 1366 ba .cont11 1367 mov 0,counter 1368 1369 .align 16 1370.update12: 1371 cmp counter,1 1372 ble .cont12 1373 nop 1374 1375 sub %i1,stridex,%i1 1376 stx %i1,[%fp+tmp_px] 1377 1378 sub counter,1,counter 1379 st counter,[%fp+tmp_counter] 1380 1381 ba .cont12 1382 mov 1,counter 1383 1384 .align 16 1385.update13: 1386 sethi %hi(0x7ffffc00),%o3 1387 cmp counter,1 1388 ble .cont13 1389 1390 add %o3,0x3ff,%o3 1391 1392 andcc %g5,%o3,%g0 1393 bz 1f 1394 1395 cmp %g5,0 1396 bl,a 1f 1397 nop 1398 1399 fitod %f15,%f0 1400 fdtos %f0,%f15 1401 fmuls %f15,FTWO,%f15 1402 st %f15,[%fp+tmp3] 1403 ld [%fp+tmp3],%g5 1404 sethi %hi(0x4b000000),%o3 1405 sub %g5,%o3,%g5 1406 1407 fands %f15,DC0,%f17 ! (4_0) dfx0 = vis_fand(ddx0,DC0); 1408 1409 sra %g5,13,%l6 ! (5_0) si1 = ax1 >> 13; 1410 sra %g5,24,%o3 ! (5_0) iexp1 = ax1 >> 24; 1411 and %l6,2032,%l6 ! (5_0) si1 &= 0x7f0; 1412 fpsub32s %f15,%f17,%f17 ! (4_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1413 1414 ldd [%l6+TBL],%f46 ! (5_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1415 sub %l0,%o3,%l1 ! (5_0) iexp1 = 0x3f - iexp1; 1416 1417 add %l6,TBL,%l6 ! (5_0) addr1 = (char*)TBL + si1; 1418 1419 sllx %l1,23,%l1 ! (5_0) lexp1 = iexp1 << 23; 1420 st %l1,[%fp+tmp0+4] ! (4_0) fdx0 = *((double*)lexp0); 1421 1422 fitod %f17,%f0 ! (5_0) dtmp1 = (double)(((int*)dfx0)[1]); 1423 1424 fmuld %f0,%f46,%f46 ! (5_1) xx1 = dtmp1 * tbl_div1; 1425 ba .cont13 1426 fmuld K3,%f46,%f50 ! (5_1) res1 = K3 * xx1; 14271: 1428 sub %i1,stridex,%i1 1429 stx %i1,[%fp+tmp_px] 1430 1431 sub counter,1,counter 1432 st counter,[%fp+tmp_counter] 1433 1434 ba .cont13 1435 mov 1,counter 1436 1437 .align 16 1438.update14: 1439 cmp counter,2 1440 ble .cont14 1441 sub %o5,stridex,%o3 1442 1443 sub %o3,stridex,%o3 1444 stx %o3,[%fp+tmp_px] 1445 1446 sub counter,2,counter 1447 st counter,[%fp+tmp_counter] 1448 1449 ba .cont14 1450 mov 2,counter 1451 1452 .align 16 1453.update15: 1454 sethi %hi(0x7ffffc00),%i3 1455 cmp counter,2 1456 ble .cont15 1457 sub %o5,stridex,%o3 1458 1459 add %i3,0x3ff,%i3 1460 1461 andcc %g1,%i3,%g0 1462 bz 1f 1463 sub %o3,stridex,%o3 1464 1465 cmp %g1,0 1466 bl,a 1f 1467 nop 1468 1469 fitod %f18,%f0 1470 fdtos %f0,%f18 1471 fmuls %f18,FTWO,%f18 1472 st %f18,[%fp+tmp3] 1473 ld [%fp+tmp3],%g1 1474 sethi %hi(0x4b000000),%o3 1475 sub %g1,%o3,%g1 1476 1477 fands %f18,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 1478 sra %g1,13,%o0 ! (0_0) si0 = ax0 >> 13; 1479 and %o0,2032,%o0 ! (0_0) si0 &= 0x7f0; 1480 1481 ldd [%o0+TBL],%f54 ! (0_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 1482 fpsub32s %f18,%f0,%f30 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1483 1484 sra %g1,24,%i3 ! (0_0) iexp0 = ax0 >> 24; 1485 1486 sub %l0,%i3,%g5 ! (0_0) iexp0 = 0x3f - iexp0; 1487 1488 ba .cont15 1489 fitod %f30,%f56 ! (0_0) dtmp0 = (double)(((int*)dfx0)[0]); 14901: 1491 stx %o3,[%fp+tmp_px] 1492 1493 sub counter,2,counter 1494 st counter,[%fp+tmp_counter] 1495 1496 ba .cont15 1497 mov 2,counter 1498 1499 .align 16 1500.update16: 1501 cmp counter,3 1502 ble .cont16 1503 sub %l7,stridex2,%o3 1504 1505 sub %o3,stridex,%o3 1506 stx %o3,[%fp+tmp_px] 1507 1508 sub counter,3,counter 1509 st counter,[%fp+tmp_counter] 1510 1511 ba .cont16 1512 mov 3,counter 1513 1514 .align 16 1515.update17: 1516 sethi %hi(0x7ffffc00),%i3 1517 cmp counter,3 1518 ble .cont17 1519 sub %l7,stridex2,%o3 1520 1521 add %i3,0x3ff,%i3 1522 1523 andcc %i4,%i3,%g0 1524 bz 1f 1525 sub %o3,stridex,%o3 1526 1527 cmp %i4,0 1528 bl,a 1f 1529 nop 1530 1531 fitod %f19,%f0 1532 fdtos %f0,%f19 1533 fmuls %f19,FTWO,%f19 1534 st %f19,[%fp+tmp3] 1535 ld [%fp+tmp3],%i4 1536 sethi %hi(0x4b000000),%o3 1537 sub %i4,%o3,%i4 1538 1539 fands %f19,DC0,%f0 ! (0_0) dfx0 = vis_fand(ddx0,DC0); 1540 1541 sra %i4,13,%g5 ! (1_0) si1 = ax1 >> 13; 1542 1543 sra %i4,24,%i0 ! (1_0) iexp1 = ax1 >> 24; 1544 and %g5,2032,%o7 ! (1_0) si1 &= 0x7f0; 1545 fpsub32s %f19,%f0,%f31 ! (0_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1546 1547 ldd [%o7+TBL],%f44 ! (1_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1548 sub %l0,%i0,%i0 ! (1_0) iexp1 = 0x3f - iexp1; 1549 1550 sllx %i0,23,%i0 ! (1_0) lexp1 = iexp1 << 23; 1551 fitod %f31,%f50 ! (1_0) dtmp0 = (double)(((int*)dfx0)[0]); 1552 1553 st %i0,[%fp+tmp1+4] ! (0_0) fdx0 = *((double*)lexp0); 1554 1555 add %o7,TBL,%o7 ! (1_0) addr0 = (char*)TBL + si0; 1556 fmuld %f50,%f44,%f44 ! (1_0) xx0 = dtmp0 * tbl_div0; 1557 1558 ba .cont17 1559 fmuld K3,%f44,%f50 ! (1_0) res1 = K3 * xx1; 15601: 1561 stx %o3,[%fp+tmp_px] 1562 1563 sub counter,3,counter 1564 st counter,[%fp+tmp_counter] 1565 1566 ba .cont17 1567 mov 3,counter 1568 1569 .align 16 1570.update18: 1571 cmp counter,4 1572 ble .cont18 1573 fpadd32 %f20,%f52,%f0 ! (2_1) dres0 = vis_fpadd32(dres0,fdx0); 1574 1575 sub %l7,stridex2,%i3 1576 stx %i3,[%fp+tmp_px] 1577 1578 sub counter,4,counter 1579 st counter,[%fp+tmp_counter] 1580 1581 ba .cont18 1582 mov 4,counter 1583 1584 .align 16 1585.update19: 1586 sethi %hi(0x7ffffc00),%i3 1587 cmp counter,4 1588 ble,a .cont19 1589 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 1590 1591 add %i3,0x3ff,%i3 1592 1593 andcc %g1,%i3,%g0 1594 bz 1f 1595 nop 1596 1597 cmp %g1,0 1598 bl,a 1f 1599 nop 1600 1601 fitod %f24,%f24 1602 fdtos %f24,%f24 1603 fmuls %f24,FTWO,%f24 1604 st %f24,[%fp+tmp3] 1605 ld [%fp+tmp3],%g1 1606 sethi %hi(0x4b000000),%i3 1607 sub %g1,%i3,%g1 1608 1609 fands %f24,DC0,%f8 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 1610 sra %g1,13,%i0 ! (2_0) si0 = ax0 >> 13; 1611 1612 and %i0,2032,%i0 ! (2_0) si0 &= 0x7f0; 1613 1614 ldd [%i0+TBL],%f30 ! (2_0) tbl_div0 = ((double*)((char*)TBL + si0))[0]; 1615 fpsub32s %f24,%f8,%f12 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1616 1617 sra %g1,24,%i3 ! (2_0) iexp0 = ax0 >> 24; 1618 1619 sub %l0,%i3,%g5 ! (2_0) iexp0 = 0x3f - iexp0; 1620 1621 sllx %g5,23,%g5 ! (2_0) lexp0 = iexp0 << 55; 1622 add %i0,TBL,%i0 ! (2_0) addr0 = (char*)TBL + si0; 1623 fitod %f12,%f56 ! (2_0) dtmp0 = (double)(((int*)dfx0)[0]); 1624 1625 st %g5,[%fp+tmp2] ! (2_0) fdx0 = *((double*)lexp0); 1626 fmuld %f56,%f30,%f30 ! (2_0) xx0 = dtmp0 * tbl_div0; 1627 1628 ba .cont19 1629 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 16301: 1631 sub %l7,stridex2,%i3 1632 stx %i3,[%fp+tmp_px] 1633 1634 sub counter,4,counter 1635 st counter,[%fp+tmp_counter] 1636 1637 mov 4,counter 1638 ba .cont19 1639 fmuld %f50,%f46,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 1640 1641 .align 16 1642.update20: 1643 cmp counter,5 1644 ble .cont20 1645 nop 1646 1647 sub %l7,stridex,%i3 1648 stx %i3,[%fp+tmp_px] 1649 1650 sub counter,5,counter 1651 st counter,[%fp+tmp_counter] 1652 1653 ba .cont20 1654 mov 5,counter 1655 1656 .align 16 1657.update21: 1658 sethi %hi(0x7ffffc00),%i3 1659 cmp counter,5 1660 ble,a .cont21 1661 nop 1662 1663 sub %l7,stridex,%i4 1664 add %i3,0x3ff,%i3 1665 1666 andcc %o5,%i3,%g0 1667 bz 1f 1668 ld [%i4],%f8 1669 1670 cmp %o5,0 1671 bl,a 1f 1672 nop 1673 1674 fitod %f8,%f8 1675 fdtos %f8,%f8 1676 fmuls %f8,FTWO,%f8 1677 st %f8,[%fp+tmp3] 1678 ld [%fp+tmp3],%o5 1679 sethi %hi(0x4b000000),%i3 1680 sub %o5,%i3,%o5 1681 1682 fands %f8,DC0,%f24 ! (2_0) dfx0 = vis_fand(ddx0,DC0); 1683 1684 sra %o5,13,%o1 ! (3_0) si1 = ax1 >> 13; 1685 1686 sra %o5,24,%i3 ! (3_0) iexp1 = ax1 >> 24; 1687 and %o1,2032,%o1 ! (3_0) si1 &= 0x7f0; 1688 fpsub32s %f8,%f24,%f24 ! (2_0) dfx0 = vis_fpsub32(ddx0,dfx0); 1689 1690 ldd [%o1+TBL],%f8 ! (3_0) tbl_div1 = ((double*)((char*)TBL + si1))[0]; 1691 sub %l0,%i3,%i3 ! (3_0) iexp1 = 0x3f - iexp1; 1692 1693 sllx %i3,23,%i3 ! (3_0) lexp1 = iexp1 << 23; 1694 fitod %f24,%f50 ! (3_0) dtmp1 = (double)(((int*)dfx0)[1]); 1695 1696 add %o1,TBL,%o1 ! (3_0) addr1 = (char*)TBL + si1; 1697 st %i3,[%fp+tmp2+4] ! (2_0) fdx0 = *((double*)lexp0); 1698 1699 fmuld %f50,%f8,%f24 ! (3_0) xx1 = dtmp1 * tbl_div1; 1700 1701 ba .cont21 1702 fmuld K3,%f24,%f50 ! (3_0) res1 = K3 * xx1; 17031: 1704 sub %l7,stridex,%i3 1705 stx %i3,[%fp+tmp_px] 1706 1707 sub counter,5,counter 1708 st counter,[%fp+tmp_counter] 1709 1710 ba .cont21 1711 mov 5,counter 1712 1713 .align 16 1714.exit: 1715 ret 1716 restore 1717 1718 SET_SIZE(__vrsqrtf) 1719 1720