1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/isa_defs.h> 31 #include "libm_inlines.h" 32 33 #ifdef _LITTLE_ENDIAN 34 #define HI(x) *(1+(int*)x) 35 #define LO(x) *(unsigned*)x 36 #else 37 #define HI(x) *(int*)x 38 #define LO(x) *(1+(unsigned*)x) 39 #endif 40 41 #ifdef __RESTRICT 42 #define restrict _Restrict 43 #else 44 #define restrict 45 #endif 46 47 /* double rsqrt(double x) 48 * 49 * Method : 50 * 1. Special cases: 51 * for x = NaN => QNaN; 52 * for x = +Inf => 0; 53 * for x is negative, -Inf => QNaN + invalid; 54 * for x = +0 => +Inf + divide-by-zero; 55 * for x = -0 => -Inf + divide-by-zero. 56 * 2. Computes reciprocal square root from: 57 * x = m * 2**n 58 * Where: 59 * m = [0.5, 2), 60 * n = ((exponent + 1) & ~1). 61 * Then: 62 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 63 * 2. Computes 1/sqrt(m) from: 64 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 65 * Where: 66 * m = m0 + dm, 67 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 68 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 69 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. 70 * Then: 71 * 1/sqrt(m0) is looked up in a table, 72 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). 73 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 74 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) 75 * * z + a2) * z + a1) * z + a0 76 * where z = [-1/128, 1/128]. 77 * 78 * Accuracy: 79 * The maximum relative error for the approximating 80 * polynomial is 2**(-56.26). 81 * Maximum error observed: less than 0.563 ulp after 1.500.000.000 82 * results. 83 */ 84 85 extern double sqrt (double); 86 extern const double __vlibm_TBL_rsqrt[]; 87 88 static void 89 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey); 90 91 #pragma no_inline(__vrsqrt_n) 92 93 #define RETURN(ret) \ 94 { \ 95 *py = (ret); \ 96 py += stridey; \ 97 if (n_n == 0) \ 98 { \ 99 spx = px; spy = py; \ 100 hx = HI(px); \ 101 continue; \ 102 } \ 103 n--; \ 104 break; \ 105 } 106 107 static const double 108 DONE = 1.0, 109 K1 = -5.00000000000005209867e-01, 110 K2 = 3.75000000000004884257e-01, 111 K3 = -3.12499999317136886551e-01, 112 K4 = 2.73437499359815081532e-01, 113 K5 = -2.46116125605037803130e-01, 114 K6 = 2.25606914648617522896e-01; 115 116 void 117 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey) 118 { 119 double *spx, *spy; 120 int ax, lx, hx, n_n; 121 double res; 122 123 while (n > 1) 124 { 125 n_n = 0; 126 spx = px; 127 spy = py; 128 hx = HI(px); 129 for (; n > 1 ; n--) 130 { 131 px += stridex; 132 if (hx >= 0x7ff00000) /* X = NaN or Inf */ 133 { 134 res = *(px - stridex); 135 RETURN (DONE / res) 136 } 137 138 py += stridey; 139 140 if (hx < 0x00100000) /* X = denormal, zero or negative */ 141 { 142 py -= stridey; 143 ax = hx & 0x7fffffff; 144 lx = LO((px - stridex)); 145 res = *(px - stridex); 146 147 if ((ax | lx) == 0) /* |X| = zero */ 148 { 149 RETURN (DONE / res) 150 } 151 else if (hx >= 0) /* X = denormal */ 152 { 153 double res_c0, dsqrt_exp0; 154 int ind0, sqrt_exp0; 155 double xx0, dexp_hi0, dexp_lo0; 156 int hx0, resh0, res_ch0; 157 158 res = *(long long*)&res; 159 160 hx0 = HI(&res); 161 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 162 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 163 164 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 165 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 166 HI(&res) = resh0; 167 HI(&res_c0) = res_ch0; 168 LO(&res_c0) = 0; 169 170 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 171 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 172 xx0 = dexp_hi0 * dexp_hi0; 173 xx0 = (res - res_c0) * xx0; 174 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 175 176 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 177 178 HI(&dsqrt_exp0) = sqrt_exp0; 179 LO(&dsqrt_exp0) = 0; 180 res *= dsqrt_exp0; 181 182 RETURN (res) 183 } 184 else /* X = negative */ 185 { 186 RETURN (sqrt(res)) 187 } 188 } 189 n_n++; 190 hx = HI(px); 191 } 192 if (n_n > 0) 193 __vrsqrt_n(n_n, spx, stridex, spy, stridey); 194 } 195 if (n > 0) 196 { 197 hx = HI(px); 198 199 if (hx >= 0x7ff00000) /* X = NaN or Inf */ 200 { 201 res = *px; 202 *py = DONE / res; 203 } 204 else if (hx < 0x00100000) /* X = denormal, zero or negative */ 205 { 206 ax = hx & 0x7fffffff; 207 lx = LO(px); 208 res = *px; 209 210 if ((ax | lx) == 0) /* |X| = zero */ 211 { 212 *py = DONE / res; 213 } 214 else if (hx >= 0) /* X = denormal */ 215 { 216 double res_c0, dsqrt_exp0; 217 int ind0, sqrt_exp0; 218 double xx0, dexp_hi0, dexp_lo0; 219 int hx0, resh0, res_ch0; 220 221 res = *(long long*)&res; 222 223 hx0 = HI(&res); 224 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 225 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 226 227 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 228 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 229 HI(&res) = resh0; 230 HI(&res_c0) = res_ch0; 231 LO(&res_c0) = 0; 232 233 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 234 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 235 xx0 = dexp_hi0 * dexp_hi0; 236 xx0 = (res - res_c0) * xx0; 237 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 238 239 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 240 241 HI(&dsqrt_exp0) = sqrt_exp0; 242 LO(&dsqrt_exp0) = 0; 243 res *= dsqrt_exp0; 244 245 *py = res; 246 } 247 else /* X = negative */ 248 { 249 *py = sqrt(res); 250 } 251 } 252 else 253 { 254 double res_c0, dsqrt_exp0; 255 int ind0, sqrt_exp0; 256 double xx0, dexp_hi0, dexp_lo0; 257 int resh0, res_ch0; 258 259 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; 260 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; 261 262 resh0 = (hx & 0x001fffff) | 0x3fe00000; 263 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 264 HI(&res) = resh0; 265 LO(&res) = LO(px); 266 HI(&res_c0) = res_ch0; 267 LO(&res_c0) = 0; 268 269 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 270 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 271 xx0 = dexp_hi0 * dexp_hi0; 272 xx0 = (res - res_c0) * xx0; 273 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 274 275 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 276 277 HI(&dsqrt_exp0) = sqrt_exp0; 278 LO(&dsqrt_exp0) = 0; 279 res *= dsqrt_exp0; 280 281 *py = res; 282 } 283 } 284 } 285 286 static void 287 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey) 288 { 289 double res0, res_c0, dsqrt_exp0; 290 double res1, res_c1, dsqrt_exp1; 291 double res2, res_c2, dsqrt_exp2; 292 int ind0, sqrt_exp0; 293 int ind1, sqrt_exp1; 294 int ind2, sqrt_exp2; 295 double xx0, dexp_hi0, dexp_lo0; 296 double xx1, dexp_hi1, dexp_lo1; 297 double xx2, dexp_hi2, dexp_lo2; 298 int hx0, resh0, res_ch0; 299 int hx1, resh1, res_ch1; 300 int hx2, resh2, res_ch2; 301 302 LO(&dsqrt_exp0) = 0; 303 LO(&dsqrt_exp1) = 0; 304 LO(&dsqrt_exp2) = 0; 305 LO(&res_c0) = 0; 306 LO(&res_c1) = 0; 307 LO(&res_c2) = 0; 308 309 for(; n > 2 ; n -= 3) 310 { 311 hx0 = HI(px); 312 LO(&res0) = LO(px); 313 px += stridex; 314 315 hx1 = HI(px); 316 LO(&res1) = LO(px); 317 px += stridex; 318 319 hx2 = HI(px); 320 LO(&res2) = LO(px); 321 px += stridex; 322 323 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 324 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; 325 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; 326 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 327 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; 328 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; 329 330 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 331 resh1 = (hx1 & 0x001fffff) | 0x3fe00000; 332 resh2 = (hx2 & 0x001fffff) | 0x3fe00000; 333 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 334 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; 335 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; 336 HI(&res0) = resh0; 337 HI(&res1) = resh1; 338 HI(&res2) = resh2; 339 HI(&res_c0) = res_ch0; 340 HI(&res_c1) = res_ch1; 341 HI(&res_c2) = res_ch2; 342 343 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 344 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0]; 345 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0]; 346 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 347 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1]; 348 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1]; 349 xx0 = dexp_hi0 * dexp_hi0; 350 xx1 = dexp_hi1 * dexp_hi1; 351 xx2 = dexp_hi2 * dexp_hi2; 352 xx0 = (res0 - res_c0) * xx0; 353 xx1 = (res1 - res_c1) * xx1; 354 xx2 = (res2 - res_c2) * xx2; 355 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 356 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1; 357 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2; 358 359 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 360 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; 361 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; 362 363 HI(&dsqrt_exp0) = sqrt_exp0; 364 HI(&dsqrt_exp1) = sqrt_exp1; 365 HI(&dsqrt_exp2) = sqrt_exp2; 366 res0 *= dsqrt_exp0; 367 res1 *= dsqrt_exp1; 368 res2 *= dsqrt_exp2; 369 370 *py = res0; 371 py += stridey; 372 373 *py = res1; 374 py += stridey; 375 376 *py = res2; 377 py += stridey; 378 } 379 380 for(; n > 0 ; n--) 381 { 382 hx0 = HI(px); 383 384 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 385 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 386 387 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 388 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 389 HI(&res0) = resh0; 390 LO(&res0) = LO(px); 391 HI(&res_c0) = res_ch0; 392 LO(&res_c0) = 0; 393 394 px += stridex; 395 396 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 397 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 398 xx0 = dexp_hi0 * dexp_hi0; 399 xx0 = (res0 - res_c0) * xx0; 400 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 401 402 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 403 404 HI(&dsqrt_exp0) = sqrt_exp0; 405 LO(&dsqrt_exp0) = 0; 406 res0 *= dsqrt_exp0; 407 408 *py = res0; 409 py += stridey; 410 } 411 } 412