1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/isa_defs.h> 31 #include "libm_inlines.h" 32 33 #ifdef _LITTLE_ENDIAN 34 #define HI(x) *(1+(int *)x) 35 #define LO(x) *(unsigned *)x 36 #else 37 #define HI(x) *(int *)x 38 #define LO(x) *(1+(unsigned *)x) 39 #endif 40 41 #ifdef __RESTRICT 42 #define restrict _Restrict 43 #else 44 #define restrict 45 #endif 46 47 /* 48 * double rsqrt(double x) 49 * 50 * Method : 51 * 1. Special cases: 52 * for x = NaN => QNaN; 53 * for x = +Inf => 0; 54 * for x is negative, -Inf => QNaN + invalid; 55 * for x = +0 => +Inf + divide-by-zero; 56 * for x = -0 => -Inf + divide-by-zero. 57 * 2. Computes reciprocal square root from: 58 * x = m * 2**n 59 * Where: 60 * m = [0.5, 2), 61 * n = ((exponent + 1) & ~1). 62 * Then: 63 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 64 * 2. Computes 1/sqrt(m) from: 65 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 66 * Where: 67 * m = m0 + dm, 68 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 69 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), 70 * k = [64, 127]; 71 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. 72 * Then: 73 * 1/sqrt(m0) is looked up in a table, 74 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). 75 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 76 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) 77 * * z + a2) * z + a1) * z + a0 78 * where z = [-1/128, 1/128]. 79 * 80 * Accuracy: 81 * The maximum relative error for the approximating 82 * polynomial is 2**(-56.26). 83 * Maximum error observed: less than 0.563 ulp after 1.500.000.000 84 * results. 85 */ 86 87 extern double sqrt(double); 88 extern const double __vlibm_TBL_rsqrt[]; 89 90 static void 91 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py, 92 int stridey); 93 94 #define RETURN(ret) \ 95 { \ 96 *py = (ret); \ 97 py += stridey; \ 98 if (n_n == 0) \ 99 { \ 100 spx = px; \ 101 spy = py; \ 102 hx = HI(px); \ 103 continue; \ 104 } \ 105 n--; \ 106 break; \ 107 } 108 109 static const double 110 DONE = 1.0, 111 K1 = -5.00000000000005209867e-01, 112 K2 = 3.75000000000004884257e-01, 113 K3 = -3.12499999317136886551e-01, 114 K4 = 2.73437499359815081532e-01, 115 K5 = -2.46116125605037803130e-01, 116 K6 = 2.25606914648617522896e-01; 117 118 void 119 __vrsqrt(int n, double *restrict px, int stridex, double *restrict py, 120 int stridey) 121 { 122 double *spx, *spy; 123 int ax, lx, hx, n_n; 124 double res; 125 126 while (n > 1) { 127 n_n = 0; 128 spx = px; 129 spy = py; 130 hx = HI(px); 131 for (; n > 1; n--) { 132 px += stridex; 133 if (hx >= 0x7ff00000) { /* X = NaN or Inf */ 134 res = *(px - stridex); 135 RETURN(DONE / res) 136 } 137 138 py += stridey; 139 140 if (hx < 0x00100000) { 141 /* X = denormal, zero or negative */ 142 py -= stridey; 143 ax = hx & 0x7fffffff; 144 lx = LO((px - stridex)); 145 res = *(px - stridex); 146 147 if ((ax | lx) == 0) { /* |X| = zero */ 148 RETURN(DONE / res) 149 } else if (hx >= 0) { /* X = denormal */ 150 double res_c0, dsqrt_exp0; 151 int ind0, sqrt_exp0; 152 double xx0, dexp_hi0, dexp_lo0; 153 int hx0, resh0, res_ch0; 154 155 res = *(long long *)&res; 156 157 hx0 = HI(&res); 158 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 159 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & 160 -16; 161 162 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 163 res_ch0 = (resh0 + 0x00002000) & 164 0x7fffc000; 165 HI(&res) = resh0; 166 HI(&res_c0) = res_ch0; 167 LO(&res_c0) = 0; 168 169 dexp_hi0 = ((double *)((char *) 170 __vlibm_TBL_rsqrt + ind0))[0]; 171 dexp_lo0 = ((double *)((char *) 172 __vlibm_TBL_rsqrt + ind0))[1]; 173 xx0 = dexp_hi0 * dexp_hi0; 174 xx0 = (res - res_c0) * xx0; 175 res = (((((K6 * xx0 + K5) * xx0 + K4) * 176 xx0 + K3) * xx0 + K2) * xx0 + K1) * 177 xx0; 178 179 res = dexp_hi0 * res + dexp_lo0 + 180 dexp_hi0; 181 182 HI(&dsqrt_exp0) = sqrt_exp0; 183 LO(&dsqrt_exp0) = 0; 184 res *= dsqrt_exp0; 185 186 RETURN(res) 187 } else { /* X = negative */ 188 RETURN(sqrt(res)) 189 } 190 } 191 n_n++; 192 hx = HI(px); 193 } 194 if (n_n > 0) 195 __vrsqrt_n(n_n, spx, stridex, spy, stridey); 196 } 197 if (n > 0) { 198 hx = HI(px); 199 200 if (hx >= 0x7ff00000) { /* X = NaN or Inf */ 201 res = *px; 202 *py = DONE / res; 203 } else if (hx < 0x00100000) { 204 /* X = denormal, zero or negative */ 205 ax = hx & 0x7fffffff; 206 lx = LO(px); 207 res = *px; 208 209 if ((ax | lx) == 0) { /* |X| = zero */ 210 *py = DONE / res; 211 } else if (hx >= 0) { /* X = denormal */ 212 double res_c0, dsqrt_exp0; 213 int ind0, sqrt_exp0; 214 double xx0, dexp_hi0, dexp_lo0; 215 int hx0, resh0, res_ch0; 216 217 res = *(long long *)&res; 218 219 hx0 = HI(&res); 220 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 221 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 222 223 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 224 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 225 HI(&res) = resh0; 226 HI(&res_c0) = res_ch0; 227 LO(&res_c0) = 0; 228 229 dexp_hi0 = ((double *)((char *) 230 __vlibm_TBL_rsqrt + ind0))[0]; 231 dexp_lo0 = ((double *)((char *) 232 __vlibm_TBL_rsqrt + ind0))[1]; 233 xx0 = dexp_hi0 * dexp_hi0; 234 xx0 = (res - res_c0) * xx0; 235 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + 236 K3) * xx0 + K2) * xx0 + K1) * xx0; 237 238 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 239 240 HI(&dsqrt_exp0) = sqrt_exp0; 241 LO(&dsqrt_exp0) = 0; 242 res *= dsqrt_exp0; 243 244 *py = res; 245 } else { /* X = negative */ 246 *py = sqrt(res); 247 } 248 } else { 249 double res_c0, dsqrt_exp0; 250 int ind0, sqrt_exp0; 251 double xx0, dexp_hi0, dexp_lo0; 252 int resh0, res_ch0; 253 254 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; 255 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; 256 257 resh0 = (hx & 0x001fffff) | 0x3fe00000; 258 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 259 HI(&res) = resh0; 260 LO(&res) = LO(px); 261 HI(&res_c0) = res_ch0; 262 LO(&res_c0) = 0; 263 264 dexp_hi0 = ((double *)((char *) 265 __vlibm_TBL_rsqrt + ind0))[0]; 266 dexp_lo0 = ((double *)((char *) 267 __vlibm_TBL_rsqrt + ind0))[1]; 268 xx0 = dexp_hi0 * dexp_hi0; 269 xx0 = (res - res_c0) * xx0; 270 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * 271 xx0 + K2) * xx0 + K1) * xx0; 272 273 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 274 275 HI(&dsqrt_exp0) = sqrt_exp0; 276 LO(&dsqrt_exp0) = 0; 277 res *= dsqrt_exp0; 278 279 *py = res; 280 } 281 } 282 } 283 284 static void 285 __vrsqrt_n(int n, double *restrict px, int stridex, double *restrict py, 286 int stridey) 287 { 288 double res0, res_c0, dsqrt_exp0; 289 double res1, res_c1, dsqrt_exp1; 290 double res2, res_c2, dsqrt_exp2; 291 int ind0, sqrt_exp0; 292 int ind1, sqrt_exp1; 293 int ind2, sqrt_exp2; 294 double xx0, dexp_hi0, dexp_lo0; 295 double xx1, dexp_hi1, dexp_lo1; 296 double xx2, dexp_hi2, dexp_lo2; 297 int hx0, resh0, res_ch0; 298 int hx1, resh1, res_ch1; 299 int hx2, resh2, res_ch2; 300 301 LO(&dsqrt_exp0) = 0; 302 LO(&dsqrt_exp1) = 0; 303 LO(&dsqrt_exp2) = 0; 304 LO(&res_c0) = 0; 305 LO(&res_c1) = 0; 306 LO(&res_c2) = 0; 307 308 for (; n > 2; n -= 3) { 309 hx0 = HI(px); 310 LO(&res0) = LO(px); 311 px += stridex; 312 313 hx1 = HI(px); 314 LO(&res1) = LO(px); 315 px += stridex; 316 317 hx2 = HI(px); 318 LO(&res2) = LO(px); 319 px += stridex; 320 321 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 322 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; 323 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; 324 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 325 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; 326 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; 327 328 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 329 resh1 = (hx1 & 0x001fffff) | 0x3fe00000; 330 resh2 = (hx2 & 0x001fffff) | 0x3fe00000; 331 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 332 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; 333 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; 334 HI(&res0) = resh0; 335 HI(&res1) = resh1; 336 HI(&res2) = resh2; 337 HI(&res_c0) = res_ch0; 338 HI(&res_c1) = res_ch1; 339 HI(&res_c2) = res_ch2; 340 341 dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0]; 342 dexp_hi1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[0]; 343 dexp_hi2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[0]; 344 dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1]; 345 dexp_lo1 = ((double *)((char *)__vlibm_TBL_rsqrt + ind1))[1]; 346 dexp_lo2 = ((double *)((char *)__vlibm_TBL_rsqrt + ind2))[1]; 347 xx0 = dexp_hi0 * dexp_hi0; 348 xx1 = dexp_hi1 * dexp_hi1; 349 xx2 = dexp_hi2 * dexp_hi2; 350 xx0 = (res0 - res_c0) * xx0; 351 xx1 = (res1 - res_c1) * xx1; 352 xx2 = (res2 - res_c2) * xx2; 353 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * 354 xx0 + K2) * xx0 + K1) * xx0; 355 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * 356 xx1 + K2) * xx1 + K1) * xx1; 357 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * 358 xx2 + K2) * xx2 + K1) * xx2; 359 360 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 361 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; 362 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; 363 364 HI(&dsqrt_exp0) = sqrt_exp0; 365 HI(&dsqrt_exp1) = sqrt_exp1; 366 HI(&dsqrt_exp2) = sqrt_exp2; 367 res0 *= dsqrt_exp0; 368 res1 *= dsqrt_exp1; 369 res2 *= dsqrt_exp2; 370 371 *py = res0; 372 py += stridey; 373 374 *py = res1; 375 py += stridey; 376 377 *py = res2; 378 py += stridey; 379 } 380 381 for (; n > 0; n--) { 382 hx0 = HI(px); 383 384 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 385 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 386 387 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 388 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 389 HI(&res0) = resh0; 390 LO(&res0) = LO(px); 391 HI(&res_c0) = res_ch0; 392 LO(&res_c0) = 0; 393 394 px += stridex; 395 396 dexp_hi0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[0]; 397 dexp_lo0 = ((double *)((char *)__vlibm_TBL_rsqrt + ind0))[1]; 398 xx0 = dexp_hi0 * dexp_hi0; 399 xx0 = (res0 - res_c0) * xx0; 400 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * 401 xx0 + K2) * xx0 + K1) * xx0; 402 403 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 404 405 HI(&dsqrt_exp0) = sqrt_exp0; 406 LO(&dsqrt_exp0) = 0; 407 res0 *= dsqrt_exp0; 408 409 *py = res0; 410 py += stridey; 411 } 412 } 413