1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/isa_defs.h> 31 #include "libm_synonyms.h" 32 #include "libm_inlines.h" 33 34 #ifdef _LITTLE_ENDIAN 35 #define HI(x) *(1+(int*)x) 36 #define LO(x) *(unsigned*)x 37 #else 38 #define HI(x) *(int*)x 39 #define LO(x) *(1+(unsigned*)x) 40 #endif 41 42 #ifdef __RESTRICT 43 #define restrict _Restrict 44 #else 45 #define restrict 46 #endif 47 48 /* double rsqrt(double x) 49 * 50 * Method : 51 * 1. Special cases: 52 * for x = NaN => QNaN; 53 * for x = +Inf => 0; 54 * for x is negative, -Inf => QNaN + invalid; 55 * for x = +0 => +Inf + divide-by-zero; 56 * for x = -0 => -Inf + divide-by-zero. 57 * 2. Computes reciprocal square root from: 58 * x = m * 2**n 59 * Where: 60 * m = [0.5, 2), 61 * n = ((exponent + 1) & ~1). 62 * Then: 63 * rsqrt(x) = 1/sqrt( m * 2**n ) = (2 ** (-n/2)) * (1/sqrt(m)) 64 * 2. Computes 1/sqrt(m) from: 65 * 1/sqrt(m) = (1/sqrt(m0)) * (1/sqrt(1 + (1/m0)*dm)) 66 * Where: 67 * m = m0 + dm, 68 * m0 = 0.5 * (1 + k/64) for m = [0.5, 0.5+127/256), k = [0, 63]; 69 * m0 = 1.0 * (0 + k/64) for m = [0.5+127/256, 1.0+127/128), k = [64, 127]; 70 * m0 = 2.0 for m = [1.0+127/128, 2.0), k = 128. 71 * Then: 72 * 1/sqrt(m0) is looked up in a table, 73 * 1/m0 is computed as (1/sqrt(m0)) * (1/sqrt(m0)). 74 * 1/sqrt(1 + (1/m0)*dm) is computed using approximation: 75 * 1/sqrt(1 + z) = (((((a6 * z + a5) * z + a4) * z + a3) 76 * * z + a2) * z + a1) * z + a0 77 * where z = [-1/128, 1/128]. 78 * 79 * Accuracy: 80 * The maximum relative error for the approximating 81 * polynomial is 2**(-56.26). 82 * Maximum error observed: less than 0.563 ulp after 1.500.000.000 83 * results. 84 */ 85 86 #define sqrt __sqrt 87 88 extern double sqrt (double); 89 extern const double __vlibm_TBL_rsqrt[]; 90 91 static void 92 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey); 93 94 #pragma no_inline(__vrsqrt_n) 95 96 #define RETURN(ret) \ 97 { \ 98 *py = (ret); \ 99 py += stridey; \ 100 if (n_n == 0) \ 101 { \ 102 spx = px; spy = py; \ 103 hx = HI(px); \ 104 continue; \ 105 } \ 106 n--; \ 107 break; \ 108 } 109 110 static const double 111 DONE = 1.0, 112 K1 = -5.00000000000005209867e-01, 113 K2 = 3.75000000000004884257e-01, 114 K3 = -3.12499999317136886551e-01, 115 K4 = 2.73437499359815081532e-01, 116 K5 = -2.46116125605037803130e-01, 117 K6 = 2.25606914648617522896e-01; 118 119 void 120 __vrsqrt(int n, double * restrict px, int stridex, double * restrict py, int stridey) 121 { 122 double *spx, *spy; 123 int ax, lx, hx, n_n; 124 double res; 125 126 while (n > 1) 127 { 128 n_n = 0; 129 spx = px; 130 spy = py; 131 hx = HI(px); 132 for (; n > 1 ; n--) 133 { 134 px += stridex; 135 if (hx >= 0x7ff00000) /* X = NaN or Inf */ 136 { 137 res = *(px - stridex); 138 RETURN (DONE / res) 139 } 140 141 py += stridey; 142 143 if (hx < 0x00100000) /* X = denormal, zero or negative */ 144 { 145 py -= stridey; 146 ax = hx & 0x7fffffff; 147 lx = LO((px - stridex)); 148 res = *(px - stridex); 149 150 if ((ax | lx) == 0) /* |X| = zero */ 151 { 152 RETURN (DONE / res) 153 } 154 else if (hx >= 0) /* X = denormal */ 155 { 156 double res_c0, dsqrt_exp0; 157 int ind0, sqrt_exp0; 158 double xx0, dexp_hi0, dexp_lo0; 159 int hx0, resh0, res_ch0; 160 161 res = *(long long*)&res; 162 163 hx0 = HI(&res); 164 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 165 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 166 167 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 168 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 169 HI(&res) = resh0; 170 HI(&res_c0) = res_ch0; 171 LO(&res_c0) = 0; 172 173 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 174 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 175 xx0 = dexp_hi0 * dexp_hi0; 176 xx0 = (res - res_c0) * xx0; 177 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 178 179 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 180 181 HI(&dsqrt_exp0) = sqrt_exp0; 182 LO(&dsqrt_exp0) = 0; 183 res *= dsqrt_exp0; 184 185 RETURN (res) 186 } 187 else /* X = negative */ 188 { 189 RETURN (sqrt(res)) 190 } 191 } 192 n_n++; 193 hx = HI(px); 194 } 195 if (n_n > 0) 196 __vrsqrt_n(n_n, spx, stridex, spy, stridey); 197 } 198 if (n > 0) 199 { 200 hx = HI(px); 201 202 if (hx >= 0x7ff00000) /* X = NaN or Inf */ 203 { 204 res = *px; 205 *py = DONE / res; 206 } 207 else if (hx < 0x00100000) /* X = denormal, zero or negative */ 208 { 209 ax = hx & 0x7fffffff; 210 lx = LO(px); 211 res = *px; 212 213 if ((ax | lx) == 0) /* |X| = zero */ 214 { 215 *py = DONE / res; 216 } 217 else if (hx >= 0) /* X = denormal */ 218 { 219 double res_c0, dsqrt_exp0; 220 int ind0, sqrt_exp0; 221 double xx0, dexp_hi0, dexp_lo0; 222 int hx0, resh0, res_ch0; 223 224 res = *(long long*)&res; 225 226 hx0 = HI(&res); 227 sqrt_exp0 = (0x817 - (hx0 >> 21)) << 20; 228 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 229 230 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 231 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 232 HI(&res) = resh0; 233 HI(&res_c0) = res_ch0; 234 LO(&res_c0) = 0; 235 236 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 237 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 238 xx0 = dexp_hi0 * dexp_hi0; 239 xx0 = (res - res_c0) * xx0; 240 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 241 242 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 243 244 HI(&dsqrt_exp0) = sqrt_exp0; 245 LO(&dsqrt_exp0) = 0; 246 res *= dsqrt_exp0; 247 248 *py = res; 249 } 250 else /* X = negative */ 251 { 252 *py = sqrt(res); 253 } 254 } 255 else 256 { 257 double res_c0, dsqrt_exp0; 258 int ind0, sqrt_exp0; 259 double xx0, dexp_hi0, dexp_lo0; 260 int resh0, res_ch0; 261 262 sqrt_exp0 = (0x5fe - (hx >> 21)) << 20; 263 ind0 = (((hx >> 10) & 0x7f8) + 8) & -16; 264 265 resh0 = (hx & 0x001fffff) | 0x3fe00000; 266 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 267 HI(&res) = resh0; 268 LO(&res) = LO(px); 269 HI(&res_c0) = res_ch0; 270 LO(&res_c0) = 0; 271 272 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 273 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 274 xx0 = dexp_hi0 * dexp_hi0; 275 xx0 = (res - res_c0) * xx0; 276 res = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 277 278 res = dexp_hi0 * res + dexp_lo0 + dexp_hi0; 279 280 HI(&dsqrt_exp0) = sqrt_exp0; 281 LO(&dsqrt_exp0) = 0; 282 res *= dsqrt_exp0; 283 284 *py = res; 285 } 286 } 287 } 288 289 static void 290 __vrsqrt_n(int n, double * restrict px, int stridex, double * restrict py, int stridey) 291 { 292 double res0, res_c0, dsqrt_exp0; 293 double res1, res_c1, dsqrt_exp1; 294 double res2, res_c2, dsqrt_exp2; 295 int ind0, sqrt_exp0; 296 int ind1, sqrt_exp1; 297 int ind2, sqrt_exp2; 298 double xx0, dexp_hi0, dexp_lo0; 299 double xx1, dexp_hi1, dexp_lo1; 300 double xx2, dexp_hi2, dexp_lo2; 301 int hx0, resh0, res_ch0; 302 int hx1, resh1, res_ch1; 303 int hx2, resh2, res_ch2; 304 305 LO(&dsqrt_exp0) = 0; 306 LO(&dsqrt_exp1) = 0; 307 LO(&dsqrt_exp2) = 0; 308 LO(&res_c0) = 0; 309 LO(&res_c1) = 0; 310 LO(&res_c2) = 0; 311 312 for(; n > 2 ; n -= 3) 313 { 314 hx0 = HI(px); 315 LO(&res0) = LO(px); 316 px += stridex; 317 318 hx1 = HI(px); 319 LO(&res1) = LO(px); 320 px += stridex; 321 322 hx2 = HI(px); 323 LO(&res2) = LO(px); 324 px += stridex; 325 326 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 327 sqrt_exp1 = (0x5fe - (hx1 >> 21)) << 20; 328 sqrt_exp2 = (0x5fe - (hx2 >> 21)) << 20; 329 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 330 ind1 = (((hx1 >> 10) & 0x7f8) + 8) & -16; 331 ind2 = (((hx2 >> 10) & 0x7f8) + 8) & -16; 332 333 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 334 resh1 = (hx1 & 0x001fffff) | 0x3fe00000; 335 resh2 = (hx2 & 0x001fffff) | 0x3fe00000; 336 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 337 res_ch1 = (resh1 + 0x00002000) & 0x7fffc000; 338 res_ch2 = (resh2 + 0x00002000) & 0x7fffc000; 339 HI(&res0) = resh0; 340 HI(&res1) = resh1; 341 HI(&res2) = resh2; 342 HI(&res_c0) = res_ch0; 343 HI(&res_c1) = res_ch1; 344 HI(&res_c2) = res_ch2; 345 346 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 347 dexp_hi1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[0]; 348 dexp_hi2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[0]; 349 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 350 dexp_lo1 = ((double*)((char*)__vlibm_TBL_rsqrt + ind1))[1]; 351 dexp_lo2 = ((double*)((char*)__vlibm_TBL_rsqrt + ind2))[1]; 352 xx0 = dexp_hi0 * dexp_hi0; 353 xx1 = dexp_hi1 * dexp_hi1; 354 xx2 = dexp_hi2 * dexp_hi2; 355 xx0 = (res0 - res_c0) * xx0; 356 xx1 = (res1 - res_c1) * xx1; 357 xx2 = (res2 - res_c2) * xx2; 358 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 359 res1 = (((((K6 * xx1 + K5) * xx1 + K4) * xx1 + K3) * xx1 + K2) * xx1 + K1) * xx1; 360 res2 = (((((K6 * xx2 + K5) * xx2 + K4) * xx2 + K3) * xx2 + K2) * xx2 + K1) * xx2; 361 362 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 363 res1 = dexp_hi1 * res1 + dexp_lo1 + dexp_hi1; 364 res2 = dexp_hi2 * res2 + dexp_lo2 + dexp_hi2; 365 366 HI(&dsqrt_exp0) = sqrt_exp0; 367 HI(&dsqrt_exp1) = sqrt_exp1; 368 HI(&dsqrt_exp2) = sqrt_exp2; 369 res0 *= dsqrt_exp0; 370 res1 *= dsqrt_exp1; 371 res2 *= dsqrt_exp2; 372 373 *py = res0; 374 py += stridey; 375 376 *py = res1; 377 py += stridey; 378 379 *py = res2; 380 py += stridey; 381 } 382 383 for(; n > 0 ; n--) 384 { 385 hx0 = HI(px); 386 387 sqrt_exp0 = (0x5fe - (hx0 >> 21)) << 20; 388 ind0 = (((hx0 >> 10) & 0x7f8) + 8) & -16; 389 390 resh0 = (hx0 & 0x001fffff) | 0x3fe00000; 391 res_ch0 = (resh0 + 0x00002000) & 0x7fffc000; 392 HI(&res0) = resh0; 393 LO(&res0) = LO(px); 394 HI(&res_c0) = res_ch0; 395 LO(&res_c0) = 0; 396 397 px += stridex; 398 399 dexp_hi0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[0]; 400 dexp_lo0 = ((double*)((char*)__vlibm_TBL_rsqrt + ind0))[1]; 401 xx0 = dexp_hi0 * dexp_hi0; 402 xx0 = (res0 - res_c0) * xx0; 403 res0 = (((((K6 * xx0 + K5) * xx0 + K4) * xx0 + K3) * xx0 + K2) * xx0 + K1) * xx0; 404 405 res0 = dexp_hi0 * res0 + dexp_lo0 + dexp_hi0; 406 407 HI(&dsqrt_exp0) = sqrt_exp0; 408 LO(&dsqrt_exp0) = 0; 409 res0 *= dsqrt_exp0; 410 411 *py = res0; 412 py += stridey; 413 } 414 } 415 416