1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/isa_defs.h> 31 #include "libm_inlines.h" 32 33 #ifdef _LITTLE_ENDIAN 34 #define HI(x) *(1+(int*)x) 35 #define LO(x) *(unsigned*)x 36 #else 37 #define HI(x) *(int*)x 38 #define LO(x) *(1+(unsigned*)x) 39 #endif 40 41 #ifdef __RESTRICT 42 #define restrict _Restrict 43 #else 44 #define restrict 45 #endif 46 47 /* double rhypot(double x, double y) 48 * 49 * Method : 50 * 1. Special cases: 51 * x or y = Inf => 0 52 * x or y = NaN => QNaN 53 * x and y = 0 => Inf + divide-by-zero 54 * 2. Computes rhypot(x,y): 55 * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm)) 56 * Where: 57 * m = 1/max(|x|,|y|) 58 * xnm = x * m 59 * ynm = y * m 60 * 61 * Compute 1/(xnm * xnm + ynm * ynm) by simulating 62 * muti-precision arithmetic. 63 * 64 * Accuracy: 65 * Maximum error observed: less than 0.869 ulp after 1.000.000.000 66 * results. 67 */ 68 69 extern double sqrt(double); 70 extern double fabs(double); 71 72 static const int __vlibm_TBL_rhypot[] = { 73 /* i = [0,127] 74 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */ 75 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465, 76 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a, 77 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6, 78 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3, 79 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b, 80 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036, 81 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01, 82 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1, 83 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb, 84 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5, 85 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405, 86 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc, 87 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7, 88 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec, 89 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b, 90 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed, 91 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150, 92 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539, 93 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66, 94 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995, 95 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d, 96 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19, 97 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404, 98 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22, 99 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47, 100 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a, 101 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06, 102 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358, 103 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20, 104 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f, 105 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197, 106 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010, 107 }; 108 109 static const unsigned long long LCONST[] = { 110 0x3ff0000000000000ULL, /* DONE = 1.0 */ 111 0x4000000000000000ULL, /* DTWO = 2.0 */ 112 0x4230000000000000ULL, /* D2ON36 = 2**36 */ 113 0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */ 114 0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */ 115 }; 116 117 #define RET_SC(I) \ 118 px += stridex; \ 119 py += stridey; \ 120 pz += stridez; \ 121 if (--n <= 0) \ 122 break; \ 123 goto start##I; 124 125 #define RETURN(I, ret) \ 126 { \ 127 pz[0] = (ret); \ 128 RET_SC(I) \ 129 } 130 131 #define PREP(I) \ 132 hx##I = HI(px); \ 133 hy##I = HI(py); \ 134 hx##I &= 0x7fffffff; \ 135 hy##I &= 0x7fffffff; \ 136 pz##I = pz; \ 137 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000) /* |X| or |Y| = Inf,NaN */ \ 138 { \ 139 lx = LO(px); \ 140 ly = LO(py); \ 141 x = *px; \ 142 y = *py; \ 143 if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0; /* |X| = Inf */ \ 144 else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0; /* |Y| = Inf */ \ 145 else res0 = fabs(x) + fabs(y); \ 146 \ 147 RETURN (I, res0) \ 148 } \ 149 x##I = *px; \ 150 y##I = *py; \ 151 diff0 = hy##I - hx##I; \ 152 j0 = diff0 >> 31; \ 153 if (hx##I < 0x00100000 && hy##I < 0x00100000) /* |X| and |Y| = subnormal or zero */ \ 154 { \ 155 lx = LO(px); \ 156 ly = LO(py); \ 157 x = x##I; \ 158 y = y##I; \ 159 \ 160 if ((hx##I | hy##I | lx | ly) == 0) /* |X| and |Y| = 0 */ \ 161 RETURN (I, DONE / 0.0) \ 162 \ 163 x = fabs(x); \ 164 y = fabs(y); \ 165 \ 166 x = *(long long*)&x; \ 167 y = *(long long*)&y; \ 168 \ 169 x *= D2ONM52; \ 170 y *= D2ONM52; \ 171 \ 172 x_hi0 = (x + D2ON36) - D2ON36; \ 173 y_hi0 = (y + D2ON36) - D2ON36; \ 174 x_lo0 = x - x_hi0; \ 175 y_lo0 = y - y_hi0; \ 176 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \ 177 res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \ 178 \ 179 dres0 = res0_hi + res0_lo; \ 180 \ 181 iarr0 = HI(&dres0); \ 182 iexp0 = iarr0 & 0xfff00000; \ 183 \ 184 iarr0 = (iarr0 >> 11) & 0x1fc; \ 185 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \ 186 itbl0 -= iexp0; \ 187 HI(&dd0) = itbl0; \ 188 LO(&dd0) = 0; \ 189 \ 190 dd0 = dd0 * (DTWO - dd0 * dres0); \ 191 dd0 = dd0 * (DTWO - dd0 * dres0); \ 192 dres0 = dd0 * (DTWO - dd0 * dres0); \ 193 \ 194 HI(&res0) = HI(&dres0) & 0xffffff00; \ 195 LO(&res0) = 0; \ 196 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \ 197 res0 = sqrt (res0); \ 198 \ 199 res0 = D2ON1022 * res0; \ 200 RETURN (I, res0) \ 201 } \ 202 j0 = hy##I - (diff0 & j0); \ 203 j0 &= 0x7ff00000; \ 204 HI(&scl##I) = 0x7ff00000 - j0; 205 206 void 207 __vrhypot(int n, double * restrict px, int stridex, double * restrict py, 208 int stridey, double * restrict pz, int stridez) 209 { 210 int i = 0; 211 double x, y; 212 double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0; 213 double x0, y0, res0, dd0; 214 double res0_hi,res0_lo, dres0; 215 double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0; 216 double x1 = 0.0L, y1 = 0.0L, res1, dd1; 217 double res1_hi,res1_lo, dres1; 218 double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0; 219 double x2, y2, res2, dd2; 220 double res2_hi,res2_lo, dres2; 221 222 int hx0, hy0, j0, diff0; 223 int iarr0, iexp0, itbl0; 224 int hx1, hy1; 225 int iarr1, iexp1, itbl1; 226 int hx2, hy2; 227 int iarr2, iexp2, itbl2; 228 229 int lx, ly; 230 231 double DONE = ((double*)LCONST)[0]; 232 double DTWO = ((double*)LCONST)[1]; 233 double D2ON36 = ((double*)LCONST)[2]; 234 double D2ON1022 = ((double*)LCONST)[3]; 235 double D2ONM52 = ((double*)LCONST)[4]; 236 237 double *pz0, *pz1 = 0, *pz2; 238 239 do 240 { 241 start0: 242 PREP(0) 243 px += stridex; 244 py += stridey; 245 pz += stridez; 246 i = 1; 247 if (--n <= 0) 248 break; 249 250 start1: 251 PREP(1) 252 px += stridex; 253 py += stridey; 254 pz += stridez; 255 i = 2; 256 if (--n <= 0) 257 break; 258 259 start2: 260 PREP(2) 261 262 x0 *= scl0; 263 y0 *= scl0; 264 x1 *= scl1; 265 y1 *= scl1; 266 x2 *= scl2; 267 y2 *= scl2; 268 269 x_hi0 = (x0 + D2ON36) - D2ON36; 270 y_hi0 = (y0 + D2ON36) - D2ON36; 271 x_hi1 = (x1 + D2ON36) - D2ON36; 272 y_hi1 = (y1 + D2ON36) - D2ON36; 273 x_hi2 = (x2 + D2ON36) - D2ON36; 274 y_hi2 = (y2 + D2ON36) - D2ON36; 275 x_lo0 = x0 - x_hi0; 276 y_lo0 = y0 - y_hi0; 277 x_lo1 = x1 - x_hi1; 278 y_lo1 = y1 - y_hi1; 279 x_lo2 = x2 - x_hi2; 280 y_lo2 = y2 - y_hi2; 281 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); 282 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); 283 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2); 284 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); 285 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); 286 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2); 287 288 dres0 = res0_hi + res0_lo; 289 dres1 = res1_hi + res1_lo; 290 dres2 = res2_hi + res2_lo; 291 292 iarr0 = HI(&dres0); 293 iarr1 = HI(&dres1); 294 iarr2 = HI(&dres2); 295 iexp0 = iarr0 & 0xfff00000; 296 iexp1 = iarr1 & 0xfff00000; 297 iexp2 = iarr2 & 0xfff00000; 298 299 iarr0 = (iarr0 >> 11) & 0x1fc; 300 iarr1 = (iarr1 >> 11) & 0x1fc; 301 iarr2 = (iarr2 >> 11) & 0x1fc; 302 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; 303 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; 304 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0]; 305 itbl0 -= iexp0; 306 itbl1 -= iexp1; 307 itbl2 -= iexp2; 308 HI(&dd0) = itbl0; 309 HI(&dd1) = itbl1; 310 HI(&dd2) = itbl2; 311 LO(&dd0) = 0; 312 LO(&dd1) = 0; 313 LO(&dd2) = 0; 314 315 dd0 = dd0 * (DTWO - dd0 * dres0); 316 dd1 = dd1 * (DTWO - dd1 * dres1); 317 dd2 = dd2 * (DTWO - dd2 * dres2); 318 dd0 = dd0 * (DTWO - dd0 * dres0); 319 dd1 = dd1 * (DTWO - dd1 * dres1); 320 dd2 = dd2 * (DTWO - dd2 * dres2); 321 dres0 = dd0 * (DTWO - dd0 * dres0); 322 dres1 = dd1 * (DTWO - dd1 * dres1); 323 dres2 = dd2 * (DTWO - dd2 * dres2); 324 325 HI(&res0) = HI(&dres0) & 0xffffff00; 326 HI(&res1) = HI(&dres1) & 0xffffff00; 327 HI(&res2) = HI(&dres2) & 0xffffff00; 328 LO(&res0) = 0; 329 LO(&res1) = 0; 330 LO(&res2) = 0; 331 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; 332 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; 333 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2; 334 res0 = sqrt (res0); 335 res1 = sqrt (res1); 336 res2 = sqrt (res2); 337 338 res0 = scl0 * res0; 339 res1 = scl1 * res1; 340 res2 = scl2 * res2; 341 342 *pz0 = res0; 343 *pz1 = res1; 344 *pz2 = res2; 345 346 px += stridex; 347 py += stridey; 348 pz += stridez; 349 i = 0; 350 351 } while (--n > 0); 352 353 if (i > 0) 354 { 355 x0 *= scl0; 356 y0 *= scl0; 357 358 x_hi0 = (x0 + D2ON36) - D2ON36; 359 y_hi0 = (y0 + D2ON36) - D2ON36; 360 x_lo0 = x0 - x_hi0; 361 y_lo0 = y0 - y_hi0; 362 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); 363 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); 364 365 dres0 = res0_hi + res0_lo; 366 367 iarr0 = HI(&dres0); 368 iexp0 = iarr0 & 0xfff00000; 369 370 iarr0 = (iarr0 >> 11) & 0x1fc; 371 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; 372 itbl0 -= iexp0; 373 HI(&dd0) = itbl0; 374 LO(&dd0) = 0; 375 376 dd0 = dd0 * (DTWO - dd0 * dres0); 377 dd0 = dd0 * (DTWO - dd0 * dres0); 378 dres0 = dd0 * (DTWO - dd0 * dres0); 379 380 HI(&res0) = HI(&dres0) & 0xffffff00; 381 LO(&res0) = 0; 382 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; 383 res0 = sqrt (res0); 384 385 res0 = scl0 * res0; 386 387 *pz0 = res0; 388 389 if (i > 1) 390 { 391 x1 *= scl1; 392 y1 *= scl1; 393 394 x_hi1 = (x1 + D2ON36) - D2ON36; 395 y_hi1 = (y1 + D2ON36) - D2ON36; 396 x_lo1 = x1 - x_hi1; 397 y_lo1 = y1 - y_hi1; 398 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); 399 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); 400 401 dres1 = res1_hi + res1_lo; 402 403 iarr1 = HI(&dres1); 404 iexp1 = iarr1 & 0xfff00000; 405 406 iarr1 = (iarr1 >> 11) & 0x1fc; 407 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; 408 itbl1 -= iexp1; 409 HI(&dd1) = itbl1; 410 LO(&dd1) = 0; 411 412 dd1 = dd1 * (DTWO - dd1 * dres1); 413 dd1 = dd1 * (DTWO - dd1 * dres1); 414 dres1 = dd1 * (DTWO - dd1 * dres1); 415 416 HI(&res1) = HI(&dres1) & 0xffffff00; 417 LO(&res1) = 0; 418 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; 419 res1 = sqrt (res1); 420 421 res1 = scl1 * res1; 422 423 *pz1 = res1; 424 } 425 } 426 } 427