1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #include <sys/isa_defs.h> 31 #include "libm_synonyms.h" 32 #include "libm_inlines.h" 33 34 #ifdef _LITTLE_ENDIAN 35 #define HI(x) *(1+(int*)x) 36 #define LO(x) *(unsigned*)x 37 #else 38 #define HI(x) *(int*)x 39 #define LO(x) *(1+(unsigned*)x) 40 #endif 41 42 #ifdef __RESTRICT 43 #define restrict _Restrict 44 #else 45 #define restrict 46 #endif 47 48 /* double rhypot(double x, double y) 49 * 50 * Method : 51 * 1. Special cases: 52 * x or y = Inf => 0 53 * x or y = NaN => QNaN 54 * x and y = 0 => Inf + divide-by-zero 55 * 2. Computes rhypot(x,y): 56 * rhypot(x,y) = m * sqrt(1/(xnm * xnm + ynm * ynm)) 57 * Where: 58 * m = 1/max(|x|,|y|) 59 * xnm = x * m 60 * ynm = y * m 61 * 62 * Compute 1/(xnm * xnm + ynm * ynm) by simulating 63 * muti-precision arithmetic. 64 * 65 * Accuracy: 66 * Maximum error observed: less than 0.869 ulp after 1.000.000.000 67 * results. 68 */ 69 70 #define sqrt __sqrt 71 72 extern double sqrt(double); 73 74 extern double fabs(double); 75 76 static const int __vlibm_TBL_rhypot[] = { 77 /* i = [0,127] 78 * TBL[i] = 0x3ff00000 + *(int*)&(1.0 / *(double*)&(0x3ff0000000000000ULL + (i << 45))); */ 79 0x7fe00000, 0x7fdfc07f, 0x7fdf81f8, 0x7fdf4465, 80 0x7fdf07c1, 0x7fdecc07, 0x7fde9131, 0x7fde573a, 81 0x7fde1e1e, 0x7fdde5d6, 0x7fddae60, 0x7fdd77b6, 82 0x7fdd41d4, 0x7fdd0cb5, 0x7fdcd856, 0x7fdca4b3, 83 0x7fdc71c7, 0x7fdc3f8f, 0x7fdc0e07, 0x7fdbdd2b, 84 0x7fdbacf9, 0x7fdb7d6c, 0x7fdb4e81, 0x7fdb2036, 85 0x7fdaf286, 0x7fdac570, 0x7fda98ef, 0x7fda6d01, 86 0x7fda41a4, 0x7fda16d3, 0x7fd9ec8e, 0x7fd9c2d1, 87 0x7fd99999, 0x7fd970e4, 0x7fd948b0, 0x7fd920fb, 88 0x7fd8f9c1, 0x7fd8d301, 0x7fd8acb9, 0x7fd886e5, 89 0x7fd86186, 0x7fd83c97, 0x7fd81818, 0x7fd7f405, 90 0x7fd7d05f, 0x7fd7ad22, 0x7fd78a4c, 0x7fd767dc, 91 0x7fd745d1, 0x7fd72428, 0x7fd702e0, 0x7fd6e1f7, 92 0x7fd6c16c, 0x7fd6a13c, 0x7fd68168, 0x7fd661ec, 93 0x7fd642c8, 0x7fd623fa, 0x7fd60581, 0x7fd5e75b, 94 0x7fd5c988, 0x7fd5ac05, 0x7fd58ed2, 0x7fd571ed, 95 0x7fd55555, 0x7fd53909, 0x7fd51d07, 0x7fd50150, 96 0x7fd4e5e0, 0x7fd4cab8, 0x7fd4afd6, 0x7fd49539, 97 0x7fd47ae1, 0x7fd460cb, 0x7fd446f8, 0x7fd42d66, 98 0x7fd41414, 0x7fd3fb01, 0x7fd3e22c, 0x7fd3c995, 99 0x7fd3b13b, 0x7fd3991c, 0x7fd38138, 0x7fd3698d, 100 0x7fd3521c, 0x7fd33ae4, 0x7fd323e3, 0x7fd30d19, 101 0x7fd2f684, 0x7fd2e025, 0x7fd2c9fb, 0x7fd2b404, 102 0x7fd29e41, 0x7fd288b0, 0x7fd27350, 0x7fd25e22, 103 0x7fd24924, 0x7fd23456, 0x7fd21fb7, 0x7fd20b47, 104 0x7fd1f704, 0x7fd1e2ef, 0x7fd1cf06, 0x7fd1bb4a, 105 0x7fd1a7b9, 0x7fd19453, 0x7fd18118, 0x7fd16e06, 106 0x7fd15b1e, 0x7fd1485f, 0x7fd135c8, 0x7fd12358, 107 0x7fd11111, 0x7fd0fef0, 0x7fd0ecf5, 0x7fd0db20, 108 0x7fd0c971, 0x7fd0b7e6, 0x7fd0a681, 0x7fd0953f, 109 0x7fd08421, 0x7fd07326, 0x7fd0624d, 0x7fd05197, 110 0x7fd04104, 0x7fd03091, 0x7fd02040, 0x7fd01010, 111 }; 112 113 static const unsigned long long LCONST[] = { 114 0x3ff0000000000000ULL, /* DONE = 1.0 */ 115 0x4000000000000000ULL, /* DTWO = 2.0 */ 116 0x4230000000000000ULL, /* D2ON36 = 2**36 */ 117 0x7fd0000000000000ULL, /* D2ON1022 = 2**1022 */ 118 0x3cb0000000000000ULL, /* D2ONM52 = 2**-52 */ 119 }; 120 121 #define RET_SC(I) \ 122 px += stridex; \ 123 py += stridey; \ 124 pz += stridez; \ 125 if (--n <= 0) \ 126 break; \ 127 goto start##I; 128 129 #define RETURN(I, ret) \ 130 { \ 131 pz[0] = (ret); \ 132 RET_SC(I) \ 133 } 134 135 #define PREP(I) \ 136 hx##I = HI(px); \ 137 hy##I = HI(py); \ 138 hx##I &= 0x7fffffff; \ 139 hy##I &= 0x7fffffff; \ 140 pz##I = pz; \ 141 if (hx##I >= 0x7ff00000 || hy##I >= 0x7ff00000) /* |X| or |Y| = Inf,NaN */ \ 142 { \ 143 lx = LO(px); \ 144 ly = LO(py); \ 145 x = *px; \ 146 y = *py; \ 147 if (hx##I == 0x7ff00000 && lx == 0) res0 = 0.0; /* |X| = Inf */ \ 148 else if (hy##I == 0x7ff00000 && ly == 0) res0 = 0.0; /* |Y| = Inf */ \ 149 else res0 = fabs(x) + fabs(y); \ 150 \ 151 RETURN (I, res0) \ 152 } \ 153 x##I = *px; \ 154 y##I = *py; \ 155 diff0 = hy##I - hx##I; \ 156 j0 = diff0 >> 31; \ 157 if (hx##I < 0x00100000 && hy##I < 0x00100000) /* |X| and |Y| = subnormal or zero */ \ 158 { \ 159 lx = LO(px); \ 160 ly = LO(py); \ 161 x = x##I; \ 162 y = y##I; \ 163 \ 164 if ((hx##I | hy##I | lx | ly) == 0) /* |X| and |Y| = 0 */ \ 165 RETURN (I, DONE / 0.0) \ 166 \ 167 x = fabs(x); \ 168 y = fabs(y); \ 169 \ 170 x = *(long long*)&x; \ 171 y = *(long long*)&y; \ 172 \ 173 x *= D2ONM52; \ 174 y *= D2ONM52; \ 175 \ 176 x_hi0 = (x + D2ON36) - D2ON36; \ 177 y_hi0 = (y + D2ON36) - D2ON36; \ 178 x_lo0 = x - x_hi0; \ 179 y_lo0 = y - y_hi0; \ 180 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); \ 181 res0_lo = ((x + x_hi0) * x_lo0 + (y + y_hi0) * y_lo0); \ 182 \ 183 dres0 = res0_hi + res0_lo; \ 184 \ 185 iarr0 = HI(&dres0); \ 186 iexp0 = iarr0 & 0xfff00000; \ 187 \ 188 iarr0 = (iarr0 >> 11) & 0x1fc; \ 189 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; \ 190 itbl0 -= iexp0; \ 191 HI(&dd0) = itbl0; \ 192 LO(&dd0) = 0; \ 193 \ 194 dd0 = dd0 * (DTWO - dd0 * dres0); \ 195 dd0 = dd0 * (DTWO - dd0 * dres0); \ 196 dres0 = dd0 * (DTWO - dd0 * dres0); \ 197 \ 198 HI(&res0) = HI(&dres0) & 0xffffff00; \ 199 LO(&res0) = 0; \ 200 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; \ 201 res0 = sqrt (res0); \ 202 \ 203 res0 = D2ON1022 * res0; \ 204 RETURN (I, res0) \ 205 } \ 206 j0 = hy##I - (diff0 & j0); \ 207 j0 &= 0x7ff00000; \ 208 HI(&scl##I) = 0x7ff00000 - j0; 209 210 void 211 __vrhypot(int n, double * restrict px, int stridex, double * restrict py, 212 int stridey, double * restrict pz, int stridez) 213 { 214 int i = 0; 215 double x, y; 216 double x_hi0, x_lo0, y_hi0, y_lo0, scl0 = 0; 217 double x0, y0, res0, dd0; 218 double res0_hi,res0_lo, dres0; 219 double x_hi1, x_lo1, y_hi1, y_lo1, scl1 = 0; 220 double x1 = 0.0L, y1 = 0.0L, res1, dd1; 221 double res1_hi,res1_lo, dres1; 222 double x_hi2, x_lo2, y_hi2, y_lo2, scl2 = 0; 223 double x2, y2, res2, dd2; 224 double res2_hi,res2_lo, dres2; 225 226 int hx0, hy0, j0, diff0; 227 int iarr0, iexp0, itbl0; 228 int hx1, hy1; 229 int iarr1, iexp1, itbl1; 230 int hx2, hy2; 231 int iarr2, iexp2, itbl2; 232 233 int lx, ly; 234 235 double DONE = ((double*)LCONST)[0]; 236 double DTWO = ((double*)LCONST)[1]; 237 double D2ON36 = ((double*)LCONST)[2]; 238 double D2ON1022 = ((double*)LCONST)[3]; 239 double D2ONM52 = ((double*)LCONST)[4]; 240 241 double *pz0, *pz1 = 0, *pz2; 242 243 do 244 { 245 start0: 246 PREP(0) 247 px += stridex; 248 py += stridey; 249 pz += stridez; 250 i = 1; 251 if (--n <= 0) 252 break; 253 254 start1: 255 PREP(1) 256 px += stridex; 257 py += stridey; 258 pz += stridez; 259 i = 2; 260 if (--n <= 0) 261 break; 262 263 start2: 264 PREP(2) 265 266 x0 *= scl0; 267 y0 *= scl0; 268 x1 *= scl1; 269 y1 *= scl1; 270 x2 *= scl2; 271 y2 *= scl2; 272 273 x_hi0 = (x0 + D2ON36) - D2ON36; 274 y_hi0 = (y0 + D2ON36) - D2ON36; 275 x_hi1 = (x1 + D2ON36) - D2ON36; 276 y_hi1 = (y1 + D2ON36) - D2ON36; 277 x_hi2 = (x2 + D2ON36) - D2ON36; 278 y_hi2 = (y2 + D2ON36) - D2ON36; 279 x_lo0 = x0 - x_hi0; 280 y_lo0 = y0 - y_hi0; 281 x_lo1 = x1 - x_hi1; 282 y_lo1 = y1 - y_hi1; 283 x_lo2 = x2 - x_hi2; 284 y_lo2 = y2 - y_hi2; 285 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); 286 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); 287 res2_hi = (x_hi2 * x_hi2 + y_hi2 * y_hi2); 288 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); 289 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); 290 res2_lo = ((x2 + x_hi2) * x_lo2 + (y2 + y_hi2) * y_lo2); 291 292 dres0 = res0_hi + res0_lo; 293 dres1 = res1_hi + res1_lo; 294 dres2 = res2_hi + res2_lo; 295 296 iarr0 = HI(&dres0); 297 iarr1 = HI(&dres1); 298 iarr2 = HI(&dres2); 299 iexp0 = iarr0 & 0xfff00000; 300 iexp1 = iarr1 & 0xfff00000; 301 iexp2 = iarr2 & 0xfff00000; 302 303 iarr0 = (iarr0 >> 11) & 0x1fc; 304 iarr1 = (iarr1 >> 11) & 0x1fc; 305 iarr2 = (iarr2 >> 11) & 0x1fc; 306 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; 307 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; 308 itbl2 = ((int*)((char*)__vlibm_TBL_rhypot + iarr2))[0]; 309 itbl0 -= iexp0; 310 itbl1 -= iexp1; 311 itbl2 -= iexp2; 312 HI(&dd0) = itbl0; 313 HI(&dd1) = itbl1; 314 HI(&dd2) = itbl2; 315 LO(&dd0) = 0; 316 LO(&dd1) = 0; 317 LO(&dd2) = 0; 318 319 dd0 = dd0 * (DTWO - dd0 * dres0); 320 dd1 = dd1 * (DTWO - dd1 * dres1); 321 dd2 = dd2 * (DTWO - dd2 * dres2); 322 dd0 = dd0 * (DTWO - dd0 * dres0); 323 dd1 = dd1 * (DTWO - dd1 * dres1); 324 dd2 = dd2 * (DTWO - dd2 * dres2); 325 dres0 = dd0 * (DTWO - dd0 * dres0); 326 dres1 = dd1 * (DTWO - dd1 * dres1); 327 dres2 = dd2 * (DTWO - dd2 * dres2); 328 329 HI(&res0) = HI(&dres0) & 0xffffff00; 330 HI(&res1) = HI(&dres1) & 0xffffff00; 331 HI(&res2) = HI(&dres2) & 0xffffff00; 332 LO(&res0) = 0; 333 LO(&res1) = 0; 334 LO(&res2) = 0; 335 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; 336 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; 337 res2 += (DONE - res2_hi * res2 - res2_lo * res2) * dres2; 338 res0 = sqrt (res0); 339 res1 = sqrt (res1); 340 res2 = sqrt (res2); 341 342 res0 = scl0 * res0; 343 res1 = scl1 * res1; 344 res2 = scl2 * res2; 345 346 *pz0 = res0; 347 *pz1 = res1; 348 *pz2 = res2; 349 350 px += stridex; 351 py += stridey; 352 pz += stridez; 353 i = 0; 354 355 } while (--n > 0); 356 357 if (i > 0) 358 { 359 x0 *= scl0; 360 y0 *= scl0; 361 362 x_hi0 = (x0 + D2ON36) - D2ON36; 363 y_hi0 = (y0 + D2ON36) - D2ON36; 364 x_lo0 = x0 - x_hi0; 365 y_lo0 = y0 - y_hi0; 366 res0_hi = (x_hi0 * x_hi0 + y_hi0 * y_hi0); 367 res0_lo = ((x0 + x_hi0) * x_lo0 + (y0 + y_hi0) * y_lo0); 368 369 dres0 = res0_hi + res0_lo; 370 371 iarr0 = HI(&dres0); 372 iexp0 = iarr0 & 0xfff00000; 373 374 iarr0 = (iarr0 >> 11) & 0x1fc; 375 itbl0 = ((int*)((char*)__vlibm_TBL_rhypot + iarr0))[0]; 376 itbl0 -= iexp0; 377 HI(&dd0) = itbl0; 378 LO(&dd0) = 0; 379 380 dd0 = dd0 * (DTWO - dd0 * dres0); 381 dd0 = dd0 * (DTWO - dd0 * dres0); 382 dres0 = dd0 * (DTWO - dd0 * dres0); 383 384 HI(&res0) = HI(&dres0) & 0xffffff00; 385 LO(&res0) = 0; 386 res0 += (DONE - res0_hi * res0 - res0_lo * res0) * dres0; 387 res0 = sqrt (res0); 388 389 res0 = scl0 * res0; 390 391 *pz0 = res0; 392 393 if (i > 1) 394 { 395 x1 *= scl1; 396 y1 *= scl1; 397 398 x_hi1 = (x1 + D2ON36) - D2ON36; 399 y_hi1 = (y1 + D2ON36) - D2ON36; 400 x_lo1 = x1 - x_hi1; 401 y_lo1 = y1 - y_hi1; 402 res1_hi = (x_hi1 * x_hi1 + y_hi1 * y_hi1); 403 res1_lo = ((x1 + x_hi1) * x_lo1 + (y1 + y_hi1) * y_lo1); 404 405 dres1 = res1_hi + res1_lo; 406 407 iarr1 = HI(&dres1); 408 iexp1 = iarr1 & 0xfff00000; 409 410 iarr1 = (iarr1 >> 11) & 0x1fc; 411 itbl1 = ((int*)((char*)__vlibm_TBL_rhypot + iarr1))[0]; 412 itbl1 -= iexp1; 413 HI(&dd1) = itbl1; 414 LO(&dd1) = 0; 415 416 dd1 = dd1 * (DTWO - dd1 * dres1); 417 dd1 = dd1 * (DTWO - dd1 * dres1); 418 dres1 = dd1 * (DTWO - dd1 * dres1); 419 420 HI(&res1) = HI(&dres1) & 0xffffff00; 421 LO(&res1) = 0; 422 res1 += (DONE - res1_hi * res1 - res1_lo * res1) * dres1; 423 res1 = sqrt (res1); 424 425 res1 = scl1 * res1; 426 427 *pz1 = res1; 428 } 429 } 430 } 431 432