1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 #pragma weak fma = __fma 31 32 #include "libm.h" 33 #include "fma.h" 34 #include "fenv_inlines.h" 35 36 #if defined(__sparc) 37 38 static const union { 39 unsigned i[2]; 40 double d; 41 } C[] = { 42 { 0x3fe00000u, 0 }, 43 { 0x40000000u, 0 }, 44 { 0x43300000u, 0 }, 45 { 0x41a00000u, 0 }, 46 { 0x3e500000u, 0 }, 47 { 0x3df00000u, 0 }, 48 { 0x3bf00000u, 0 }, 49 { 0x7fe00000u, 0 }, 50 { 0x00100000u, 0 }, 51 { 0x00100001u, 0 } 52 }; 53 54 #define half C[0].d 55 #define two C[1].d 56 #define two52 C[2].d 57 #define two27 C[3].d 58 #define twom26 C[4].d 59 #define twom32 C[5].d 60 #define twom64 C[6].d 61 #define huge C[7].d 62 #define tiny C[8].d 63 #define tiny2 C[9].d 64 65 static const unsigned int fsr_rm = 0xc0000000u; 66 67 /* 68 * fma for SPARC: 64-bit double precision, big-endian 69 */ 70 double 71 __fma(double x, double y, double z) { 72 union { 73 unsigned i[2]; 74 double d; 75 } xx, yy, zz; 76 double xhi, yhi, xlo, ylo, t; 77 unsigned int xy0, xy1, xy2, xy3, z0, z1, z2, z3, fsr, rm, sticky; 78 int hx, hy, hz, ex, ey, ez, exy, sxy, sz, e, ibit; 79 volatile double dummy; 80 81 /* extract the high order words of the arguments */ 82 xx.d = x; 83 yy.d = y; 84 zz.d = z; 85 hx = xx.i[0] & ~0x80000000; 86 hy = yy.i[0] & ~0x80000000; 87 hz = zz.i[0] & ~0x80000000; 88 89 /* dispense with inf, nan, and zero cases */ 90 if (hx >= 0x7ff00000 || hy >= 0x7ff00000 || (hx | xx.i[1]) == 0 || 91 (hy | yy.i[1]) == 0) /* x or y is inf, nan, or zero */ 92 return (x * y + z); 93 94 if (hz >= 0x7ff00000) /* z is inf or nan */ 95 return (x + z); /* avoid spurious under/overflow in x * y */ 96 97 if ((hz | zz.i[1]) == 0) /* z is zero */ 98 /* 99 * x * y isn't zero but could underflow to zero, 100 * so don't add z, lest we perturb the sign 101 */ 102 return (x * y); 103 104 /* 105 * now x, y, and z are all finite and nonzero; save the fsr and 106 * set round-to-negative-infinity mode (and clear nonstandard 107 * mode before we try to scale subnormal operands) 108 */ 109 __fenv_getfsr32(&fsr); 110 __fenv_setfsr32(&fsr_rm); 111 112 /* extract signs and exponents, and normalize subnormals */ 113 sxy = (xx.i[0] ^ yy.i[0]) & 0x80000000; 114 sz = zz.i[0] & 0x80000000; 115 ex = hx >> 20; 116 if (!ex) { 117 xx.d = x * two52; 118 ex = ((xx.i[0] & ~0x80000000) >> 20) - 52; 119 } 120 ey = hy >> 20; 121 if (!ey) { 122 yy.d = y * two52; 123 ey = ((yy.i[0] & ~0x80000000) >> 20) - 52; 124 } 125 ez = hz >> 20; 126 if (!ez) { 127 zz.d = z * two52; 128 ez = ((zz.i[0] & ~0x80000000) >> 20) - 52; 129 } 130 131 /* multiply x*y to 106 bits */ 132 exy = ex + ey - 0x3ff; 133 xx.i[0] = (xx.i[0] & 0xfffff) | 0x3ff00000; 134 yy.i[0] = (yy.i[0] & 0xfffff) | 0x3ff00000; 135 x = xx.d; 136 y = yy.d; 137 xhi = ((x + twom26) + two27) - two27; 138 yhi = ((y + twom26) + two27) - two27; 139 xlo = x - xhi; 140 ylo = y - yhi; 141 x *= y; 142 y = ((xhi * yhi - x) + xhi * ylo + xlo * yhi) + xlo * ylo; 143 if (x >= two) { 144 x *= half; 145 y *= half; 146 exy++; 147 } 148 149 /* extract the significands */ 150 xx.d = x; 151 xy0 = (xx.i[0] & 0xfffff) | 0x100000; 152 xy1 = xx.i[1]; 153 yy.d = t = y + twom32; 154 xy2 = yy.i[1]; 155 yy.d = (y - (t - twom32)) + twom64; 156 xy3 = yy.i[1]; 157 z0 = (zz.i[0] & 0xfffff) | 0x100000; 158 z1 = zz.i[1]; 159 z2 = z3 = 0; 160 161 /* 162 * now x*y is represented by sxy, exy, and xy[0-3], and z is 163 * represented likewise; swap if need be so |xy| <= |z| 164 */ 165 if (exy > ez || (exy == ez && (xy0 > z0 || (xy0 == z0 && 166 (xy1 > z1 || (xy1 == z1 && (xy2 | xy3) != 0)))))) { 167 e = sxy; sxy = sz; sz = e; 168 e = exy; exy = ez; ez = e; 169 e = xy0; xy0 = z0; z0 = e; 170 e = xy1; xy1 = z1; z1 = e; 171 z2 = xy2; xy2 = 0; 172 z3 = xy3; xy3 = 0; 173 } 174 175 /* shift the significand of xy keeping a sticky bit */ 176 e = ez - exy; 177 if (e > 116) { 178 xy0 = xy1 = xy2 = 0; 179 xy3 = 1; 180 } else if (e >= 96) { 181 sticky = xy3 | xy2 | xy1 | ((xy0 << 1) << (127 - e)); 182 xy3 = xy0 >> (e - 96); 183 if (sticky) 184 xy3 |= 1; 185 xy0 = xy1 = xy2 = 0; 186 } else if (e >= 64) { 187 sticky = xy3 | xy2 | ((xy1 << 1) << (95 - e)); 188 xy3 = (xy1 >> (e - 64)) | ((xy0 << 1) << (95 - e)); 189 if (sticky) 190 xy3 |= 1; 191 xy2 = xy0 >> (e - 64); 192 xy0 = xy1 = 0; 193 } else if (e >= 32) { 194 sticky = xy3 | ((xy2 << 1) << (63 - e)); 195 xy3 = (xy2 >> (e - 32)) | ((xy1 << 1) << (63 - e)); 196 if (sticky) 197 xy3 |= 1; 198 xy2 = (xy1 >> (e - 32)) | ((xy0 << 1) << (63 - e)); 199 xy1 = xy0 >> (e - 32); 200 xy0 = 0; 201 } else if (e) { 202 sticky = (xy3 << 1) << (31 - e); 203 xy3 = (xy3 >> e) | ((xy2 << 1) << (31 - e)); 204 if (sticky) 205 xy3 |= 1; 206 xy2 = (xy2 >> e) | ((xy1 << 1) << (31 - e)); 207 xy1 = (xy1 >> e) | ((xy0 << 1) << (31 - e)); 208 xy0 >>= e; 209 } 210 211 /* if this is a magnitude subtract, negate the significand of xy */ 212 if (sxy ^ sz) { 213 xy0 = ~xy0; 214 xy1 = ~xy1; 215 xy2 = ~xy2; 216 xy3 = -xy3; 217 if (xy3 == 0) 218 if (++xy2 == 0) 219 if (++xy1 == 0) 220 xy0++; 221 } 222 223 /* add, propagating carries */ 224 z3 += xy3; 225 e = (z3 < xy3); 226 z2 += xy2; 227 if (e) { 228 z2++; 229 e = (z2 <= xy2); 230 } else 231 e = (z2 < xy2); 232 z1 += xy1; 233 if (e) { 234 z1++; 235 e = (z1 <= xy1); 236 } else 237 e = (z1 < xy1); 238 z0 += xy0; 239 if (e) 240 z0++; 241 242 /* postnormalize and collect rounding information into z2 */ 243 if (ez < 1) { 244 /* result is tiny; shift right until exponent is within range */ 245 e = 1 - ez; 246 if (e > 56) { 247 z2 = 1; /* result can't be exactly zero */ 248 z0 = z1 = 0; 249 } else if (e >= 32) { 250 sticky = z3 | z2 | ((z1 << 1) << (63 - e)); 251 z2 = (z1 >> (e - 32)) | ((z0 << 1) << (63 - e)); 252 if (sticky) 253 z2 |= 1; 254 z1 = z0 >> (e - 32); 255 z0 = 0; 256 } else { 257 sticky = z3 | (z2 << 1) << (31 - e); 258 z2 = (z2 >> e) | ((z1 << 1) << (31 - e)); 259 if (sticky) 260 z2 |= 1; 261 z1 = (z1 >> e) | ((z0 << 1) << (31 - e)); 262 z0 >>= e; 263 } 264 ez = 1; 265 } else if (z0 >= 0x200000) { 266 /* carry out; shift right by one */ 267 sticky = (z2 & 1) | z3; 268 z2 = (z2 >> 1) | (z1 << 31); 269 if (sticky) 270 z2 |= 1; 271 z1 = (z1 >> 1) | (z0 << 31); 272 z0 >>= 1; 273 ez++; 274 } else { 275 if (z0 < 0x100000 && (z0 | z1 | z2 | z3) != 0) { 276 /* 277 * borrow/cancellation; shift left as much as 278 * exponent allows 279 */ 280 while (!(z0 | (z1 & 0xffe00000)) && ez >= 33) { 281 z0 = z1; 282 z1 = z2; 283 z2 = z3; 284 z3 = 0; 285 ez -= 32; 286 } 287 while (z0 < 0x100000 && ez > 1) { 288 z0 = (z0 << 1) | (z1 >> 31); 289 z1 = (z1 << 1) | (z2 >> 31); 290 z2 = (z2 << 1) | (z3 >> 31); 291 z3 <<= 1; 292 ez--; 293 } 294 } 295 if (z3) 296 z2 |= 1; 297 } 298 299 /* get the rounding mode and clear current exceptions */ 300 rm = fsr >> 30; 301 fsr &= ~FSR_CEXC; 302 303 /* strip off the integer bit, if there is one */ 304 ibit = z0 & 0x100000; 305 if (ibit) 306 z0 -= 0x100000; 307 else { 308 ez = 0; 309 if (!(z0 | z1 | z2)) { /* exact zero */ 310 zz.i[0] = rm == FSR_RM ? 0x80000000 : 0; 311 zz.i[1] = 0; 312 __fenv_setfsr32(&fsr); 313 return (zz.d); 314 } 315 } 316 317 /* 318 * flip the sense of directed roundings if the result is negative; 319 * the logic below applies to a positive result 320 */ 321 if (sz) 322 rm ^= rm >> 1; 323 324 /* round and raise exceptions */ 325 if (z2) { 326 fsr |= FSR_NXC; 327 328 /* decide whether to round the fraction up */ 329 if (rm == FSR_RP || (rm == FSR_RN && (z2 > 0x80000000u || 330 (z2 == 0x80000000u && (z1 & 1))))) { 331 /* round up and renormalize if necessary */ 332 if (++z1 == 0) { 333 if (++z0 == 0x100000) { 334 z0 = 0; 335 ez++; 336 } 337 } 338 } 339 } 340 341 /* check for under/overflow */ 342 if (ez >= 0x7ff) { 343 if (rm == FSR_RN || rm == FSR_RP) { 344 zz.i[0] = sz | 0x7ff00000; 345 zz.i[1] = 0; 346 } else { 347 zz.i[0] = sz | 0x7fefffff; 348 zz.i[1] = 0xffffffff; 349 } 350 fsr |= FSR_OFC | FSR_NXC; 351 } else { 352 zz.i[0] = sz | (ez << 20) | z0; 353 zz.i[1] = z1; 354 355 /* 356 * !ibit => exact result was tiny before rounding, 357 * z2 nonzero => result delivered is inexact 358 */ 359 if (!ibit) { 360 if (z2) 361 fsr |= FSR_UFC | FSR_NXC; 362 else if (fsr & FSR_UFM) 363 fsr |= FSR_UFC; 364 } 365 } 366 367 /* restore the fsr and emulate exceptions as needed */ 368 if ((fsr & FSR_CEXC) & (fsr >> 23)) { 369 __fenv_setfsr32(&fsr); 370 if (fsr & FSR_OFC) { 371 dummy = huge; 372 dummy *= huge; 373 } else if (fsr & FSR_UFC) { 374 dummy = tiny; 375 if (fsr & FSR_NXC) 376 dummy *= tiny; 377 else 378 dummy -= tiny2; 379 } else { 380 dummy = huge; 381 dummy += tiny; 382 } 383 } else { 384 fsr |= (fsr & 0x1f) << 5; 385 __fenv_setfsr32(&fsr); 386 } 387 return (zz.d); 388 } 389 390 #elif defined(__x86) 391 392 #if defined(__amd64) 393 #define NI 4 394 #else 395 #define NI 3 396 #endif 397 398 /* 399 * fma for x86: 64-bit double precision, little-endian 400 */ 401 double 402 __fma(double x, double y, double z) { 403 union { 404 unsigned i[NI]; 405 long double e; 406 } xx, yy, zz; 407 long double xe, ye, xhi, xlo, yhi, ylo; 408 int ex, ey, ez; 409 unsigned cwsw, oldcwsw, rm; 410 411 /* convert the operands to double extended */ 412 xx.e = (long double) x; 413 yy.e = (long double) y; 414 zz.e = (long double) z; 415 416 /* extract the exponents of the arguments */ 417 ex = xx.i[2] & 0x7fff; 418 ey = yy.i[2] & 0x7fff; 419 ez = zz.i[2] & 0x7fff; 420 421 /* dispense with inf, nan, and zero cases */ 422 if (ex == 0x7fff || ey == 0x7fff || ex == 0 || ey == 0) 423 /* x or y is inf, nan, or zero */ 424 return ((double) (xx.e * yy.e + zz.e)); 425 426 if (ez >= 0x7fff) /* z is inf or nan */ 427 return ((double) (xx.e + zz.e)); 428 /* avoid spurious inexact in x * y */ 429 430 /* 431 * save the control and status words, mask all exceptions, and 432 * set rounding to 64-bit precision and to-nearest 433 */ 434 __fenv_getcwsw(&oldcwsw); 435 cwsw = (oldcwsw & 0xf0c0ffff) | 0x033f0000; 436 __fenv_setcwsw(&cwsw); 437 438 /* multiply x*y to 106 bits */ 439 xe = xx.e; 440 xx.i[0] = 0; 441 xhi = xx.e; /* hi 32 bits */ 442 xlo = xe - xhi; /* lo 21 bits */ 443 ye = yy.e; 444 yy.i[0] = 0; 445 yhi = yy.e; 446 ylo = ye - yhi; 447 xe = xe * ye; 448 ye = ((xhi * yhi - xe) + xhi * ylo + xlo * yhi) + xlo * ylo; 449 450 /* distill the sum of xe, ye, and z */ 451 xhi = ye + zz.e; 452 yhi = xhi - ye; 453 xlo = (zz.e - yhi) + (ye - (xhi - yhi)); 454 /* now (xhi,xlo) = ye + z */ 455 456 yhi = xe + xhi; 457 ye = yhi - xe; 458 ylo = (xhi - ye) + (xe - (yhi - ye)); /* now (yhi,ylo) = xe + xhi */ 459 460 xhi = xlo + ylo; 461 xe = xhi - xlo; 462 xlo = (ylo - xe) + (xlo - (xhi - xe)); /* now (xhi,xlo) = xlo + ylo */ 463 464 yy.e = yhi + xhi; 465 ylo = (yhi - yy.e) + xhi; /* now (yy.e,ylo) = xhi + yhi */ 466 467 if (yy.i[1] != 0) { /* yy.e is nonzero */ 468 /* perturb yy.e if its least significant 10 bits are zero */ 469 if (!(yy.i[0] & 0x3ff)) { 470 xx.e = ylo + xlo; 471 if (xx.i[1] != 0) { 472 xx.i[2] = (xx.i[2] & 0x8000) | 473 ((yy.i[2] & 0x7fff) - 63); 474 xx.i[1] = 0x80000000; 475 xx.i[0] = 0; 476 yy.e += xx.e; 477 } 478 } 479 } else { 480 /* set sign of zero result according to rounding direction */ 481 rm = oldcwsw & 0x0c000000; 482 yy.i[2] = ((rm == FCW_RM)? 0x8000 : 0); 483 } 484 485 /* 486 * restore the control and status words and convert the result 487 * to double 488 */ 489 __fenv_setcwsw(&oldcwsw); 490 return ((double) yy.e); 491 } 492 493 #else 494 #error Unknown architecture 495 #endif 496