1*25c28e83SPiotr Jasiukajtis /* 2*25c28e83SPiotr Jasiukajtis * CDDL HEADER START 3*25c28e83SPiotr Jasiukajtis * 4*25c28e83SPiotr Jasiukajtis * The contents of this file are subject to the terms of the 5*25c28e83SPiotr Jasiukajtis * Common Development and Distribution License (the "License"). 6*25c28e83SPiotr Jasiukajtis * You may not use this file except in compliance with the License. 7*25c28e83SPiotr Jasiukajtis * 8*25c28e83SPiotr Jasiukajtis * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*25c28e83SPiotr Jasiukajtis * or http://www.opensolaris.org/os/licensing. 10*25c28e83SPiotr Jasiukajtis * See the License for the specific language governing permissions 11*25c28e83SPiotr Jasiukajtis * and limitations under the License. 12*25c28e83SPiotr Jasiukajtis * 13*25c28e83SPiotr Jasiukajtis * When distributing Covered Code, include this CDDL HEADER in each 14*25c28e83SPiotr Jasiukajtis * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*25c28e83SPiotr Jasiukajtis * If applicable, add the following below this CDDL HEADER, with the 16*25c28e83SPiotr Jasiukajtis * fields enclosed by brackets "[]" replaced with your own identifying 17*25c28e83SPiotr Jasiukajtis * information: Portions Copyright [yyyy] [name of copyright owner] 18*25c28e83SPiotr Jasiukajtis * 19*25c28e83SPiotr Jasiukajtis * CDDL HEADER END 20*25c28e83SPiotr Jasiukajtis */ 21*25c28e83SPiotr Jasiukajtis 22*25c28e83SPiotr Jasiukajtis /* 23*25c28e83SPiotr Jasiukajtis * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24*25c28e83SPiotr Jasiukajtis */ 25*25c28e83SPiotr Jasiukajtis /* 26*25c28e83SPiotr Jasiukajtis * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27*25c28e83SPiotr Jasiukajtis * Use is subject to license terms. 28*25c28e83SPiotr Jasiukajtis */ 29*25c28e83SPiotr Jasiukajtis 30*25c28e83SPiotr Jasiukajtis #pragma weak fmaf = __fmaf 31*25c28e83SPiotr Jasiukajtis 32*25c28e83SPiotr Jasiukajtis #include "libm.h" 33*25c28e83SPiotr Jasiukajtis #include "fma.h" 34*25c28e83SPiotr Jasiukajtis #include "fenv_inlines.h" 35*25c28e83SPiotr Jasiukajtis 36*25c28e83SPiotr Jasiukajtis #if defined(__sparc) 37*25c28e83SPiotr Jasiukajtis 38*25c28e83SPiotr Jasiukajtis /* 39*25c28e83SPiotr Jasiukajtis * fmaf for SPARC: 32-bit single precision, big-endian 40*25c28e83SPiotr Jasiukajtis */ 41*25c28e83SPiotr Jasiukajtis float 42*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) { 43*25c28e83SPiotr Jasiukajtis union { 44*25c28e83SPiotr Jasiukajtis unsigned i[2]; 45*25c28e83SPiotr Jasiukajtis double d; 46*25c28e83SPiotr Jasiukajtis } xy, zz; 47*25c28e83SPiotr Jasiukajtis unsigned u, s; 48*25c28e83SPiotr Jasiukajtis int exy, ez; 49*25c28e83SPiotr Jasiukajtis 50*25c28e83SPiotr Jasiukajtis /* 51*25c28e83SPiotr Jasiukajtis * the following operations can only raise the invalid exception, 52*25c28e83SPiotr Jasiukajtis * and then only if either x*y is of the form Inf*0 or one of x, 53*25c28e83SPiotr Jasiukajtis * y, or z is a signaling NaN 54*25c28e83SPiotr Jasiukajtis */ 55*25c28e83SPiotr Jasiukajtis xy.d = (double) x * y; 56*25c28e83SPiotr Jasiukajtis zz.d = (double) z; 57*25c28e83SPiotr Jasiukajtis 58*25c28e83SPiotr Jasiukajtis /* 59*25c28e83SPiotr Jasiukajtis * if the sum xy + z will be exact, just compute it and cast the 60*25c28e83SPiotr Jasiukajtis * result to float 61*25c28e83SPiotr Jasiukajtis */ 62*25c28e83SPiotr Jasiukajtis exy = (xy.i[0] >> 20) & 0x7ff; 63*25c28e83SPiotr Jasiukajtis ez = (zz.i[0] >> 20) & 0x7ff; 64*25c28e83SPiotr Jasiukajtis if ((ez - exy <= 4 && exy - ez <= 28) || exy == 0x7ff || exy == 0 || 65*25c28e83SPiotr Jasiukajtis ez == 0x7ff || ez == 0) { 66*25c28e83SPiotr Jasiukajtis return ((float) (xy.d + zz.d)); 67*25c28e83SPiotr Jasiukajtis } 68*25c28e83SPiotr Jasiukajtis 69*25c28e83SPiotr Jasiukajtis /* 70*25c28e83SPiotr Jasiukajtis * collapse the tail of the smaller summand into a "sticky bit" 71*25c28e83SPiotr Jasiukajtis * so that the sum can be computed without error 72*25c28e83SPiotr Jasiukajtis */ 73*25c28e83SPiotr Jasiukajtis if (ez > exy) { 74*25c28e83SPiotr Jasiukajtis if (ez - exy < 31) { 75*25c28e83SPiotr Jasiukajtis u = xy.i[1]; 76*25c28e83SPiotr Jasiukajtis s = 2 << (ez - exy); 77*25c28e83SPiotr Jasiukajtis if (u & (s - 1)) 78*25c28e83SPiotr Jasiukajtis u |= s; 79*25c28e83SPiotr Jasiukajtis xy.i[1] = u & ~(s - 1); 80*25c28e83SPiotr Jasiukajtis } else if (ez - exy < 51) { 81*25c28e83SPiotr Jasiukajtis u = xy.i[0]; 82*25c28e83SPiotr Jasiukajtis s = 1 << (ez - exy - 31); 83*25c28e83SPiotr Jasiukajtis if ((u & (s - 1)) | xy.i[1]) 84*25c28e83SPiotr Jasiukajtis u |= s; 85*25c28e83SPiotr Jasiukajtis xy.i[0] = u & ~(s - 1); 86*25c28e83SPiotr Jasiukajtis xy.i[1] = 0; 87*25c28e83SPiotr Jasiukajtis } else { 88*25c28e83SPiotr Jasiukajtis /* collapse all of xy into a single bit */ 89*25c28e83SPiotr Jasiukajtis xy.i[0] = (xy.i[0] & 0x80000000) | ((ez - 51) << 20); 90*25c28e83SPiotr Jasiukajtis xy.i[1] = 0; 91*25c28e83SPiotr Jasiukajtis } 92*25c28e83SPiotr Jasiukajtis } else { 93*25c28e83SPiotr Jasiukajtis if (exy - ez < 31) { 94*25c28e83SPiotr Jasiukajtis u = zz.i[1]; 95*25c28e83SPiotr Jasiukajtis s = 2 << (exy - ez); 96*25c28e83SPiotr Jasiukajtis if (u & (s - 1)) 97*25c28e83SPiotr Jasiukajtis u |= s; 98*25c28e83SPiotr Jasiukajtis zz.i[1] = u & ~(s - 1); 99*25c28e83SPiotr Jasiukajtis } else if (exy - ez < 51) { 100*25c28e83SPiotr Jasiukajtis u = zz.i[0]; 101*25c28e83SPiotr Jasiukajtis s = 1 << (exy - ez - 31); 102*25c28e83SPiotr Jasiukajtis if ((u & (s - 1)) | zz.i[1]) 103*25c28e83SPiotr Jasiukajtis u |= s; 104*25c28e83SPiotr Jasiukajtis zz.i[0] = u & ~(s - 1); 105*25c28e83SPiotr Jasiukajtis zz.i[1] = 0; 106*25c28e83SPiotr Jasiukajtis } else { 107*25c28e83SPiotr Jasiukajtis /* collapse all of zz into a single bit */ 108*25c28e83SPiotr Jasiukajtis zz.i[0] = (zz.i[0] & 0x80000000) | ((exy - 51) << 20); 109*25c28e83SPiotr Jasiukajtis zz.i[1] = 0; 110*25c28e83SPiotr Jasiukajtis } 111*25c28e83SPiotr Jasiukajtis } 112*25c28e83SPiotr Jasiukajtis 113*25c28e83SPiotr Jasiukajtis return ((float) (xy.d + zz.d)); 114*25c28e83SPiotr Jasiukajtis } 115*25c28e83SPiotr Jasiukajtis 116*25c28e83SPiotr Jasiukajtis #elif defined(__x86) 117*25c28e83SPiotr Jasiukajtis 118*25c28e83SPiotr Jasiukajtis #if defined(__amd64) 119*25c28e83SPiotr Jasiukajtis #define NI 4 120*25c28e83SPiotr Jasiukajtis #else 121*25c28e83SPiotr Jasiukajtis #define NI 3 122*25c28e83SPiotr Jasiukajtis #endif 123*25c28e83SPiotr Jasiukajtis 124*25c28e83SPiotr Jasiukajtis /* 125*25c28e83SPiotr Jasiukajtis * fmaf for x86: 32-bit single precision, little-endian 126*25c28e83SPiotr Jasiukajtis */ 127*25c28e83SPiotr Jasiukajtis float 128*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) { 129*25c28e83SPiotr Jasiukajtis union { 130*25c28e83SPiotr Jasiukajtis unsigned i[NI]; 131*25c28e83SPiotr Jasiukajtis long double e; 132*25c28e83SPiotr Jasiukajtis } xy, zz; 133*25c28e83SPiotr Jasiukajtis unsigned u, s, cwsw, oldcwsw; 134*25c28e83SPiotr Jasiukajtis int exy, ez; 135*25c28e83SPiotr Jasiukajtis 136*25c28e83SPiotr Jasiukajtis /* set rounding precision to 64 bits */ 137*25c28e83SPiotr Jasiukajtis __fenv_getcwsw(&oldcwsw); 138*25c28e83SPiotr Jasiukajtis cwsw = (oldcwsw & 0xfcffffff) | 0x03000000; 139*25c28e83SPiotr Jasiukajtis __fenv_setcwsw(&cwsw); 140*25c28e83SPiotr Jasiukajtis 141*25c28e83SPiotr Jasiukajtis /* 142*25c28e83SPiotr Jasiukajtis * the following operations can only raise the invalid exception, 143*25c28e83SPiotr Jasiukajtis * and then only if either x*y is of the form Inf*0 or one of x, 144*25c28e83SPiotr Jasiukajtis * y, or z is a signaling NaN 145*25c28e83SPiotr Jasiukajtis */ 146*25c28e83SPiotr Jasiukajtis xy.e = (long double) x * y; 147*25c28e83SPiotr Jasiukajtis zz.e = (long double) z; 148*25c28e83SPiotr Jasiukajtis 149*25c28e83SPiotr Jasiukajtis /* 150*25c28e83SPiotr Jasiukajtis * if the sum xy + z will be exact, just compute it and cast the 151*25c28e83SPiotr Jasiukajtis * result to float 152*25c28e83SPiotr Jasiukajtis */ 153*25c28e83SPiotr Jasiukajtis exy = xy.i[2] & 0x7fff; 154*25c28e83SPiotr Jasiukajtis ez = zz.i[2] & 0x7fff; 155*25c28e83SPiotr Jasiukajtis if ((ez - exy <= 15 && exy - ez <= 39) || exy == 0x7fff || exy == 0 || 156*25c28e83SPiotr Jasiukajtis ez == 0x7fff || ez == 0) { 157*25c28e83SPiotr Jasiukajtis goto cont; 158*25c28e83SPiotr Jasiukajtis } 159*25c28e83SPiotr Jasiukajtis 160*25c28e83SPiotr Jasiukajtis /* 161*25c28e83SPiotr Jasiukajtis * collapse the tail of the smaller summand into a "sticky bit" 162*25c28e83SPiotr Jasiukajtis * so that the sum can be computed without error 163*25c28e83SPiotr Jasiukajtis */ 164*25c28e83SPiotr Jasiukajtis if (ez > exy) { 165*25c28e83SPiotr Jasiukajtis if (ez - exy < 31) { 166*25c28e83SPiotr Jasiukajtis u = xy.i[0]; 167*25c28e83SPiotr Jasiukajtis s = 2 << (ez - exy); 168*25c28e83SPiotr Jasiukajtis if (u & (s - 1)) 169*25c28e83SPiotr Jasiukajtis u |= s; 170*25c28e83SPiotr Jasiukajtis xy.i[0] = u & ~(s - 1); 171*25c28e83SPiotr Jasiukajtis } else if (ez - exy < 62) { 172*25c28e83SPiotr Jasiukajtis u = xy.i[1]; 173*25c28e83SPiotr Jasiukajtis s = 1 << (ez - exy - 31); 174*25c28e83SPiotr Jasiukajtis if ((u & (s - 1)) | xy.i[0]) 175*25c28e83SPiotr Jasiukajtis u |= s; 176*25c28e83SPiotr Jasiukajtis xy.i[1] = u & ~(s - 1); 177*25c28e83SPiotr Jasiukajtis xy.i[0] = 0; 178*25c28e83SPiotr Jasiukajtis } else { 179*25c28e83SPiotr Jasiukajtis /* collapse all of xy into a single bit */ 180*25c28e83SPiotr Jasiukajtis xy.i[0] = 0; 181*25c28e83SPiotr Jasiukajtis xy.i[1] = 0x80000000; 182*25c28e83SPiotr Jasiukajtis xy.i[2] = (xy.i[2] & 0x8000) | (ez - 62); 183*25c28e83SPiotr Jasiukajtis } 184*25c28e83SPiotr Jasiukajtis } else { 185*25c28e83SPiotr Jasiukajtis if (exy - ez < 62) { 186*25c28e83SPiotr Jasiukajtis u = zz.i[1]; 187*25c28e83SPiotr Jasiukajtis s = 1 << (exy - ez - 31); 188*25c28e83SPiotr Jasiukajtis if ((u & (s - 1)) | zz.i[0]) 189*25c28e83SPiotr Jasiukajtis u |= s; 190*25c28e83SPiotr Jasiukajtis zz.i[1] = u & ~(s - 1); 191*25c28e83SPiotr Jasiukajtis zz.i[0] = 0; 192*25c28e83SPiotr Jasiukajtis } else { 193*25c28e83SPiotr Jasiukajtis /* collapse all of zz into a single bit */ 194*25c28e83SPiotr Jasiukajtis zz.i[0] = 0; 195*25c28e83SPiotr Jasiukajtis zz.i[1] = 0x80000000; 196*25c28e83SPiotr Jasiukajtis zz.i[2] = (zz.i[2] & 0x8000) | (exy - 62); 197*25c28e83SPiotr Jasiukajtis } 198*25c28e83SPiotr Jasiukajtis } 199*25c28e83SPiotr Jasiukajtis 200*25c28e83SPiotr Jasiukajtis cont: 201*25c28e83SPiotr Jasiukajtis xy.e += zz.e; 202*25c28e83SPiotr Jasiukajtis 203*25c28e83SPiotr Jasiukajtis /* restore the rounding precision */ 204*25c28e83SPiotr Jasiukajtis __fenv_getcwsw(&cwsw); 205*25c28e83SPiotr Jasiukajtis cwsw = (cwsw & 0xfcffffff) | (oldcwsw & 0x03000000); 206*25c28e83SPiotr Jasiukajtis __fenv_setcwsw(&cwsw); 207*25c28e83SPiotr Jasiukajtis 208*25c28e83SPiotr Jasiukajtis return ((float) xy.e); 209*25c28e83SPiotr Jasiukajtis } 210*25c28e83SPiotr Jasiukajtis 211*25c28e83SPiotr Jasiukajtis #if 0 212*25c28e83SPiotr Jasiukajtis /* 213*25c28e83SPiotr Jasiukajtis * another fmaf for x86: assumes return value will be left in 214*25c28e83SPiotr Jasiukajtis * long double (80-bit double extended) precision 215*25c28e83SPiotr Jasiukajtis */ 216*25c28e83SPiotr Jasiukajtis long double 217*25c28e83SPiotr Jasiukajtis __fmaf(float x, float y, float z) { 218*25c28e83SPiotr Jasiukajtis /* 219*25c28e83SPiotr Jasiukajtis * Note: This implementation assumes the rounding precision mode 220*25c28e83SPiotr Jasiukajtis * is set to the default, rounding to 64 bit precision. If this 221*25c28e83SPiotr Jasiukajtis * routine must work in non-default rounding precision modes, do 222*25c28e83SPiotr Jasiukajtis * the following instead: 223*25c28e83SPiotr Jasiukajtis * 224*25c28e83SPiotr Jasiukajtis * long double t; 225*25c28e83SPiotr Jasiukajtis * 226*25c28e83SPiotr Jasiukajtis * <set rp mode to round to 64 bit precision> 227*25c28e83SPiotr Jasiukajtis * t = x * y; 228*25c28e83SPiotr Jasiukajtis * <restore rp mode> 229*25c28e83SPiotr Jasiukajtis * return t + z; 230*25c28e83SPiotr Jasiukajtis * 231*25c28e83SPiotr Jasiukajtis * Note that the code to change rounding precision must not alter 232*25c28e83SPiotr Jasiukajtis * the exception masks or flags, since the product x * y may raise 233*25c28e83SPiotr Jasiukajtis * an invalid operation exception. 234*25c28e83SPiotr Jasiukajtis */ 235*25c28e83SPiotr Jasiukajtis return ((long double) x * y + z); 236*25c28e83SPiotr Jasiukajtis } 237*25c28e83SPiotr Jasiukajtis #endif 238*25c28e83SPiotr Jasiukajtis 239*25c28e83SPiotr Jasiukajtis #else 240*25c28e83SPiotr Jasiukajtis #error Unknown architecture 241*25c28e83SPiotr Jasiukajtis #endif 242