1 /* 2 * Double-precision erf(x) function. 3 * 4 * Copyright (c) 2020, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "math_config.h" 9 #include <math.h> 10 #include <stdint.h> 11 12 #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3 13 #define C 0x1.b0ac16p-1 14 #define PA __erf_data.erf_poly_A 15 #define NA __erf_data.erf_ratio_N_A 16 #define DA __erf_data.erf_ratio_D_A 17 #define NB __erf_data.erf_ratio_N_B 18 #define DB __erf_data.erf_ratio_D_B 19 #define PC __erf_data.erfc_poly_C 20 #define PD __erf_data.erfc_poly_D 21 #define PE __erf_data.erfc_poly_E 22 #define PF __erf_data.erfc_poly_F 23 24 /* Top 32 bits of a double. */ 25 static inline uint32_t 26 top32 (double x) 27 { 28 return asuint64 (x) >> 32; 29 } 30 31 /* Fast erf implementation using a mix of 32 rational and polynomial approximations. 33 Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0. */ 34 double 35 erf (double x) 36 { 37 /* Get top word and sign. */ 38 uint32_t ix = top32 (x); 39 uint32_t ia = ix & 0x7fffffff; 40 uint32_t sign = ix >> 31; 41 42 /* Normalized and subnormal cases */ 43 if (ia < 0x3feb0000) 44 { /* a = |x| < 0.84375. */ 45 46 if (ia < 0x3e300000) 47 { /* a < 2^(-28). */ 48 if (ia < 0x00800000) 49 { /* a < 2^(-1015). */ 50 double y = fma (TwoOverSqrtPiMinusOne, x, x); 51 return check_uflow (y); 52 } 53 return x + TwoOverSqrtPiMinusOne * x; 54 } 55 56 double x2 = x * x; 57 58 if (ia < 0x3fe00000) 59 { /* a < 0.5 - Use polynomial approximation. */ 60 double r1 = fma (x2, PA[1], PA[0]); 61 double r2 = fma (x2, PA[3], PA[2]); 62 double r3 = fma (x2, PA[5], PA[4]); 63 double r4 = fma (x2, PA[7], PA[6]); 64 double r5 = fma (x2, PA[9], PA[8]); 65 double x4 = x2 * x2; 66 double r = r5; 67 r = fma (x4, r, r4); 68 r = fma (x4, r, r3); 69 r = fma (x4, r, r2); 70 r = fma (x4, r, r1); 71 return fma (r, x, x); /* This fma is crucial for accuracy. */ 72 } 73 else 74 { /* 0.5 <= a < 0.84375 - Use rational approximation. */ 75 double x4, x8, r1n, r2n, r1d, r2d, r3d; 76 77 r1n = fma (x2, NA[1], NA[0]); 78 x4 = x2 * x2; 79 r2n = fma (x2, NA[3], NA[2]); 80 x8 = x4 * x4; 81 r1d = fma (x2, DA[0], 1.0); 82 r2d = fma (x2, DA[2], DA[1]); 83 r3d = fma (x2, DA[4], DA[3]); 84 double P = r1n + x4 * r2n + x8 * NA[4]; 85 double Q = r1d + x4 * r2d + x8 * r3d; 86 return fma (P / Q, x, x); 87 } 88 } 89 else if (ia < 0x3ff40000) 90 { /* 0.84375 <= |x| < 1.25. */ 91 double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d; 92 double a = fabs (x) - 1.0; 93 r1n = fma (a, NB[1], NB[0]); 94 a2 = a * a; 95 r1d = fma (a, DB[0], 1.0); 96 a4 = a2 * a2; 97 r2n = fma (a, NB[3], NB[2]); 98 a6 = a4 * a2; 99 r2d = fma (a, DB[2], DB[1]); 100 r3n = fma (a, NB[5], NB[4]); 101 r3d = fma (a, DB[4], DB[3]); 102 r4n = NB[6]; 103 r4d = DB[5]; 104 double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n; 105 double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d; 106 if (sign) 107 return -C - P / Q; 108 else 109 return C + P / Q; 110 } 111 else if (ia < 0x40000000) 112 { /* 1.25 <= |x| < 2.0. */ 113 double a = fabs (x); 114 a = a - 1.25; 115 116 double r1 = fma (a, PC[1], PC[0]); 117 double r2 = fma (a, PC[3], PC[2]); 118 double r3 = fma (a, PC[5], PC[4]); 119 double r4 = fma (a, PC[7], PC[6]); 120 double r5 = fma (a, PC[9], PC[8]); 121 double r6 = fma (a, PC[11], PC[10]); 122 double r7 = fma (a, PC[13], PC[12]); 123 double r8 = fma (a, PC[15], PC[14]); 124 125 double a2 = a * a; 126 127 double r = r8; 128 r = fma (a2, r, r7); 129 r = fma (a2, r, r6); 130 r = fma (a2, r, r5); 131 r = fma (a2, r, r4); 132 r = fma (a2, r, r3); 133 r = fma (a2, r, r2); 134 r = fma (a2, r, r1); 135 136 if (sign) 137 return -1.0 + r; 138 else 139 return 1.0 - r; 140 } 141 else if (ia < 0x400a0000) 142 { /* 2 <= |x| < 3.25. */ 143 double a = fabs (x); 144 a = fma (0.5, a, -1.0); 145 146 double r1 = fma (a, PD[1], PD[0]); 147 double r2 = fma (a, PD[3], PD[2]); 148 double r3 = fma (a, PD[5], PD[4]); 149 double r4 = fma (a, PD[7], PD[6]); 150 double r5 = fma (a, PD[9], PD[8]); 151 double r6 = fma (a, PD[11], PD[10]); 152 double r7 = fma (a, PD[13], PD[12]); 153 double r8 = fma (a, PD[15], PD[14]); 154 double r9 = fma (a, PD[17], PD[16]); 155 156 double a2 = a * a; 157 158 double r = r9; 159 r = fma (a2, r, r8); 160 r = fma (a2, r, r7); 161 r = fma (a2, r, r6); 162 r = fma (a2, r, r5); 163 r = fma (a2, r, r4); 164 r = fma (a2, r, r3); 165 r = fma (a2, r, r2); 166 r = fma (a2, r, r1); 167 168 if (sign) 169 return -1.0 + r; 170 else 171 return 1.0 - r; 172 } 173 else if (ia < 0x40100000) 174 { /* 3.25 <= |x| < 4.0. */ 175 double a = fabs (x); 176 a = a - 3.25; 177 178 double r1 = fma (a, PE[1], PE[0]); 179 double r2 = fma (a, PE[3], PE[2]); 180 double r3 = fma (a, PE[5], PE[4]); 181 double r4 = fma (a, PE[7], PE[6]); 182 double r5 = fma (a, PE[9], PE[8]); 183 double r6 = fma (a, PE[11], PE[10]); 184 double r7 = fma (a, PE[13], PE[12]); 185 186 double a2 = a * a; 187 188 double r = r7; 189 r = fma (a2, r, r6); 190 r = fma (a2, r, r5); 191 r = fma (a2, r, r4); 192 r = fma (a2, r, r3); 193 r = fma (a2, r, r2); 194 r = fma (a2, r, r1); 195 196 if (sign) 197 return -1.0 + r; 198 else 199 return 1.0 - r; 200 } 201 else if (ia < 0x4017a000) 202 { /* 4 <= |x| < 5.90625. */ 203 double a = fabs (x); 204 a = fma (0.5, a, -2.0); 205 206 double r1 = fma (a, PF[1], PF[0]); 207 double r2 = fma (a, PF[3], PF[2]); 208 double r3 = fma (a, PF[5], PF[4]); 209 double r4 = fma (a, PF[7], PF[6]); 210 double r5 = fma (a, PF[9], PF[8]); 211 double r6 = fma (a, PF[11], PF[10]); 212 double r7 = fma (a, PF[13], PF[12]); 213 double r8 = fma (a, PF[15], PF[14]); 214 double r9 = PF[16]; 215 216 double a2 = a * a; 217 218 double r = r9; 219 r = fma (a2, r, r8); 220 r = fma (a2, r, r7); 221 r = fma (a2, r, r6); 222 r = fma (a2, r, r5); 223 r = fma (a2, r, r4); 224 r = fma (a2, r, r3); 225 r = fma (a2, r, r2); 226 r = fma (a2, r, r1); 227 228 if (sign) 229 return -1.0 + r; 230 else 231 return 1.0 - r; 232 } 233 else 234 { 235 /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1. */ 236 if (unlikely (ia >= 0x7ff00000)) 237 return (double) (1.0 - (sign << 1)) + 1.0 / x; 238 239 if (sign) 240 return -1.0; 241 else 242 return 1.0; 243 } 244 } 245