xref: /freebsd/contrib/arm-optimized-routines/math/erf.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * Double-precision erf(x) function.
3  *
4  * Copyright (c) 2020-2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "math_config.h"
9 #include <math.h>
10 #include <stdint.h>
11 #include "test_defs.h"
12 #include "test_sig.h"
13 
14 #define TwoOverSqrtPiMinusOne 0x1.06eba8214db69p-3
15 #define C 0x1.b0ac16p-1
16 #define PA __erf_data.erf_poly_A
17 #define NA __erf_data.erf_ratio_N_A
18 #define DA __erf_data.erf_ratio_D_A
19 #define NB __erf_data.erf_ratio_N_B
20 #define DB __erf_data.erf_ratio_D_B
21 #define PC __erf_data.erfc_poly_C
22 #define PD __erf_data.erfc_poly_D
23 #define PE __erf_data.erfc_poly_E
24 #define PF __erf_data.erfc_poly_F
25 
26 /* Top 32 bits of a double.  */
27 static inline uint32_t
28 top32 (double x)
29 {
30   return asuint64 (x) >> 32;
31 }
32 
33 /* Fast erf implementation using a mix of
34    rational and polynomial approximations.
35    Highest measured error is 1.01 ULPs at 0x1.39956ac43382fp+0.  */
36 double
37 erf (double x)
38 {
39   /* Get top word and sign.  */
40   uint32_t ix = top32 (x);
41   uint32_t ia = ix & 0x7fffffff;
42   uint32_t sign = ix >> 31;
43 
44   /* Normalized and subnormal cases */
45   if (ia < 0x3feb0000)
46     { /* a = |x| < 0.84375.  */
47 
48       if (ia < 0x3e300000)
49 	{ /* a < 2^(-28).  */
50 	  if (ia < 0x00800000)
51 	    { /* a < 2^(-1015).  */
52 	      double y =  fma (TwoOverSqrtPiMinusOne, x, x);
53 	      return check_uflow (y);
54 	    }
55 	  return x + TwoOverSqrtPiMinusOne * x;
56 	}
57 
58       double x2 = x * x;
59 
60       if (ia < 0x3fe00000)
61 	{ /* a < 0.5  - Use polynomial approximation.  */
62 	  double r1 = fma (x2, PA[1], PA[0]);
63 	  double r2 = fma (x2, PA[3], PA[2]);
64 	  double r3 = fma (x2, PA[5], PA[4]);
65 	  double r4 = fma (x2, PA[7], PA[6]);
66 	  double r5 = fma (x2, PA[9], PA[8]);
67 	  double x4 = x2 * x2;
68 	  double r = r5;
69 	  r = fma (x4, r, r4);
70 	  r = fma (x4, r, r3);
71 	  r = fma (x4, r, r2);
72 	  r = fma (x4, r, r1);
73 	  return fma (r, x, x); /* This fma is crucial for accuracy.  */
74 	}
75       else
76 	{ /* 0.5 <= a < 0.84375 - Use rational approximation.  */
77 	  double x4, x8, r1n, r2n, r1d, r2d, r3d;
78 
79 	  r1n = fma (x2, NA[1], NA[0]);
80 	  x4 = x2 * x2;
81 	  r2n = fma (x2, NA[3], NA[2]);
82 	  x8 = x4 * x4;
83 	  r1d = fma (x2, DA[0], 1.0);
84 	  r2d = fma (x2, DA[2], DA[1]);
85 	  r3d = fma (x2, DA[4], DA[3]);
86 	  double P = r1n + x4 * r2n + x8 * NA[4];
87 	  double Q = r1d + x4 * r2d + x8 * r3d;
88 	  return fma (P / Q, x, x);
89 	}
90     }
91   else if (ia < 0x3ff40000)
92     { /* 0.84375 <= |x| < 1.25.  */
93       double a2, a4, a6, r1n, r2n, r3n, r4n, r1d, r2d, r3d, r4d;
94       double a = fabs (x) - 1.0;
95       r1n = fma (a, NB[1], NB[0]);
96       a2 = a * a;
97       r1d = fma (a, DB[0], 1.0);
98       a4 = a2 * a2;
99       r2n = fma (a, NB[3], NB[2]);
100       a6 = a4 * a2;
101       r2d = fma (a, DB[2], DB[1]);
102       r3n = fma (a, NB[5], NB[4]);
103       r3d = fma (a, DB[4], DB[3]);
104       r4n = NB[6];
105       r4d = DB[5];
106       double P = r1n + a2 * r2n + a4 * r3n + a6 * r4n;
107       double Q = r1d + a2 * r2d + a4 * r3d + a6 * r4d;
108       if (sign)
109 	return -C - P / Q;
110       else
111 	return C + P / Q;
112     }
113   else if (ia < 0x40000000)
114     { /* 1.25 <= |x| < 2.0.  */
115       double a = fabs (x);
116       a = a - 1.25;
117 
118       double r1 = fma (a, PC[1], PC[0]);
119       double r2 = fma (a, PC[3], PC[2]);
120       double r3 = fma (a, PC[5], PC[4]);
121       double r4 = fma (a, PC[7], PC[6]);
122       double r5 = fma (a, PC[9], PC[8]);
123       double r6 = fma (a, PC[11], PC[10]);
124       double r7 = fma (a, PC[13], PC[12]);
125       double r8 = fma (a, PC[15], PC[14]);
126 
127       double a2 = a * a;
128 
129       double r = r8;
130       r = fma (a2, r, r7);
131       r = fma (a2, r, r6);
132       r = fma (a2, r, r5);
133       r = fma (a2, r, r4);
134       r = fma (a2, r, r3);
135       r = fma (a2, r, r2);
136       r = fma (a2, r, r1);
137 
138       if (sign)
139 	return -1.0 + r;
140       else
141 	return 1.0 - r;
142     }
143   else if (ia < 0x400a0000)
144     { /* 2 <= |x| < 3.25.  */
145       double a = fabs (x);
146       a = fma (0.5, a, -1.0);
147 
148       double r1 = fma (a, PD[1], PD[0]);
149       double r2 = fma (a, PD[3], PD[2]);
150       double r3 = fma (a, PD[5], PD[4]);
151       double r4 = fma (a, PD[7], PD[6]);
152       double r5 = fma (a, PD[9], PD[8]);
153       double r6 = fma (a, PD[11], PD[10]);
154       double r7 = fma (a, PD[13], PD[12]);
155       double r8 = fma (a, PD[15], PD[14]);
156       double r9 = fma (a, PD[17], PD[16]);
157 
158       double a2 = a * a;
159 
160       double r = r9;
161       r = fma (a2, r, r8);
162       r = fma (a2, r, r7);
163       r = fma (a2, r, r6);
164       r = fma (a2, r, r5);
165       r = fma (a2, r, r4);
166       r = fma (a2, r, r3);
167       r = fma (a2, r, r2);
168       r = fma (a2, r, r1);
169 
170       if (sign)
171 	return -1.0 + r;
172       else
173 	return 1.0 - r;
174     }
175   else if (ia < 0x40100000)
176     { /* 3.25 <= |x| < 4.0.  */
177       double a = fabs (x);
178       a = a - 3.25;
179 
180       double r1 = fma (a, PE[1], PE[0]);
181       double r2 = fma (a, PE[3], PE[2]);
182       double r3 = fma (a, PE[5], PE[4]);
183       double r4 = fma (a, PE[7], PE[6]);
184       double r5 = fma (a, PE[9], PE[8]);
185       double r6 = fma (a, PE[11], PE[10]);
186       double r7 = fma (a, PE[13], PE[12]);
187 
188       double a2 = a * a;
189 
190       double r = r7;
191       r = fma (a2, r, r6);
192       r = fma (a2, r, r5);
193       r = fma (a2, r, r4);
194       r = fma (a2, r, r3);
195       r = fma (a2, r, r2);
196       r = fma (a2, r, r1);
197 
198       if (sign)
199 	return -1.0 + r;
200       else
201 	return 1.0 - r;
202     }
203   else if (ia < 0x4017a000)
204     { /* 4 <= |x| < 5.90625.  */
205       double a = fabs (x);
206       a = fma (0.5, a, -2.0);
207 
208       double r1 = fma (a, PF[1], PF[0]);
209       double r2 = fma (a, PF[3], PF[2]);
210       double r3 = fma (a, PF[5], PF[4]);
211       double r4 = fma (a, PF[7], PF[6]);
212       double r5 = fma (a, PF[9], PF[8]);
213       double r6 = fma (a, PF[11], PF[10]);
214       double r7 = fma (a, PF[13], PF[12]);
215       double r8 = fma (a, PF[15], PF[14]);
216       double r9 = PF[16];
217 
218       double a2 = a * a;
219 
220       double r = r9;
221       r = fma (a2, r, r8);
222       r = fma (a2, r, r7);
223       r = fma (a2, r, r6);
224       r = fma (a2, r, r5);
225       r = fma (a2, r, r4);
226       r = fma (a2, r, r3);
227       r = fma (a2, r, r2);
228       r = fma (a2, r, r1);
229 
230       if (sign)
231 	return -1.0 + r;
232       else
233 	return 1.0 - r;
234     }
235   else
236     {
237       /* Special cases : erf(nan)=nan, erf(+inf)=+1 and erf(-inf)=-1.  */
238       if (unlikely (ia >= 0x7ff00000))
239 	return (double) (1.0 - (sign << 1)) + 1.0 / x;
240 
241       if (sign)
242 	return -1.0;
243       else
244 	return 1.0;
245     }
246 }
247 
248 TEST_SIG (S, D, 1, erf, -6.0, 6.0)
249 TEST_ULP (erf, 0.51)
250 TEST_ULP_NONNEAREST (erf, 0.9)
251 TEST_INTERVAL (erf, 0, 0xffff000000000000, 10000)
252 TEST_SYM_INTERVAL (erf, 0x1p-1022, 0x1p-26, 40000)
253 TEST_SYM_INTERVAL (erf, 0x1p-26, 0x1p3, 40000)
254 TEST_INTERVAL (erf, 0, inf, 40000)
255