xref: /freebsd/contrib/arm-optimized-routines/math/erff.c (revision 6c05f3a74f30934ee60919cc97e16ec69b542b06)
1 /*
2  * Single-precision erf(x) function.
3  *
4  * Copyright (c) 2020-2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include <stdint.h>
9 #include <math.h>
10 #include "math_config.h"
11 #include "test_defs.h"
12 #include "test_sig.h"
13 
14 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
15 #define A __erff_data.erff_poly_A
16 #define B __erff_data.erff_poly_B
17 
18 /* Top 12 bits of a float.  */
19 static inline uint32_t
20 top12 (float x)
21 {
22   return asuint (x) >> 20;
23 }
24 
25 /* Efficient implementation of erff
26    using either a pure polynomial approximation or
27    the exponential of a polynomial.
28    Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
29 float
30 erff (float x)
31 {
32   float r, x2, u;
33 
34   /* Get top word.  */
35   uint32_t ix = asuint (x);
36   uint32_t sign = ix >> 31;
37   uint32_t ia12 = top12 (x) & 0x7ff;
38 
39   /* Limit of both intervals is 0.875 for performance reasons but coefficients
40      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
41      from 0.94 to 1.1ulps.  */
42   if (ia12 < 0x3f6)
43     { /* a = |x| < 0.875.  */
44 
45       /* Tiny and subnormal cases.  */
46       if (unlikely (ia12 < 0x318))
47 	{ /* |x| < 2^(-28).  */
48 	  if (unlikely (ia12 < 0x040))
49 	    { /* |x| < 2^(-119).  */
50 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
51 	      return check_uflowf (y);
52 	    }
53 	  return x + TwoOverSqrtPiMinusOne * x;
54 	}
55 
56       x2 = x * x;
57 
58       /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
59       r = A[5];
60       r = fmaf (r, x2, A[4]);
61       r = fmaf (r, x2, A[3]);
62       r = fmaf (r, x2, A[2]);
63       r = fmaf (r, x2, A[1]);
64       r = fmaf (r, x2, A[0]);
65       r = fmaf (r, x, x);
66     }
67   else if (ia12 < 0x408)
68     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
69 
70       float a = fabsf (x);
71       /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
72       r = fmaf (B[6], a, B[5]);
73       u = fmaf (B[4], a, B[3]);
74       x2 = x * x;
75       r = fmaf (r, x2, u);
76       /* Then switch to pure Horner scheme.  */
77       r = fmaf (r, a, B[2]);
78       r = fmaf (r, a, B[1]);
79       r = fmaf (r, a, B[0]);
80       r = fmaf (r, a, a);
81       /* Single precision exponential with ~0.5ulps,
82 	 ensures erff has max. rel. error
83 	 < 1ulp on [0.921875, 4.0],
84 	 < 1.1ulps on [0.875, 4.0].  */
85       r = expf (-r);
86       /* Explicit copysign (calling copysignf increases latency).  */
87       if (sign)
88 	r = -1.0f + r;
89       else
90 	r = 1.0f - r;
91     }
92   else
93     { /* |x| >= 4.0.  */
94 
95       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
96       if (unlikely (ia12 >= 0x7f8))
97 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
98 
99       /* Explicit copysign (calling copysignf increases latency).  */
100       if (sign)
101 	r = -1.0f;
102       else
103 	r = 1.0f;
104     }
105   return r;
106 }
107 
108 TEST_SIG (S, F, 1, erf, -6.0, 6.0)
109 TEST_ULP (erff, 0.6)
110 TEST_ULP_NONNEAREST (erff, 0.9)
111 TEST_INTERVAL (erff, 0, 0xffff0000, 10000)
112 TEST_SYM_INTERVAL (erff, 0x1p-127, 0x1p-26, 40000)
113 TEST_SYM_INTERVAL (erff, 0x1p-26, 0x1p3, 40000)
114 TEST_INTERVAL (erff, 0, inf, 40000)
115