xref: /freebsd/contrib/arm-optimized-routines/math/erff.c (revision 13ec1e3155c7e9bf037b12af186351b7fa9b9450)
1 /*
2  * Single-precision erf(x) function.
3  *
4  * Copyright (c) 2020, Arm Limited.
5  * SPDX-License-Identifier: MIT
6  */
7 
8 #include <stdint.h>
9 #include <math.h>
10 #include "math_config.h"
11 
12 #define TwoOverSqrtPiMinusOne 0x1.06eba8p-3f
13 #define A __erff_data.erff_poly_A
14 #define B __erff_data.erff_poly_B
15 
16 /* Top 12 bits of a float.  */
17 static inline uint32_t
18 top12 (float x)
19 {
20   return asuint (x) >> 20;
21 }
22 
23 /* Efficient implementation of erff
24    using either a pure polynomial approximation or
25    the exponential of a polynomial.
26    Worst-case error is 1.09ulps at 0x1.c111acp-1.  */
27 float
28 erff (float x)
29 {
30   float r, x2, u;
31 
32   /* Get top word.  */
33   uint32_t ix = asuint (x);
34   uint32_t sign = ix >> 31;
35   uint32_t ia12 = top12 (x) & 0x7ff;
36 
37   /* Limit of both intervals is 0.875 for performance reasons but coefficients
38      computed on [0.0, 0.921875] and [0.921875, 4.0], which brought accuracy
39      from 0.94 to 1.1ulps.  */
40   if (ia12 < 0x3f6)
41     { /* a = |x| < 0.875.  */
42 
43       /* Tiny and subnormal cases.  */
44       if (unlikely (ia12 < 0x318))
45 	{ /* |x| < 2^(-28).  */
46 	  if (unlikely (ia12 < 0x040))
47 	    { /* |x| < 2^(-119).  */
48 	      float y = fmaf (TwoOverSqrtPiMinusOne, x, x);
49 	      return check_uflowf (y);
50 	    }
51 	  return x + TwoOverSqrtPiMinusOne * x;
52 	}
53 
54       x2 = x * x;
55 
56       /* Normalized cases (|x| < 0.921875). Use Horner scheme for x+x*P(x^2).  */
57       r = A[5];
58       r = fmaf (r, x2, A[4]);
59       r = fmaf (r, x2, A[3]);
60       r = fmaf (r, x2, A[2]);
61       r = fmaf (r, x2, A[1]);
62       r = fmaf (r, x2, A[0]);
63       r = fmaf (r, x, x);
64     }
65   else if (ia12 < 0x408)
66     { /* |x| < 4.0 - Use a custom Estrin scheme.  */
67 
68       float a = fabsf (x);
69       /* Start with Estrin scheme on high order (small magnitude) coefficients.  */
70       r = fmaf (B[6], a, B[5]);
71       u = fmaf (B[4], a, B[3]);
72       x2 = x * x;
73       r = fmaf (r, x2, u);
74       /* Then switch to pure Horner scheme.  */
75       r = fmaf (r, a, B[2]);
76       r = fmaf (r, a, B[1]);
77       r = fmaf (r, a, B[0]);
78       r = fmaf (r, a, a);
79       /* Single precision exponential with ~0.5ulps,
80 	 ensures erff has max. rel. error
81 	 < 1ulp on [0.921875, 4.0],
82 	 < 1.1ulps on [0.875, 4.0].  */
83       r = expf (-r);
84       /* Explicit copysign (calling copysignf increases latency).  */
85       if (sign)
86 	r = -1.0f + r;
87       else
88 	r = 1.0f - r;
89     }
90   else
91     { /* |x| >= 4.0.  */
92 
93       /* Special cases : erff(nan)=nan, erff(+inf)=+1 and erff(-inf)=-1.  */
94       if (unlikely (ia12 >= 0x7f8))
95 	return (1.f - (float) ((ix >> 31) << 1)) + 1.f / x;
96 
97       /* Explicit copysign (calling copysignf increases latency).  */
98       if (sign)
99 	r = -1.0f;
100       else
101 	r = 1.0f;
102     }
103   return r;
104 }
105