xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/experimental/sve/erfinvf_5u.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1 /*
2  * Single-precision inverse error function (SVE variant).
3  *
4  * Copyright (c) 2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 #include "sv_math.h"
8 #include "test_sig.h"
9 #include "test_defs.h"
10 #include "sv_poly_f32.h"
11 #include "sv_logf_inline.h"
12 
13 const static struct data
14 {
15   /*  We use P_N and Q_N to refer to arrays of coefficients, where P_N
16       is the coeffs of the numerator in table N of Blair et al, and
17       Q_N is the coeffs of the denominator. Coefficients stored in
18       interleaved format to support lookup scheme.  */
19   float P10_2, P29_3, Q10_2, Q29_2;
20   float P10_0, P29_1, P10_1, P29_2;
21   float Q10_0, Q29_0, Q10_1, Q29_1;
22   float P29_0, P_50[6], Q_50[2], tailshift;
23   struct sv_logf_data logf_tbl;
24 } data = { .P10_0 = -0x1.a31268p+3,
25 	   .P10_1 = 0x1.ac9048p+4,
26 	   .P10_2 = -0x1.293ff6p+3,
27 	   .P29_0 = -0x1.fc0252p-4,
28 	   .P29_1 = 0x1.119d44p+0,
29 	   .P29_2 = -0x1.f59ee2p+0,
30 	   .P29_3 = 0x1.b13626p-2,
31 	   .Q10_0 = -0x1.8265eep+3,
32 	   .Q10_1 = 0x1.ef5eaep+4,
33 	   .Q10_2 = -0x1.12665p+4,
34 	   .Q29_0 = -0x1.69952p-4,
35 	   .Q29_1 = 0x1.c7b7d2p-1,
36 	   .Q29_2 = -0x1.167d7p+1,
37 	   .P_50 = { 0x1.3d8948p-3, 0x1.61f9eap+0, 0x1.61c6bcp-1,
38 		     -0x1.20c9f2p+0, 0x1.5c704cp-1, -0x1.50c6bep-3 },
39 	   .Q_50 = { 0x1.3d7dacp-3, 0x1.629e5p+0 },
40 	   .tailshift = -0.87890625,
41 	   .logf_tbl = SV_LOGF_CONSTANTS };
42 
43 static inline svfloat32_t
special(svbool_t pg,svfloat32_t x,const struct data * d)44 special (svbool_t pg, svfloat32_t x, const struct data *d)
45 {
46   svfloat32_t ax = svabs_x (pg, x);
47   svfloat32_t t = svdivr_x (
48       pg,
49       svsqrt_x (pg, svneg_x (pg, sv_logf_inline (pg, svsubr_x (pg, ax, 1),
50 						 &d->logf_tbl))),
51       1);
52   svuint32_t sign
53       = sveor_x (pg, svreinterpret_u32 (ax), svreinterpret_u32 (x));
54   svfloat32_t ts
55       = svreinterpret_f32 (svorr_x (pg, sign, svreinterpret_u32 (t)));
56   svfloat32_t q
57       = svmla_x (pg, sv_f32 (d->Q_50[0]), svadd_x (pg, t, d->Q_50[1]), t);
58   return svdiv_x (pg, sv_horner_5_f32_x (pg, t, d->P_50), svmul_x (pg, ts, q));
59 }
60 
61 static inline svfloat32_t
notails(svbool_t pg,svfloat32_t x,const struct data * d)62 notails (svbool_t pg, svfloat32_t x, const struct data *d)
63 {
64   /* Shortcut when no input is in a tail region - no need to gather shift or
65      coefficients.  */
66   svfloat32_t t = svmad_x (pg, x, x, -0.5625);
67   svfloat32_t q = svadd_x (pg, t, d->Q10_2);
68   q = svmad_x (pg, t, q, d->Q10_1);
69   q = svmad_x (pg, t, q, d->Q10_0);
70 
71   svfloat32_t p = svmla_x (pg, sv_f32 (d->P10_1), t, d->P10_2);
72   p = svmad_x (pg, p, t, d->P10_0);
73 
74   return svdiv_x (pg, svmul_x (pg, x, p), q);
75 }
76 
77 /* Vector implementation of Blair et al's rational approximation to inverse
78    error function in single-precision. Worst-case error is 4.71 ULP, in the
79    tail region:
80    _ZGVsMxv_erfinvf(0x1.f84e9ap-1) got 0x1.b8326ap+0
81 				  want 0x1.b83274p+0.  */
SV_NAME_F1(erfinv)82 svfloat32_t SV_NAME_F1 (erfinv) (svfloat32_t x, svbool_t pg)
83 {
84   const struct data *d = ptr_barrier (&data);
85 
86   /* Calculate inverse error using algorithm described in
87      J. M. Blair, C. A. Edwards, and J. H. Johnson,
88      "Rational Chebyshev approximations for the inverse of the error function",
89      Math. Comp. 30, pp. 827--830 (1976).
90      https://doi.org/10.1090/S0025-5718-1976-0421040-7.  */
91 
92   /* Algorithm has 3 intervals:
93      - 'Normal' region [-0.75, 0.75]
94      - Tail region [0.75, 0.9375] U [-0.9375, -0.75]
95      - Extreme tail [-1, -0.9375] U [0.9375, 1]
96      Normal and tail are both rational approximation of similar order on
97      shifted input - these are typically performed in parallel using gather
98      loads to obtain correct coefficients depending on interval.  */
99   svbool_t is_tail = svacge (pg, x, 0.75);
100   svbool_t extreme_tail = svacge (pg, x, 0.9375);
101 
102   if (likely (!svptest_any (pg, is_tail)))
103     return notails (pg, x, d);
104 
105   /* Select requisite shift depending on interval: polynomial is evaluated on
106      x * x - shift.
107      Normal shift = 0.5625
108      Tail shift   = 0.87890625.  */
109   svfloat32_t t = svmla_x (
110       pg, svsel (is_tail, sv_f32 (d->tailshift), sv_f32 (-0.5625)), x, x);
111 
112   svuint32_t idx = svdup_u32_z (is_tail, 1);
113   svuint32_t idxhi = svadd_x (pg, idx, 2);
114 
115   /* Load coeffs in quadwords and select them according to interval.  */
116   svfloat32_t pqhi = svld1rq (svptrue_b32 (), &d->P10_2);
117   svfloat32_t plo = svld1rq (svptrue_b32 (), &d->P10_0);
118   svfloat32_t qlo = svld1rq (svptrue_b32 (), &d->Q10_0);
119 
120   svfloat32_t p2 = svtbl (pqhi, idx);
121   svfloat32_t p1 = svtbl (plo, idxhi);
122   svfloat32_t p0 = svtbl (plo, idx);
123   svfloat32_t q0 = svtbl (qlo, idx);
124   svfloat32_t q1 = svtbl (qlo, idxhi);
125   svfloat32_t q2 = svtbl (pqhi, idxhi);
126 
127   svfloat32_t p = svmla_x (pg, p1, p2, t);
128   p = svmla_x (pg, p0, p, t);
129   /* Tail polynomial has higher order - merge with normal lanes.  */
130   p = svmad_m (is_tail, p, t, d->P29_0);
131   svfloat32_t y = svmul_x (pg, x, p);
132 
133   /* Least significant term of both Q polynomials is 1, so no need to generate
134      it.  */
135   svfloat32_t q = svadd_x (pg, t, q2);
136   q = svmla_x (pg, q1, q, t);
137   q = svmla_x (pg, q0, q, t);
138 
139   if (unlikely (svptest_any (pg, extreme_tail)))
140     return svsel (extreme_tail, special (extreme_tail, x, d),
141 		  svdiv_x (pg, y, q));
142   return svdiv_x (pg, y, q);
143 }
144 
145 #if USE_MPFR
146 # warning Not generating tests for _ZGVsMxv_erfinvf, as MPFR has no suitable reference
147 #else
148 TEST_SIG (SV, F, 1, erfinv, -0.99, 0.99)
149 TEST_ULP (SV_NAME_F1 (erfinv), 4.09)
150 TEST_DISABLE_FENV (SV_NAME_F1 (erfinv))
151 TEST_SYM_INTERVAL (SV_NAME_F1 (erfinv), 0, 1, 40000)
152 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.5)
153 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.8)
154 TEST_CONTROL_VALUE (SV_NAME_F1 (erfinv), 0.95)
155 #endif
156 CLOSE_SVE_ATTR
157