xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/sve/pow.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner  * Double-precision SVE pow(x, y) function.
3*f3087befSAndrew Turner  *
4*f3087befSAndrew Turner  * Copyright (c) 2022-2025, Arm Limited.
5*f3087befSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner  */
7*f3087befSAndrew Turner 
8*f3087befSAndrew Turner #include "sv_math.h"
9*f3087befSAndrew Turner #include "test_sig.h"
10*f3087befSAndrew Turner #include "test_defs.h"
11*f3087befSAndrew Turner 
12*f3087befSAndrew Turner /* This version share a similar algorithm as AOR scalar pow.
13*f3087befSAndrew Turner 
14*f3087befSAndrew Turner    The core computation consists in computing pow(x, y) as
15*f3087befSAndrew Turner 
16*f3087befSAndrew Turner      exp (y * log (x)).
17*f3087befSAndrew Turner 
18*f3087befSAndrew Turner    The algorithms for exp and log are very similar to scalar exp and log.
19*f3087befSAndrew Turner    The log relies on table lookup for 3 variables and an order 8 polynomial.
20*f3087befSAndrew Turner    It returns a high and a low contribution that are then passed to the exp,
21*f3087befSAndrew Turner    to minimise the loss of accuracy in both routines.
22*f3087befSAndrew Turner    The exp is based on 8-bit table lookup for scale and order-4 polynomial.
23*f3087befSAndrew Turner    The SVE algorithm drops the tail in the exp computation at the price of
24*f3087befSAndrew Turner    a lower accuracy, slightly above 1ULP.
25*f3087befSAndrew Turner    The SVE algorithm also drops the special treatement of small (< 2^-65) and
26*f3087befSAndrew Turner    large (> 2^63) finite values of |y|, as they only affect non-round to
27*f3087befSAndrew Turner    nearest modes.
28*f3087befSAndrew Turner 
29*f3087befSAndrew Turner    Maximum measured error is 1.04 ULPs:
30*f3087befSAndrew Turner    SV_NAME_D2 (pow) (0x1.3d2d45bc848acp+63, -0x1.a48a38b40cd43p-12)
31*f3087befSAndrew Turner      got 0x1.f7116284221fcp-1
32*f3087befSAndrew Turner     want 0x1.f7116284221fdp-1.  */
33*f3087befSAndrew Turner 
34*f3087befSAndrew Turner /* Data is defined in v_pow_log_data.c.  */
35*f3087befSAndrew Turner #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
36*f3087befSAndrew Turner #define Off 0x3fe6955500000000
37*f3087befSAndrew Turner 
38*f3087befSAndrew Turner /* Data is defined in v_pow_exp_data.c.  */
39*f3087befSAndrew Turner #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
40*f3087befSAndrew Turner #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
41*f3087befSAndrew Turner #define SmallExp 0x3c9 /* top12(0x1p-54).  */
42*f3087befSAndrew Turner #define BigExp 0x408   /* top12(512.).  */
43*f3087befSAndrew Turner #define ThresExp 0x03f /* BigExp - SmallExp.  */
44*f3087befSAndrew Turner #define HugeExp 0x409  /* top12(1024.).  */
45*f3087befSAndrew Turner 
46*f3087befSAndrew Turner /* Constants associated with pow.  */
47*f3087befSAndrew Turner #define SmallBoundX 0x1p-126
48*f3087befSAndrew Turner #define SmallPowX 0x001 /* top12(0x1p-126).  */
49*f3087befSAndrew Turner #define BigPowX 0x7ff	/* top12(INFINITY).  */
50*f3087befSAndrew Turner #define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
51*f3087befSAndrew Turner #define SmallPowY 0x3be /* top12(0x1.e7b6p-65).  */
52*f3087befSAndrew Turner #define BigPowY 0x43e	/* top12(0x1.749p62).  */
53*f3087befSAndrew Turner #define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
54*f3087befSAndrew Turner 
55*f3087befSAndrew Turner static const struct data
56*f3087befSAndrew Turner {
57*f3087befSAndrew Turner   double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
58*f3087befSAndrew Turner   double log_c1, log_c3, log_c5, off;
59*f3087befSAndrew Turner   double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
60*f3087befSAndrew Turner   double exp_c0, exp_c1;
61*f3087befSAndrew Turner } data = {
62*f3087befSAndrew Turner   .log_c0 = -0x1p-1,
63*f3087befSAndrew Turner   .log_c1 = -0x1.555555555556p-1,
64*f3087befSAndrew Turner   .log_c2 = 0x1.0000000000006p-1,
65*f3087befSAndrew Turner   .log_c3 = 0x1.999999959554ep-1,
66*f3087befSAndrew Turner   .log_c4 = -0x1.555555529a47ap-1,
67*f3087befSAndrew Turner   .log_c5 = -0x1.2495b9b4845e9p0,
68*f3087befSAndrew Turner   .log_c6 = 0x1.0002b8b263fc3p0,
69*f3087befSAndrew Turner   .off = Off,
70*f3087befSAndrew Turner   .exp_c0 = 0x1.fffffffffffd4p-2,
71*f3087befSAndrew Turner   .exp_c1 = 0x1.5555571d6ef9p-3,
72*f3087befSAndrew Turner   .exp_c2 = 0x1.5555576a5adcep-5,
73*f3087befSAndrew Turner   .ln2_hi = 0x1.62e42fefa3800p-1,
74*f3087befSAndrew Turner   .ln2_lo = 0x1.ef35793c76730p-45,
75*f3087befSAndrew Turner   .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
76*f3087befSAndrew Turner   .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
77*f3087befSAndrew Turner   .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
78*f3087befSAndrew Turner };
79*f3087befSAndrew Turner 
80*f3087befSAndrew Turner /* Check if x is an integer.  */
81*f3087befSAndrew Turner static inline svbool_t
sv_isint(svbool_t pg,svfloat64_t x)82*f3087befSAndrew Turner sv_isint (svbool_t pg, svfloat64_t x)
83*f3087befSAndrew Turner {
84*f3087befSAndrew Turner   return svcmpeq (pg, svrintz_z (pg, x), x);
85*f3087befSAndrew Turner }
86*f3087befSAndrew Turner 
87*f3087befSAndrew Turner /* Check if x is real not integer valued.  */
88*f3087befSAndrew Turner static inline svbool_t
sv_isnotint(svbool_t pg,svfloat64_t x)89*f3087befSAndrew Turner sv_isnotint (svbool_t pg, svfloat64_t x)
90*f3087befSAndrew Turner {
91*f3087befSAndrew Turner   return svcmpne (pg, svrintz_z (pg, x), x);
92*f3087befSAndrew Turner }
93*f3087befSAndrew Turner 
94*f3087befSAndrew Turner /* Check if x is an odd integer.  */
95*f3087befSAndrew Turner static inline svbool_t
sv_isodd(svbool_t pg,svfloat64_t x)96*f3087befSAndrew Turner sv_isodd (svbool_t pg, svfloat64_t x)
97*f3087befSAndrew Turner {
98*f3087befSAndrew Turner   svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
99*f3087befSAndrew Turner   return sv_isnotint (pg, y);
100*f3087befSAndrew Turner }
101*f3087befSAndrew Turner 
102*f3087befSAndrew Turner /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
103*f3087befSAndrew Turner    the bit representation of a non-zero finite floating-point value.  */
104*f3087befSAndrew Turner static inline int
checkint(uint64_t iy)105*f3087befSAndrew Turner checkint (uint64_t iy)
106*f3087befSAndrew Turner {
107*f3087befSAndrew Turner   int e = iy >> 52 & 0x7ff;
108*f3087befSAndrew Turner   if (e < 0x3ff)
109*f3087befSAndrew Turner     return 0;
110*f3087befSAndrew Turner   if (e > 0x3ff + 52)
111*f3087befSAndrew Turner     return 2;
112*f3087befSAndrew Turner   if (iy & ((1ULL << (0x3ff + 52 - e)) - 1))
113*f3087befSAndrew Turner     return 0;
114*f3087befSAndrew Turner   if (iy & (1ULL << (0x3ff + 52 - e)))
115*f3087befSAndrew Turner     return 1;
116*f3087befSAndrew Turner   return 2;
117*f3087befSAndrew Turner }
118*f3087befSAndrew Turner 
119*f3087befSAndrew Turner /* Top 12 bits (sign and exponent of each double float lane).  */
120*f3087befSAndrew Turner static inline svuint64_t
sv_top12(svfloat64_t x)121*f3087befSAndrew Turner sv_top12 (svfloat64_t x)
122*f3087befSAndrew Turner {
123*f3087befSAndrew Turner   return svlsr_x (svptrue_b64 (), svreinterpret_u64 (x), 52);
124*f3087befSAndrew Turner }
125*f3087befSAndrew Turner 
126*f3087befSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
127*f3087befSAndrew Turner static inline int
zeroinfnan(uint64_t i)128*f3087befSAndrew Turner zeroinfnan (uint64_t i)
129*f3087befSAndrew Turner {
130*f3087befSAndrew Turner   return 2 * i - 1 >= 2 * asuint64 (INFINITY) - 1;
131*f3087befSAndrew Turner }
132*f3087befSAndrew Turner 
133*f3087befSAndrew Turner /* Returns 1 if input is the bit representation of 0, infinity or nan.  */
134*f3087befSAndrew Turner static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint64_t i)135*f3087befSAndrew Turner sv_zeroinfnan (svbool_t pg, svuint64_t i)
136*f3087befSAndrew Turner {
137*f3087befSAndrew Turner   return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
138*f3087befSAndrew Turner 		  2 * asuint64 (INFINITY) - 1);
139*f3087befSAndrew Turner }
140*f3087befSAndrew Turner 
141*f3087befSAndrew Turner /* Handle cases that may overflow or underflow when computing the result that
142*f3087befSAndrew Turner    is scale*(1+TMP) without intermediate rounding.  The bit representation of
143*f3087befSAndrew Turner    scale is in SBITS, however it has a computed exponent that may have
144*f3087befSAndrew Turner    overflown into the sign bit so that needs to be adjusted before using it as
145*f3087befSAndrew Turner    a double.  (int32_t)KI is the k used in the argument reduction and exponent
146*f3087befSAndrew Turner    adjustment of scale, positive k here means the result may overflow and
147*f3087befSAndrew Turner    negative k means the result may underflow.  */
148*f3087befSAndrew Turner static inline double
specialcase(double tmp,uint64_t sbits,uint64_t ki)149*f3087befSAndrew Turner specialcase (double tmp, uint64_t sbits, uint64_t ki)
150*f3087befSAndrew Turner {
151*f3087befSAndrew Turner   double scale;
152*f3087befSAndrew Turner   if ((ki & 0x80000000) == 0)
153*f3087befSAndrew Turner     {
154*f3087befSAndrew Turner       /* k > 0, the exponent of scale might have overflowed by <= 460.  */
155*f3087befSAndrew Turner       sbits -= 1009ull << 52;
156*f3087befSAndrew Turner       scale = asdouble (sbits);
157*f3087befSAndrew Turner       return 0x1p1009 * (scale + scale * tmp);
158*f3087befSAndrew Turner     }
159*f3087befSAndrew Turner   /* k < 0, need special care in the subnormal range.  */
160*f3087befSAndrew Turner   sbits += 1022ull << 52;
161*f3087befSAndrew Turner   /* Note: sbits is signed scale.  */
162*f3087befSAndrew Turner   scale = asdouble (sbits);
163*f3087befSAndrew Turner   double y = scale + scale * tmp;
164*f3087befSAndrew Turner   return 0x1p-1022 * y;
165*f3087befSAndrew Turner }
166*f3087befSAndrew Turner 
167*f3087befSAndrew Turner /* Scalar fallback for special cases of SVE pow's exp.  */
168*f3087befSAndrew Turner static inline svfloat64_t
sv_call_specialcase(svfloat64_t x1,svuint64_t u1,svuint64_t u2,svfloat64_t y,svbool_t cmp)169*f3087befSAndrew Turner sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
170*f3087befSAndrew Turner 		     svfloat64_t y, svbool_t cmp)
171*f3087befSAndrew Turner {
172*f3087befSAndrew Turner   svbool_t p = svpfirst (cmp, svpfalse ());
173*f3087befSAndrew Turner   while (svptest_any (cmp, p))
174*f3087befSAndrew Turner     {
175*f3087befSAndrew Turner       double sx1 = svclastb (p, 0, x1);
176*f3087befSAndrew Turner       uint64_t su1 = svclastb (p, 0, u1);
177*f3087befSAndrew Turner       uint64_t su2 = svclastb (p, 0, u2);
178*f3087befSAndrew Turner       double elem = specialcase (sx1, su1, su2);
179*f3087befSAndrew Turner       svfloat64_t y2 = sv_f64 (elem);
180*f3087befSAndrew Turner       y = svsel (p, y2, y);
181*f3087befSAndrew Turner       p = svpnext_b64 (cmp, p);
182*f3087befSAndrew Turner     }
183*f3087befSAndrew Turner   return y;
184*f3087befSAndrew Turner }
185*f3087befSAndrew Turner 
186*f3087befSAndrew Turner /* Compute y+TAIL = log(x) where the rounded result is y and TAIL has about
187*f3087befSAndrew Turner    additional 15 bits precision.  IX is the bit representation of x, but
188*f3087befSAndrew Turner    normalized in the subnormal range using the sign bit for the exponent.  */
189*f3087befSAndrew Turner static inline svfloat64_t
sv_log_inline(svbool_t pg,svuint64_t ix,svfloat64_t * tail,const struct data * d)190*f3087befSAndrew Turner sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
191*f3087befSAndrew Turner 	       const struct data *d)
192*f3087befSAndrew Turner {
193*f3087befSAndrew Turner   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
194*f3087befSAndrew Turner      The range is split into N subintervals.
195*f3087befSAndrew Turner      The ith subinterval contains z and c is near its center.  */
196*f3087befSAndrew Turner   svuint64_t tmp = svsub_x (pg, ix, d->off);
197*f3087befSAndrew Turner   svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
198*f3087befSAndrew Turner 			  sv_u64 (N_LOG - 1));
199*f3087befSAndrew Turner   svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
200*f3087befSAndrew Turner   svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
201*f3087befSAndrew Turner   svfloat64_t z = svreinterpret_f64 (iz);
202*f3087befSAndrew Turner   svfloat64_t kd = svcvt_f64_x (pg, k);
203*f3087befSAndrew Turner 
204*f3087befSAndrew Turner   /* log(x) = k*Ln2 + log(c) + log1p(z/c-1).  */
205*f3087befSAndrew Turner   /* SVE lookup requires 3 separate lookup tables, as opposed to scalar version
206*f3087befSAndrew Turner      that uses array of structures. We also do the lookup earlier in the code
207*f3087befSAndrew Turner      to make sure it finishes as early as possible.  */
208*f3087befSAndrew Turner   svfloat64_t invc = svld1_gather_index (pg, __v_pow_log_data.invc, i);
209*f3087befSAndrew Turner   svfloat64_t logc = svld1_gather_index (pg, __v_pow_log_data.logc, i);
210*f3087befSAndrew Turner   svfloat64_t logctail = svld1_gather_index (pg, __v_pow_log_data.logctail, i);
211*f3087befSAndrew Turner 
212*f3087befSAndrew Turner   /* Note: 1/c is j/N or j/N/2 where j is an integer in [N,2N) and
213*f3087befSAndrew Turner      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
214*f3087befSAndrew Turner   svfloat64_t r = svmad_x (pg, z, invc, -1.0);
215*f3087befSAndrew Turner   /* k*Ln2 + log(c) + r.  */
216*f3087befSAndrew Turner 
217*f3087befSAndrew Turner   svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
218*f3087befSAndrew Turner   svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
219*f3087befSAndrew Turner   svfloat64_t t2 = svadd_x (pg, t1, r);
220*f3087befSAndrew Turner   svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
221*f3087befSAndrew Turner   svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
222*f3087befSAndrew Turner 
223*f3087befSAndrew Turner   /* Evaluation is optimized assuming superscalar pipelined execution.  */
224*f3087befSAndrew Turner 
225*f3087befSAndrew Turner   svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
226*f3087befSAndrew Turner   svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
227*f3087befSAndrew Turner   svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
228*f3087befSAndrew Turner   svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
229*f3087befSAndrew Turner   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
230*f3087befSAndrew Turner   svfloat64_t hi = svadd_x (pg, t2, ar2);
231*f3087befSAndrew Turner   svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
232*f3087befSAndrew Turner   svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
233*f3087befSAndrew Turner   /* p = log1p(r) - r - A[0]*r*r.  */
234*f3087befSAndrew Turner   /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
235*f3087befSAndrew Turner      A[6])))).  */
236*f3087befSAndrew Turner 
237*f3087befSAndrew Turner   svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
238*f3087befSAndrew Turner   svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
239*f3087befSAndrew Turner   svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
240*f3087befSAndrew Turner   svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
241*f3087befSAndrew Turner   svfloat64_t p = svmla_x (pg, a34, ar2, a56);
242*f3087befSAndrew Turner   p = svmla_x (pg, a12, ar2, p);
243*f3087befSAndrew Turner   p = svmul_x (svptrue_b64 (), ar3, p);
244*f3087befSAndrew Turner   svfloat64_t lo = svadd_x (
245*f3087befSAndrew Turner       pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
246*f3087befSAndrew Turner   svfloat64_t y = svadd_x (pg, hi, lo);
247*f3087befSAndrew Turner   *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
248*f3087befSAndrew Turner   return y;
249*f3087befSAndrew Turner }
250*f3087befSAndrew Turner 
251*f3087befSAndrew Turner static inline svfloat64_t
sv_exp_core(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,svfloat64_t * tmp,svuint64_t * sbits,svuint64_t * ki,const struct data * d)252*f3087befSAndrew Turner sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
253*f3087befSAndrew Turner 	     svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
254*f3087befSAndrew Turner 	     svuint64_t *ki, const struct data *d)
255*f3087befSAndrew Turner {
256*f3087befSAndrew Turner   /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
257*f3087befSAndrew Turner   /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
258*f3087befSAndrew Turner   svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
259*f3087befSAndrew Turner   svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
260*f3087befSAndrew Turner   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
261*f3087befSAndrew Turner   svfloat64_t kd = svrinta_x (pg, z);
262*f3087befSAndrew Turner   *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
263*f3087befSAndrew Turner 
264*f3087befSAndrew Turner   svfloat64_t ln2_over_n_hilo
265*f3087befSAndrew Turner       = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
266*f3087befSAndrew Turner   svfloat64_t r = x;
267*f3087befSAndrew Turner   r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
268*f3087befSAndrew Turner   r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
269*f3087befSAndrew Turner   /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
270*f3087befSAndrew Turner   r = svadd_x (pg, r, xtail);
271*f3087befSAndrew Turner   /* 2^(k/N) ~= scale.  */
272*f3087befSAndrew Turner   svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
273*f3087befSAndrew Turner   svuint64_t top
274*f3087befSAndrew Turner       = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
275*f3087befSAndrew Turner   /* This is only a valid scale when -1023*N < k < 1024*N.  */
276*f3087befSAndrew Turner   *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
277*f3087befSAndrew Turner   *sbits = svadd_x (pg, *sbits, top);
278*f3087befSAndrew Turner   /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
279*f3087befSAndrew Turner   svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
280*f3087befSAndrew Turner   *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
281*f3087befSAndrew Turner   *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
282*f3087befSAndrew Turner   *tmp = svmla_x (pg, r, r2, *tmp);
283*f3087befSAndrew Turner   svfloat64_t scale = svreinterpret_f64 (*sbits);
284*f3087befSAndrew Turner   /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
285*f3087befSAndrew Turner      is no spurious underflow here even without fma.  */
286*f3087befSAndrew Turner   z = svmla_x (pg, scale, scale, *tmp);
287*f3087befSAndrew Turner   return z;
288*f3087befSAndrew Turner }
289*f3087befSAndrew Turner 
290*f3087befSAndrew Turner /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
291*f3087befSAndrew Turner    The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
292*f3087befSAndrew Turner static inline svfloat64_t
sv_exp_inline(svbool_t pg,svfloat64_t x,svfloat64_t xtail,svuint64_t sign_bias,const struct data * d)293*f3087befSAndrew Turner sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
294*f3087befSAndrew Turner 	       svuint64_t sign_bias, const struct data *d)
295*f3087befSAndrew Turner {
296*f3087befSAndrew Turner   /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
297*f3087befSAndrew Turner      and other cases of large values of x (scale * (1 + TMP) oflow).  */
298*f3087befSAndrew Turner   svuint64_t abstop = svand_x (pg, sv_top12 (x), 0x7ff);
299*f3087befSAndrew Turner   /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
300*f3087befSAndrew Turner   svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
301*f3087befSAndrew Turner 
302*f3087befSAndrew Turner   svfloat64_t tmp;
303*f3087befSAndrew Turner   svuint64_t sbits, ki;
304*f3087befSAndrew Turner   if (unlikely (svptest_any (pg, uoflow)))
305*f3087befSAndrew Turner     {
306*f3087befSAndrew Turner       svfloat64_t z
307*f3087befSAndrew Turner 	  = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
308*f3087befSAndrew Turner 
309*f3087befSAndrew Turner       /* |x| is tiny (|x| <= 0x1p-54).  */
310*f3087befSAndrew Turner       svbool_t uflow
311*f3087befSAndrew Turner 	  = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
312*f3087befSAndrew Turner       uflow = svand_z (pg, uoflow, uflow);
313*f3087befSAndrew Turner       /* |x| is huge (|x| >= 1024).  */
314*f3087befSAndrew Turner       svbool_t oflow = svcmpge (pg, abstop, HugeExp);
315*f3087befSAndrew Turner       oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
316*f3087befSAndrew Turner 
317*f3087befSAndrew Turner       /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
318*f3087befSAndrew Turner     or underflow.  */
319*f3087befSAndrew Turner       svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
320*f3087befSAndrew Turner 
321*f3087befSAndrew Turner       /* Update result with special and large cases.  */
322*f3087befSAndrew Turner       z = sv_call_specialcase (tmp, sbits, ki, z, special);
323*f3087befSAndrew Turner 
324*f3087befSAndrew Turner       /* Handle underflow and overflow.  */
325*f3087befSAndrew Turner       svbool_t x_is_neg = svcmplt (pg, x, 0);
326*f3087befSAndrew Turner       svuint64_t sign_mask
327*f3087befSAndrew Turner 	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
328*f3087befSAndrew Turner       svfloat64_t res_uoflow
329*f3087befSAndrew Turner 	  = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
330*f3087befSAndrew Turner       res_uoflow = svreinterpret_f64 (
331*f3087befSAndrew Turner 	  svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
332*f3087befSAndrew Turner       /* Avoid spurious underflow for tiny x.  */
333*f3087befSAndrew Turner       svfloat64_t res_spurious_uflow
334*f3087befSAndrew Turner 	  = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
335*f3087befSAndrew Turner 
336*f3087befSAndrew Turner       z = svsel (oflow, res_uoflow, z);
337*f3087befSAndrew Turner       z = svsel (uflow, res_spurious_uflow, z);
338*f3087befSAndrew Turner       return z;
339*f3087befSAndrew Turner     }
340*f3087befSAndrew Turner 
341*f3087befSAndrew Turner   return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
342*f3087befSAndrew Turner }
343*f3087befSAndrew Turner 
344*f3087befSAndrew Turner static inline double
pow_sc(double x,double y)345*f3087befSAndrew Turner pow_sc (double x, double y)
346*f3087befSAndrew Turner {
347*f3087befSAndrew Turner   uint64_t ix = asuint64 (x);
348*f3087befSAndrew Turner   uint64_t iy = asuint64 (y);
349*f3087befSAndrew Turner   /* Special cases: |x| or |y| is 0, inf or nan.  */
350*f3087befSAndrew Turner   if (unlikely (zeroinfnan (iy)))
351*f3087befSAndrew Turner     {
352*f3087befSAndrew Turner       if (2 * iy == 0)
353*f3087befSAndrew Turner 	return issignaling_inline (x) ? x + y : 1.0;
354*f3087befSAndrew Turner       if (ix == asuint64 (1.0))
355*f3087befSAndrew Turner 	return issignaling_inline (y) ? x + y : 1.0;
356*f3087befSAndrew Turner       if (2 * ix > 2 * asuint64 (INFINITY) || 2 * iy > 2 * asuint64 (INFINITY))
357*f3087befSAndrew Turner 	return x + y;
358*f3087befSAndrew Turner       if (2 * ix == 2 * asuint64 (1.0))
359*f3087befSAndrew Turner 	return 1.0;
360*f3087befSAndrew Turner       if ((2 * ix < 2 * asuint64 (1.0)) == !(iy >> 63))
361*f3087befSAndrew Turner 	return 0.0; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
362*f3087befSAndrew Turner       return y * y;
363*f3087befSAndrew Turner     }
364*f3087befSAndrew Turner   if (unlikely (zeroinfnan (ix)))
365*f3087befSAndrew Turner     {
366*f3087befSAndrew Turner       double_t x2 = x * x;
367*f3087befSAndrew Turner       if (ix >> 63 && checkint (iy) == 1)
368*f3087befSAndrew Turner 	x2 = -x2;
369*f3087befSAndrew Turner       return (iy >> 63) ? 1 / x2 : x2;
370*f3087befSAndrew Turner     }
371*f3087befSAndrew Turner   return x;
372*f3087befSAndrew Turner }
373*f3087befSAndrew Turner 
SV_NAME_D2(pow)374*f3087befSAndrew Turner svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
375*f3087befSAndrew Turner {
376*f3087befSAndrew Turner   const struct data *d = ptr_barrier (&data);
377*f3087befSAndrew Turner 
378*f3087befSAndrew Turner   /* This preamble handles special case conditions used in the final scalar
379*f3087befSAndrew Turner      fallbacks. It also updates ix and sign_bias, that are used in the core
380*f3087befSAndrew Turner      computation too, i.e., exp( y * log (x) ).  */
381*f3087befSAndrew Turner   svuint64_t vix0 = svreinterpret_u64 (x);
382*f3087befSAndrew Turner   svuint64_t viy0 = svreinterpret_u64 (y);
383*f3087befSAndrew Turner 
384*f3087befSAndrew Turner   /* Negative x cases.  */
385*f3087befSAndrew Turner   svbool_t xisneg = svcmplt (pg, x, 0);
386*f3087befSAndrew Turner 
387*f3087befSAndrew Turner   /* Set sign_bias and ix depending on sign of x and nature of y.  */
388*f3087befSAndrew Turner   svbool_t yint_or_xpos = pg;
389*f3087befSAndrew Turner   svuint64_t sign_bias = sv_u64 (0);
390*f3087befSAndrew Turner   svuint64_t vix = vix0;
391*f3087befSAndrew Turner   if (unlikely (svptest_any (pg, xisneg)))
392*f3087befSAndrew Turner     {
393*f3087befSAndrew Turner       /* Determine nature of y.  */
394*f3087befSAndrew Turner       yint_or_xpos = sv_isint (xisneg, y);
395*f3087befSAndrew Turner       svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
396*f3087befSAndrew Turner       /* ix set to abs(ix) if y is integer.  */
397*f3087befSAndrew Turner       vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
398*f3087befSAndrew Turner       /* Set to SignBias if x is negative and y is odd.  */
399*f3087befSAndrew Turner       sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
400*f3087befSAndrew Turner     }
401*f3087befSAndrew Turner 
402*f3087befSAndrew Turner   /* Small cases of x: |x| < 0x1p-126.  */
403*f3087befSAndrew Turner   svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
404*f3087befSAndrew Turner   if (unlikely (svptest_any (yint_or_xpos, xsmall)))
405*f3087befSAndrew Turner     {
406*f3087befSAndrew Turner       /* Normalize subnormal x so exponent becomes negative.  */
407*f3087befSAndrew Turner       svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
408*f3087befSAndrew Turner       svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
409*f3087befSAndrew Turner 
410*f3087befSAndrew Turner       svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
411*f3087befSAndrew Turner       vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
412*f3087befSAndrew Turner       vix_norm = svsub_m (xsmall, vix_norm, 52ULL << 52);
413*f3087befSAndrew Turner       vix = svsel (topx_is_null, vix_norm, vix);
414*f3087befSAndrew Turner     }
415*f3087befSAndrew Turner 
416*f3087befSAndrew Turner   /* y_hi = log(ix, &y_lo).  */
417*f3087befSAndrew Turner   svfloat64_t vlo;
418*f3087befSAndrew Turner   svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
419*f3087befSAndrew Turner 
420*f3087befSAndrew Turner   /* z = exp(y_hi, y_lo, sign_bias).  */
421*f3087befSAndrew Turner   svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
422*f3087befSAndrew Turner   svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
423*f3087befSAndrew Turner   svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
424*f3087befSAndrew Turner   svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
425*f3087befSAndrew Turner 
426*f3087befSAndrew Turner   /* Cases of finite y and finite negative x.  */
427*f3087befSAndrew Turner   vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
428*f3087befSAndrew Turner 
429*f3087befSAndrew Turner   /* Special cases of x or y: zero, inf and nan.  */
430*f3087befSAndrew Turner   svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
431*f3087befSAndrew Turner   svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
432*f3087befSAndrew Turner   svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
433*f3087befSAndrew Turner 
434*f3087befSAndrew Turner   /* Cases of zero/inf/nan x or y.  */
435*f3087befSAndrew Turner   if (unlikely (svptest_any (svptrue_b64 (), special)))
436*f3087befSAndrew Turner     vz = sv_call2_f64 (pow_sc, x, y, vz, special);
437*f3087befSAndrew Turner 
438*f3087befSAndrew Turner   return vz;
439*f3087befSAndrew Turner }
440*f3087befSAndrew Turner 
441*f3087befSAndrew Turner TEST_SIG (SV, D, 2, pow)
442*f3087befSAndrew Turner TEST_ULP (SV_NAME_D2 (pow), 0.55)
443*f3087befSAndrew Turner TEST_DISABLE_FENV (SV_NAME_D2 (pow))
444*f3087befSAndrew Turner /* Wide intervals spanning the whole domain but shared between x and y.  */
445*f3087befSAndrew Turner #define SV_POW_INTERVAL2(xlo, xhi, ylo, yhi, n)                               \
446*f3087befSAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, ylo, yhi, n)                    \
447*f3087befSAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), xlo, xhi, -ylo, -yhi, n)                  \
448*f3087befSAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, ylo, yhi, n)                  \
449*f3087befSAndrew Turner   TEST_INTERVAL2 (SV_NAME_D2 (pow), -xlo, -xhi, -ylo, -yhi, n)
450*f3087befSAndrew Turner #define EXPAND(str) str##000000000
451*f3087befSAndrew Turner #define SHL52(str) EXPAND (str)
452*f3087befSAndrew Turner SV_POW_INTERVAL2 (0, SHL52 (SmallPowX), 0, inf, 40000)
453*f3087befSAndrew Turner SV_POW_INTERVAL2 (SHL52 (SmallPowX), SHL52 (BigPowX), 0, inf, 40000)
454*f3087befSAndrew Turner SV_POW_INTERVAL2 (SHL52 (BigPowX), inf, 0, inf, 40000)
455*f3087befSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, SHL52 (SmallPowY), 40000)
456*f3087befSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (SmallPowY), SHL52 (BigPowY), 40000)
457*f3087befSAndrew Turner SV_POW_INTERVAL2 (0, inf, SHL52 (BigPowY), inf, 40000)
458*f3087befSAndrew Turner SV_POW_INTERVAL2 (0, inf, 0, inf, 1000)
459*f3087befSAndrew Turner /* x~1 or y~1.  */
460*f3087befSAndrew Turner SV_POW_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
461*f3087befSAndrew Turner SV_POW_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
462*f3087befSAndrew Turner SV_POW_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
463*f3087befSAndrew Turner /* around estimated argmaxs of ULP error.  */
464*f3087befSAndrew Turner SV_POW_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
465*f3087befSAndrew Turner SV_POW_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
466*f3087befSAndrew Turner /* x is negative, y is odd or even integer, or y is real not integer.  */
467*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
468*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
469*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
470*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
471*f3087befSAndrew Turner /* |x| is inf, y is odd or even integer, or y is real not integer.  */
472*f3087befSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
473*f3087befSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
474*f3087befSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
475*f3087befSAndrew Turner SV_POW_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
476*f3087befSAndrew Turner /* 0.0^y.  */
477*f3087befSAndrew Turner SV_POW_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
478*f3087befSAndrew Turner /* 1.0^y.  */
479*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
480*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
481*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
482*f3087befSAndrew Turner TEST_INTERVAL2 (SV_NAME_D2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
483*f3087befSAndrew Turner CLOSE_SVE_ATTR
484