xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/sve/powf.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1 /*
2  * Single-precision SVE powf function.
3  *
4  * Copyright (c) 2023-2025, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "sv_math.h"
9 #include "test_sig.h"
10 #include "test_defs.h"
11 
12 /* The following data is used in the SVE pow core computation
13    and special case detection.  */
14 #define Tinvc __v_powf_data.invc
15 #define Tlogc __v_powf_data.logc
16 #define Texp __v_powf_data.scale
17 #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18 #define Norm 0x1p23f /* 0x4b000000.  */
19 
20 /* Overall ULP error bound for pow is 2.6 ulp
21    ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
22 static const struct data
23 {
24   double log_poly[4];
25   double exp_poly[3];
26   float uflow_bound, oflow_bound, small_bound;
27   uint32_t sign_bias, subnormal_bias, off;
28 } data = {
29   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
30      V_POWF_EXP2_N.  */
31   .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
32 		-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
33   /* rel err: 1.69 * 2^-34.  */
34   .exp_poly = {
35     0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3.  */
36     0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2.  */
37     0x1.62e42ff0c52d6p-6,   /* A3 / V_POWF_EXP2_N.  */
38   },
39   .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N.  */
40   .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N.  */
41   .small_bound = 0x1p-126f,
42   .off = 0x3f35d000,
43   .sign_bias = SignBias,
44   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
45 };
46 
47 #define A(i) sv_f64 (d->log_poly[i])
48 #define C(i) sv_f64 (d->exp_poly[i])
49 
50 /* Check if x is an integer.  */
51 static inline svbool_t
svisint(svbool_t pg,svfloat32_t x)52 svisint (svbool_t pg, svfloat32_t x)
53 {
54   return svcmpeq (pg, svrintz_z (pg, x), x);
55 }
56 
57 /* Check if x is real not integer valued.  */
58 static inline svbool_t
svisnotint(svbool_t pg,svfloat32_t x)59 svisnotint (svbool_t pg, svfloat32_t x)
60 {
61   return svcmpne (pg, svrintz_z (pg, x), x);
62 }
63 
64 /* Check if x is an odd integer.  */
65 static inline svbool_t
svisodd(svbool_t pg,svfloat32_t x)66 svisodd (svbool_t pg, svfloat32_t x)
67 {
68   svfloat32_t y = svmul_x (pg, x, 0.5f);
69   return svisnotint (pg, y);
70 }
71 
72 /* Check if zero, inf or nan.  */
73 static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint32_t i)74 sv_zeroinfnan (svbool_t pg, svuint32_t i)
75 {
76   return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
77 		  2u * 0x7f800000 - 1);
78 }
79 
80 /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
81    the bit representation of a non-zero finite floating-point value.  */
82 static inline int
checkint(uint32_t iy)83 checkint (uint32_t iy)
84 {
85   int e = iy >> 23 & 0xff;
86   if (e < 0x7f)
87     return 0;
88   if (e > 0x7f + 23)
89     return 2;
90   if (iy & ((1 << (0x7f + 23 - e)) - 1))
91     return 0;
92   if (iy & (1 << (0x7f + 23 - e)))
93     return 1;
94   return 2;
95 }
96 
97 /* Check if zero, inf or nan.  */
98 static inline int
zeroinfnan(uint32_t ix)99 zeroinfnan (uint32_t ix)
100 {
101   return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
102 }
103 
104 /* A scalar subroutine used to fix main power special cases. Similar to the
105    preamble of scalar powf except that we do not update ix and sign_bias. This
106    is done in the preamble of the SVE powf.  */
107 static inline float
powf_specialcase(float x,float y,float z)108 powf_specialcase (float x, float y, float z)
109 {
110   uint32_t ix = asuint (x);
111   uint32_t iy = asuint (y);
112   /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
113   if (unlikely (zeroinfnan (iy)))
114     {
115       if (2 * iy == 0)
116 	return issignalingf_inline (x) ? x + y : 1.0f;
117       if (ix == 0x3f800000)
118 	return issignalingf_inline (y) ? x + y : 1.0f;
119       if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
120 	return x + y;
121       if (2 * ix == 2 * 0x3f800000)
122 	return 1.0f;
123       if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
124 	return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
125       return y * y;
126     }
127   if (unlikely (zeroinfnan (ix)))
128     {
129       float_t x2 = x * x;
130       if (ix & 0x80000000 && checkint (iy) == 1)
131 	x2 = -x2;
132       return iy & 0x80000000 ? 1 / x2 : x2;
133     }
134   /* We need a return here in case x<0 and y is integer, but all other tests
135    need to be run.  */
136   return z;
137 }
138 
139 /* Scalar fallback for special case routines with custom signature.  */
140 static svfloat32_t NOINLINE
sv_call_powf_sc(svfloat32_t x1,svfloat32_t x2,svfloat32_t y)141 sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
142 {
143   /* Special cases of x or y: zero, inf and nan.  */
144   svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
145   svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
146   svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
147 
148   svbool_t p = svpfirst (cmp, svpfalse ());
149   while (svptest_any (cmp, p))
150     {
151       float sx1 = svclastb (p, 0, x1);
152       float sx2 = svclastb (p, 0, x2);
153       float elem = svclastb (p, 0, y);
154       elem = powf_specialcase (sx1, sx2, elem);
155       svfloat32_t y2 = sv_f32 (elem);
156       y = svsel (p, y2, y);
157       p = svpnext_b32 (cmp, p);
158     }
159   return y;
160 }
161 
162 /* Compute core for half of the lanes in double precision.  */
163 static inline svfloat64_t
sv_powf_core_ext(const svbool_t pg,svuint64_t i,svfloat64_t z,svint64_t k,svfloat64_t y,svuint64_t sign_bias,svfloat64_t * pylogx,const struct data * d)164 sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
165 		  svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
166 		  const struct data *d)
167 {
168   svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
169   svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
170 
171   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
172   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
173   svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
174 
175   /* Polynomial to approximate log1p(r)/ln2.  */
176   svfloat64_t logx = A (0);
177   logx = svmad_x (pg, r, logx, A (1));
178   logx = svmad_x (pg, r, logx, A (2));
179   logx = svmad_x (pg, r, logx, A (3));
180   logx = svmad_x (pg, r, logx, y0);
181   *pylogx = svmul_x (pg, y, logx);
182 
183   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
184   svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
185   svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
186 
187   r = svsub_x (pg, *pylogx, kd);
188 
189   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
190   svuint64_t t = svld1_gather_index (
191       svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
192   svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
193   t = svadd_x (svptrue_b64 (), t,
194 	       svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
195   svfloat64_t s = svreinterpret_f64 (t);
196 
197   svfloat64_t p = C (0);
198   p = svmla_x (pg, C (1), p, r);
199   p = svmla_x (pg, C (2), p, r);
200   p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
201 
202   return p;
203 }
204 
205 /* Widen vector to double precision and compute core on both halves of the
206    vector. Lower cost of promotion by considering all lanes active.  */
207 static inline svfloat32_t
sv_powf_core(const svbool_t pg,svuint32_t i,svuint32_t iz,svint32_t k,svfloat32_t y,svuint32_t sign_bias,svfloat32_t * pylogx,const struct data * d)208 sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
209 	      svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
210 	      const struct data *d)
211 {
212   const svbool_t ptrue = svptrue_b64 ();
213 
214   /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
215      in order to perform core computation in double precision.  */
216   const svbool_t pg_lo = svunpklo (pg);
217   const svbool_t pg_hi = svunpkhi (pg);
218   svfloat64_t y_lo
219       = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
220   svfloat64_t y_hi
221       = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
222   svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
223   svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
224   svuint64_t i_lo = svunpklo (i);
225   svuint64_t i_hi = svunpkhi (i);
226   svint64_t k_lo = svunpklo (k);
227   svint64_t k_hi = svunpkhi (k);
228   svuint64_t sign_bias_lo = svunpklo (sign_bias);
229   svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230 
231   /* Compute each part in double precision.  */
232   svfloat64_t ylogx_lo, ylogx_hi;
233   svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234 				     sign_bias_lo, &ylogx_lo, d);
235   svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236 				     sign_bias_hi, &ylogx_hi, d);
237 
238   /* Convert back to single-precision and interleave.  */
239   svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240   svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241   *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242   svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243   svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244   return svuzp1 (lo_32, hi_32);
245 }
246 
247 /* Implementation of SVE powf.
248    Provides the same accuracy as AdvSIMD powf, since it relies on the same
249    algorithm. The theoretical maximum error is under 2.60 ULPs.
250    Maximum measured error is 2.57 ULPs:
251    SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
252 						   want 0x1.fff862p+127.  */
SV_NAME_F2(pow)253 svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254 {
255   const struct data *d = ptr_barrier (&data);
256 
257   svuint32_t vix0 = svreinterpret_u32 (x);
258   svuint32_t viy0 = svreinterpret_u32 (y);
259 
260   /* Negative x cases.  */
261   svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
262 
263   /* Set sign_bias and ix depending on sign of x and nature of y.  */
264   svbool_t yint_or_xpos = pg;
265   svuint32_t sign_bias = sv_u32 (0);
266   svuint32_t vix = vix0;
267   if (unlikely (svptest_any (pg, xisneg)))
268     {
269       /* Determine nature of y.  */
270       yint_or_xpos = svisint (xisneg, y);
271       svbool_t yisodd_xisneg = svisodd (xisneg, y);
272       /* ix set to abs(ix) if y is integer.  */
273       vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
274       /* Set to SignBias if x is negative and y is odd.  */
275       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
276     }
277 
278   /* Special cases of x or y: zero, inf and nan.  */
279   svbool_t xspecial = sv_zeroinfnan (pg, vix0);
280   svbool_t yspecial = sv_zeroinfnan (pg, viy0);
281   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
282 
283   /* Small cases of x: |x| < 0x1p-126.  */
284   svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
285   if (unlikely (svptest_any (yint_or_xpos, xsmall)))
286     {
287       /* Normalize subnormal x so exponent becomes negative.  */
288       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
289       vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
290       vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
291       vix = svsel (xsmall, vix_norm, vix);
292     }
293   /* Part of core computation carried in working precision.  */
294   svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
295   svuint32_t i = svand_x (
296       yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
297       V_POWF_LOG2_N - 1);
298   svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
299   svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
300   svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
301 			 (23 - V_POWF_EXP2_TABLE_BITS));
302 
303   /* Compute core in extended precision and return intermediate ylogx results
304      to handle cases of underflow and underflow in exp.  */
305   svfloat32_t ylogx;
306   svfloat32_t ret
307       = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
308 
309   /* Handle exp special cases of underflow and overflow.  */
310   svuint32_t sign
311       = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
312   svfloat32_t ret_oflow
313       = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
314   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
315   ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
316   ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
317 
318   /* Cases of finite y and finite negative x.  */
319   ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
320 
321   if (unlikely (svptest_any (cmp, cmp)))
322     return sv_call_powf_sc (x, y, ret);
323 
324   return ret;
325 }
326 
327 TEST_SIG (SV, F, 2, pow)
328 TEST_ULP (SV_NAME_F2 (pow), 2.08)
329 TEST_DISABLE_FENV (SV_NAME_F2 (pow))
330 /* Wide intervals spanning the whole domain but shared between x and y.  */
331 #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n)                              \
332   TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n)                    \
333   TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n)                  \
334   TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n)                  \
335   TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
336 SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
337 SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
338 SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
339 /* x~1 or y~1.  */
340 SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
341 SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
342 SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
343 /* around estimated argmaxs of ULP error.  */
344 SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
345 SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
346 /* x is negative, y is odd or even integer, or y is real not integer.  */
347 TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
348 TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
349 TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
350 TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
351 /* |x| is inf, y is odd or even integer, or y is real not integer.  */
352 SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
353 SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
354 SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
355 SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
356 /* 0.0^y.  */
357 SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
358 /* 1.0^y.  */
359 TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
360 TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
361 TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
362 TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
363 CLOSE_SVE_ATTR
364