xref: /freebsd/contrib/arm-optimized-routines/pl/math/sv_powf_2u6.c (revision a8089ea5aee578e08acab2438e82fc9a9ae50ed8)
1 /*
2  * Single-precision SVE powf function.
3  *
4  * Copyright (c) 2023, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "sv_math.h"
9 #include "pl_sig.h"
10 #include "pl_test.h"
11 
12 /* The following data is used in the SVE pow core computation
13    and special case detection.  */
14 #define Tinvc __v_powf_data.invc
15 #define Tlogc __v_powf_data.logc
16 #define Texp __v_powf_data.scale
17 #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18 #define Shift 0x1.8p52
19 #define Norm 0x1p23f /* 0x4b000000.  */
20 
21 /* Overall ULP error bound for pow is 2.6 ulp
22    ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2).  */
23 static const struct data
24 {
25   double log_poly[4];
26   double exp_poly[3];
27   float uflow_bound, oflow_bound, small_bound;
28   uint32_t sign_bias, sign_mask, subnormal_bias, off;
29 } data = {
30   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
31      V_POWF_EXP2_N.  */
32   .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
33 		-0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
34   /* rel err: 1.69 * 2^-34.  */
35   .exp_poly = {
36     0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3.  */
37     0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2.  */
38     0x1.62e42ff0c52d6p-6,   /* A3 / V_POWF_EXP2_N.  */
39   },
40   .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N.  */
41   .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N.  */
42   .small_bound = 0x1p-126f,
43   .off = 0x3f35d000,
44   .sign_bias = SignBias,
45   .sign_mask = 0x80000000,
46   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
47 };
48 
49 #define A(i) sv_f64 (d->log_poly[i])
50 #define C(i) sv_f64 (d->exp_poly[i])
51 
52 /* Check if x is an integer.  */
53 static inline svbool_t
54 svisint (svbool_t pg, svfloat32_t x)
55 {
56   return svcmpeq (pg, svrintz_z (pg, x), x);
57 }
58 
59 /* Check if x is real not integer valued.  */
60 static inline svbool_t
61 svisnotint (svbool_t pg, svfloat32_t x)
62 {
63   return svcmpne (pg, svrintz_z (pg, x), x);
64 }
65 
66 /* Check if x is an odd integer.  */
67 static inline svbool_t
68 svisodd (svbool_t pg, svfloat32_t x)
69 {
70   svfloat32_t y = svmul_x (pg, x, 0.5f);
71   return svisnotint (pg, y);
72 }
73 
74 /* Check if zero, inf or nan.  */
75 static inline svbool_t
76 sv_zeroinfnan (svbool_t pg, svuint32_t i)
77 {
78   return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
79 		  2u * 0x7f800000 - 1);
80 }
81 
82 /* Returns 0 if not int, 1 if odd int, 2 if even int.  The argument is
83    the bit representation of a non-zero finite floating-point value.  */
84 static inline int
85 checkint (uint32_t iy)
86 {
87   int e = iy >> 23 & 0xff;
88   if (e < 0x7f)
89     return 0;
90   if (e > 0x7f + 23)
91     return 2;
92   if (iy & ((1 << (0x7f + 23 - e)) - 1))
93     return 0;
94   if (iy & (1 << (0x7f + 23 - e)))
95     return 1;
96   return 2;
97 }
98 
99 /* Check if zero, inf or nan.  */
100 static inline int
101 zeroinfnan (uint32_t ix)
102 {
103   return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
104 }
105 
106 /* A scalar subroutine used to fix main power special cases. Similar to the
107    preamble of finite_powf except that we do not update ix and sign_bias. This
108    is done in the preamble of the SVE powf.  */
109 static inline float
110 powf_specialcase (float x, float y, float z)
111 {
112   uint32_t ix = asuint (x);
113   uint32_t iy = asuint (y);
114   /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan).  */
115   if (unlikely (zeroinfnan (iy)))
116     {
117       if (2 * iy == 0)
118 	return issignalingf_inline (x) ? x + y : 1.0f;
119       if (ix == 0x3f800000)
120 	return issignalingf_inline (y) ? x + y : 1.0f;
121       if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
122 	return x + y;
123       if (2 * ix == 2 * 0x3f800000)
124 	return 1.0f;
125       if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
126 	return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf.  */
127       return y * y;
128     }
129   if (unlikely (zeroinfnan (ix)))
130     {
131       float_t x2 = x * x;
132       if (ix & 0x80000000 && checkint (iy) == 1)
133 	x2 = -x2;
134       return iy & 0x80000000 ? 1 / x2 : x2;
135     }
136   /* We need a return here in case x<0 and y is integer, but all other tests
137    need to be run.  */
138   return z;
139 }
140 
141 /* Scalar fallback for special case routines with custom signature.  */
142 static inline svfloat32_t
143 sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
144 {
145   svbool_t p = svpfirst (cmp, svpfalse ());
146   while (svptest_any (cmp, p))
147     {
148       float sx1 = svclastb (p, 0, x1);
149       float sx2 = svclastb (p, 0, x2);
150       float elem = svclastb (p, 0, y);
151       elem = powf_specialcase (sx1, sx2, elem);
152       svfloat32_t y2 = sv_f32 (elem);
153       y = svsel (p, y2, y);
154       p = svpnext_b32 (cmp, p);
155     }
156   return y;
157 }
158 
159 /* Compute core for half of the lanes in double precision.  */
160 static inline svfloat64_t
161 sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
162 		  svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
163 		  const struct data *d)
164 {
165   svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
166   svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
167 
168   /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k.  */
169   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
170   svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
171 
172   /* Polynomial to approximate log1p(r)/ln2.  */
173   svfloat64_t logx = A (0);
174   logx = svmla_x (pg, A (1), r, logx);
175   logx = svmla_x (pg, A (2), r, logx);
176   logx = svmla_x (pg, A (3), r, logx);
177   logx = svmla_x (pg, y0, r, logx);
178   *pylogx = svmul_x (pg, y, logx);
179 
180   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
181   svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
182   svuint64_t ki = svreinterpret_u64 (kd);
183   kd = svsub_x (pg, kd, Shift);
184 
185   r = svsub_x (pg, *pylogx, kd);
186 
187   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
188   svuint64_t t
189       = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
190   svuint64_t ski = svadd_x (pg, ki, sign_bias);
191   t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
192   svfloat64_t s = svreinterpret_f64 (t);
193 
194   svfloat64_t p = C (0);
195   p = svmla_x (pg, C (1), p, r);
196   p = svmla_x (pg, C (2), p, r);
197   p = svmla_x (pg, s, p, svmul_x (pg, s, r));
198 
199   return p;
200 }
201 
202 /* Widen vector to double precision and compute core on both halves of the
203    vector. Lower cost of promotion by considering all lanes active.  */
204 static inline svfloat32_t
205 sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
206 	      svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
207 	      const struct data *d)
208 {
209   const svbool_t ptrue = svptrue_b64 ();
210 
211   /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
212      order to perform core computation in double precision.  */
213   const svbool_t pg_lo = svunpklo (pg);
214   const svbool_t pg_hi = svunpkhi (pg);
215   svfloat64_t y_lo = svcvt_f64_x (
216       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
217   svfloat64_t y_hi = svcvt_f64_x (
218       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
219   svfloat32_t z = svreinterpret_f32 (iz);
220   svfloat64_t z_lo = svcvt_f64_x (
221       ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
222   svfloat64_t z_hi = svcvt_f64_x (
223       ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
224   svuint64_t i_lo = svunpklo (i);
225   svuint64_t i_hi = svunpkhi (i);
226   svint64_t k_lo = svunpklo (k);
227   svint64_t k_hi = svunpkhi (k);
228   svuint64_t sign_bias_lo = svunpklo (sign_bias);
229   svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230 
231   /* Compute each part in double precision.  */
232   svfloat64_t ylogx_lo, ylogx_hi;
233   svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234 				     sign_bias_lo, &ylogx_lo, d);
235   svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236 				     sign_bias_hi, &ylogx_hi, d);
237 
238   /* Convert back to single-precision and interleave.  */
239   svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240   svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241   *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242   svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243   svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244   return svuzp1 (lo_32, hi_32);
245 }
246 
247 /* Implementation of SVE powf.
248    Provides the same accuracy as AdvSIMD powf, since it relies on the same
249    algorithm. The theoretical maximum error is under 2.60 ULPs.
250    Maximum measured error is 2.56 ULPs:
251    SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
252 						   want 0x1.fd4b06p+127.  */
253 svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254 {
255   const struct data *d = ptr_barrier (&data);
256 
257   svuint32_t vix0 = svreinterpret_u32 (x);
258   svuint32_t viy0 = svreinterpret_u32 (y);
259 
260   /* Negative x cases.  */
261   svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
262   svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
263 
264   /* Set sign_bias and ix depending on sign of x and nature of y.  */
265   svbool_t yisnotint_xisneg = svpfalse_b ();
266   svuint32_t sign_bias = sv_u32 (0);
267   svuint32_t vix = vix0;
268   if (unlikely (svptest_any (pg, xisneg)))
269     {
270       /* Determine nature of y.  */
271       yisnotint_xisneg = svisnotint (xisneg, y);
272       svbool_t yisint_xisneg = svisint (xisneg, y);
273       svbool_t yisodd_xisneg = svisodd (xisneg, y);
274       /* ix set to abs(ix) if y is integer.  */
275       vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
276       /* Set to SignBias if x is negative and y is odd.  */
277       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
278     }
279 
280   /* Special cases of x or y: zero, inf and nan.  */
281   svbool_t xspecial = sv_zeroinfnan (pg, vix0);
282   svbool_t yspecial = sv_zeroinfnan (pg, viy0);
283   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
284 
285   /* Small cases of x: |x| < 0x1p-126.  */
286   svbool_t xsmall = svaclt (pg, x, d->small_bound);
287   if (unlikely (svptest_any (pg, xsmall)))
288     {
289       /* Normalize subnormal x so exponent becomes negative.  */
290       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
291       vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
292       vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
293       vix = svsel (xsmall, vix_norm, vix);
294     }
295   /* Part of core computation carried in working precision.  */
296   svuint32_t tmp = svsub_x (pg, vix, d->off);
297   svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
298 			  V_POWF_LOG2_N - 1);
299   svuint32_t top = svand_x (pg, tmp, 0xff800000);
300   svuint32_t iz = svsub_x (pg, vix, top);
301   svint32_t k
302       = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
303 
304   /* Compute core in extended precision and return intermediate ylogx results to
305       handle cases of underflow and underflow in exp.  */
306   svfloat32_t ylogx;
307   svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
308 
309   /* Handle exp special cases of underflow and overflow.  */
310   svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
311   svfloat32_t ret_oflow
312       = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
313   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
314   ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
315   ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
316 
317   /* Cases of finite y and finite negative x.  */
318   ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
319 
320   if (unlikely (svptest_any (pg, cmp)))
321     return sv_call_powf_sc (x, y, ret, cmp);
322 
323   return ret;
324 }
325 
326 PL_SIG (SV, F, 2, pow)
327 PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
328 /* Wide intervals spanning the whole domain but shared between x and y.  */
329 #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n)                               \
330   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n)                  \
331   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n)                \
332   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n)                \
333   PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
334 SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
335 SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
336 SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
337 /* x~1 or y~1.  */
338 SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
339 SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
340 SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
341 /* around estimated argmaxs of ULP error.  */
342 SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
343 SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
344 /* x is negative, y is odd or even integer, or y is real not integer.  */
345 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
346 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
347 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
348 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
349 /* |x| is inf, y is odd or even integer, or y is real not integer.  */
350 SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
351 SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
352 SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
353 SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
354 /* 0.0^y.  */
355 SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
356 /* 1.0^y.  */
357 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
358 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
359 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
360 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
361