1 /*
2 * Single-precision SVE powf function.
3 *
4 * Copyright (c) 2023, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7
8 #include "sv_math.h"
9 #include "pl_sig.h"
10 #include "pl_test.h"
11
12 /* The following data is used in the SVE pow core computation
13 and special case detection. */
14 #define Tinvc __v_powf_data.invc
15 #define Tlogc __v_powf_data.logc
16 #define Texp __v_powf_data.scale
17 #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
18 #define Shift 0x1.8p52
19 #define Norm 0x1p23f /* 0x4b000000. */
20
21 /* Overall ULP error bound for pow is 2.6 ulp
22 ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */
23 static const struct data
24 {
25 double log_poly[4];
26 double exp_poly[3];
27 float uflow_bound, oflow_bound, small_bound;
28 uint32_t sign_bias, sign_mask, subnormal_bias, off;
29 } data = {
30 /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
31 V_POWF_EXP2_N. */
32 .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3,
33 -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 },
34 /* rel err: 1.69 * 2^-34. */
35 .exp_poly = {
36 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */
37 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */
38 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */
39 },
40 .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */
41 .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */
42 .small_bound = 0x1p-126f,
43 .off = 0x3f35d000,
44 .sign_bias = SignBias,
45 .sign_mask = 0x80000000,
46 .subnormal_bias = 0x0b800000, /* 23 << 23. */
47 };
48
49 #define A(i) sv_f64 (d->log_poly[i])
50 #define C(i) sv_f64 (d->exp_poly[i])
51
52 /* Check if x is an integer. */
53 static inline svbool_t
svisint(svbool_t pg,svfloat32_t x)54 svisint (svbool_t pg, svfloat32_t x)
55 {
56 return svcmpeq (pg, svrintz_z (pg, x), x);
57 }
58
59 /* Check if x is real not integer valued. */
60 static inline svbool_t
svisnotint(svbool_t pg,svfloat32_t x)61 svisnotint (svbool_t pg, svfloat32_t x)
62 {
63 return svcmpne (pg, svrintz_z (pg, x), x);
64 }
65
66 /* Check if x is an odd integer. */
67 static inline svbool_t
svisodd(svbool_t pg,svfloat32_t x)68 svisodd (svbool_t pg, svfloat32_t x)
69 {
70 svfloat32_t y = svmul_x (pg, x, 0.5f);
71 return svisnotint (pg, y);
72 }
73
74 /* Check if zero, inf or nan. */
75 static inline svbool_t
sv_zeroinfnan(svbool_t pg,svuint32_t i)76 sv_zeroinfnan (svbool_t pg, svuint32_t i)
77 {
78 return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
79 2u * 0x7f800000 - 1);
80 }
81
82 /* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is
83 the bit representation of a non-zero finite floating-point value. */
84 static inline int
checkint(uint32_t iy)85 checkint (uint32_t iy)
86 {
87 int e = iy >> 23 & 0xff;
88 if (e < 0x7f)
89 return 0;
90 if (e > 0x7f + 23)
91 return 2;
92 if (iy & ((1 << (0x7f + 23 - e)) - 1))
93 return 0;
94 if (iy & (1 << (0x7f + 23 - e)))
95 return 1;
96 return 2;
97 }
98
99 /* Check if zero, inf or nan. */
100 static inline int
zeroinfnan(uint32_t ix)101 zeroinfnan (uint32_t ix)
102 {
103 return 2 * ix - 1 >= 2u * 0x7f800000 - 1;
104 }
105
106 /* A scalar subroutine used to fix main power special cases. Similar to the
107 preamble of finite_powf except that we do not update ix and sign_bias. This
108 is done in the preamble of the SVE powf. */
109 static inline float
powf_specialcase(float x,float y,float z)110 powf_specialcase (float x, float y, float z)
111 {
112 uint32_t ix = asuint (x);
113 uint32_t iy = asuint (y);
114 /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */
115 if (unlikely (zeroinfnan (iy)))
116 {
117 if (2 * iy == 0)
118 return issignalingf_inline (x) ? x + y : 1.0f;
119 if (ix == 0x3f800000)
120 return issignalingf_inline (y) ? x + y : 1.0f;
121 if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000)
122 return x + y;
123 if (2 * ix == 2 * 0x3f800000)
124 return 1.0f;
125 if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000))
126 return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */
127 return y * y;
128 }
129 if (unlikely (zeroinfnan (ix)))
130 {
131 float_t x2 = x * x;
132 if (ix & 0x80000000 && checkint (iy) == 1)
133 x2 = -x2;
134 return iy & 0x80000000 ? 1 / x2 : x2;
135 }
136 /* We need a return here in case x<0 and y is integer, but all other tests
137 need to be run. */
138 return z;
139 }
140
141 /* Scalar fallback for special case routines with custom signature. */
142 static inline svfloat32_t
sv_call_powf_sc(svfloat32_t x1,svfloat32_t x2,svfloat32_t y,svbool_t cmp)143 sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
144 {
145 svbool_t p = svpfirst (cmp, svpfalse ());
146 while (svptest_any (cmp, p))
147 {
148 float sx1 = svclastb (p, 0, x1);
149 float sx2 = svclastb (p, 0, x2);
150 float elem = svclastb (p, 0, y);
151 elem = powf_specialcase (sx1, sx2, elem);
152 svfloat32_t y2 = sv_f32 (elem);
153 y = svsel (p, y2, y);
154 p = svpnext_b32 (cmp, p);
155 }
156 return y;
157 }
158
159 /* Compute core for half of the lanes in double precision. */
160 static inline svfloat64_t
sv_powf_core_ext(const svbool_t pg,svuint64_t i,svfloat64_t z,svint64_t k,svfloat64_t y,svuint64_t sign_bias,svfloat64_t * pylogx,const struct data * d)161 sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
162 svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx,
163 const struct data *d)
164 {
165 svfloat64_t invc = svld1_gather_index (pg, Tinvc, i);
166 svfloat64_t logc = svld1_gather_index (pg, Tlogc, i);
167
168 /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */
169 svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc);
170 svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k));
171
172 /* Polynomial to approximate log1p(r)/ln2. */
173 svfloat64_t logx = A (0);
174 logx = svmla_x (pg, A (1), r, logx);
175 logx = svmla_x (pg, A (2), r, logx);
176 logx = svmla_x (pg, A (3), r, logx);
177 logx = svmla_x (pg, y0, r, logx);
178 *pylogx = svmul_x (pg, y, logx);
179
180 /* z - kd is in [-1, 1] in non-nearest rounding modes. */
181 svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
182 svuint64_t ki = svreinterpret_u64 (kd);
183 kd = svsub_x (pg, kd, Shift);
184
185 r = svsub_x (pg, *pylogx, kd);
186
187 /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */
188 svuint64_t t
189 = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
190 svuint64_t ski = svadd_x (pg, ki, sign_bias);
191 t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
192 svfloat64_t s = svreinterpret_f64 (t);
193
194 svfloat64_t p = C (0);
195 p = svmla_x (pg, C (1), p, r);
196 p = svmla_x (pg, C (2), p, r);
197 p = svmla_x (pg, s, p, svmul_x (pg, s, r));
198
199 return p;
200 }
201
202 /* Widen vector to double precision and compute core on both halves of the
203 vector. Lower cost of promotion by considering all lanes active. */
204 static inline svfloat32_t
sv_powf_core(const svbool_t pg,svuint32_t i,svuint32_t iz,svint32_t k,svfloat32_t y,svuint32_t sign_bias,svfloat32_t * pylogx,const struct data * d)205 sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
206 svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx,
207 const struct data *d)
208 {
209 const svbool_t ptrue = svptrue_b64 ();
210
211 /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
212 order to perform core computation in double precision. */
213 const svbool_t pg_lo = svunpklo (pg);
214 const svbool_t pg_hi = svunpkhi (pg);
215 svfloat64_t y_lo = svcvt_f64_x (
216 ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
217 svfloat64_t y_hi = svcvt_f64_x (
218 ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
219 svfloat32_t z = svreinterpret_f32 (iz);
220 svfloat64_t z_lo = svcvt_f64_x (
221 ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
222 svfloat64_t z_hi = svcvt_f64_x (
223 ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
224 svuint64_t i_lo = svunpklo (i);
225 svuint64_t i_hi = svunpkhi (i);
226 svint64_t k_lo = svunpklo (k);
227 svint64_t k_hi = svunpkhi (k);
228 svuint64_t sign_bias_lo = svunpklo (sign_bias);
229 svuint64_t sign_bias_hi = svunpkhi (sign_bias);
230
231 /* Compute each part in double precision. */
232 svfloat64_t ylogx_lo, ylogx_hi;
233 svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo,
234 sign_bias_lo, &ylogx_lo, d);
235 svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi,
236 sign_bias_hi, &ylogx_hi, d);
237
238 /* Convert back to single-precision and interleave. */
239 svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo);
240 svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi);
241 *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32);
242 svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo);
243 svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi);
244 return svuzp1 (lo_32, hi_32);
245 }
246
247 /* Implementation of SVE powf.
248 Provides the same accuracy as AdvSIMD powf, since it relies on the same
249 algorithm. The theoretical maximum error is under 2.60 ULPs.
250 Maximum measured error is 2.56 ULPs:
251 SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
252 want 0x1.fd4b06p+127. */
SV_NAME_F2(pow)253 svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
254 {
255 const struct data *d = ptr_barrier (&data);
256
257 svuint32_t vix0 = svreinterpret_u32 (x);
258 svuint32_t viy0 = svreinterpret_u32 (y);
259
260 /* Negative x cases. */
261 svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
262 svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
263
264 /* Set sign_bias and ix depending on sign of x and nature of y. */
265 svbool_t yisnotint_xisneg = svpfalse_b ();
266 svuint32_t sign_bias = sv_u32 (0);
267 svuint32_t vix = vix0;
268 if (unlikely (svptest_any (pg, xisneg)))
269 {
270 /* Determine nature of y. */
271 yisnotint_xisneg = svisnotint (xisneg, y);
272 svbool_t yisint_xisneg = svisint (xisneg, y);
273 svbool_t yisodd_xisneg = svisodd (xisneg, y);
274 /* ix set to abs(ix) if y is integer. */
275 vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
276 /* Set to SignBias if x is negative and y is odd. */
277 sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
278 }
279
280 /* Special cases of x or y: zero, inf and nan. */
281 svbool_t xspecial = sv_zeroinfnan (pg, vix0);
282 svbool_t yspecial = sv_zeroinfnan (pg, viy0);
283 svbool_t cmp = svorr_z (pg, xspecial, yspecial);
284
285 /* Small cases of x: |x| < 0x1p-126. */
286 svbool_t xsmall = svaclt (pg, x, d->small_bound);
287 if (unlikely (svptest_any (pg, xsmall)))
288 {
289 /* Normalize subnormal x so exponent becomes negative. */
290 svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
291 vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff);
292 vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias);
293 vix = svsel (xsmall, vix_norm, vix);
294 }
295 /* Part of core computation carried in working precision. */
296 svuint32_t tmp = svsub_x (pg, vix, d->off);
297 svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
298 V_POWF_LOG2_N - 1);
299 svuint32_t top = svand_x (pg, tmp, 0xff800000);
300 svuint32_t iz = svsub_x (pg, vix, top);
301 svint32_t k
302 = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
303
304 /* Compute core in extended precision and return intermediate ylogx results to
305 handle cases of underflow and underflow in exp. */
306 svfloat32_t ylogx;
307 svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
308
309 /* Handle exp special cases of underflow and overflow. */
310 svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
311 svfloat32_t ret_oflow
312 = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
313 svfloat32_t ret_uflow = svreinterpret_f32 (sign);
314 ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
315 ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
316
317 /* Cases of finite y and finite negative x. */
318 ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
319
320 if (unlikely (svptest_any (pg, cmp)))
321 return sv_call_powf_sc (x, y, ret, cmp);
322
323 return ret;
324 }
325
326 PL_SIG (SV, F, 2, pow)
327 PL_TEST_ULP (SV_NAME_F2 (pow), 2.06)
328 /* Wide intervals spanning the whole domain but shared between x and y. */
329 #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \
330 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \
331 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \
332 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \
333 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n)
334 SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000)
335 SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000)
336 SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000)
337 /* x~1 or y~1. */
338 SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000)
339 SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000)
340 SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000)
341 /* around estimated argmaxs of ULP error. */
342 SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000)
343 SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000)
344 /* x is negative, y is odd or even integer, or y is real not integer. */
345 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000)
346 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000)
347 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000)
348 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000)
349 /* |x| is inf, y is odd or even integer, or y is real not integer. */
350 SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1)
351 SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1)
352 SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1)
353 SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1)
354 /* 0.0^y. */
355 SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000)
356 /* 1.0^y. */
357 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000)
358 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000)
359 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000)
360 PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)
361