/* * Single-precision SVE powf function. * * Copyright (c) 2023, Arm Limited. * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception */ #include "sv_math.h" #include "pl_sig.h" #include "pl_test.h" /* The following data is used in the SVE pow core computation and special case detection. */ #define Tinvc __v_powf_data.invc #define Tlogc __v_powf_data.logc #define Texp __v_powf_data.scale #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11)) #define Shift 0x1.8p52 #define Norm 0x1p23f /* 0x4b000000. */ /* Overall ULP error bound for pow is 2.6 ulp ~ 0.5 + 2^24 (128*Ln2*relerr_log2 + relerr_exp2). */ static const struct data { double log_poly[4]; double exp_poly[3]; float uflow_bound, oflow_bound, small_bound; uint32_t sign_bias, sign_mask, subnormal_bias, off; } data = { /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of V_POWF_EXP2_N. */ .log_poly = { -0x1.6ff5daa3b3d7cp+3, 0x1.ec81d03c01aebp+3, -0x1.71547bb43f101p+4, 0x1.7154764a815cbp+5 }, /* rel err: 1.69 * 2^-34. */ .exp_poly = { 0x1.c6af84b912394p-20, /* A0 / V_POWF_EXP2_N^3. */ 0x1.ebfce50fac4f3p-13, /* A1 / V_POWF_EXP2_N^2. */ 0x1.62e42ff0c52d6p-6, /* A3 / V_POWF_EXP2_N. */ }, .uflow_bound = -0x1.2cp+12f, /* -150.0 * V_POWF_EXP2_N. */ .oflow_bound = 0x1p+12f, /* 128.0 * V_POWF_EXP2_N. */ .small_bound = 0x1p-126f, .off = 0x3f35d000, .sign_bias = SignBias, .sign_mask = 0x80000000, .subnormal_bias = 0x0b800000, /* 23 << 23. */ }; #define A(i) sv_f64 (d->log_poly[i]) #define C(i) sv_f64 (d->exp_poly[i]) /* Check if x is an integer. */ static inline svbool_t svisint (svbool_t pg, svfloat32_t x) { return svcmpeq (pg, svrintz_z (pg, x), x); } /* Check if x is real not integer valued. */ static inline svbool_t svisnotint (svbool_t pg, svfloat32_t x) { return svcmpne (pg, svrintz_z (pg, x), x); } /* Check if x is an odd integer. */ static inline svbool_t svisodd (svbool_t pg, svfloat32_t x) { svfloat32_t y = svmul_x (pg, x, 0.5f); return svisnotint (pg, y); } /* Check if zero, inf or nan. */ static inline svbool_t sv_zeroinfnan (svbool_t pg, svuint32_t i) { return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1), 2u * 0x7f800000 - 1); } /* Returns 0 if not int, 1 if odd int, 2 if even int. The argument is the bit representation of a non-zero finite floating-point value. */ static inline int checkint (uint32_t iy) { int e = iy >> 23 & 0xff; if (e < 0x7f) return 0; if (e > 0x7f + 23) return 2; if (iy & ((1 << (0x7f + 23 - e)) - 1)) return 0; if (iy & (1 << (0x7f + 23 - e))) return 1; return 2; } /* Check if zero, inf or nan. */ static inline int zeroinfnan (uint32_t ix) { return 2 * ix - 1 >= 2u * 0x7f800000 - 1; } /* A scalar subroutine used to fix main power special cases. Similar to the preamble of finite_powf except that we do not update ix and sign_bias. This is done in the preamble of the SVE powf. */ static inline float powf_specialcase (float x, float y, float z) { uint32_t ix = asuint (x); uint32_t iy = asuint (y); /* Either (x < 0x1p-126 or inf or nan) or (y is 0 or inf or nan). */ if (unlikely (zeroinfnan (iy))) { if (2 * iy == 0) return issignalingf_inline (x) ? x + y : 1.0f; if (ix == 0x3f800000) return issignalingf_inline (y) ? x + y : 1.0f; if (2 * ix > 2u * 0x7f800000 || 2 * iy > 2u * 0x7f800000) return x + y; if (2 * ix == 2 * 0x3f800000) return 1.0f; if ((2 * ix < 2 * 0x3f800000) == !(iy & 0x80000000)) return 0.0f; /* |x|<1 && y==inf or |x|>1 && y==-inf. */ return y * y; } if (unlikely (zeroinfnan (ix))) { float_t x2 = x * x; if (ix & 0x80000000 && checkint (iy) == 1) x2 = -x2; return iy & 0x80000000 ? 1 / x2 : x2; } /* We need a return here in case x<0 and y is integer, but all other tests need to be run. */ return z; } /* Scalar fallback for special case routines with custom signature. */ static inline svfloat32_t sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp) { svbool_t p = svpfirst (cmp, svpfalse ()); while (svptest_any (cmp, p)) { float sx1 = svclastb (p, 0, x1); float sx2 = svclastb (p, 0, x2); float elem = svclastb (p, 0, y); elem = powf_specialcase (sx1, sx2, elem); svfloat32_t y2 = sv_f32 (elem); y = svsel (p, y2, y); p = svpnext_b32 (cmp, p); } return y; } /* Compute core for half of the lanes in double precision. */ static inline svfloat64_t sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k, svfloat64_t y, svuint64_t sign_bias, svfloat64_t *pylogx, const struct data *d) { svfloat64_t invc = svld1_gather_index (pg, Tinvc, i); svfloat64_t logc = svld1_gather_index (pg, Tlogc, i); /* log2(x) = log1p(z/c-1)/ln2 + log2(c) + k. */ svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), z, invc); svfloat64_t y0 = svadd_x (pg, logc, svcvt_f64_x (pg, k)); /* Polynomial to approximate log1p(r)/ln2. */ svfloat64_t logx = A (0); logx = svmla_x (pg, A (1), r, logx); logx = svmla_x (pg, A (2), r, logx); logx = svmla_x (pg, A (3), r, logx); logx = svmla_x (pg, y0, r, logx); *pylogx = svmul_x (pg, y, logx); /* z - kd is in [-1, 1] in non-nearest rounding modes. */ svfloat64_t kd = svadd_x (pg, *pylogx, Shift); svuint64_t ki = svreinterpret_u64 (kd); kd = svsub_x (pg, kd, Shift); r = svsub_x (pg, *pylogx, kd); /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1). */ svuint64_t t = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1)); svuint64_t ski = svadd_x (pg, ki, sign_bias); t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS)); svfloat64_t s = svreinterpret_f64 (t); svfloat64_t p = C (0); p = svmla_x (pg, C (1), p, r); p = svmla_x (pg, C (2), p, r); p = svmla_x (pg, s, p, svmul_x (pg, s, r)); return p; } /* Widen vector to double precision and compute core on both halves of the vector. Lower cost of promotion by considering all lanes active. */ static inline svfloat32_t sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k, svfloat32_t y, svuint32_t sign_bias, svfloat32_t *pylogx, const struct data *d) { const svbool_t ptrue = svptrue_b64 (); /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in order to perform core computation in double precision. */ const svbool_t pg_lo = svunpklo (pg); const svbool_t pg_hi = svunpkhi (pg); svfloat64_t y_lo = svcvt_f64_x ( ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y)))); svfloat64_t y_hi = svcvt_f64_x ( ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y)))); svfloat32_t z = svreinterpret_f32 (iz); svfloat64_t z_lo = svcvt_f64_x ( ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z)))); svfloat64_t z_hi = svcvt_f64_x ( ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z)))); svuint64_t i_lo = svunpklo (i); svuint64_t i_hi = svunpkhi (i); svint64_t k_lo = svunpklo (k); svint64_t k_hi = svunpkhi (k); svuint64_t sign_bias_lo = svunpklo (sign_bias); svuint64_t sign_bias_hi = svunpkhi (sign_bias); /* Compute each part in double precision. */ svfloat64_t ylogx_lo, ylogx_hi; svfloat64_t lo = sv_powf_core_ext (pg_lo, i_lo, z_lo, k_lo, y_lo, sign_bias_lo, &ylogx_lo, d); svfloat64_t hi = sv_powf_core_ext (pg_hi, i_hi, z_hi, k_hi, y_hi, sign_bias_hi, &ylogx_hi, d); /* Convert back to single-precision and interleave. */ svfloat32_t ylogx_lo_32 = svcvt_f32_x (ptrue, ylogx_lo); svfloat32_t ylogx_hi_32 = svcvt_f32_x (ptrue, ylogx_hi); *pylogx = svuzp1 (ylogx_lo_32, ylogx_hi_32); svfloat32_t lo_32 = svcvt_f32_x (ptrue, lo); svfloat32_t hi_32 = svcvt_f32_x (ptrue, hi); return svuzp1 (lo_32, hi_32); } /* Implementation of SVE powf. Provides the same accuracy as AdvSIMD powf, since it relies on the same algorithm. The theoretical maximum error is under 2.60 ULPs. Maximum measured error is 2.56 ULPs: SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127 want 0x1.fd4b06p+127. */ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg) { const struct data *d = ptr_barrier (&data); svuint32_t vix0 = svreinterpret_u32 (x); svuint32_t viy0 = svreinterpret_u32 (y); /* Negative x cases. */ svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask); svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask); /* Set sign_bias and ix depending on sign of x and nature of y. */ svbool_t yisnotint_xisneg = svpfalse_b (); svuint32_t sign_bias = sv_u32 (0); svuint32_t vix = vix0; if (unlikely (svptest_any (pg, xisneg))) { /* Determine nature of y. */ yisnotint_xisneg = svisnotint (xisneg, y); svbool_t yisint_xisneg = svisint (xisneg, y); svbool_t yisodd_xisneg = svisodd (xisneg, y); /* ix set to abs(ix) if y is integer. */ vix = svand_m (yisint_xisneg, vix0, 0x7fffffff); /* Set to SignBias if x is negative and y is odd. */ sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0)); } /* Special cases of x or y: zero, inf and nan. */ svbool_t xspecial = sv_zeroinfnan (pg, vix0); svbool_t yspecial = sv_zeroinfnan (pg, viy0); svbool_t cmp = svorr_z (pg, xspecial, yspecial); /* Small cases of x: |x| < 0x1p-126. */ svbool_t xsmall = svaclt (pg, x, d->small_bound); if (unlikely (svptest_any (pg, xsmall))) { /* Normalize subnormal x so exponent becomes negative. */ svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm)); vix_norm = svand_x (xsmall, vix_norm, 0x7fffffff); vix_norm = svsub_x (xsmall, vix_norm, d->subnormal_bias); vix = svsel (xsmall, vix_norm, vix); } /* Part of core computation carried in working precision. */ svuint32_t tmp = svsub_x (pg, vix, d->off); svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)), V_POWF_LOG2_N - 1); svuint32_t top = svand_x (pg, tmp, 0xff800000); svuint32_t iz = svsub_x (pg, vix, top); svint32_t k = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS)); /* Compute core in extended precision and return intermediate ylogx results to handle cases of underflow and underflow in exp. */ svfloat32_t ylogx; svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d); /* Handle exp special cases of underflow and overflow. */ svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS); svfloat32_t ret_oflow = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY))); svfloat32_t ret_uflow = svreinterpret_f32 (sign); ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret); ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret); /* Cases of finite y and finite negative x. */ ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret); if (unlikely (svptest_any (pg, cmp))) return sv_call_powf_sc (x, y, ret, cmp); return ret; } PL_SIG (SV, F, 2, pow) PL_TEST_ULP (SV_NAME_F2 (pow), 2.06) /* Wide intervals spanning the whole domain but shared between x and y. */ #define SV_POWF_INTERVAL2(xlo, xhi, ylo, yhi, n) \ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, ylo, yhi, n) \ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), xlo, xhi, -ylo, -yhi, n) \ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, ylo, yhi, n) \ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -xlo, -xhi, -ylo, -yhi, n) SV_POWF_INTERVAL2 (0, 0x1p-126, 0, inf, 40000) SV_POWF_INTERVAL2 (0x1p-126, 1, 0, inf, 50000) SV_POWF_INTERVAL2 (1, inf, 0, inf, 50000) /* x~1 or y~1. */ SV_POWF_INTERVAL2 (0x1p-1, 0x1p1, 0x1p-10, 0x1p10, 10000) SV_POWF_INTERVAL2 (0x1.ep-1, 0x1.1p0, 0x1p8, 0x1p16, 10000) SV_POWF_INTERVAL2 (0x1p-500, 0x1p500, 0x1p-1, 0x1p1, 10000) /* around estimated argmaxs of ULP error. */ SV_POWF_INTERVAL2 (0x1p-300, 0x1p-200, 0x1p-20, 0x1p-10, 10000) SV_POWF_INTERVAL2 (0x1p50, 0x1p100, 0x1p-20, 0x1p-10, 10000) /* x is negative, y is odd or even integer, or y is real not integer. */ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 3.0, 3.0, 10000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 4.0, 4.0, 10000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), -0.0, -10.0, 0.0, 10.0, 10000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 0.0, 10.0, -0.0, -10.0, 10000) /* |x| is inf, y is odd or even integer, or y is real not integer. */ SV_POWF_INTERVAL2 (inf, inf, 0.5, 0.5, 1) SV_POWF_INTERVAL2 (inf, inf, 1.0, 1.0, 1) SV_POWF_INTERVAL2 (inf, inf, 2.0, 2.0, 1) SV_POWF_INTERVAL2 (inf, inf, 3.0, 3.0, 1) /* 0.0^y. */ SV_POWF_INTERVAL2 (0.0, 0.0, 0.0, 0x1p120, 1000) /* 1.0^y. */ PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0.0, 0x1p-50, 1000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 0x1p-50, 1.0, 1000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, 1.0, 0x1p100, 1000) PL_TEST_INTERVAL2 (SV_NAME_F2 (pow), 1.0, 1.0, -1.0, -0x1p120, 1000)