1 /* 2 * Single-precision scalar tan(x) function. 3 * 4 * Copyright (c) 2021-2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 #include "math_config.h" 8 #include "pl_sig.h" 9 #include "pl_test.h" 10 #include "pairwise_hornerf.h" 11 12 /* Useful constants. */ 13 #define NegPio2_1 (-0x1.921fb6p+0f) 14 #define NegPio2_2 (0x1.777a5cp-25f) 15 #define NegPio2_3 (0x1.ee59dap-50f) 16 /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */ 17 #define RangeVal (0x1p17f) 18 #define InvPio2 ((0x1.45f306p-1f)) 19 #define Shift (0x1.8p+23f) 20 #define AbsMask (0x7fffffff) 21 #define Pio4 (0x1.921fb6p-1) 22 /* 2PI * 2^-64. */ 23 #define Pio2p63 (0x1.921FB54442D18p-62) 24 25 #define P(i) __tanf_poly_data.poly_tan[i] 26 #define Q(i) __tanf_poly_data.poly_cotan[i] 27 28 static inline float 29 eval_P (float z) 30 { 31 return PAIRWISE_HORNER_5 (z, z * z, P); 32 } 33 34 static inline float 35 eval_Q (float z) 36 { 37 return PAIRWISE_HORNER_3 (z, z * z, Q); 38 } 39 40 /* Reduction of the input argument x using Cody-Waite approach, such that x = r 41 + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */ 42 static inline float 43 reduce (float x, int32_t *in) 44 { 45 /* n = rint(x/(pi/2)). */ 46 float r = x; 47 float q = fmaf (InvPio2, r, Shift); 48 float n = q - Shift; 49 /* There is no rounding here, n is representable by a signed integer. */ 50 *in = (int32_t) n; 51 /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */ 52 r = fmaf (NegPio2_1, n, r); 53 r = fmaf (NegPio2_2, n, r); 54 r = fmaf (NegPio2_3, n, r); 55 return r; 56 } 57 58 /* Table with 4/PI to 192 bit precision. To avoid unaligned accesses 59 only 8 new bits are added per entry, making the table 4 times larger. */ 60 static const uint32_t __inv_pio4[24] 61 = {0x000000a2, 0x0000a2f9, 0x00a2f983, 0xa2f9836e, 0xf9836e4e, 0x836e4e44, 62 0x6e4e4415, 0x4e441529, 0x441529fc, 0x1529fc27, 0x29fc2757, 0xfc2757d1, 63 0x2757d1f5, 0x57d1f534, 0xd1f534dd, 0xf534ddc0, 0x34ddc0db, 0xddc0db62, 64 0xc0db6295, 0xdb629599, 0x6295993c, 0x95993c43, 0x993c4390, 0x3c439041}; 65 66 /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic. 67 XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored). 68 Return the modulo between -PI/4 and PI/4 and store the quadrant in NP. 69 Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit 70 multiply computes the exact 2.62-bit fixed-point modulo. Since the result 71 can have at most 29 leading zeros after the binary point, the double 72 precision result is accurate to 33 bits. */ 73 static inline double 74 reduce_large (uint32_t xi, int *np) 75 { 76 const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15]; 77 int shift = (xi >> 23) & 7; 78 uint64_t n, res0, res1, res2; 79 80 xi = (xi & 0xffffff) | 0x800000; 81 xi <<= shift; 82 83 res0 = xi * arr[0]; 84 res1 = (uint64_t) xi * arr[4]; 85 res2 = (uint64_t) xi * arr[8]; 86 res0 = (res2 >> 32) | (res0 << 32); 87 res0 += res1; 88 89 n = (res0 + (1ULL << 61)) >> 62; 90 res0 -= n << 62; 91 double x = (int64_t) res0; 92 *np = n; 93 return x * Pio2p63; 94 } 95 96 /* Top 12 bits of the float representation with the sign bit cleared. */ 97 static inline uint32_t 98 top12 (float x) 99 { 100 return (asuint (x) >> 20); 101 } 102 103 /* Fast single-precision tan implementation. 104 Maximum ULP error: 3.293ulps. 105 tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */ 106 float 107 tanf (float x) 108 { 109 /* Get top words. */ 110 uint32_t ix = asuint (x); 111 uint32_t ia = ix & AbsMask; 112 uint32_t ia12 = ia >> 20; 113 114 /* Dispatch between no reduction (small numbers), fast reduction and 115 slow large numbers reduction. The reduction step determines r float 116 (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */ 117 int32_t n; 118 float r; 119 if (ia12 < top12 (Pio4)) 120 { 121 /* Optimize small values. */ 122 if (unlikely (ia12 < top12 (0x1p-12f))) 123 { 124 if (unlikely (ia12 < top12 (0x1p-126f))) 125 /* Force underflow for tiny x. */ 126 force_eval_float (x * x); 127 return x; 128 } 129 130 /* tan (x) ~= x + x^3 * P(x^2). */ 131 float x2 = x * x; 132 float y = eval_P (x2); 133 return fmaf (x2, x * y, x); 134 } 135 /* Similar to other trigonometric routines, fast inaccurate reduction is 136 performed for values of x from pi/4 up to RangeVal. In order to keep errors 137 below 3.5ulps, we set the value of RangeVal to 2^17. This might differ for 138 other trigonometric routines. Above this value more advanced but slower 139 reduction techniques need to be implemented to reach a similar accuracy. 140 */ 141 else if (ia12 < top12 (RangeVal)) 142 { 143 /* Fast inaccurate reduction. */ 144 r = reduce (x, &n); 145 } 146 else if (ia12 < 0x7f8) 147 { 148 /* Slow accurate reduction. */ 149 uint32_t sign = ix & ~AbsMask; 150 double dar = reduce_large (ia, &n); 151 float ar = (float) dar; 152 r = asfloat (asuint (ar) ^ sign); 153 } 154 else 155 { 156 /* tan(Inf or NaN) is NaN. */ 157 return __math_invalidf (x); 158 } 159 160 /* If x lives in an interval where |tan(x)| 161 - is finite then use an approximation of tangent in the form 162 tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2). 163 - grows to infinity then use an approximation of cotangent in the form 164 cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early. 165 Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r), 166 we only need to change the sign of r to obtain tan(x) from cotan(r). 167 This 2-interval approach requires 2 different sets of coefficients P and 168 Q, where Q is a lower order polynomial than P. */ 169 170 /* Determine if x lives in an interval where |tan(x)| grows to infinity. */ 171 uint32_t alt = (uint32_t) n & 1; 172 173 /* Perform additional reduction if required. */ 174 float z = alt ? -r : r; 175 176 /* Prepare backward transformation. */ 177 float z2 = r * r; 178 float offset = alt ? 1.0f / z : z; 179 float scale = alt ? z : z * z2; 180 181 /* Evaluate polynomial approximation of tan or cotan. */ 182 float p = alt ? eval_Q (z2) : eval_P (z2); 183 184 /* A unified way of assembling the result on both interval types. */ 185 return fmaf (scale, p, offset); 186 } 187 188 PL_SIG (S, F, 1, tan, -3.1, 3.1) 189 PL_TEST_ULP (tanf, 2.80) 190 PL_TEST_INTERVAL (tanf, 0, 0xffff0000, 10000) 191 PL_TEST_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000) 192 PL_TEST_INTERVAL (tanf, -0x1p-127, -0x1p-14, 50000) 193 PL_TEST_INTERVAL (tanf, 0x1p-14, 0.7, 50000) 194 PL_TEST_INTERVAL (tanf, -0x1p-14, -0.7, 50000) 195 PL_TEST_INTERVAL (tanf, 0.7, 1.5, 50000) 196 PL_TEST_INTERVAL (tanf, -0.7, -1.5, 50000) 197 PL_TEST_INTERVAL (tanf, 1.5, 0x1p17, 50000) 198 PL_TEST_INTERVAL (tanf, -1.5, -0x1p17, 50000) 199 PL_TEST_INTERVAL (tanf, 0x1p17, 0x1p54, 50000) 200 PL_TEST_INTERVAL (tanf, -0x1p17, -0x1p54, 50000) 201 PL_TEST_INTERVAL (tanf, 0x1p54, inf, 50000) 202 PL_TEST_INTERVAL (tanf, -0x1p54, -inf, 50000) 203