1 /*
2 * Single-precision scalar tan(x) function.
3 *
4 * Copyright (c) 2021-2024, Arm Limited.
5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6 */
7 #include "math_config.h"
8 #include "test_sig.h"
9 #include "test_defs.h"
10 #include "poly_scalar_f32.h"
11
12 /* Useful constants. */
13 #define NegPio2_1 (-0x1.921fb6p+0f)
14 #define NegPio2_2 (0x1.777a5cp-25f)
15 #define NegPio2_3 (0x1.ee59dap-50f)
16 /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps. */
17 #define RangeVal (0x1p17f)
18 #define InvPio2 ((0x1.45f306p-1f))
19 #define Shift (0x1.8p+23f)
20 #define AbsMask (0x7fffffff)
21 #define Pio4 (0x1.921fb6p-1)
22 /* 2PI * 2^-64. */
23 #define Pio2p63 (0x1.921FB54442D18p-62)
24
25 static inline float
eval_P(float z)26 eval_P (float z)
27 {
28 return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan);
29 }
30
31 static inline float
eval_Q(float z)32 eval_Q (float z)
33 {
34 return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan);
35 }
36
37 /* Reduction of the input argument x using Cody-Waite approach, such that x = r
38 + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer. */
39 static inline float
reduce(float x,int32_t * in)40 reduce (float x, int32_t *in)
41 {
42 /* n = rint(x/(pi/2)). */
43 float r = x;
44 float q = fmaf (InvPio2, r, Shift);
45 float n = q - Shift;
46 /* There is no rounding here, n is representable by a signed integer. */
47 *in = (int32_t) n;
48 /* r = x - n * (pi/2) (range reduction into -pi/4 .. pi/4). */
49 r = fmaf (NegPio2_1, n, r);
50 r = fmaf (NegPio2_2, n, r);
51 r = fmaf (NegPio2_3, n, r);
52 return r;
53 }
54
55 /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
56 XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
57 Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
58 Reduction uses a table of 4/PI with 192 bits of precision. A 32x96->128 bit
59 multiply computes the exact 2.62-bit fixed-point modulo. Since the result
60 can have at most 29 leading zeros after the binary point, the double
61 precision result is accurate to 33 bits. */
62 static inline double
reduce_large(uint32_t xi,int * np)63 reduce_large (uint32_t xi, int *np)
64 {
65 const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
66 int shift = (xi >> 23) & 7;
67 uint64_t n, res0, res1, res2;
68
69 xi = (xi & 0xffffff) | 0x800000;
70 xi <<= shift;
71
72 res0 = xi * arr[0];
73 res1 = (uint64_t) xi * arr[4];
74 res2 = (uint64_t) xi * arr[8];
75 res0 = (res2 >> 32) | (res0 << 32);
76 res0 += res1;
77
78 n = (res0 + (1ULL << 61)) >> 62;
79 res0 -= n << 62;
80 double x = (int64_t) res0;
81 *np = n;
82 return x * Pio2p63;
83 }
84
85 /* Top 12 bits of the float representation with the sign bit cleared. */
86 static inline uint32_t
top12(float x)87 top12 (float x)
88 {
89 return (asuint (x) >> 20);
90 }
91
92 /* Fast single-precision tan implementation.
93 Maximum ULP error: 3.293ulps.
94 tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1. */
95 float
tanf(float x)96 tanf (float x)
97 {
98 /* Get top words. */
99 uint32_t ix = asuint (x);
100 uint32_t ia = ix & AbsMask;
101 uint32_t ia12 = ia >> 20;
102
103 /* Dispatch between no reduction (small numbers), fast reduction and
104 slow large numbers reduction. The reduction step determines r float
105 (|r| < pi/4) and n signed integer such that x = r + n * pi/2. */
106 int32_t n;
107 float r;
108 if (ia12 < top12 (Pio4))
109 {
110 /* Optimize small values. */
111 if (unlikely (ia12 < top12 (0x1p-12f)))
112 {
113 if (unlikely (ia12 < top12 (0x1p-126f)))
114 /* Force underflow for tiny x. */
115 force_eval_float (x * x);
116 return x;
117 }
118
119 /* tan (x) ~= x + x^3 * P(x^2). */
120 float x2 = x * x;
121 float y = eval_P (x2);
122 return fmaf (x2, x * y, x);
123 }
124 /* Similar to other trigonometric routines, fast inaccurate reduction is
125 performed for values of x from pi/4 up to RangeVal. In order to keep
126 errors below 3.5ulps, we set the value of RangeVal to 2^17. This might
127 differ for other trigonometric routines. Above this value more advanced
128 but slower reduction techniques need to be implemented to reach a similar
129 accuracy. */
130 else if (ia12 < top12 (RangeVal))
131 {
132 /* Fast inaccurate reduction. */
133 r = reduce (x, &n);
134 }
135 else if (ia12 < 0x7f8)
136 {
137 /* Slow accurate reduction. */
138 uint32_t sign = ix & ~AbsMask;
139 double dar = reduce_large (ia, &n);
140 float ar = (float) dar;
141 r = asfloat (asuint (ar) ^ sign);
142 }
143 else
144 {
145 /* tan(Inf or NaN) is NaN. */
146 return __math_invalidf (x);
147 }
148
149 /* If x lives in an interval where |tan(x)|
150 - is finite then use an approximation of tangent in the form
151 tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
152 - grows to infinity then use an approximation of cotangent in the form
153 cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
154 Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
155 we only need to change the sign of r to obtain tan(x) from cotan(r).
156 This 2-interval approach requires 2 different sets of coefficients P and
157 Q, where Q is a lower order polynomial than P. */
158
159 /* Determine if x lives in an interval where |tan(x)| grows to infinity. */
160 uint32_t alt = (uint32_t) n & 1;
161
162 /* Perform additional reduction if required. */
163 float z = alt ? -r : r;
164
165 /* Prepare backward transformation. */
166 float z2 = r * r;
167 float offset = alt ? 1.0f / z : z;
168 float scale = alt ? z : z * z2;
169
170 /* Evaluate polynomial approximation of tan or cotan. */
171 float p = alt ? eval_Q (z2) : eval_P (z2);
172
173 /* A unified way of assembling the result on both interval types. */
174 return fmaf (scale, p, offset);
175 }
176
177 TEST_SIG (S, F, 1, tan, -3.1, 3.1)
178 TEST_ULP (tanf, 2.80)
179 TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
180 TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
181 TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
182 TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
183 TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
184 TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
185 TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
186