xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/experimental/tanf_3u3.c (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner  * Single-precision scalar tan(x) function.
3*f3087befSAndrew Turner  *
4*f3087befSAndrew Turner  * Copyright (c) 2021-2024, Arm Limited.
5*f3087befSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner  */
7*f3087befSAndrew Turner #include "math_config.h"
8*f3087befSAndrew Turner #include "test_sig.h"
9*f3087befSAndrew Turner #include "test_defs.h"
10*f3087befSAndrew Turner #include "poly_scalar_f32.h"
11*f3087befSAndrew Turner 
12*f3087befSAndrew Turner /* Useful constants.  */
13*f3087befSAndrew Turner #define NegPio2_1 (-0x1.921fb6p+0f)
14*f3087befSAndrew Turner #define NegPio2_2 (0x1.777a5cp-25f)
15*f3087befSAndrew Turner #define NegPio2_3 (0x1.ee59dap-50f)
16*f3087befSAndrew Turner /* Reduced from 0x1p20 to 0x1p17 to ensure 3.5ulps.  */
17*f3087befSAndrew Turner #define RangeVal (0x1p17f)
18*f3087befSAndrew Turner #define InvPio2 ((0x1.45f306p-1f))
19*f3087befSAndrew Turner #define Shift (0x1.8p+23f)
20*f3087befSAndrew Turner #define AbsMask (0x7fffffff)
21*f3087befSAndrew Turner #define Pio4 (0x1.921fb6p-1)
22*f3087befSAndrew Turner /* 2PI * 2^-64.  */
23*f3087befSAndrew Turner #define Pio2p63 (0x1.921FB54442D18p-62)
24*f3087befSAndrew Turner 
25*f3087befSAndrew Turner static inline float
eval_P(float z)26*f3087befSAndrew Turner eval_P (float z)
27*f3087befSAndrew Turner {
28*f3087befSAndrew Turner   return pw_horner_5_f32 (z, z * z, __tanf_poly_data.poly_tan);
29*f3087befSAndrew Turner }
30*f3087befSAndrew Turner 
31*f3087befSAndrew Turner static inline float
eval_Q(float z)32*f3087befSAndrew Turner eval_Q (float z)
33*f3087befSAndrew Turner {
34*f3087befSAndrew Turner   return pairwise_poly_3_f32 (z, z * z, __tanf_poly_data.poly_cotan);
35*f3087befSAndrew Turner }
36*f3087befSAndrew Turner 
37*f3087befSAndrew Turner /* Reduction of the input argument x using Cody-Waite approach, such that x = r
38*f3087befSAndrew Turner    + n * pi/2 with r lives in [-pi/4, pi/4] and n is a signed integer.  */
39*f3087befSAndrew Turner static inline float
reduce(float x,int32_t * in)40*f3087befSAndrew Turner reduce (float x, int32_t *in)
41*f3087befSAndrew Turner {
42*f3087befSAndrew Turner   /* n = rint(x/(pi/2)).  */
43*f3087befSAndrew Turner   float r = x;
44*f3087befSAndrew Turner   float q = fmaf (InvPio2, r, Shift);
45*f3087befSAndrew Turner   float n = q - Shift;
46*f3087befSAndrew Turner   /* There is no rounding here, n is representable by a signed integer.  */
47*f3087befSAndrew Turner   *in = (int32_t) n;
48*f3087befSAndrew Turner   /* r = x - n * (pi/2)  (range reduction into -pi/4 .. pi/4).  */
49*f3087befSAndrew Turner   r = fmaf (NegPio2_1, n, r);
50*f3087befSAndrew Turner   r = fmaf (NegPio2_2, n, r);
51*f3087befSAndrew Turner   r = fmaf (NegPio2_3, n, r);
52*f3087befSAndrew Turner   return r;
53*f3087befSAndrew Turner }
54*f3087befSAndrew Turner 
55*f3087befSAndrew Turner /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
56*f3087befSAndrew Turner    XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
57*f3087befSAndrew Turner    Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
58*f3087befSAndrew Turner    Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
59*f3087befSAndrew Turner    multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
60*f3087befSAndrew Turner    can have at most 29 leading zeros after the binary point, the double
61*f3087befSAndrew Turner    precision result is accurate to 33 bits.  */
62*f3087befSAndrew Turner static inline double
reduce_large(uint32_t xi,int * np)63*f3087befSAndrew Turner reduce_large (uint32_t xi, int *np)
64*f3087befSAndrew Turner {
65*f3087befSAndrew Turner   const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
66*f3087befSAndrew Turner   int shift = (xi >> 23) & 7;
67*f3087befSAndrew Turner   uint64_t n, res0, res1, res2;
68*f3087befSAndrew Turner 
69*f3087befSAndrew Turner   xi = (xi & 0xffffff) | 0x800000;
70*f3087befSAndrew Turner   xi <<= shift;
71*f3087befSAndrew Turner 
72*f3087befSAndrew Turner   res0 = xi * arr[0];
73*f3087befSAndrew Turner   res1 = (uint64_t) xi * arr[4];
74*f3087befSAndrew Turner   res2 = (uint64_t) xi * arr[8];
75*f3087befSAndrew Turner   res0 = (res2 >> 32) | (res0 << 32);
76*f3087befSAndrew Turner   res0 += res1;
77*f3087befSAndrew Turner 
78*f3087befSAndrew Turner   n = (res0 + (1ULL << 61)) >> 62;
79*f3087befSAndrew Turner   res0 -= n << 62;
80*f3087befSAndrew Turner   double x = (int64_t) res0;
81*f3087befSAndrew Turner   *np = n;
82*f3087befSAndrew Turner   return x * Pio2p63;
83*f3087befSAndrew Turner }
84*f3087befSAndrew Turner 
85*f3087befSAndrew Turner /* Top 12 bits of the float representation with the sign bit cleared.  */
86*f3087befSAndrew Turner static inline uint32_t
top12(float x)87*f3087befSAndrew Turner top12 (float x)
88*f3087befSAndrew Turner {
89*f3087befSAndrew Turner   return (asuint (x) >> 20);
90*f3087befSAndrew Turner }
91*f3087befSAndrew Turner 
92*f3087befSAndrew Turner /* Fast single-precision tan implementation.
93*f3087befSAndrew Turner    Maximum ULP error: 3.293ulps.
94*f3087befSAndrew Turner    tanf(0x1.c849eap+16) got -0x1.fe8d98p-1 want -0x1.fe8d9ep-1.  */
95*f3087befSAndrew Turner float
tanf(float x)96*f3087befSAndrew Turner tanf (float x)
97*f3087befSAndrew Turner {
98*f3087befSAndrew Turner   /* Get top words.  */
99*f3087befSAndrew Turner   uint32_t ix = asuint (x);
100*f3087befSAndrew Turner   uint32_t ia = ix & AbsMask;
101*f3087befSAndrew Turner   uint32_t ia12 = ia >> 20;
102*f3087befSAndrew Turner 
103*f3087befSAndrew Turner   /* Dispatch between no reduction (small numbers), fast reduction and
104*f3087befSAndrew Turner      slow large numbers reduction. The reduction step determines r float
105*f3087befSAndrew Turner      (|r| < pi/4) and n signed integer such that x = r + n * pi/2.  */
106*f3087befSAndrew Turner   int32_t n;
107*f3087befSAndrew Turner   float r;
108*f3087befSAndrew Turner   if (ia12 < top12 (Pio4))
109*f3087befSAndrew Turner     {
110*f3087befSAndrew Turner       /* Optimize small values.  */
111*f3087befSAndrew Turner       if (unlikely (ia12 < top12 (0x1p-12f)))
112*f3087befSAndrew Turner 	{
113*f3087befSAndrew Turner 	  if (unlikely (ia12 < top12 (0x1p-126f)))
114*f3087befSAndrew Turner 	    /* Force underflow for tiny x.  */
115*f3087befSAndrew Turner 	    force_eval_float (x * x);
116*f3087befSAndrew Turner 	  return x;
117*f3087befSAndrew Turner 	}
118*f3087befSAndrew Turner 
119*f3087befSAndrew Turner       /* tan (x) ~= x + x^3 * P(x^2).  */
120*f3087befSAndrew Turner       float x2 = x * x;
121*f3087befSAndrew Turner       float y = eval_P (x2);
122*f3087befSAndrew Turner       return fmaf (x2, x * y, x);
123*f3087befSAndrew Turner     }
124*f3087befSAndrew Turner   /* Similar to other trigonometric routines, fast inaccurate reduction is
125*f3087befSAndrew Turner      performed for values of x from pi/4 up to RangeVal. In order to keep
126*f3087befSAndrew Turner      errors below 3.5ulps, we set the value of RangeVal to 2^17. This might
127*f3087befSAndrew Turner      differ for other trigonometric routines. Above this value more advanced
128*f3087befSAndrew Turner      but slower reduction techniques need to be implemented to reach a similar
129*f3087befSAndrew Turner      accuracy.  */
130*f3087befSAndrew Turner   else if (ia12 < top12 (RangeVal))
131*f3087befSAndrew Turner     {
132*f3087befSAndrew Turner       /* Fast inaccurate reduction.  */
133*f3087befSAndrew Turner       r = reduce (x, &n);
134*f3087befSAndrew Turner     }
135*f3087befSAndrew Turner   else if (ia12 < 0x7f8)
136*f3087befSAndrew Turner     {
137*f3087befSAndrew Turner       /* Slow accurate reduction.  */
138*f3087befSAndrew Turner       uint32_t sign = ix & ~AbsMask;
139*f3087befSAndrew Turner       double dar = reduce_large (ia, &n);
140*f3087befSAndrew Turner       float ar = (float) dar;
141*f3087befSAndrew Turner       r = asfloat (asuint (ar) ^ sign);
142*f3087befSAndrew Turner     }
143*f3087befSAndrew Turner   else
144*f3087befSAndrew Turner     {
145*f3087befSAndrew Turner       /* tan(Inf or NaN) is NaN.  */
146*f3087befSAndrew Turner       return __math_invalidf (x);
147*f3087befSAndrew Turner     }
148*f3087befSAndrew Turner 
149*f3087befSAndrew Turner   /* If x lives in an interval where |tan(x)|
150*f3087befSAndrew Turner      - is finite then use an approximation of tangent in the form
151*f3087befSAndrew Turner        tan(r) ~ r + r^3 * P(r^2) = r + r * r^2 * P(r^2).
152*f3087befSAndrew Turner      - grows to infinity then use an approximation of cotangent in the form
153*f3087befSAndrew Turner        cotan(z) ~ 1/z + z * Q(z^2), where the reciprocal can be computed early.
154*f3087befSAndrew Turner        Using symmetries of tangent and the identity tan(r) = cotan(pi/2 - r),
155*f3087befSAndrew Turner        we only need to change the sign of r to obtain tan(x) from cotan(r).
156*f3087befSAndrew Turner      This 2-interval approach requires 2 different sets of coefficients P and
157*f3087befSAndrew Turner      Q, where Q is a lower order polynomial than P.  */
158*f3087befSAndrew Turner 
159*f3087befSAndrew Turner   /* Determine if x lives in an interval where |tan(x)| grows to infinity.  */
160*f3087befSAndrew Turner   uint32_t alt = (uint32_t) n & 1;
161*f3087befSAndrew Turner 
162*f3087befSAndrew Turner   /* Perform additional reduction if required.  */
163*f3087befSAndrew Turner   float z = alt ? -r : r;
164*f3087befSAndrew Turner 
165*f3087befSAndrew Turner   /* Prepare backward transformation.  */
166*f3087befSAndrew Turner   float z2 = r * r;
167*f3087befSAndrew Turner   float offset = alt ? 1.0f / z : z;
168*f3087befSAndrew Turner   float scale = alt ? z : z * z2;
169*f3087befSAndrew Turner 
170*f3087befSAndrew Turner   /* Evaluate polynomial approximation of tan or cotan.  */
171*f3087befSAndrew Turner   float p = alt ? eval_Q (z2) : eval_P (z2);
172*f3087befSAndrew Turner 
173*f3087befSAndrew Turner   /* A unified way of assembling the result on both interval types.  */
174*f3087befSAndrew Turner   return fmaf (scale, p, offset);
175*f3087befSAndrew Turner }
176*f3087befSAndrew Turner 
177*f3087befSAndrew Turner TEST_SIG (S, F, 1, tan, -3.1, 3.1)
178*f3087befSAndrew Turner TEST_ULP (tanf, 2.80)
179*f3087befSAndrew Turner TEST_INTERVAL (tanf, 0, 0xffff0000, 10000)
180*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p-127, 0x1p-14, 50000)
181*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p-14, 0.7, 50000)
182*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0.7, 1.5, 50000)
183*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 1.5, 0x1p17, 50000)
184*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p17, 0x1p54, 50000)
185*f3087befSAndrew Turner TEST_SYM_INTERVAL (tanf, 0x1p54, inf, 50000)
186