xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/advsimd/tanpif.c (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * Single-precision vector tanpi(x) function.
3  *
4  * Copyright (c) 2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "v_math.h"
9 #include "test_sig.h"
10 #include "test_defs.h"
11 
12 const static struct v_tanpif_data
13 {
14   float32x4_t c0, c2, c4, c6;
15   float c1, c3, c5, c7;
16 } tanpif_data = {
17   /* Coefficents for tan(pi * x).  */
18   .c0 = V4 (0x1.921fb4p1f),  .c1 = 0x1.4abbcep3f,      .c2 = V4 (0x1.466b8p5f),
19   .c3 = 0x1.461c72p7f,	     .c4 = V4 (0x1.42e9d4p9f), .c5 = 0x1.69e2c4p11f,
20   .c6 = V4 (0x1.e85558p11f), .c7 = 0x1.a52e08p16f,
21 };
22 
23 /* Approximation for single-precision vector tanpi(x)
24    The maximum error is 3.34 ULP:
25    _ZGVnN4v_tanpif(0x1.d6c09ap-2) got 0x1.f70aacp+2
26 				 want 0x1.f70aa6p+2.  */
27 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (tanpi) (float32x4_t x)
28 {
29   const struct v_tanpif_data *d = ptr_barrier (&tanpif_data);
30 
31   float32x4_t n = vrndnq_f32 (x);
32 
33   /* inf produces nan that propagates.  */
34   float32x4_t xr = vsubq_f32 (x, n);
35   float32x4_t ar = vabdq_f32 (x, n);
36   uint32x4_t flip = vcgtq_f32 (ar, v_f32 (0.25f));
37   float32x4_t r = vbslq_f32 (flip, vsubq_f32 (v_f32 (0.5f), ar), ar);
38 
39   /* Order-7 pairwise Horner polynomial evaluation scheme.  */
40   float32x4_t r2 = vmulq_f32 (r, r);
41   float32x4_t r4 = vmulq_f32 (r2, r2);
42 
43   float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
44   float32x4_t p01 = vfmaq_laneq_f32 (d->c0, r2, odd_coeffs, 0);
45   float32x4_t p23 = vfmaq_laneq_f32 (d->c2, r2, odd_coeffs, 1);
46   float32x4_t p45 = vfmaq_laneq_f32 (d->c4, r2, odd_coeffs, 2);
47   float32x4_t p67 = vfmaq_laneq_f32 (d->c6, r2, odd_coeffs, 3);
48   float32x4_t p = vfmaq_f32 (p45, r4, p67);
49   p = vfmaq_f32 (p23, r4, p);
50   p = vfmaq_f32 (p01, r4, p);
51 
52   p = vmulq_f32 (r, p);
53   float32x4_t p_recip = vdivq_f32 (v_f32 (1.0f), p);
54   float32x4_t y = vbslq_f32 (flip, p_recip, p);
55 
56   uint32x4_t sign
57       = veorq_u32 (vreinterpretq_u32_f32 (xr), vreinterpretq_u32_f32 (ar));
58   return vreinterpretq_f32_u32 (vorrq_u32 (vreinterpretq_u32_f32 (y), sign));
59 }
60 
61 HALF_WIDTH_ALIAS_F1 (tanpi)
62 
63 #if WANT_TRIGPI_TESTS
64 TEST_DISABLE_FENV (V_NAME_F1 (tanpi))
65 TEST_ULP (V_NAME_F1 (tanpi), 2.84)
66 TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0, 0x1p-31, 50000)
67 TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p-31, 0.5, 100000)
68 TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0.5, 0x1p23f, 100000)
69 TEST_SYM_INTERVAL (V_NAME_F1 (tanpi), 0x1p23f, inf, 100000)
70 #endif
71