xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/advsimd/v_sincos_common.h (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * Core approximation for double-precision vector sincos
3  *
4  * Copyright (c) 2023-2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include "v_math.h"
9 #include "v_poly_f64.h"
10 
11 static const struct v_sincos_data
12 {
13   float64x2_t sin_poly[7], cos_poly[6], pio2[3];
14   float64x2_t inv_pio2, shift, range_val;
15 } v_sincos_data = {
16   .inv_pio2 = V2 (0x1.45f306dc9c882p-1),
17   .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26),
18 	    V2 (0x1.1a62633145c07p-54) },
19   .shift = V2 (0x1.8p52),
20   .sin_poly = { /* Computed using Remez in [-pi/2, pi/2].  */
21 	        V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7),
22 		V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19),
23 		V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33),
24 		V2 (-0x1.9e9540300a1p-41) },
25   .cos_poly = { /* Computed using Remez in [-pi/4, pi/4].  */
26 	        V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10),
27 		V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22),
28 		V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) },
29   .range_val = V2 (0x1p23), };
30 
31 static inline uint64x2_t
32 check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d)
33 {
34   return vcagtq_f64 (x, d->range_val);
35 }
36 
37 /* Double-precision vector function allowing calculation of both sin and cos in
38    one function call, using shared argument reduction and separate polynomials.
39    Largest observed error is for sin, 3.22 ULP:
40    v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
41 				       want -0x1.ffe9537d5dbb4p-3.  */
42 static inline float64x2x2_t
43 v_sincos_inline (float64x2_t x, const struct v_sincos_data *d)
44 {
45   /* q = nearest integer to 2 * x / pi.  */
46   float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift);
47   int64x2_t n = vcvtq_s64_f64 (q);
48 
49   /* Use q to reduce x to r in [-pi/4, pi/4], by:
50      r = x - q * pi/2, in extended precision.  */
51   float64x2_t r = x;
52   r = vfmsq_f64 (r, q, d->pio2[0]);
53   r = vfmsq_f64 (r, q, d->pio2[1]);
54   r = vfmsq_f64 (r, q, d->pio2[2]);
55 
56   float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2;
57 
58   /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
59   float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly);
60   s = vfmaq_f64 (r, r3, s);
61 
62   /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
63   float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly);
64   c = vfmaq_f64 (v_f64 (-0.5), r2, c);
65   c = vfmaq_f64 (v_f64 (1), r2, c);
66 
67   /* If odd quadrant, swap cos and sin.  */
68   uint64x2_t swap = vtstq_s64 (n, v_s64 (1));
69   float64x2_t ss = vbslq_f64 (swap, c, s);
70   float64x2_t cc = vbslq_f64 (swap, s, c);
71 
72   /* Fix signs according to quadrant.
73      ss = asdouble(asuint64(ss) ^ ((n       & 2) << 62))
74      cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)).  */
75   uint64x2_t sin_sign
76       = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62);
77   uint64x2_t cos_sign = vshlq_n_u64 (
78       vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)),
79       62);
80   ss = vreinterpretq_f64_u64 (
81       veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign));
82   cc = vreinterpretq_f64_u64 (
83       veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign));
84 
85   return (float64x2x2_t){ ss, cc };
86 }
87