xref: /freebsd/contrib/arm-optimized-routines/math/aarch64/sve/sv_sincos_common.h (revision f3087bef11543b42e0d69b708f367097a4118d24)
1*f3087befSAndrew Turner /*
2*f3087befSAndrew Turner  * Core approximation for double-precision vector sincos
3*f3087befSAndrew Turner  *
4*f3087befSAndrew Turner  * Copyright (c) 2023-2024, Arm Limited.
5*f3087befSAndrew Turner  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6*f3087befSAndrew Turner  */
7*f3087befSAndrew Turner 
8*f3087befSAndrew Turner #include "sv_math.h"
9*f3087befSAndrew Turner #include "sv_poly_f64.h"
10*f3087befSAndrew Turner 
11*f3087befSAndrew Turner static const struct sv_sincos_data
12*f3087befSAndrew Turner {
13*f3087befSAndrew Turner   double sin_poly[7], cos_poly[6], pio2[3];
14*f3087befSAndrew Turner   double inv_pio2, shift, range_val;
15*f3087befSAndrew Turner } sv_sincos_data = {
16*f3087befSAndrew Turner   .inv_pio2 = 0x1.45f306dc9c882p-1,
17*f3087befSAndrew Turner   .pio2 = { 0x1.921fb50000000p+0, 0x1.110b460000000p-26,
18*f3087befSAndrew Turner 	    0x1.1a62633145c07p-54 },
19*f3087befSAndrew Turner   .shift = 0x1.8p52,
20*f3087befSAndrew Turner   .sin_poly = { /* Computed using Remez in [-pi/2, pi/2].  */
21*f3087befSAndrew Turner 	        -0x1.555555555547bp-3, 0x1.1111111108a4dp-7,
22*f3087befSAndrew Turner 		-0x1.a01a019936f27p-13, 0x1.71de37a97d93ep-19,
23*f3087befSAndrew Turner 		-0x1.ae633919987c6p-26, 0x1.60e277ae07cecp-33,
24*f3087befSAndrew Turner 		-0x1.9e9540300a1p-41 },
25*f3087befSAndrew Turner   .cos_poly = { /* Computed using Remez in [-pi/4, pi/4].  */
26*f3087befSAndrew Turner 	        0x1.555555555554cp-5, -0x1.6c16c16c1521fp-10,
27*f3087befSAndrew Turner 		0x1.a01a019cbf62ap-16, -0x1.27e4f812b681ep-22,
28*f3087befSAndrew Turner 		0x1.1ee9f152a57cdp-29, -0x1.8fb131098404bp-37 },
29*f3087befSAndrew Turner   .range_val = 0x1p23, };
30*f3087befSAndrew Turner 
31*f3087befSAndrew Turner static inline svbool_t
check_ge_rangeval(svbool_t pg,svfloat64_t x,const struct sv_sincos_data * d)32*f3087befSAndrew Turner check_ge_rangeval (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d)
33*f3087befSAndrew Turner {
34*f3087befSAndrew Turner   svbool_t in_bounds = svaclt (pg, x, d->range_val);
35*f3087befSAndrew Turner   return svnot_z (pg, in_bounds);
36*f3087befSAndrew Turner }
37*f3087befSAndrew Turner 
38*f3087befSAndrew Turner /* Double-precision vector function allowing calculation of both sin and cos in
39*f3087befSAndrew Turner    one function call, using shared argument reduction and separate polynomials.
40*f3087befSAndrew Turner    Largest observed error is for sin, 3.22 ULP:
41*f3087befSAndrew Turner    v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3
42*f3087befSAndrew Turner 				       want -0x1.ffe9537d5dbb4p-3.  */
43*f3087befSAndrew Turner static inline svfloat64x2_t
sv_sincos_inline(svbool_t pg,svfloat64_t x,const struct sv_sincos_data * d)44*f3087befSAndrew Turner sv_sincos_inline (svbool_t pg, svfloat64_t x, const struct sv_sincos_data *d)
45*f3087befSAndrew Turner {
46*f3087befSAndrew Turner   /* q = nearest integer to 2 * x / pi.  */
47*f3087befSAndrew Turner   svfloat64_t q = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_pio2),
48*f3087befSAndrew Turner 			   d->shift);
49*f3087befSAndrew Turner   svint64_t n = svcvt_s64_x (pg, q);
50*f3087befSAndrew Turner 
51*f3087befSAndrew Turner   /* Reduce x such that r is in [ -pi/4, pi/4 ].  */
52*f3087befSAndrew Turner   svfloat64_t r = x;
53*f3087befSAndrew Turner   r = svmls_x (pg, r, q, d->pio2[0]);
54*f3087befSAndrew Turner   r = svmls_x (pg, r, q, d->pio2[1]);
55*f3087befSAndrew Turner   r = svmls_x (pg, r, q, d->pio2[2]);
56*f3087befSAndrew Turner 
57*f3087befSAndrew Turner   svfloat64_t r2 = svmul_x (pg, r, r), r3 = svmul_x (pg, r2, r),
58*f3087befSAndrew Turner 	      r4 = svmul_x (pg, r2, r2);
59*f3087befSAndrew Turner 
60*f3087befSAndrew Turner   /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2).  */
61*f3087befSAndrew Turner   svfloat64_t s = sv_pw_horner_6_f64_x (pg, r2, r4, d->sin_poly);
62*f3087befSAndrew Turner   s = svmla_x (pg, r, r3, s);
63*f3087befSAndrew Turner 
64*f3087befSAndrew Turner   /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2).  */
65*f3087befSAndrew Turner   svfloat64_t c = sv_pw_horner_5_f64_x (pg, r2, r4, d->cos_poly);
66*f3087befSAndrew Turner   c = svmad_x (pg, c, r2, -0.5);
67*f3087befSAndrew Turner   c = svmad_x (pg, c, r2, 1);
68*f3087befSAndrew Turner 
69*f3087befSAndrew Turner   svuint64_t un = svreinterpret_u64 (n);
70*f3087befSAndrew Turner   /* If odd quadrant, swap cos and sin.  */
71*f3087befSAndrew Turner   svbool_t swap = svcmpeq (pg, svlsl_x (pg, un, 63), 0);
72*f3087befSAndrew Turner   svfloat64_t ss = svsel (swap, s, c);
73*f3087befSAndrew Turner   svfloat64_t cc = svsel (swap, c, s);
74*f3087befSAndrew Turner 
75*f3087befSAndrew Turner   /* Fix signs according to quadrant.
76*f3087befSAndrew Turner      ss = asdouble(asuint64(ss) ^ ((n       & 2) << 62))
77*f3087befSAndrew Turner      cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)).  */
78*f3087befSAndrew Turner   svuint64_t sin_sign = svlsl_x (pg, svand_x (pg, un, 2), 62);
79*f3087befSAndrew Turner   svuint64_t cos_sign = svlsl_x (
80*f3087befSAndrew Turner       pg, svand_x (pg, svreinterpret_u64 (svadd_x (pg, n, 1)), 2), 62);
81*f3087befSAndrew Turner   ss = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (ss), sin_sign));
82*f3087befSAndrew Turner   cc = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (cc), cos_sign));
83*f3087befSAndrew Turner 
84*f3087befSAndrew Turner   return svcreate2 (ss, cc);
85*f3087befSAndrew Turner }
86