1 /* 2 * Core approximation for double-precision vector sincos 3 * 4 * Copyright (c) 2023, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "v_math.h" 9 #include "poly_advsimd_f64.h" 10 11 static const struct v_sincos_data 12 { 13 float64x2_t sin_poly[7], cos_poly[6], pio2[3]; 14 float64x2_t inv_pio2, shift, range_val; 15 } v_sincos_data = { 16 .inv_pio2 = V2 (0x1.45f306dc9c882p-1), 17 .pio2 = { V2 (0x1.921fb50000000p+0), V2 (0x1.110b460000000p-26), 18 V2 (0x1.1a62633145c07p-54) }, 19 .shift = V2 (0x1.8p52), 20 .sin_poly = { /* Computed using Remez in [-pi/2, pi/2]. */ 21 V2 (-0x1.555555555547bp-3), V2 (0x1.1111111108a4dp-7), 22 V2 (-0x1.a01a019936f27p-13), V2 (0x1.71de37a97d93ep-19), 23 V2 (-0x1.ae633919987c6p-26), V2 (0x1.60e277ae07cecp-33), 24 V2 (-0x1.9e9540300a1p-41) }, 25 .cos_poly = { /* Computed using Remez in [-pi/4, pi/4]. */ 26 V2 (0x1.555555555554cp-5), V2 (-0x1.6c16c16c1521fp-10), 27 V2 (0x1.a01a019cbf62ap-16), V2 (-0x1.27e4f812b681ep-22), 28 V2 (0x1.1ee9f152a57cdp-29), V2 (-0x1.8fb131098404bp-37) }, 29 .range_val = V2 (0x1p23), }; 30 31 static inline uint64x2_t 32 check_ge_rangeval (float64x2_t x, const struct v_sincos_data *d) 33 { 34 return vcagtq_f64 (x, d->range_val); 35 } 36 37 /* Double-precision vector function allowing calculation of both sin and cos in 38 one function call, using shared argument reduction and separate polynomials. 39 Largest observed error is for sin, 3.22 ULP: 40 v_sincos_sin (0x1.d70eef40f39b1p+12) got -0x1.ffe9537d5dbb7p-3 41 want -0x1.ffe9537d5dbb4p-3. */ 42 static inline float64x2x2_t 43 v_sincos_inline (float64x2_t x, const struct v_sincos_data *d) 44 { 45 /* q = nearest integer to 2 * x / pi. */ 46 float64x2_t q = vsubq_f64 (vfmaq_f64 (d->shift, x, d->inv_pio2), d->shift); 47 int64x2_t n = vcvtq_s64_f64 (q); 48 49 /* Use q to reduce x to r in [-pi/4, pi/4], by: 50 r = x - q * pi/2, in extended precision. */ 51 float64x2_t r = x; 52 r = vfmsq_f64 (r, q, d->pio2[0]); 53 r = vfmsq_f64 (r, q, d->pio2[1]); 54 r = vfmsq_f64 (r, q, d->pio2[2]); 55 56 float64x2_t r2 = r * r, r3 = r2 * r, r4 = r2 * r2; 57 58 /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ 59 float64x2_t s = v_pw_horner_6_f64 (r2, r4, d->sin_poly); 60 s = vfmaq_f64 (r, r3, s); 61 62 /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ 63 float64x2_t c = v_pw_horner_5_f64 (r2, r4, d->cos_poly); 64 c = vfmaq_f64 (v_f64 (-0.5), r2, c); 65 c = vfmaq_f64 (v_f64 (1), r2, c); 66 67 /* If odd quadrant, swap cos and sin. */ 68 uint64x2_t swap = vtstq_s64 (n, v_s64 (1)); 69 float64x2_t ss = vbslq_f64 (swap, c, s); 70 float64x2_t cc = vbslq_f64 (swap, s, c); 71 72 /* Fix signs according to quadrant. 73 ss = asdouble(asuint64(ss) ^ ((n & 2) << 62)) 74 cc = asdouble(asuint64(cc) & (((n + 1) & 2) << 62)). */ 75 uint64x2_t sin_sign 76 = vshlq_n_u64 (vandq_u64 (vreinterpretq_u64_s64 (n), v_u64 (2)), 62); 77 uint64x2_t cos_sign = vshlq_n_u64 ( 78 vandq_u64 (vreinterpretq_u64_s64 (vaddq_s64 (n, v_s64 (1))), v_u64 (2)), 79 62); 80 ss = vreinterpretq_f64_u64 ( 81 veorq_u64 (vreinterpretq_u64_f64 (ss), sin_sign)); 82 cc = vreinterpretq_f64_u64 ( 83 veorq_u64 (vreinterpretq_u64_f64 (cc), cos_sign)); 84 85 return (float64x2x2_t){ ss, cc }; 86 } 87