1 /* 2 * Core approximation for single-precision vector sincos 3 * 4 * Copyright (c) 2023-2024, Arm Limited. 5 * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception 6 */ 7 8 #include "v_math.h" 9 10 const static struct v_sincosf_data 11 { 12 float32x4_t poly_sin[3], poly_cos[3], pio2[3], inv_pio2, shift, range_val; 13 } v_sincosf_data = { 14 .poly_sin = { /* Generated using Remez, odd coeffs only, in [-pi/4, pi/4]. */ 15 V4 (-0x1.555546p-3), V4 (0x1.11076p-7), V4 (-0x1.994eb4p-13) }, 16 .poly_cos = { /* Generated using Remez, even coeffs only, in [-pi/4, pi/4]. */ 17 V4 (0x1.55554ap-5), V4 (-0x1.6c0c1ap-10), V4 (0x1.99e0eep-16) }, 18 .pio2 = { V4 (0x1.921fb6p+0f), V4 (-0x1.777a5cp-25f), V4 (-0x1.ee59dap-50f) }, 19 .inv_pio2 = V4 (0x1.45f306p-1f), 20 .shift = V4 (0x1.8p23), 21 .range_val = V4 (0x1p20), 22 }; 23 24 static inline uint32x4_t 25 check_ge_rangeval (float32x4_t x, const struct v_sincosf_data *d) 26 { 27 return vcagtq_f32 (x, d->range_val); 28 } 29 30 /* Single-precision vector function allowing calculation of both sin and cos in 31 one function call, using shared argument reduction and separate low-order 32 polynomials. 33 Worst-case error for sin is 1.67 ULP: 34 v_sincosf_sin(0x1.c704c4p+19) got 0x1.fff698p-5 want 0x1.fff69cp-5 35 Worst-case error for cos is 1.81 ULP: 36 v_sincosf_cos(0x1.e506fp+19) got -0x1.ffec6ep-6 want -0x1.ffec72p-6. */ 37 static inline float32x4x2_t 38 v_sincosf_inline (float32x4_t x, const struct v_sincosf_data *d) 39 { 40 /* n = rint ( x / (pi/2) ). */ 41 float32x4_t shift = d->shift; 42 float32x4_t q = vfmaq_f32 (shift, x, d->inv_pio2); 43 q = vsubq_f32 (q, shift); 44 int32x4_t n = vcvtq_s32_f32 (q); 45 46 /* Reduce x such that r is in [ -pi/4, pi/4 ]. */ 47 float32x4_t r = x; 48 r = vfmsq_f32 (r, q, d->pio2[0]); 49 r = vfmsq_f32 (r, q, d->pio2[1]); 50 r = vfmsq_f32 (r, q, d->pio2[2]); 51 52 /* Approximate sin(r) ~= r + r^3 * poly_sin(r^2). */ 53 float32x4_t r2 = vmulq_f32 (r, r), r3 = vmulq_f32 (r, r2); 54 float32x4_t s = vfmaq_f32 (d->poly_sin[1], r2, d->poly_sin[2]); 55 s = vfmaq_f32 (d->poly_sin[0], r2, s); 56 s = vfmaq_f32 (r, r3, s); 57 58 /* Approximate cos(r) ~= 1 - (r^2)/2 + r^4 * poly_cos(r^2). */ 59 float32x4_t r4 = vmulq_f32 (r2, r2); 60 float32x4_t p = vfmaq_f32 (d->poly_cos[1], r2, d->poly_cos[2]); 61 float32x4_t c = vfmaq_f32 (v_f32 (-0.5), r2, d->poly_cos[0]); 62 c = vfmaq_f32 (c, r4, p); 63 c = vfmaq_f32 (v_f32 (1), c, r2); 64 65 /* If odd quadrant, swap cos and sin. */ 66 uint32x4_t swap = vtstq_u32 (vreinterpretq_u32_s32 (n), v_u32 (1)); 67 float32x4_t ss = vbslq_f32 (swap, c, s); 68 float32x4_t cc = vbslq_f32 (swap, s, c); 69 70 /* Fix signs according to quadrant. 71 ss = asfloat(asuint(ss) ^ ((n & 2) << 30)) 72 cc = asfloat(asuint(cc) & (((n + 1) & 2) << 30)). */ 73 uint32x4_t sin_sign 74 = vshlq_n_u32 (vandq_u32 (vreinterpretq_u32_s32 (n), v_u32 (2)), 30); 75 uint32x4_t cos_sign = vshlq_n_u32 ( 76 vandq_u32 (vreinterpretq_u32_s32 (vaddq_s32 (n, v_s32 (1))), v_u32 (2)), 77 30); 78 ss = vreinterpretq_f32_u32 ( 79 veorq_u32 (vreinterpretq_u32_f32 (ss), sin_sign)); 80 cc = vreinterpretq_f32_u32 ( 81 veorq_u32 (vreinterpretq_u32_f32 (cc), cos_sign)); 82 83 return (float32x4x2_t){ ss, cc }; 84 } 85