xref: /freebsd/contrib/arm-optimized-routines/math/sincosf.h (revision dd21556857e8d40f66bf5ad54754d9d52669ebf7)
1 /*
2  * Header for sinf, cosf and sincosf.
3  *
4  * Copyright (c) 2018-2024, Arm Limited.
5  * SPDX-License-Identifier: MIT OR Apache-2.0 WITH LLVM-exception
6  */
7 
8 #include <stdint.h>
9 #include <math.h>
10 #include "math_config.h"
11 
12 /* 2PI * 2^-64.  */
13 static const double pi63 = 0x1.921FB54442D18p-62;
14 /* PI / 4.  */
15 static const float pio4f = 0x1.921FB6p-1f;
16 
17 /* The constants and polynomials for sine and cosine.  */
18 typedef struct
19 {
20   double sign[4];		/* Sign of sine in quadrants 0..3.  */
21   double hpi_inv;		/* 2 / PI ( * 2^24 if !TOINT_INTRINSICS).  */
22   double hpi;			/* PI / 2.  */
23   double c0, c1, c2, c3, c4;	/* Cosine polynomial.  */
24   double s1, s2, s3;		/* Sine polynomial.  */
25 } sincos_t;
26 
27 /* Polynomial data (the cosine polynomial is negated in the 2nd entry).  */
28 extern const sincos_t __sincosf_table[2] HIDDEN;
29 
30 /* Top 12 bits of the float representation with the sign bit cleared.  */
31 static inline uint32_t
32 abstop12 (float x)
33 {
34   return (asuint (x) >> 20) & 0x7ff;
35 }
36 
37 /* Compute the sine and cosine of inputs X and X2 (X squared), using the
38    polynomial P and store the results in SINP and COSP.  N is the quadrant,
39    if odd the cosine and sine polynomials are swapped.  */
40 static inline void
41 sincosf_poly (double x, double x2, const sincos_t *p, int n, float *sinp,
42 	      float *cosp)
43 {
44   double x3, x4, x5, x6, s, c, c1, c2, s1;
45 
46   x4 = x2 * x2;
47   x3 = x2 * x;
48   c2 = p->c3 + x2 * p->c4;
49   s1 = p->s2 + x2 * p->s3;
50 
51   /* Swap sin/cos result based on quadrant.  */
52   float *tmp = (n & 1 ? cosp : sinp);
53   cosp = (n & 1 ? sinp : cosp);
54   sinp = tmp;
55 
56   c1 = p->c0 + x2 * p->c1;
57   x5 = x3 * x2;
58   x6 = x4 * x2;
59 
60   s = x + x3 * p->s1;
61   c = c1 + x4 * p->c2;
62 
63   *sinp = s + x5 * s1;
64   *cosp = c + x6 * c2;
65 }
66 
67 /* Return the sine of inputs X and X2 (X squared) using the polynomial P.
68    N is the quadrant, and if odd the cosine polynomial is used.  */
69 static inline float
70 sinf_poly (double x, double x2, const sincos_t *p, int n)
71 {
72   double x3, x4, x6, x7, s, c, c1, c2, s1;
73 
74   if ((n & 1) == 0)
75     {
76       x3 = x * x2;
77       s1 = p->s2 + x2 * p->s3;
78 
79       x7 = x3 * x2;
80       s = x + x3 * p->s1;
81 
82       return s + x7 * s1;
83     }
84   else
85     {
86       x4 = x2 * x2;
87       c2 = p->c3 + x2 * p->c4;
88       c1 = p->c0 + x2 * p->c1;
89 
90       x6 = x4 * x2;
91       c = c1 + x4 * p->c2;
92 
93       return c + x6 * c2;
94     }
95 }
96 
97 /* Fast range reduction using single multiply-subtract.  Return the modulo of
98    X as a value between -PI/4 and PI/4 and store the quadrant in NP.
99    The values for PI/2 and 2/PI are accessed via P.  Since PI/2 as a double
100    is accurate to 55 bits and the worst-case cancellation happens at 6 * PI/4,
101    the result is accurate for |X| <= 120.0.  */
102 static inline double
103 reduce_fast (double x, const sincos_t *p, int *np)
104 {
105   double r;
106 #if TOINT_INTRINSICS
107   /* Use fast round and lround instructions when available.  */
108   r = x * p->hpi_inv;
109   *np = converttoint (r);
110   return x - roundtoint (r) * p->hpi;
111 #else
112   /* Use scaled float to int conversion with explicit rounding.
113      hpi_inv is prescaled by 2^24 so the quadrant ends up in bits 24..31.
114      This avoids inaccuracies introduced by truncating negative values.  */
115   r = x * p->hpi_inv;
116   int n = ((int32_t)r + 0x800000) >> 24;
117   *np = n;
118   return x - n * p->hpi;
119 #endif
120 }
121 
122 /* Reduce the range of XI to a multiple of PI/2 using fast integer arithmetic.
123    XI is a reinterpreted float and must be >= 2.0f (the sign bit is ignored).
124    Return the modulo between -PI/4 and PI/4 and store the quadrant in NP.
125    Reduction uses a table of 4/PI with 192 bits of precision.  A 32x96->128 bit
126    multiply computes the exact 2.62-bit fixed-point modulo.  Since the result
127    can have at most 29 leading zeros after the binary point, the double
128    precision result is accurate to 33 bits.  */
129 static inline double
130 reduce_large (uint32_t xi, int *np)
131 {
132   const uint32_t *arr = &__inv_pio4[(xi >> 26) & 15];
133   int shift = (xi >> 23) & 7;
134   uint64_t n, res0, res1, res2;
135 
136   xi = (xi & 0xffffff) | 0x800000;
137   xi <<= shift;
138 
139   res0 = xi * arr[0];
140   res1 = (uint64_t)xi * arr[4];
141   res2 = (uint64_t)xi * arr[8];
142   res0 = (res2 >> 32) | (res0 << 32);
143   res0 += res1;
144 
145   n = (res0 + (1ULL << 61)) >> 62;
146   res0 -= n << 62;
147   double x = (int64_t)res0;
148   *np = n;
149   return x * pi63;
150 }
151