1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2011 Nexenta Systems, Inc. All rights reserved. 24 */ 25 /* 26 * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 27 * Use is subject to license terms. 28 */ 29 30 /* 31 * Given X, __vlibm_rem_pio2m finds Y and an integer n such that 32 * Y = X - n*pi/2 and |Y| < pi/2. 33 * 34 * On entry, X is represented by x, an array of nx 24-bit integers 35 * stored in double precision format, and e: 36 * 37 * X = sum (x[i] * 2^(e - 24*i)) 38 * 39 * nx must be 1, 2, or 3, and e must be >= -24. For example, a 40 * suitable representation for the double precision number z can 41 * be computed as follows: 42 * 43 * e = ilogb(z)-23 44 * z = scalbn(z,-e) 45 * for i = 0,1,2 46 * x[i] = floor(z) 47 * z = (z-x[i])*2**24 48 * 49 * On exit, Y is approximated by y[0] if prec is 0 and by the un- 50 * evaluated sum y[0] + y[1] if prec != 0. The approximation is 51 * accurate to 53 bits in the former case and to at least 72 bits 52 * in the latter. 53 * 54 * __vlibm_rem_pio2m returns n mod 8. 55 * 56 * Notes: 57 * 58 * As n is the integer nearest X * 2/pi, we approximate the latter 59 * product to a precision that is determined dynamically so as to 60 * ensure that the final value Y is approximated accurately enough. 61 * We don't bother to compute terms in the product that are multiples 62 * of 8, so the cost of this multiplication is independent of the 63 * magnitude of X. The variable ip determines the offset into the 64 * array ipio2 of the first term we need to use. The variable eq0 65 * is the corresponding exponent of the first partial product. 66 * 67 * The partial products are scaled, summed, and split into an array 68 * of non-overlapping 24-bit terms (not necessarily having the same 69 * signs). Each partial product overlaps three elements of the 70 * resulting array: 71 * 72 * q[i] xxxxxxxxxxxxxx 73 * q[i+1] xxxxxxxxxxxxxx 74 * q[i+2] xxxxxxxxxxxxxx 75 * ... ... 76 * 77 * 78 * r[i] xxxxxx 79 * r[i+1] xxxxxx 80 * r[i+2] xxxxxx 81 * ... ... 82 * 83 * In order that the last element of the r array have some correct 84 * bits, we compute an extra term in the q array, but we don't bother 85 * to split this last term into 24-bit chunks; thus, the final term 86 * of the r array could have more than 24 bits, but this doesn't 87 * matter. 88 * 89 * After we subtract the nearest integer to the product, we multiply 90 * the remaining part of r by pi/2 to obtain Y. Before we compute 91 * this last product, however, we make sure that the remaining part 92 * of r has at least five nonzero terms, computing more if need be. 93 * This ensures that even if the first nonzero term is only a single 94 * bit and the last term is wrong in several trailing bits, we still 95 * have enough accuracy to obtain 72 bits of Y. 96 * 97 * IMPORTANT: This code assumes that the rounding mode is round-to- 98 * nearest in several key places. First, after we compute X * 2/pi, 99 * we round to the nearest integer by adding and subtracting a power 100 * of two. This step must be done in round-to-nearest mode to ensure 101 * that the remainder is less than 1/2 in absolute value. (Because 102 * we only take two adjacent terms of r into account when we perform 103 * this rounding, in very rare cases the remainder could be just 104 * barely greater than 1/2, but this shouldn't matter in practice.) 105 * 106 * Second, we also split the partial products of X * 2/pi into 24-bit 107 * pieces by adding and subtracting a power of two. In this step, 108 * round-to-nearest mode is important in order to guarantee that 109 * the index of the first nonzero term in the remainder gives an 110 * accurate indication of the number of significant terms. For 111 * example, suppose eq0 = -1, so that r[1] is a multiple of 1/2 and 112 * |r[2]| < 1/2. After we subtract the nearest integer, r[1] could 113 * be -1/2, and r[2] could be very nearly 1/2, so that r[1] != 0, 114 * yet the remainder is much smaller than the least significant bit 115 * corresponding to r[1]. As long as we use round-to-nearest mode, 116 * this can't happen; instead, the absolute value of each r[j] will 117 * be less than 1/2 the least significant bit corresponding to r[j-1], 118 * so that the entire remainder must be at least half as large as 119 * the first nonzero term (or perhaps just barely smaller than this). 120 */ 121 122 #include <sys/isa_defs.h> 123 124 #ifdef _LITTLE_ENDIAN 125 #define HIWORD 1 126 #define LOWORD 0 127 #else 128 #define HIWORD 0 129 #define LOWORD 1 130 #endif 131 132 /* 396 hex digits of 2/pi, with two leading zeroes to make life easier */ 133 static const double ipio2[] = { 134 0, 0, 135 0xA2F983, 0x6E4E44, 0x1529FC, 0x2757D1, 0xF534DD, 0xC0DB62, 136 0x95993C, 0x439041, 0xFE5163, 0xABDEBB, 0xC561B7, 0x246E3A, 137 0x424DD2, 0xE00649, 0x2EEA09, 0xD1921C, 0xFE1DEB, 0x1CB129, 138 0xA73EE8, 0x8235F5, 0x2EBB44, 0x84E99C, 0x7026B4, 0x5F7E41, 139 0x3991D6, 0x398353, 0x39F49C, 0x845F8B, 0xBDF928, 0x3B1FF8, 140 0x97FFDE, 0x05980F, 0xEF2F11, 0x8B5A0A, 0x6D1F6D, 0x367ECF, 141 0x27CB09, 0xB74F46, 0x3F669E, 0x5FEA2D, 0x7527BA, 0xC7EBE5, 142 0xF17B3D, 0x0739F7, 0x8A5292, 0xEA6BFB, 0x5FB11F, 0x8D5D08, 143 0x560330, 0x46FC7B, 0x6BABF0, 0xCFBC20, 0x9AF436, 0x1DA9E3, 144 0x91615E, 0xE61B08, 0x659985, 0x5F14A0, 0x68408D, 0xFFD880, 145 0x4D7327, 0x310606, 0x1556CA, 0x73A8C9, 0x60E27B, 0xC08C6B, 146 }; 147 148 /* pi/2 in 24-bit pieces */ 149 static const double pio2[] = { 150 1.57079625129699707031e+00, 151 7.54978941586159635335e-08, 152 5.39030252995776476554e-15, 153 3.28200341580791294123e-22, 154 1.27065575308067607349e-29, 155 }; 156 157 /* miscellaneous constants */ 158 static const double 159 zero = 0.0, 160 two24 = 16777216.0, 161 round1 = 6755399441055744.0, /* 3 * 2^51 */ 162 round24 = 113336795588871485128704.0, /* 3 * 2^75 */ 163 twon24 = 5.960464477539062500E-8; 164 165 int 166 __vlibm_rem_pio2m(double *x, double *y, int e, int nx, int prec) 167 { 168 union { 169 double d; 170 int i[2]; 171 } s; 172 double z, t, p, q[20], r[21], *pr; 173 int nq, ip, n, i, j, k, eq0, eqnqm1; 174 175 /* determine ip and eq0; note that -48 <= eq0 <= 2 */ 176 ip = (e - 3) / 24; 177 if (ip < 0) 178 ip = 0; 179 eq0 = e - 24 * (ip + 1); 180 181 /* compute q[0,...,5] = x * ipio2 and initialize nq and eqnqm1 */ 182 if (nx == 3) { 183 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1] + x[2] * ipio2[ip]; 184 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2] + x[2] * ipio2[ip+1]; 185 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3] + x[2] * ipio2[ip+2]; 186 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4] + x[2] * ipio2[ip+3]; 187 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5] + x[2] * ipio2[ip+4]; 188 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6] + x[2] * ipio2[ip+5]; 189 } else if (nx == 2) { 190 q[0] = x[0] * ipio2[ip+2] + x[1] * ipio2[ip+1]; 191 q[1] = x[0] * ipio2[ip+3] + x[1] * ipio2[ip+2]; 192 q[2] = x[0] * ipio2[ip+4] + x[1] * ipio2[ip+3]; 193 q[3] = x[0] * ipio2[ip+5] + x[1] * ipio2[ip+4]; 194 q[4] = x[0] * ipio2[ip+6] + x[1] * ipio2[ip+5]; 195 q[5] = x[0] * ipio2[ip+7] + x[1] * ipio2[ip+6]; 196 } else { 197 q[0] = x[0] * ipio2[ip+2]; 198 q[1] = x[0] * ipio2[ip+3]; 199 q[2] = x[0] * ipio2[ip+4]; 200 q[3] = x[0] * ipio2[ip+5]; 201 q[4] = x[0] * ipio2[ip+6]; 202 q[5] = x[0] * ipio2[ip+7]; 203 } 204 nq = 5; 205 eqnqm1 = eq0 - 96; 206 207 recompute: 208 /* propagate carries and incorporate powers of two */ 209 s.i[HIWORD] = (0x3ff + eqnqm1) << 20; 210 s.i[LOWORD] = 0; 211 p = s.d; 212 z = q[nq] * twon24; 213 for (j = nq-1; j >= 1; j--) { 214 z += q[j]; 215 t = (z + round24) - round24; /* must be rounded to nearest */ 216 r[j+1] = (z - t) * p; 217 z = t * twon24; 218 p *= two24; 219 } 220 z += q[0]; 221 t = (z + round24) - round24; /* must be rounded to nearest */ 222 r[1] = (z - t) * p; 223 r[0] = t * p; 224 225 /* form n = [r] mod 8 and leave the fractional part of r */ 226 if (eq0 > 0) { 227 /* binary point lies within r[2] */ 228 z = r[2] + r[3]; 229 t = (z + round1) - round1; /* must be rounded to nearest */ 230 r[2] -= t; 231 n = (int)(r[1] + t); 232 r[0] = r[1] = zero; 233 } else if (eq0 > -24) { 234 /* binary point lies within or just to the right of r[1] */ 235 z = r[1] + r[2]; 236 t = (z + round1) - round1; /* must be rounded to nearest */ 237 r[1] -= t; 238 z = r[0] + t; 239 /* cut off high part of z so conversion to int doesn't 240 overflow */ 241 t = (z + round24) - round24; 242 n = (int)(z - t); 243 r[0] = zero; 244 } else { 245 /* binary point lies within or just to the right of r[0] */ 246 z = r[0] + r[1]; 247 t = (z + round1) - round1; /* must be rounded to nearest */ 248 r[0] -= t; 249 n = (int)t; 250 } 251 252 /* count the number of leading zeroes in r */ 253 for (j = 0; j <= nq; j++) { 254 if (r[j] != zero) 255 break; 256 } 257 258 /* if fewer than 5 terms remain, add more */ 259 if (nq - j < 4) { 260 k = 4 - (nq - j); 261 /* 262 * compute q[nq+1] to q[nq+k] 263 * 264 * For some reason, writing out the nx loop explicitly 265 * for each of the three possible values (as above) seems 266 * to run a little slower, so we'll leave this code as is. 267 */ 268 for (i = nq + 1; i <= nq + k; i++) { 269 t = x[0] * ipio2[ip+2+i]; 270 for (j = 1; j < nx; j++) 271 t += x[j] * ipio2[ip+2+i-j]; 272 q[i] = t; 273 eqnqm1 -= 24; 274 } 275 nq += k; 276 goto recompute; 277 } 278 279 /* set pr and nq so that pr[0,...,nq] is the part of r remaining */ 280 pr = &r[j]; 281 nq = nq - j; 282 283 /* compute pio2 * pr[0,...,nq]; note that nq >= 4 here */ 284 q[0] = pio2[0] * pr[0]; 285 q[1] = pio2[0] * pr[1] + pio2[1] * pr[0]; 286 q[2] = pio2[0] * pr[2] + pio2[1] * pr[1] + pio2[2] * pr[0]; 287 q[3] = pio2[0] * pr[3] + pio2[1] * pr[2] + pio2[2] * pr[1] 288 + pio2[3] * pr[0]; 289 for (i = 4; i <= nq; i++) { 290 q[i] = pio2[0] * pr[i] + pio2[1] * pr[i-1] + pio2[2] * pr[i-2] 291 + pio2[3] * pr[i-3] + pio2[4] * pr[i-4]; 292 } 293 294 /* sum q in increasing order to obtain the first term of y */ 295 t = q[nq]; 296 for (i = nq - 1; i >= 0; i--) 297 t += q[i]; 298 y[0] = t; 299 if (prec) { 300 /* subtract and sum again in decreasing order 301 to obtain the second term */ 302 t = q[0] - t; 303 for (i = 1; i <= nq; i++) 304 t += q[i]; 305 y[1] = t; 306 } 307 308 return (n & 7); 309 } 310