1*a50b01d2SDag-Erling Smørgrav /*- 2*a50b01d2SDag-Erling Smørgrav * Copyright (c) 2015 Dag-Erling Smørgrav 3*a50b01d2SDag-Erling Smørgrav * All rights reserved. 4*a50b01d2SDag-Erling Smørgrav * 5*a50b01d2SDag-Erling Smørgrav * Redistribution and use in source and binary forms, with or without 6*a50b01d2SDag-Erling Smørgrav * modification, are permitted provided that the following conditions 7*a50b01d2SDag-Erling Smørgrav * are met: 8*a50b01d2SDag-Erling Smørgrav * 1. Redistributions of source code must retain the above copyright 9*a50b01d2SDag-Erling Smørgrav * notice, this list of conditions and the following disclaimer. 10*a50b01d2SDag-Erling Smørgrav * 2. Redistributions in binary form must reproduce the above copyright 11*a50b01d2SDag-Erling Smørgrav * notice, this list of conditions and the following disclaimer in the 12*a50b01d2SDag-Erling Smørgrav * documentation and/or other materials provided with the distribution. 13*a50b01d2SDag-Erling Smørgrav * 14*a50b01d2SDag-Erling Smørgrav * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15*a50b01d2SDag-Erling Smørgrav * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16*a50b01d2SDag-Erling Smørgrav * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17*a50b01d2SDag-Erling Smørgrav * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18*a50b01d2SDag-Erling Smørgrav * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19*a50b01d2SDag-Erling Smørgrav * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20*a50b01d2SDag-Erling Smørgrav * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21*a50b01d2SDag-Erling Smørgrav * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22*a50b01d2SDag-Erling Smørgrav * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23*a50b01d2SDag-Erling Smørgrav * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24*a50b01d2SDag-Erling Smørgrav * SUCH DAMAGE. 25*a50b01d2SDag-Erling Smørgrav * 26*a50b01d2SDag-Erling Smørgrav * $FreeBSD$ 27*a50b01d2SDag-Erling Smørgrav */ 28*a50b01d2SDag-Erling Smørgrav 29*a50b01d2SDag-Erling Smørgrav #ifdef _KERNEL 30*a50b01d2SDag-Erling Smørgrav #include <sys/libkern.h> 31*a50b01d2SDag-Erling Smørgrav #else 32*a50b01d2SDag-Erling Smørgrav #include <stdio.h> 33*a50b01d2SDag-Erling Smørgrav #include <strings.h> 34*a50b01d2SDag-Erling Smørgrav #endif 35*a50b01d2SDag-Erling Smørgrav 36*a50b01d2SDag-Erling Smørgrav #include "fp16.h" 37*a50b01d2SDag-Erling Smørgrav 38*a50b01d2SDag-Erling Smørgrav /* 39*a50b01d2SDag-Erling Smørgrav * Compute the quare root of x, using Newton's method with 2^(log2(x)/2) 40*a50b01d2SDag-Erling Smørgrav * as the initial estimate. 41*a50b01d2SDag-Erling Smørgrav */ 42*a50b01d2SDag-Erling Smørgrav fp16_t 43*a50b01d2SDag-Erling Smørgrav fp16_sqrt(fp16_t x) 44*a50b01d2SDag-Erling Smørgrav { 45*a50b01d2SDag-Erling Smørgrav fp16_t y, delta; 46*a50b01d2SDag-Erling Smørgrav signed int log2x; 47*a50b01d2SDag-Erling Smørgrav 48*a50b01d2SDag-Erling Smørgrav /* special case */ 49*a50b01d2SDag-Erling Smørgrav if (x == 0) 50*a50b01d2SDag-Erling Smørgrav return (0); 51*a50b01d2SDag-Erling Smørgrav 52*a50b01d2SDag-Erling Smørgrav /* shift toward 0 by half the logarithm */ 53*a50b01d2SDag-Erling Smørgrav log2x = flsl(x) - 1; 54*a50b01d2SDag-Erling Smørgrav if (log2x >= 16) { 55*a50b01d2SDag-Erling Smørgrav y = x >> (log2x - 16) / 2; 56*a50b01d2SDag-Erling Smørgrav } else { 57*a50b01d2SDag-Erling Smørgrav #if 0 58*a50b01d2SDag-Erling Smørgrav y = x << (16 - log2x) / 2; 59*a50b01d2SDag-Erling Smørgrav #else 60*a50b01d2SDag-Erling Smørgrav /* XXX for now, return 0 for anything < 1 */ 61*a50b01d2SDag-Erling Smørgrav return (0); 62*a50b01d2SDag-Erling Smørgrav #endif 63*a50b01d2SDag-Erling Smørgrav } 64*a50b01d2SDag-Erling Smørgrav while (y > 0) { 65*a50b01d2SDag-Erling Smørgrav /* delta = y^2 / 2y */ 66*a50b01d2SDag-Erling Smørgrav delta = fp16_div(fp16_sub(fp16_mul(y, y), x), y * 2); 67*a50b01d2SDag-Erling Smørgrav if (delta == 0) 68*a50b01d2SDag-Erling Smørgrav break; 69*a50b01d2SDag-Erling Smørgrav y = fp16_sub(y, delta); 70*a50b01d2SDag-Erling Smørgrav } 71*a50b01d2SDag-Erling Smørgrav return (y); 72*a50b01d2SDag-Erling Smørgrav } 73*a50b01d2SDag-Erling Smørgrav 74*a50b01d2SDag-Erling Smørgrav static fp16_t fp16_cos_table[256] = { 75*a50b01d2SDag-Erling Smørgrav 65536, 65534, 65531, 65524, 65516, 65505, 65491, 65475, 76*a50b01d2SDag-Erling Smørgrav 65457, 65436, 65412, 65386, 65358, 65327, 65294, 65258, 77*a50b01d2SDag-Erling Smørgrav 65220, 65179, 65136, 65091, 65043, 64992, 64939, 64884, 78*a50b01d2SDag-Erling Smørgrav 64826, 64766, 64703, 64638, 64571, 64501, 64428, 64353, 79*a50b01d2SDag-Erling Smørgrav 64276, 64197, 64115, 64030, 63943, 63854, 63762, 63668, 80*a50b01d2SDag-Erling Smørgrav 63571, 63473, 63371, 63268, 63162, 63053, 62942, 62829, 81*a50b01d2SDag-Erling Smørgrav 62714, 62596, 62475, 62353, 62228, 62100, 61971, 61839, 82*a50b01d2SDag-Erling Smørgrav 61705, 61568, 61429, 61288, 61144, 60998, 60850, 60700, 83*a50b01d2SDag-Erling Smørgrav 60547, 60392, 60235, 60075, 59913, 59749, 59583, 59414, 84*a50b01d2SDag-Erling Smørgrav 59243, 59070, 58895, 58718, 58538, 58356, 58172, 57986, 85*a50b01d2SDag-Erling Smørgrav 57797, 57606, 57414, 57219, 57022, 56822, 56621, 56417, 86*a50b01d2SDag-Erling Smørgrav 56212, 56004, 55794, 55582, 55368, 55152, 54933, 54713, 87*a50b01d2SDag-Erling Smørgrav 54491, 54266, 54040, 53811, 53581, 53348, 53114, 52877, 88*a50b01d2SDag-Erling Smørgrav 52639, 52398, 52155, 51911, 51665, 51416, 51166, 50914, 89*a50b01d2SDag-Erling Smørgrav 50660, 50403, 50146, 49886, 49624, 49360, 49095, 48828, 90*a50b01d2SDag-Erling Smørgrav 48558, 48288, 48015, 47740, 47464, 47186, 46906, 46624, 91*a50b01d2SDag-Erling Smørgrav 46340, 46055, 45768, 45480, 45189, 44897, 44603, 44308, 92*a50b01d2SDag-Erling Smørgrav 44011, 43712, 43412, 43110, 42806, 42501, 42194, 41885, 93*a50b01d2SDag-Erling Smørgrav 41575, 41263, 40950, 40636, 40319, 40002, 39682, 39362, 94*a50b01d2SDag-Erling Smørgrav 39039, 38716, 38390, 38064, 37736, 37406, 37075, 36743, 95*a50b01d2SDag-Erling Smørgrav 36409, 36074, 35738, 35400, 35061, 34721, 34379, 34036, 96*a50b01d2SDag-Erling Smørgrav 33692, 33346, 32999, 32651, 32302, 31952, 31600, 31247, 97*a50b01d2SDag-Erling Smørgrav 30893, 30538, 30181, 29824, 29465, 29105, 28745, 28383, 98*a50b01d2SDag-Erling Smørgrav 28020, 27656, 27291, 26925, 26557, 26189, 25820, 25450, 99*a50b01d2SDag-Erling Smørgrav 25079, 24707, 24334, 23960, 23586, 23210, 22833, 22456, 100*a50b01d2SDag-Erling Smørgrav 22078, 21699, 21319, 20938, 20557, 20175, 19792, 19408, 101*a50b01d2SDag-Erling Smørgrav 19024, 18638, 18253, 17866, 17479, 17091, 16702, 16313, 102*a50b01d2SDag-Erling Smørgrav 15923, 15533, 15142, 14751, 14359, 13966, 13573, 13179, 103*a50b01d2SDag-Erling Smørgrav 12785, 12390, 11995, 11600, 11204, 10807, 10410, 10013, 104*a50b01d2SDag-Erling Smørgrav 9616, 9218, 8819, 8421, 8022, 7623, 7223, 6823, 105*a50b01d2SDag-Erling Smørgrav 6423, 6023, 5622, 5222, 4821, 4420, 4018, 3617, 106*a50b01d2SDag-Erling Smørgrav 3215, 2814, 2412, 2010, 1608, 1206, 804, 402, 107*a50b01d2SDag-Erling Smørgrav }; 108*a50b01d2SDag-Erling Smørgrav 109*a50b01d2SDag-Erling Smørgrav /* 110*a50b01d2SDag-Erling Smørgrav * Compute the cosine of theta. 111*a50b01d2SDag-Erling Smørgrav */ 112*a50b01d2SDag-Erling Smørgrav fp16_t 113*a50b01d2SDag-Erling Smørgrav fp16_cos(fp16_t theta) 114*a50b01d2SDag-Erling Smørgrav { 115*a50b01d2SDag-Erling Smørgrav unsigned int i; 116*a50b01d2SDag-Erling Smørgrav 117*a50b01d2SDag-Erling Smørgrav i = 1024 * (theta % FP16_2PI) / FP16_2PI; 118*a50b01d2SDag-Erling Smørgrav switch (i / 256) { 119*a50b01d2SDag-Erling Smørgrav case 0: 120*a50b01d2SDag-Erling Smørgrav return (fp16_cos_table[i % 256]); 121*a50b01d2SDag-Erling Smørgrav case 1: 122*a50b01d2SDag-Erling Smørgrav return (-fp16_cos_table[255 - i % 256]); 123*a50b01d2SDag-Erling Smørgrav case 2: 124*a50b01d2SDag-Erling Smørgrav return (-fp16_cos_table[i % 256]); 125*a50b01d2SDag-Erling Smørgrav case 3: 126*a50b01d2SDag-Erling Smørgrav return (fp16_cos_table[255 - i % 256]); 127*a50b01d2SDag-Erling Smørgrav default: 128*a50b01d2SDag-Erling Smørgrav /* inconceivable! */ 129*a50b01d2SDag-Erling Smørgrav return (0); 130*a50b01d2SDag-Erling Smørgrav } 131*a50b01d2SDag-Erling Smørgrav } 132