1 /*-
2 * Copyright (c) 2017 Hans Petter Selasky
3 *
4 * Redistribution and use in source and binary forms, with or without
5 * modification, are permitted provided that the following conditions
6 * are met:
7 * 1. Redistributions of source code must retain the above copyright
8 * notice, this list of conditions and the following disclaimer.
9 * 2. Redistributions in binary form must reproduce the above copyright
10 * notice, this list of conditions and the following disclaimer in the
11 * documentation and/or other materials provided with the distribution.
12 *
13 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23 * SUCH DAMAGE.
24 */
25
26 #include <sys/queue.h>
27
28 #include <stdint.h>
29 #include <string.h>
30
31 #include "int.h"
32
33 #ifndef VOSS_X3_LOG2_COMBA
34 #define VOSS_X3_LOG2_COMBA 5
35 #endif
36
37 #if (VOSS_X3_LOG2_COMBA < 2)
38 #error "VOSS_X3_LOG2_COMBA must be greater than 1"
39 #endif
40
41 struct voss_x3_input_double {
42 double a;
43 double b;
44 } __aligned(16);
45
46 /*
47 * <input size> = "stride"
48 * <output size> = 2 * "stride"
49 */
50 static void
voss_x3_multiply_sub_double(struct voss_x3_input_double * input,double * ptr_low,double * ptr_high,const size_t stride,const uint8_t toggle)51 voss_x3_multiply_sub_double(struct voss_x3_input_double *input, double *ptr_low, double *ptr_high,
52 const size_t stride, const uint8_t toggle)
53 {
54 size_t x;
55 size_t y;
56
57 if (stride >= (1UL << VOSS_X3_LOG2_COMBA)) {
58 const size_t strideh = stride >> 1;
59
60 if (toggle) {
61
62 /* inverse step */
63 for (x = 0; x != strideh; x++) {
64 double a, b, c, d;
65
66 a = ptr_low[x];
67 b = ptr_low[x + strideh];
68 c = ptr_high[x];
69 d = ptr_high[x + strideh];
70
71 ptr_low[x + strideh] = a + b;
72 ptr_high[x] = a + b + c + d;
73 }
74
75 voss_x3_multiply_sub_double(input, ptr_low, ptr_low + strideh, strideh, 1);
76
77 for (x = 0; x != strideh; x++)
78 ptr_low[x + strideh] = -ptr_low[x + strideh];
79
80 voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high + strideh, strideh, 1);
81
82 /* forward step */
83 for (x = 0; x != strideh; x++) {
84 double a, b, c, d;
85
86 a = ptr_low[x];
87 b = ptr_low[x + strideh];
88 c = ptr_high[x];
89 d = ptr_high[x + strideh];
90
91 ptr_low[x + strideh] = -a - b;
92 ptr_high[x] = c + b - d;
93
94 input[x + strideh].a += input[x].a;
95 input[x + strideh].b += input[x].b;
96 }
97
98 voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high, strideh, 0);
99 } else {
100 voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high, strideh, 1);
101
102 /* inverse step */
103 for (x = 0; x != strideh; x++) {
104 double a, b, c, d;
105
106 a = ptr_low[x];
107 b = ptr_low[x + strideh];
108 c = ptr_high[x];
109 d = ptr_high[x + strideh];
110
111 ptr_low[x + strideh] = -a - b;
112 ptr_high[x] = a + b + c + d;
113
114 input[x + strideh].a -= input[x].a;
115 input[x + strideh].b -= input[x].b;
116 }
117
118 voss_x3_multiply_sub_double(input + strideh, ptr_low + strideh, ptr_high + strideh, strideh, 0);
119
120 for (x = 0; x != strideh; x++)
121 ptr_low[x + strideh] = -ptr_low[x + strideh];
122
123 voss_x3_multiply_sub_double(input, ptr_low, ptr_low + strideh, strideh, 0);
124
125 /* forward step */
126 for (x = 0; x != strideh; x++) {
127 double a, b, c, d;
128
129 a = ptr_low[x];
130 b = ptr_low[x + strideh];
131 c = ptr_high[x];
132 d = ptr_high[x + strideh];
133
134 ptr_low[x + strideh] = b - a;
135 ptr_high[x] = c - b - d;
136 }
137 }
138 } else {
139 for (x = 0; x != stride; x++) {
140 double value = input[x].a;
141
142 for (y = 0; y != (stride - x); y++) {
143 ptr_low[x + y] += input[y].b * value;
144 }
145
146 for (; y != stride; y++) {
147 ptr_high[x + y - stride] += input[y].b * value;
148 }
149 }
150 }
151 }
152
153 /*
154 * <input size> = "max"
155 * <output size> = 2 * "max"
156 */
157 void
voss_x3_multiply_double(const int64_t * va,const double * vb,double * pc,const size_t max)158 voss_x3_multiply_double(const int64_t *va, const double *vb, double *pc, const size_t max)
159 {
160 struct voss_x3_input_double input[max];
161 size_t x;
162
163 /* check for non-power of two */
164 if (max & (max - 1))
165 return;
166
167 /* setup input vector */
168 for (x = 0; x != max; x++) {
169 input[x].a = va[x];
170 input[x].b = vb[x];
171 }
172
173 /* do multiplication */
174 voss_x3_multiply_sub_double(input, pc, pc + max, max, 1);
175 }
176