xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz_math_powerpc_altivec_common.h (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright (C) 2019 Romain Dolbeau. All rights reserved.
24  *           <romain.dolbeau@european-processor-initiative.eu>
25  */
26 
27 #include <sys/types.h>
28 #include <sys/simd.h>
29 
30 #define	_REG_CNT(_0, _1, _2, _3, _4, _5, _6, _7, N, ...) N
31 #define	REG_CNT(r...) _REG_CNT(r, 8, 7, 6, 5, 4, 3, 2, 1)
32 
33 #define	VR0_(REG, ...) "%[w"#REG"]"
34 #define	VR1_(_1, REG, ...) "%[w"#REG"]"
35 #define	VR2_(_1, _2, REG, ...) "%[w"#REG"]"
36 #define	VR3_(_1, _2, _3, REG, ...) "%[w"#REG"]"
37 #define	VR4_(_1, _2, _3, _4, REG, ...) "%[w"#REG"]"
38 #define	VR5_(_1, _2, _3, _4, _5, REG, ...) "%[w"#REG"]"
39 #define	VR6_(_1, _2, _3, _4, _5, _6, REG, ...) "%[w"#REG"]"
40 #define	VR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) "%[w"#REG"]"
41 
42 /*
43  * Here we need registers not used otherwise.
44  * They will be used in unused ASM for the case
45  * with more registers than required... but GCC
46  * will still need to make sure the constraints
47  * are correct, and duplicate constraints are illegal
48  * ... and we use the "register" number as a name
49  */
50 
51 #define	VR0(r...) VR0_(r)
52 #define	VR1(r...) VR1_(r)
53 #define	VR2(r...) VR2_(r, 36)
54 #define	VR3(r...) VR3_(r, 36, 35)
55 #define	VR4(r...) VR4_(r, 36, 35, 34, 33)
56 #define	VR5(r...) VR5_(r, 36, 35, 34, 33, 32)
57 #define	VR6(r...) VR6_(r, 36, 35, 34, 33, 32, 31)
58 #define	VR7(r...) VR7_(r, 36, 35, 34, 33, 32, 31, 30)
59 
60 #define	VR(X) "%[w"#X"]"
61 
62 #define	RVR0_(REG, ...) [w##REG] "v" (w##REG)
63 #define	RVR1_(_1, REG, ...) [w##REG] "v" (w##REG)
64 #define	RVR2_(_1, _2, REG, ...) [w##REG] "v" (w##REG)
65 #define	RVR3_(_1, _2, _3, REG, ...) [w##REG] "v" (w##REG)
66 #define	RVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "v" (w##REG)
67 #define	RVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "v" (w##REG)
68 #define	RVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "v" (w##REG)
69 #define	RVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "v" (w##REG)
70 
71 #define	RVR0(r...) RVR0_(r)
72 #define	RVR1(r...) RVR1_(r)
73 #define	RVR2(r...) RVR2_(r, 36)
74 #define	RVR3(r...) RVR3_(r, 36, 35)
75 #define	RVR4(r...) RVR4_(r, 36, 35, 34, 33)
76 #define	RVR5(r...) RVR5_(r, 36, 35, 34, 33, 32)
77 #define	RVR6(r...) RVR6_(r, 36, 35, 34, 33, 32, 31)
78 #define	RVR7(r...) RVR7_(r, 36, 35, 34, 33, 32, 31, 30)
79 
80 #define	RVR(X) [w##X] "v" (w##X)
81 
82 #define	WVR0_(REG, ...) [w##REG] "=v" (w##REG)
83 #define	WVR1_(_1, REG, ...) [w##REG] "=v" (w##REG)
84 #define	WVR2_(_1, _2, REG, ...) [w##REG] "=v" (w##REG)
85 #define	WVR3_(_1, _2, _3, REG, ...) [w##REG] "=v" (w##REG)
86 #define	WVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "=v" (w##REG)
87 #define	WVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "=v" (w##REG)
88 #define	WVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "=v" (w##REG)
89 #define	WVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "=v" (w##REG)
90 
91 #define	WVR0(r...) WVR0_(r)
92 #define	WVR1(r...) WVR1_(r)
93 #define	WVR2(r...) WVR2_(r, 36)
94 #define	WVR3(r...) WVR3_(r, 36, 35)
95 #define	WVR4(r...) WVR4_(r, 36, 35, 34, 33)
96 #define	WVR5(r...) WVR5_(r, 36, 35, 34, 33, 32)
97 #define	WVR6(r...) WVR6_(r, 36, 35, 34, 33, 32, 31)
98 #define	WVR7(r...) WVR7_(r, 36, 35, 34, 33, 32, 31, 30)
99 
100 #define	WVR(X) [w##X] "=v" (w##X)
101 
102 #define	UVR0_(REG, ...) [w##REG] "+&v" (w##REG)
103 #define	UVR1_(_1, REG, ...) [w##REG] "+&v" (w##REG)
104 #define	UVR2_(_1, _2, REG, ...) [w##REG] "+&v" (w##REG)
105 #define	UVR3_(_1, _2, _3, REG, ...) [w##REG] "+&v" (w##REG)
106 #define	UVR4_(_1, _2, _3, _4, REG, ...) [w##REG] "+&v" (w##REG)
107 #define	UVR5_(_1, _2, _3, _4, _5, REG, ...) [w##REG] "+&v" (w##REG)
108 #define	UVR6_(_1, _2, _3, _4, _5, _6, REG, ...) [w##REG] "+&v" (w##REG)
109 #define	UVR7_(_1, _2, _3, _4, _5, _6, _7, REG, ...) [w##REG] "+&v" (w##REG)
110 
111 #define	UVR0(r...) UVR0_(r)
112 #define	UVR1(r...) UVR1_(r)
113 #define	UVR2(r...) UVR2_(r, 36)
114 #define	UVR3(r...) UVR3_(r, 36, 35)
115 #define	UVR4(r...) UVR4_(r, 36, 35, 34, 33)
116 #define	UVR5(r...) UVR5_(r, 36, 35, 34, 33, 32)
117 #define	UVR6(r...) UVR6_(r, 36, 35, 34, 33, 32, 31)
118 #define	UVR7(r...) UVR7_(r, 36, 35, 34, 33, 32, 31, 30)
119 
120 #define	UVR(X) [w##X] "+&v" (w##X)
121 
122 #define	R_01(REG1, REG2, ...) REG1, REG2
123 #define	_R_23(_0, _1, REG2, REG3, ...) REG2, REG3
124 #define	R_23(REG...) _R_23(REG, 1, 2, 3)
125 
126 #define	ZFS_ASM_BUG()	ASSERT(0)
127 
128 #define	OFFSET(ptr, val)	(((unsigned char *)(ptr))+val)
129 
130 extern const uint8_t gf_clmul_mod_lt[4*256][16];
131 
132 #define	ELEM_SIZE 16
133 
134 typedef struct v {
135 	uint8_t b[ELEM_SIZE] __attribute__((aligned(ELEM_SIZE)));
136 } v_t;
137 
138 #define	XOR_ACC(src, r...)					\
139 {								\
140 	switch (REG_CNT(r)) {					\
141 	case 8:							\
142 		__asm__ __volatile__(				\
143 		"lvx 21,0,%[SRC0]\n"				\
144 		"lvx 20,0,%[SRC1]\n"				\
145 		"lvx 19,0,%[SRC2]\n"				\
146 		"lvx 18,0,%[SRC3]\n"				\
147 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
148 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
149 		"vxor " VR2(r) "," VR2(r) ",19\n"		\
150 		"vxor " VR3(r) "," VR3(r) ",18\n"		\
151 		"lvx 21,0,%[SRC4]\n"				\
152 		"lvx 20,0,%[SRC5]\n"				\
153 		"lvx 19,0,%[SRC6]\n"				\
154 		"lvx 18,0,%[SRC7]\n"				\
155 		"vxor " VR4(r) "," VR4(r) ",21\n"		\
156 		"vxor " VR5(r) "," VR5(r) ",20\n"		\
157 		"vxor " VR6(r) "," VR6(r) ",19\n"		\
158 		"vxor " VR7(r) "," VR7(r) ",18\n"		\
159 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r),	\
160 			UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
161 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
162 		[SRC1] "r" ((OFFSET(src, 16))),			\
163 		[SRC2] "r" ((OFFSET(src, 32))),			\
164 		[SRC3] "r" ((OFFSET(src, 48))),			\
165 		[SRC4] "r" ((OFFSET(src, 64))),			\
166 		[SRC5] "r" ((OFFSET(src, 80))),			\
167 		[SRC6] "r" ((OFFSET(src, 96))),			\
168 		[SRC7] "r" ((OFFSET(src, 112)))			\
169 		:	"v18", "v19", "v20", "v21");		\
170 		break;						\
171 	case 4:							\
172 		__asm__ __volatile__(				\
173 		"lvx 21,0,%[SRC0]\n"				\
174 		"lvx 20,0,%[SRC1]\n"				\
175 		"lvx 19,0,%[SRC2]\n"				\
176 		"lvx 18,0,%[SRC3]\n"				\
177 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
178 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
179 		"vxor " VR2(r) "," VR2(r) ",19\n"		\
180 		"vxor " VR3(r) "," VR3(r) ",18\n"		\
181 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
182 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
183 		[SRC1] "r" ((OFFSET(src, 16))),			\
184 		[SRC2] "r" ((OFFSET(src, 32))),			\
185 		[SRC3] "r" ((OFFSET(src, 48)))			\
186 		:	"v18", "v19", "v20", "v21");		\
187 		break;						\
188 	case 2:							\
189 		__asm__ __volatile__(				\
190 		"lvx 21,0,%[SRC0]\n"				\
191 		"lvx 20,0,%[SRC1]\n"				\
192 		"vxor " VR0(r) "," VR0(r) ",21\n"		\
193 		"vxor " VR1(r) "," VR1(r) ",20\n"		\
194 		:	UVR0(r), UVR1(r)			\
195 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
196 		[SRC1] "r" ((OFFSET(src, 16)))			\
197 		:	"v20", "v21");				\
198 		break;						\
199 	default:						\
200 		ZFS_ASM_BUG();					\
201 	}							\
202 }
203 
204 #define	XOR(r...)						\
205 {								\
206 	switch (REG_CNT(r)) {					\
207 	case 8:							\
208 		__asm__ __volatile__(				\
209 		"vxor " VR4(r) "," VR4(r) "," VR0(r) "\n"	\
210 		"vxor " VR5(r) "," VR5(r) "," VR1(r) "\n"	\
211 		"vxor " VR6(r) "," VR6(r) "," VR2(r) "\n"	\
212 		"vxor " VR7(r) "," VR7(r) "," VR3(r) "\n"	\
213 		:	UVR4(r), UVR5(r), UVR6(r), UVR7(r)	\
214 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
215 		break;						\
216 	case 4:							\
217 		__asm__ __volatile__(				\
218 		"vxor " VR2(r) "," VR2(r) "," VR0(r) "\n"	\
219 		"vxor " VR3(r) "," VR3(r) "," VR1(r) "\n"	\
220 		:	UVR2(r), UVR3(r)			\
221 		:	RVR0(r), RVR1(r));			\
222 		break;						\
223 	default:						\
224 		ZFS_ASM_BUG();					\
225 	}							\
226 }
227 
228 #define	ZERO(r...)						\
229 {								\
230 	switch (REG_CNT(r)) {					\
231 	case 8:							\
232 		__asm__ __volatile__(				\
233 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
234 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
235 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
236 		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
237 		"vxor " VR4(r) "," VR4(r) "," VR4(r) "\n"	\
238 		"vxor " VR5(r) "," VR5(r) "," VR5(r) "\n"	\
239 		"vxor " VR6(r) "," VR6(r) "," VR6(r) "\n"	\
240 		"vxor " VR7(r) "," VR7(r) "," VR7(r) "\n"	\
241 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
242 			WVR4(r), WVR5(r), WVR6(r), WVR7(r));	\
243 		break;						\
244 	case 4:							\
245 		__asm__ __volatile__(				\
246 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
247 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
248 		"vxor " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
249 		"vxor " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
250 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r));	\
251 		break;						\
252 	case 2:							\
253 		__asm__ __volatile__(				\
254 		"vxor " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
255 		"vxor " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
256 		:	WVR0(r), WVR1(r));			\
257 		break;						\
258 	default:						\
259 		ZFS_ASM_BUG();					\
260 	}							\
261 }
262 
263 #define	COPY(r...)						\
264 {								\
265 	switch (REG_CNT(r)) {					\
266 	case 8:							\
267 		__asm__ __volatile__(				\
268 		"vor " VR4(r) "," VR0(r) "," VR0(r) "\n"	\
269 		"vor " VR5(r) "," VR1(r) "," VR1(r) "\n"	\
270 		"vor " VR6(r) "," VR2(r) "," VR2(r) "\n"	\
271 		"vor " VR7(r) "," VR3(r) "," VR3(r) "\n"	\
272 		:	WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
273 		:	RVR0(r), RVR1(r), RVR2(r), RVR3(r));	\
274 		break;						\
275 	case 4:							\
276 		__asm__ __volatile__(				\
277 		"vor " VR2(r) "," VR0(r) "," VR0(r) "\n"	\
278 		"vor " VR3(r) "," VR1(r) "," VR1(r) "\n"	\
279 		:	WVR2(r), WVR3(r)			\
280 		:	RVR0(r), RVR1(r));			\
281 		break;						\
282 	default:						\
283 		ZFS_ASM_BUG();					\
284 	}							\
285 }
286 
287 #define	LOAD(src, r...)						\
288 {								\
289 	switch (REG_CNT(r)) {					\
290 	case 8:							\
291 		__asm__ __volatile__(				\
292 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
293 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
294 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
295 		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
296 		"lvx " VR4(r) " ,0,%[SRC4]\n"			\
297 		"lvx " VR5(r) " ,0,%[SRC5]\n"			\
298 		"lvx " VR6(r) " ,0,%[SRC6]\n"			\
299 		"lvx " VR7(r) " ,0,%[SRC7]\n"			\
300 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r),	\
301 			WVR4(r), WVR5(r), WVR6(r), WVR7(r)	\
302 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
303 		[SRC1] "r" ((OFFSET(src, 16))),			\
304 		[SRC2] "r" ((OFFSET(src, 32))),			\
305 		[SRC3] "r" ((OFFSET(src, 48))),			\
306 		[SRC4] "r" ((OFFSET(src, 64))),			\
307 		[SRC5] "r" ((OFFSET(src, 80))),			\
308 		[SRC6] "r" ((OFFSET(src, 96))),			\
309 		[SRC7] "r" ((OFFSET(src, 112))));		\
310 		break;						\
311 	case 4:							\
312 		__asm__ __volatile__(				\
313 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
314 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
315 		"lvx " VR2(r) " ,0,%[SRC2]\n"			\
316 		"lvx " VR3(r) " ,0,%[SRC3]\n"			\
317 		:	WVR0(r), WVR1(r), WVR2(r), WVR3(r)	\
318 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
319 		[SRC1] "r" ((OFFSET(src, 16))),			\
320 		[SRC2] "r" ((OFFSET(src, 32))),			\
321 		[SRC3] "r" ((OFFSET(src, 48))));		\
322 		break;						\
323 	case 2:							\
324 		__asm__ __volatile__(				\
325 		"lvx " VR0(r) " ,0,%[SRC0]\n"			\
326 		"lvx " VR1(r) " ,0,%[SRC1]\n"			\
327 		:	WVR0(r), WVR1(r)			\
328 		:	[SRC0] "r" ((OFFSET(src, 0))),		\
329 		[SRC1] "r" ((OFFSET(src, 16))));		\
330 		break;						\
331 	default:						\
332 		ZFS_ASM_BUG();					\
333 	}							\
334 }
335 
336 #define	STORE(dst, r...)					\
337 {								\
338 	switch (REG_CNT(r)) {					\
339 	case 8:							\
340 		__asm__ __volatile__(				\
341 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
342 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
343 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
344 		"stvx " VR3(r) " ,0,%[DST3]\n"			\
345 		"stvx " VR4(r) " ,0,%[DST4]\n"			\
346 		"stvx " VR5(r) " ,0,%[DST5]\n"			\
347 		"stvx " VR6(r) " ,0,%[DST6]\n"			\
348 		"stvx " VR7(r) " ,0,%[DST7]\n"			\
349 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
350 		[DST1] "r" ((OFFSET(dst, 16))),			\
351 		[DST2] "r" ((OFFSET(dst, 32))),			\
352 		[DST3] "r" ((OFFSET(dst, 48))),			\
353 		[DST4] "r" ((OFFSET(dst, 64))),			\
354 		[DST5] "r" ((OFFSET(dst, 80))),			\
355 		[DST6] "r" ((OFFSET(dst, 96))),			\
356 		[DST7] "r" ((OFFSET(dst, 112))),		\
357 		RVR0(r), RVR1(r), RVR2(r), RVR3(r),		\
358 		RVR4(r), RVR5(r), RVR6(r), RVR7(r)		\
359 		:	"memory");				\
360 		break;						\
361 	case 4:							\
362 		__asm__ __volatile__(				\
363 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
364 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
365 		"stvx " VR2(r) " ,0,%[DST2]\n"			\
366 		"stvx " VR3(r) " ,0,%[DST3]\n"			\
367 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
368 		[DST1] "r" ((OFFSET(dst, 16))),			\
369 		[DST2] "r" ((OFFSET(dst, 32))),			\
370 		[DST3] "r" ((OFFSET(dst, 48))),			\
371 		RVR0(r), RVR1(r), RVR2(r), RVR3(r)		\
372 		: "memory");					\
373 		break;						\
374 	case 2:							\
375 		__asm__ __volatile__(				\
376 		"stvx " VR0(r) " ,0,%[DST0]\n"			\
377 		"stvx " VR1(r) " ,0,%[DST1]\n"			\
378 		: :	[DST0] "r" ((OFFSET(dst, 0))),		\
379 		[DST1] "r" ((OFFSET(dst, 16))),			\
380 		RVR0(r), RVR1(r) : "memory");			\
381 		break;						\
382 	default:						\
383 		ZFS_ASM_BUG();					\
384 	}							\
385 }
386 
387 /*
388  * Unfortunately cannot use the macro, because GCC
389  * will try to use the macro name and not value
390  * later on...
391  * Kept as a reference to what a numbered variable is
392  */
393 #define	_00	"17"
394 #define	_1d	"16"
395 #define	_temp0	"19"
396 #define	_temp1	"18"
397 
398 #define	MUL2_SETUP()						\
399 {								\
400 	__asm__ __volatile__(					\
401 		"vspltisb " VR(16) ",14\n"			\
402 		"vspltisb " VR(17) ",15\n"			\
403 		"vaddubm " VR(16) "," VR(17) "," VR(16) "\n"	\
404 		"vxor " VR(17) "," VR(17) "," VR(17) "\n"	\
405 		:	WVR(16), WVR(17));			\
406 }
407 
408 #define	MUL2(r...)						\
409 {								\
410 	switch (REG_CNT(r)) {					\
411 	case 4:							\
412 		__asm__ __volatile__(				\
413 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
414 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
415 		"vcmpgtsb 21," VR(17) "," VR2(r) "\n"		\
416 		"vcmpgtsb 20," VR(17) "," VR3(r) "\n"		\
417 		"vand 19,19," VR(16) "\n"			\
418 		"vand 18,18," VR(16) "\n"			\
419 		"vand 21,21," VR(16) "\n"			\
420 		"vand 20,20," VR(16) "\n"			\
421 		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
422 		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
423 		"vaddubm " VR2(r) "," VR2(r) "," VR2(r) "\n"	\
424 		"vaddubm " VR3(r) "," VR3(r) "," VR3(r) "\n"	\
425 		"vxor " VR0(r) ",19," VR0(r) "\n"		\
426 		"vxor " VR1(r) ",18," VR1(r) "\n"		\
427 		"vxor " VR2(r) ",21," VR2(r) "\n"		\
428 		"vxor " VR3(r) ",20," VR3(r) "\n"		\
429 		:	UVR0(r), UVR1(r), UVR2(r), UVR3(r)	\
430 		:	RVR(17), RVR(16)			\
431 		:	"v18", "v19", "v20", "v21");		\
432 		break;						\
433 	case 2:							\
434 		__asm__ __volatile__(				\
435 		"vcmpgtsb 19," VR(17) "," VR0(r) "\n"		\
436 		"vcmpgtsb 18," VR(17) "," VR1(r) "\n"		\
437 		"vand 19,19," VR(16) "\n"			\
438 		"vand 18,18," VR(16) "\n"			\
439 		"vaddubm " VR0(r) "," VR0(r) "," VR0(r) "\n"	\
440 		"vaddubm " VR1(r) "," VR1(r) "," VR1(r) "\n"	\
441 		"vxor " VR0(r) ",19," VR0(r) "\n"		\
442 		"vxor " VR1(r) ",18," VR1(r) "\n"		\
443 		:	UVR0(r), UVR1(r)			\
444 		:	RVR(17), RVR(16)			\
445 		:	"v18", "v19");				\
446 		break;						\
447 	default:						\
448 		ZFS_ASM_BUG();					\
449 	}							\
450 }
451 
452 #define	MUL4(r...)						\
453 {								\
454 	MUL2(r);						\
455 	MUL2(r);						\
456 }
457 
458 /*
459  * Unfortunately cannot use the macro, because GCC
460  * will try to use the macro name and not value
461  * later on...
462  * Kept as a reference to what a register is
463  * (here we're using actual registers for the
464  * clobbered ones)
465  */
466 #define	_0f		"15"
467 #define	_a_save		"14"
468 #define	_b_save		"13"
469 #define	_lt_mod_a	"12"
470 #define	_lt_clmul_a	"11"
471 #define	_lt_mod_b	"10"
472 #define	_lt_clmul_b	"15"
473 
474 #define	_MULx2(c, r...)						\
475 {								\
476 	switch (REG_CNT(r)) {					\
477 	case 2:							\
478 		__asm__ __volatile__(				\
479 		/* lts for upper part */			\
480 		"vspltisb 15,15\n"				\
481 		"lvx 10,0,%[lt0]\n"				\
482 		"lvx 11,0,%[lt1]\n"				\
483 		/* upper part */				\
484 		"vand 14," VR0(r) ",15\n"			\
485 		"vand 13," VR1(r) ",15\n"			\
486 		"vspltisb 15,4\n"				\
487 		"vsrab " VR0(r) "," VR0(r) ",15\n"		\
488 		"vsrab " VR1(r) "," VR1(r) ",15\n"		\
489 								\
490 		"vperm 12,10,10," VR0(r) "\n"			\
491 		"vperm 10,10,10," VR1(r) "\n"			\
492 		"vperm 15,11,11," VR0(r) "\n"			\
493 		"vperm 11,11,11," VR1(r) "\n"			\
494 								\
495 		"vxor " VR0(r) ",15,12\n"			\
496 		"vxor " VR1(r) ",11,10\n"			\
497 		/* lts for lower part */			\
498 		"lvx 10,0,%[lt2]\n"				\
499 		"lvx 15,0,%[lt3]\n"				\
500 		/* lower part */				\
501 		"vperm 12,10,10,14\n"				\
502 		"vperm 10,10,10,13\n"				\
503 		"vperm 11,15,15,14\n"				\
504 		"vperm 15,15,15,13\n"				\
505 								\
506 		"vxor " VR0(r) "," VR0(r) ",12\n"		\
507 		"vxor " VR1(r) "," VR1(r) ",10\n"		\
508 		"vxor " VR0(r) "," VR0(r) ",11\n"		\
509 		"vxor " VR1(r) "," VR1(r) ",15\n"		\
510 		: UVR0(r), UVR1(r)				\
511 		: [lt0] "r" (&(gf_clmul_mod_lt[4*(c)+0][0])),	\
512 		[lt1] "r" (&(gf_clmul_mod_lt[4*(c)+1][0])),	\
513 		[lt2] "r" (&(gf_clmul_mod_lt[4*(c)+2][0])),	\
514 		[lt3] "r" (&(gf_clmul_mod_lt[4*(c)+3][0]))	\
515 		: "v10", "v11", "v12", "v13", "v14", "v15");	\
516 		break;						\
517 	default:						\
518 		ZFS_ASM_BUG();					\
519 	}							\
520 }
521 
522 #define	MUL(c, r...)						\
523 {								\
524 	switch (REG_CNT(r)) {					\
525 	case 4:							\
526 		_MULx2(c, R_23(r));				\
527 		_MULx2(c, R_01(r));				\
528 		break;						\
529 	case 2:							\
530 		_MULx2(c, R_01(r));				\
531 		break;						\
532 	default:						\
533 		ZFS_ASM_BUG();					\
534 	}							\
535 }
536 
537 #define	raidz_math_begin()	kfpu_begin()
538 #define	raidz_math_end()	kfpu_end()
539 
540 /* Overkill... */
541 #if 0 // defined(_KERNEL)
542 #define	GEN_X_DEFINE_0_3()	\
543 register unsigned char w0 asm("0") __attribute__((vector_size(16)));	\
544 register unsigned char w1 asm("1") __attribute__((vector_size(16)));	\
545 register unsigned char w2 asm("2") __attribute__((vector_size(16)));	\
546 register unsigned char w3 asm("3") __attribute__((vector_size(16)));
547 #define	GEN_X_DEFINE_4_5()	\
548 register unsigned char w4 asm("4") __attribute__((vector_size(16)));	\
549 register unsigned char w5 asm("5") __attribute__((vector_size(16)));
550 #define	GEN_X_DEFINE_6_7()	\
551 register unsigned char w6 asm("6") __attribute__((vector_size(16)));	\
552 register unsigned char w7 asm("7") __attribute__((vector_size(16)));
553 #define	GEN_X_DEFINE_8_9()	\
554 register unsigned char w8 asm("8") __attribute__((vector_size(16)));	\
555 register unsigned char w9 asm("9") __attribute__((vector_size(16)));
556 #define	GEN_X_DEFINE_10_11()	\
557 register unsigned char w10 asm("10") __attribute__((vector_size(16)));	\
558 register unsigned char w11 asm("11") __attribute__((vector_size(16)));
559 #define	GEN_X_DEFINE_12_15()	\
560 register unsigned char w12 asm("12") __attribute__((vector_size(16)));	\
561 register unsigned char w13 asm("13") __attribute__((vector_size(16)));	\
562 register unsigned char w14 asm("14") __attribute__((vector_size(16)));	\
563 register unsigned char w15 asm("15") __attribute__((vector_size(16)));
564 #define	GEN_X_DEFINE_16()	\
565 register unsigned char w16 asm("16") __attribute__((vector_size(16)));
566 #define	GEN_X_DEFINE_17()	\
567 register unsigned char w17 asm("17") __attribute__((vector_size(16)));
568 #define	GEN_X_DEFINE_18_21()	\
569 register unsigned char w18 asm("18") __attribute__((vector_size(16)));	\
570 register unsigned char w19 asm("19") __attribute__((vector_size(16)));	\
571 register unsigned char w20 asm("20") __attribute__((vector_size(16)));	\
572 register unsigned char w21 asm("21") __attribute__((vector_size(16)));
573 #define	GEN_X_DEFINE_22_23()	\
574 register unsigned char w22 asm("22") __attribute__((vector_size(16)));	\
575 register unsigned char w23 asm("23") __attribute__((vector_size(16)));
576 #define	GEN_X_DEFINE_24_27()	\
577 register unsigned char w24 asm("24") __attribute__((vector_size(16)));	\
578 register unsigned char w25 asm("25") __attribute__((vector_size(16)));	\
579 register unsigned char w26 asm("26") __attribute__((vector_size(16)));	\
580 register unsigned char w27 asm("27") __attribute__((vector_size(16)));
581 #define	GEN_X_DEFINE_28_30()	\
582 register unsigned char w28 asm("28") __attribute__((vector_size(16)));	\
583 register unsigned char w29 asm("29") __attribute__((vector_size(16)));	\
584 register unsigned char w30 asm("30") __attribute__((vector_size(16)));
585 #define	GEN_X_DEFINE_31()	\
586 register unsigned char w31 asm("31") __attribute__((vector_size(16)));
587 #define	GEN_X_DEFINE_32()	\
588 register unsigned char w32 asm("31") __attribute__((vector_size(16)));
589 #define	GEN_X_DEFINE_33_36()	\
590 register unsigned char w33 asm("31") __attribute__((vector_size(16)));	\
591 register unsigned char w34 asm("31") __attribute__((vector_size(16)));	\
592 register unsigned char w35 asm("31") __attribute__((vector_size(16)));	\
593 register unsigned char w36 asm("31") __attribute__((vector_size(16)));
594 #define	GEN_X_DEFINE_37_38()	\
595 register unsigned char w37 asm("31") __attribute__((vector_size(16)));	\
596 register unsigned char w38 asm("31") __attribute__((vector_size(16)));
597 #define	GEN_X_DEFINE_ALL()	\
598 	GEN_X_DEFINE_0_3()	\
599 	GEN_X_DEFINE_4_5()	\
600 	GEN_X_DEFINE_6_7()	\
601 	GEN_X_DEFINE_8_9()	\
602 	GEN_X_DEFINE_10_11()	\
603 	GEN_X_DEFINE_12_15()	\
604 	GEN_X_DEFINE_16()	\
605 	GEN_X_DEFINE_17()	\
606 	GEN_X_DEFINE_18_21()	\
607 	GEN_X_DEFINE_22_23()	\
608 	GEN_X_DEFINE_24_27()	\
609 	GEN_X_DEFINE_28_30()	\
610 	GEN_X_DEFINE_31()	\
611 	GEN_X_DEFINE_32()	\
612 	GEN_X_DEFINE_33_36() 	\
613 	GEN_X_DEFINE_37_38()
614 #else
615 #define	GEN_X_DEFINE_0_3()	\
616 	unsigned char w0 __attribute__((vector_size(16)));	\
617 	unsigned char w1 __attribute__((vector_size(16)));	\
618 	unsigned char w2 __attribute__((vector_size(16)));	\
619 	unsigned char w3 __attribute__((vector_size(16)));
620 #define	GEN_X_DEFINE_4_5()	\
621 	unsigned char w4 __attribute__((vector_size(16)));	\
622 	unsigned char w5 __attribute__((vector_size(16)));
623 #define	GEN_X_DEFINE_6_7()	\
624 	unsigned char w6 __attribute__((vector_size(16)));	\
625 	unsigned char w7 __attribute__((vector_size(16)));
626 #define	GEN_X_DEFINE_8_9()	\
627 	unsigned char w8 __attribute__((vector_size(16)));	\
628 	unsigned char w9 __attribute__((vector_size(16)));
629 #define	GEN_X_DEFINE_10_11()	\
630 	unsigned char w10 __attribute__((vector_size(16)));	\
631 	unsigned char w11 __attribute__((vector_size(16)));
632 #define	GEN_X_DEFINE_12_15()	\
633 	unsigned char w12 __attribute__((vector_size(16)));	\
634 	unsigned char w13 __attribute__((vector_size(16)));	\
635 	unsigned char w14 __attribute__((vector_size(16)));	\
636 	unsigned char w15 __attribute__((vector_size(16)));
637 #define	GEN_X_DEFINE_16()	\
638 	unsigned char w16 __attribute__((vector_size(16)));
639 #define	GEN_X_DEFINE_17()	\
640 	unsigned char w17 __attribute__((vector_size(16)));
641 #define	GEN_X_DEFINE_18_21()	\
642 	unsigned char w18 __attribute__((vector_size(16)));	\
643 	unsigned char w19 __attribute__((vector_size(16)));	\
644 	unsigned char w20 __attribute__((vector_size(16)));	\
645 	unsigned char w21 __attribute__((vector_size(16)));
646 #define	GEN_X_DEFINE_22_23()	\
647 	unsigned char w22 __attribute__((vector_size(16)));	\
648 	unsigned char w23 __attribute__((vector_size(16)));
649 #define	GEN_X_DEFINE_24_27()	\
650 	unsigned char w24 __attribute__((vector_size(16)));	\
651 	unsigned char w25 __attribute__((vector_size(16)));	\
652 	unsigned char w26 __attribute__((vector_size(16)));	\
653 	unsigned char w27 __attribute__((vector_size(16)));
654 #define	GEN_X_DEFINE_28_30()	\
655 	unsigned char w28 __attribute__((vector_size(16)));	\
656 	unsigned char w29 __attribute__((vector_size(16)));	\
657 	unsigned char w30 __attribute__((vector_size(16)));
658 #define	GEN_X_DEFINE_31()	\
659 	unsigned char w31 __attribute__((vector_size(16)));
660 #define	GEN_X_DEFINE_32()	\
661 	unsigned char w32 __attribute__((vector_size(16)));
662 #define	GEN_X_DEFINE_33_36()	\
663 	unsigned char w33 __attribute__((vector_size(16)));	\
664 	unsigned char w34 __attribute__((vector_size(16)));	\
665 	unsigned char w35 __attribute__((vector_size(16)));	\
666 	unsigned char w36 __attribute__((vector_size(16)));
667 #define	GEN_X_DEFINE_37_38()	\
668 	unsigned char w37 __attribute__((vector_size(16)));	\
669 	unsigned char w38 __attribute__((vector_size(16)));
670 #define	GEN_X_DEFINE_ALL()	\
671 	GEN_X_DEFINE_0_3()	\
672 	GEN_X_DEFINE_4_5()	\
673 	GEN_X_DEFINE_6_7()	\
674 	GEN_X_DEFINE_8_9()	\
675 	GEN_X_DEFINE_10_11()	\
676 	GEN_X_DEFINE_12_15()	\
677 	GEN_X_DEFINE_16()	\
678 	GEN_X_DEFINE_17()	\
679 	GEN_X_DEFINE_18_21()	\
680 	GEN_X_DEFINE_22_23()	\
681 	GEN_X_DEFINE_24_27()	\
682 	GEN_X_DEFINE_28_30()	\
683 	GEN_X_DEFINE_31()	\
684 	GEN_X_DEFINE_32()	\
685 	GEN_X_DEFINE_33_36()	\
686 	GEN_X_DEFINE_37_38()
687 #endif
688