xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c (revision 81ad626541db97eb356e2c1d4a20eb2a26a766ab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or https://opensource.org/licenses/CDDL-1.0.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
24  */
25 
26 #include <sys/vdev_raidz_impl.h>
27 
28 /*
29  * Provide native CPU scalar routines.
30  * Support 32bit and 64bit CPUs.
31  */
32 #if ((~(0x0ULL)) >> 24) == 0xffULL
33 #define	ELEM_SIZE	4
34 typedef uint32_t iv_t;
35 #elif ((~(0x0ULL)) >> 56) == 0xffULL
36 #define	ELEM_SIZE	8
37 typedef uint64_t iv_t;
38 #endif
39 
40 /*
41  * Vector type used in scalar implementation
42  *
43  * The union is expected to be of native CPU register size. Since addition
44  * uses XOR operation, it can be performed an all byte elements at once.
45  * Multiplication requires per byte access.
46  */
47 typedef union {
48 	iv_t e;
49 	uint8_t b[ELEM_SIZE];
50 } v_t;
51 
52 /*
53  * Precomputed lookup tables for multiplication by a constant
54  *
55  * Reconstruction path requires multiplication by a constant factors. Instead of
56  * performing two step lookup (log & exp tables), a direct lookup can be used
57  * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
58  *
59  * 	r = vdev_raidz_mul_lt[c_log][a];
60  *
61  * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
62  * they are faster to obtain while solving the syndrome equations.
63  *
64  * PERFORMANCE NOTE:
65  * Even though the complete lookup table uses 64kiB, only relatively small
66  * portion of it is used at the same time. Following shows number of accessed
67  * bytes for different cases:
68  * 	- 1 failed disk: 256B (1 mul. coefficient)
69  * 	- 2 failed disks: 512B (2 mul. coefficients)
70  * 	- 3 failed disks: 1536B (6 mul. coefficients)
71  *
72  * Size of actually accessed lookup table regions is only larger for
73  * reconstruction of 3 failed disks, when compared to traditional log/exp
74  * method. But since the result is obtained in one lookup step performance is
75  * doubled.
76  */
77 static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
78 
79 static void
80 raidz_init_scalar(void)
81 {
82 	int c, i;
83 	for (c = 0; c < 256; c++)
84 		for (i = 0; i < 256; i++)
85 			vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
86 
87 }
88 
89 #define	PREFETCHNTA(ptr, offset)	{}
90 #define	PREFETCH(ptr, offset) 		{}
91 
92 #define	XOR_ACC(src, acc)	acc.e ^= ((v_t *)src)[0].e
93 #define	XOR(src, acc)		acc.e ^= src.e
94 #define	ZERO(acc)		acc.e = 0
95 #define	COPY(src, dst)		dst = src
96 #define	LOAD(src, val) 		val = ((v_t *)src)[0]
97 #define	STORE(dst, val)		((v_t *)dst)[0] = val
98 
99 /*
100  * Constants used for optimized multiplication by 2.
101  */
102 static const struct {
103 	iv_t mod;
104 	iv_t mask;
105 	iv_t msb;
106 } scalar_mul2_consts = {
107 #if ELEM_SIZE == 8
108 	.mod	= 0x1d1d1d1d1d1d1d1dULL,
109 	.mask	= 0xfefefefefefefefeULL,
110 	.msb	= 0x8080808080808080ULL,
111 #else
112 	.mod	= 0x1d1d1d1dULL,
113 	.mask	= 0xfefefefeULL,
114 	.msb	= 0x80808080ULL,
115 #endif
116 };
117 
118 #define	MUL2_SETUP() {}
119 
120 #define	MUL2(a)								\
121 {									\
122 	iv_t _mask;							\
123 									\
124 	_mask = (a).e & scalar_mul2_consts.msb;				\
125 	_mask = (_mask << 1) - (_mask >> 7);				\
126 	(a).e = ((a).e << 1) & scalar_mul2_consts.mask;			\
127 	(a).e = (a).e ^ (_mask & scalar_mul2_consts.mod);		\
128 }
129 
130 #define	MUL4(a) 							\
131 {									\
132 	MUL2(a);							\
133 	MUL2(a);							\
134 }
135 
136 #define	MUL(c, a)							\
137 {									\
138 	const uint8_t *mul_lt = vdev_raidz_mul_lt[c];			\
139 	switch (ELEM_SIZE) {						\
140 	case 8:								\
141 		a.b[7] = mul_lt[a.b[7]];				\
142 		a.b[6] = mul_lt[a.b[6]];				\
143 		a.b[5] = mul_lt[a.b[5]];				\
144 		a.b[4] = mul_lt[a.b[4]];				\
145 		zfs_fallthrough;					\
146 	case 4:								\
147 		a.b[3] = mul_lt[a.b[3]];				\
148 		a.b[2] = mul_lt[a.b[2]];				\
149 		a.b[1] = mul_lt[a.b[1]];				\
150 		a.b[0] = mul_lt[a.b[0]];				\
151 		break;							\
152 	}								\
153 }
154 
155 #define	raidz_math_begin()	{}
156 #define	raidz_math_end()	{}
157 
158 #define	SYN_STRIDE		1
159 
160 #define	ZERO_DEFINE()		v_t d0
161 #define	ZERO_STRIDE		1
162 #define	ZERO_D			d0
163 
164 #define	COPY_DEFINE()		v_t d0
165 #define	COPY_STRIDE		1
166 #define	COPY_D			d0
167 
168 #define	ADD_DEFINE()		v_t d0
169 #define	ADD_STRIDE		1
170 #define	ADD_D			d0
171 
172 #define	MUL_DEFINE()		v_t d0
173 #define	MUL_STRIDE		1
174 #define	MUL_D			d0
175 
176 #define	GEN_P_STRIDE		1
177 #define	GEN_P_DEFINE()		v_t p0
178 #define	GEN_P_P			p0
179 
180 #define	GEN_PQ_STRIDE		1
181 #define	GEN_PQ_DEFINE()		v_t d0, c0
182 #define	GEN_PQ_D		d0
183 #define	GEN_PQ_C		c0
184 
185 #define	GEN_PQR_STRIDE		1
186 #define	GEN_PQR_DEFINE()	v_t d0, c0
187 #define	GEN_PQR_D		d0
188 #define	GEN_PQR_C		c0
189 
190 #define	SYN_Q_DEFINE()		v_t d0, x0
191 #define	SYN_Q_D			d0
192 #define	SYN_Q_X			x0
193 
194 
195 #define	SYN_R_DEFINE()		v_t d0, x0
196 #define	SYN_R_D			d0
197 #define	SYN_R_X			x0
198 
199 
200 #define	SYN_PQ_DEFINE()		v_t d0, x0
201 #define	SYN_PQ_D		d0
202 #define	SYN_PQ_X		x0
203 
204 
205 #define	REC_PQ_STRIDE		1
206 #define	REC_PQ_DEFINE()		v_t x0, y0, t0
207 #define	REC_PQ_X		x0
208 #define	REC_PQ_Y		y0
209 #define	REC_PQ_T		t0
210 
211 
212 #define	SYN_PR_DEFINE()		v_t d0, x0
213 #define	SYN_PR_D		d0
214 #define	SYN_PR_X		x0
215 
216 #define	REC_PR_STRIDE		1
217 #define	REC_PR_DEFINE()		v_t x0, y0, t0
218 #define	REC_PR_X		x0
219 #define	REC_PR_Y		y0
220 #define	REC_PR_T		t0
221 
222 
223 #define	SYN_QR_DEFINE()		v_t d0, x0
224 #define	SYN_QR_D		d0
225 #define	SYN_QR_X		x0
226 
227 
228 #define	REC_QR_STRIDE		1
229 #define	REC_QR_DEFINE()		v_t x0, y0, t0
230 #define	REC_QR_X		x0
231 #define	REC_QR_Y		y0
232 #define	REC_QR_T		t0
233 
234 
235 #define	SYN_PQR_DEFINE()	v_t d0, x0
236 #define	SYN_PQR_D		d0
237 #define	SYN_PQR_X		x0
238 
239 #define	REC_PQR_STRIDE		1
240 #define	REC_PQR_DEFINE()	v_t x0, y0, z0, xs0, ys0
241 #define	REC_PQR_X		x0
242 #define	REC_PQR_Y		y0
243 #define	REC_PQR_Z		z0
244 #define	REC_PQR_XS		xs0
245 #define	REC_PQR_YS		ys0
246 
247 #include "vdev_raidz_math_impl.h"
248 
249 DEFINE_GEN_METHODS(scalar);
250 DEFINE_REC_METHODS(scalar);
251 
252 boolean_t
253 raidz_will_scalar_work(void)
254 {
255 	return (B_TRUE); /* always */
256 }
257 
258 const raidz_impl_ops_t vdev_raidz_scalar_impl = {
259 	.init = raidz_init_scalar,
260 	.fini = NULL,
261 	.gen = RAIDZ_GEN_METHODS(scalar),
262 	.rec = RAIDZ_REC_METHODS(scalar),
263 	.is_supported = &raidz_will_scalar_work,
264 	.name = "scalar"
265 };
266 
267 /* Powers of 2 in the RAID-Z Galois field. */
268 const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
269 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
270 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
271 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
272 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
273 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
274 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
275 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
276 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
277 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
278 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
279 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
280 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
281 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
282 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
283 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
284 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
285 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
286 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
287 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
288 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
289 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
290 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
291 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
292 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
293 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
294 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
295 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
296 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
297 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
298 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
299 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
300 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
301 };
302 
303 /* Logs of 2 in the RAID-Z Galois field. */
304 const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
305 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
306 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
307 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
308 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
309 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
310 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
311 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
312 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
313 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
314 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
315 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
316 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
317 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
318 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
319 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
320 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
321 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
322 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
323 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
324 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
325 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
326 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
327 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
328 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
329 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
330 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
331 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
332 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
333 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
334 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
335 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
336 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
337 };
338