xref: /freebsd/sys/contrib/openzfs/module/zfs/vdev_raidz_math_scalar.c (revision 61145dc2b94f12f6a47344fb9aac702321880e43)
1 // SPDX-License-Identifier: CDDL-1.0
2 /*
3  * CDDL HEADER START
4  *
5  * The contents of this file are subject to the terms of the
6  * Common Development and Distribution License (the "License").
7  * You may not use this file except in compliance with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or https://opensource.org/licenses/CDDL-1.0.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 
23 /*
24  * Copyright (C) 2016 Gvozden Nešković. All rights reserved.
25  */
26 
27 #include <sys/vdev_raidz_impl.h>
28 
29 /*
30  * Provide native CPU scalar routines.
31  * Support 32bit and 64bit CPUs.
32  */
33 #if ((~(0x0ULL)) >> 24) == 0xffULL
34 #define	ELEM_SIZE	4
35 typedef uint32_t iv_t;
36 #elif ((~(0x0ULL)) >> 56) == 0xffULL
37 #define	ELEM_SIZE	8
38 typedef uint64_t iv_t;
39 #endif
40 
41 /*
42  * Vector type used in scalar implementation
43  *
44  * The union is expected to be of native CPU register size. Since addition
45  * uses XOR operation, it can be performed an all byte elements at once.
46  * Multiplication requires per byte access.
47  */
48 typedef union {
49 	iv_t e;
50 	uint8_t b[ELEM_SIZE];
51 } v_t;
52 
53 /*
54  * Precomputed lookup tables for multiplication by a constant
55  *
56  * Reconstruction path requires multiplication by a constant factors. Instead of
57  * performing two step lookup (log & exp tables), a direct lookup can be used
58  * instead. Multiplication of element 'a' by a constant 'c' is obtained as:
59  *
60  * 	r = vdev_raidz_mul_lt[c_log][a];
61  *
62  * where c_log = vdev_raidz_log2[c]. Log of coefficient factors is used because
63  * they are faster to obtain while solving the syndrome equations.
64  *
65  * PERFORMANCE NOTE:
66  * Even though the complete lookup table uses 64kiB, only relatively small
67  * portion of it is used at the same time. Following shows number of accessed
68  * bytes for different cases:
69  * 	- 1 failed disk: 256B (1 mul. coefficient)
70  * 	- 2 failed disks: 512B (2 mul. coefficients)
71  * 	- 3 failed disks: 1536B (6 mul. coefficients)
72  *
73  * Size of actually accessed lookup table regions is only larger for
74  * reconstruction of 3 failed disks, when compared to traditional log/exp
75  * method. But since the result is obtained in one lookup step performance is
76  * doubled.
77  */
78 static uint8_t vdev_raidz_mul_lt[256][256] __attribute__((aligned(256)));
79 
80 static void
raidz_init_scalar(void)81 raidz_init_scalar(void)
82 {
83 	int c, i;
84 	for (c = 0; c < 256; c++)
85 		for (i = 0; i < 256; i++)
86 			vdev_raidz_mul_lt[c][i] = gf_mul(c, i);
87 
88 }
89 
90 #define	PREFETCHNTA(ptr, offset)	{}
91 #define	PREFETCH(ptr, offset) 		{}
92 
93 #define	XOR_ACC(src, acc)	acc.e ^= ((v_t *)src)[0].e
94 #define	XOR(src, acc)		acc.e ^= src.e
95 #define	ZERO(acc)		acc.e = 0
96 #define	COPY(src, dst)		dst = src
97 #define	LOAD(src, val) 		val = ((v_t *)src)[0]
98 #define	STORE(dst, val)		((v_t *)dst)[0] = val
99 
100 /*
101  * Constants used for optimized multiplication by 2.
102  */
103 static const struct {
104 	iv_t mod;
105 	iv_t mask;
106 	iv_t msb;
107 } scalar_mul2_consts = {
108 #if ELEM_SIZE == 8
109 	.mod	= 0x1d1d1d1d1d1d1d1dULL,
110 	.mask	= 0xfefefefefefefefeULL,
111 	.msb	= 0x8080808080808080ULL,
112 #else
113 	.mod	= 0x1d1d1d1dULL,
114 	.mask	= 0xfefefefeULL,
115 	.msb	= 0x80808080ULL,
116 #endif
117 };
118 
119 #define	MUL2_SETUP() {}
120 
121 #define	MUL2(a)								\
122 {									\
123 	iv_t _mask;							\
124 									\
125 	_mask = (a).e & scalar_mul2_consts.msb;				\
126 	_mask = (_mask << 1) - (_mask >> 7);				\
127 	(a).e = ((a).e << 1) & scalar_mul2_consts.mask;			\
128 	(a).e = (a).e ^ (_mask & scalar_mul2_consts.mod);		\
129 }
130 
131 #define	MUL4(a) 							\
132 {									\
133 	MUL2(a);							\
134 	MUL2(a);							\
135 }
136 
137 #define	MUL(c, a)							\
138 {									\
139 	const uint8_t *mul_lt = vdev_raidz_mul_lt[c];			\
140 	switch (ELEM_SIZE) {						\
141 	case 8:								\
142 		a.b[7] = mul_lt[a.b[7]];				\
143 		a.b[6] = mul_lt[a.b[6]];				\
144 		a.b[5] = mul_lt[a.b[5]];				\
145 		a.b[4] = mul_lt[a.b[4]];				\
146 		zfs_fallthrough;					\
147 	case 4:								\
148 		a.b[3] = mul_lt[a.b[3]];				\
149 		a.b[2] = mul_lt[a.b[2]];				\
150 		a.b[1] = mul_lt[a.b[1]];				\
151 		a.b[0] = mul_lt[a.b[0]];				\
152 		break;							\
153 	}								\
154 }
155 
156 #define	raidz_math_begin()	{}
157 #define	raidz_math_end()	{}
158 
159 #define	SYN_STRIDE		1
160 
161 #define	ZERO_DEFINE()		v_t d0
162 #define	ZERO_STRIDE		1
163 #define	ZERO_D			d0
164 
165 #define	COPY_DEFINE()		v_t d0
166 #define	COPY_STRIDE		1
167 #define	COPY_D			d0
168 
169 #define	ADD_DEFINE()		v_t d0
170 #define	ADD_STRIDE		1
171 #define	ADD_D			d0
172 
173 #define	MUL_DEFINE()		v_t d0
174 #define	MUL_STRIDE		1
175 #define	MUL_D			d0
176 
177 #define	GEN_P_STRIDE		1
178 #define	GEN_P_DEFINE()		v_t p0
179 #define	GEN_P_P			p0
180 
181 #define	GEN_PQ_STRIDE		1
182 #define	GEN_PQ_DEFINE()		v_t d0, c0
183 #define	GEN_PQ_D		d0
184 #define	GEN_PQ_C		c0
185 
186 #define	GEN_PQR_STRIDE		1
187 #define	GEN_PQR_DEFINE()	v_t d0, c0
188 #define	GEN_PQR_D		d0
189 #define	GEN_PQR_C		c0
190 
191 #define	SYN_Q_DEFINE()		v_t d0, x0
192 #define	SYN_Q_D			d0
193 #define	SYN_Q_X			x0
194 
195 
196 #define	SYN_R_DEFINE()		v_t d0, x0
197 #define	SYN_R_D			d0
198 #define	SYN_R_X			x0
199 
200 
201 #define	SYN_PQ_DEFINE()		v_t d0, x0
202 #define	SYN_PQ_D		d0
203 #define	SYN_PQ_X		x0
204 
205 
206 #define	REC_PQ_STRIDE		1
207 #define	REC_PQ_DEFINE()		v_t x0, y0, t0
208 #define	REC_PQ_X		x0
209 #define	REC_PQ_Y		y0
210 #define	REC_PQ_T		t0
211 
212 
213 #define	SYN_PR_DEFINE()		v_t d0, x0
214 #define	SYN_PR_D		d0
215 #define	SYN_PR_X		x0
216 
217 #define	REC_PR_STRIDE		1
218 #define	REC_PR_DEFINE()		v_t x0, y0, t0
219 #define	REC_PR_X		x0
220 #define	REC_PR_Y		y0
221 #define	REC_PR_T		t0
222 
223 
224 #define	SYN_QR_DEFINE()		v_t d0, x0
225 #define	SYN_QR_D		d0
226 #define	SYN_QR_X		x0
227 
228 
229 #define	REC_QR_STRIDE		1
230 #define	REC_QR_DEFINE()		v_t x0, y0, t0
231 #define	REC_QR_X		x0
232 #define	REC_QR_Y		y0
233 #define	REC_QR_T		t0
234 
235 
236 #define	SYN_PQR_DEFINE()	v_t d0, x0
237 #define	SYN_PQR_D		d0
238 #define	SYN_PQR_X		x0
239 
240 #define	REC_PQR_STRIDE		1
241 #define	REC_PQR_DEFINE()	v_t x0, y0, z0, xs0, ys0
242 #define	REC_PQR_X		x0
243 #define	REC_PQR_Y		y0
244 #define	REC_PQR_Z		z0
245 #define	REC_PQR_XS		xs0
246 #define	REC_PQR_YS		ys0
247 
248 #include "vdev_raidz_math_impl.h"
249 
250 DEFINE_GEN_METHODS(scalar);
251 DEFINE_REC_METHODS(scalar);
252 
253 boolean_t
raidz_will_scalar_work(void)254 raidz_will_scalar_work(void)
255 {
256 	return (B_TRUE); /* always */
257 }
258 
259 const raidz_impl_ops_t vdev_raidz_scalar_impl = {
260 	.init = raidz_init_scalar,
261 	.fini = NULL,
262 	.gen = RAIDZ_GEN_METHODS(scalar),
263 	.rec = RAIDZ_REC_METHODS(scalar),
264 	.is_supported = &raidz_will_scalar_work,
265 	.name = "scalar"
266 };
267 
268 /* Powers of 2 in the RAID-Z Galois field. */
269 const uint8_t vdev_raidz_pow2[256] __attribute__((aligned(256))) = {
270 	0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80,
271 	0x1d, 0x3a, 0x74, 0xe8, 0xcd, 0x87, 0x13, 0x26,
272 	0x4c, 0x98, 0x2d, 0x5a, 0xb4, 0x75, 0xea, 0xc9,
273 	0x8f, 0x03, 0x06, 0x0c, 0x18, 0x30, 0x60, 0xc0,
274 	0x9d, 0x27, 0x4e, 0x9c, 0x25, 0x4a, 0x94, 0x35,
275 	0x6a, 0xd4, 0xb5, 0x77, 0xee, 0xc1, 0x9f, 0x23,
276 	0x46, 0x8c, 0x05, 0x0a, 0x14, 0x28, 0x50, 0xa0,
277 	0x5d, 0xba, 0x69, 0xd2, 0xb9, 0x6f, 0xde, 0xa1,
278 	0x5f, 0xbe, 0x61, 0xc2, 0x99, 0x2f, 0x5e, 0xbc,
279 	0x65, 0xca, 0x89, 0x0f, 0x1e, 0x3c, 0x78, 0xf0,
280 	0xfd, 0xe7, 0xd3, 0xbb, 0x6b, 0xd6, 0xb1, 0x7f,
281 	0xfe, 0xe1, 0xdf, 0xa3, 0x5b, 0xb6, 0x71, 0xe2,
282 	0xd9, 0xaf, 0x43, 0x86, 0x11, 0x22, 0x44, 0x88,
283 	0x0d, 0x1a, 0x34, 0x68, 0xd0, 0xbd, 0x67, 0xce,
284 	0x81, 0x1f, 0x3e, 0x7c, 0xf8, 0xed, 0xc7, 0x93,
285 	0x3b, 0x76, 0xec, 0xc5, 0x97, 0x33, 0x66, 0xcc,
286 	0x85, 0x17, 0x2e, 0x5c, 0xb8, 0x6d, 0xda, 0xa9,
287 	0x4f, 0x9e, 0x21, 0x42, 0x84, 0x15, 0x2a, 0x54,
288 	0xa8, 0x4d, 0x9a, 0x29, 0x52, 0xa4, 0x55, 0xaa,
289 	0x49, 0x92, 0x39, 0x72, 0xe4, 0xd5, 0xb7, 0x73,
290 	0xe6, 0xd1, 0xbf, 0x63, 0xc6, 0x91, 0x3f, 0x7e,
291 	0xfc, 0xe5, 0xd7, 0xb3, 0x7b, 0xf6, 0xf1, 0xff,
292 	0xe3, 0xdb, 0xab, 0x4b, 0x96, 0x31, 0x62, 0xc4,
293 	0x95, 0x37, 0x6e, 0xdc, 0xa5, 0x57, 0xae, 0x41,
294 	0x82, 0x19, 0x32, 0x64, 0xc8, 0x8d, 0x07, 0x0e,
295 	0x1c, 0x38, 0x70, 0xe0, 0xdd, 0xa7, 0x53, 0xa6,
296 	0x51, 0xa2, 0x59, 0xb2, 0x79, 0xf2, 0xf9, 0xef,
297 	0xc3, 0x9b, 0x2b, 0x56, 0xac, 0x45, 0x8a, 0x09,
298 	0x12, 0x24, 0x48, 0x90, 0x3d, 0x7a, 0xf4, 0xf5,
299 	0xf7, 0xf3, 0xfb, 0xeb, 0xcb, 0x8b, 0x0b, 0x16,
300 	0x2c, 0x58, 0xb0, 0x7d, 0xfa, 0xe9, 0xcf, 0x83,
301 	0x1b, 0x36, 0x6c, 0xd8, 0xad, 0x47, 0x8e, 0x01
302 };
303 
304 /* Logs of 2 in the RAID-Z Galois field. */
305 const uint8_t vdev_raidz_log2[256] __attribute__((aligned(256))) = {
306 	0x00, 0x00, 0x01, 0x19, 0x02, 0x32, 0x1a, 0xc6,
307 	0x03, 0xdf, 0x33, 0xee, 0x1b, 0x68, 0xc7, 0x4b,
308 	0x04, 0x64, 0xe0, 0x0e, 0x34, 0x8d, 0xef, 0x81,
309 	0x1c, 0xc1, 0x69, 0xf8, 0xc8, 0x08, 0x4c, 0x71,
310 	0x05, 0x8a, 0x65, 0x2f, 0xe1, 0x24, 0x0f, 0x21,
311 	0x35, 0x93, 0x8e, 0xda, 0xf0, 0x12, 0x82, 0x45,
312 	0x1d, 0xb5, 0xc2, 0x7d, 0x6a, 0x27, 0xf9, 0xb9,
313 	0xc9, 0x9a, 0x09, 0x78, 0x4d, 0xe4, 0x72, 0xa6,
314 	0x06, 0xbf, 0x8b, 0x62, 0x66, 0xdd, 0x30, 0xfd,
315 	0xe2, 0x98, 0x25, 0xb3, 0x10, 0x91, 0x22, 0x88,
316 	0x36, 0xd0, 0x94, 0xce, 0x8f, 0x96, 0xdb, 0xbd,
317 	0xf1, 0xd2, 0x13, 0x5c, 0x83, 0x38, 0x46, 0x40,
318 	0x1e, 0x42, 0xb6, 0xa3, 0xc3, 0x48, 0x7e, 0x6e,
319 	0x6b, 0x3a, 0x28, 0x54, 0xfa, 0x85, 0xba, 0x3d,
320 	0xca, 0x5e, 0x9b, 0x9f, 0x0a, 0x15, 0x79, 0x2b,
321 	0x4e, 0xd4, 0xe5, 0xac, 0x73, 0xf3, 0xa7, 0x57,
322 	0x07, 0x70, 0xc0, 0xf7, 0x8c, 0x80, 0x63, 0x0d,
323 	0x67, 0x4a, 0xde, 0xed, 0x31, 0xc5, 0xfe, 0x18,
324 	0xe3, 0xa5, 0x99, 0x77, 0x26, 0xb8, 0xb4, 0x7c,
325 	0x11, 0x44, 0x92, 0xd9, 0x23, 0x20, 0x89, 0x2e,
326 	0x37, 0x3f, 0xd1, 0x5b, 0x95, 0xbc, 0xcf, 0xcd,
327 	0x90, 0x87, 0x97, 0xb2, 0xdc, 0xfc, 0xbe, 0x61,
328 	0xf2, 0x56, 0xd3, 0xab, 0x14, 0x2a, 0x5d, 0x9e,
329 	0x84, 0x3c, 0x39, 0x53, 0x47, 0x6d, 0x41, 0xa2,
330 	0x1f, 0x2d, 0x43, 0xd8, 0xb7, 0x7b, 0xa4, 0x76,
331 	0xc4, 0x17, 0x49, 0xec, 0x7f, 0x0c, 0x6f, 0xf6,
332 	0x6c, 0xa1, 0x3b, 0x52, 0x29, 0x9d, 0x55, 0xaa,
333 	0xfb, 0x60, 0x86, 0xb1, 0xbb, 0xcc, 0x3e, 0x5a,
334 	0xcb, 0x59, 0x5f, 0xb0, 0x9c, 0xa9, 0xa0, 0x51,
335 	0x0b, 0xf5, 0x16, 0xeb, 0x7a, 0x75, 0x2c, 0xd7,
336 	0x4f, 0xae, 0xd5, 0xe9, 0xe6, 0xe7, 0xad, 0xe8,
337 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
338 };
339