xref: /linux/lib/raid6/recov_avx512.c (revision b77e0ce62d63a761ffb7f7245a215a49f5921c2f)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Copyright (C) 2016 Intel Corporation
4  *
5  * Author: Gayatri Kammela <gayatri.kammela@intel.com>
6  * Author: Megha Dey <megha.dey@linux.intel.com>
7  */
8 
9 #ifdef CONFIG_AS_AVX512
10 
11 #include <linux/raid/pq.h>
12 #include "x86.h"
13 
14 static int raid6_has_avx512(void)
15 {
16 	return boot_cpu_has(X86_FEATURE_AVX2) &&
17 		boot_cpu_has(X86_FEATURE_AVX) &&
18 		boot_cpu_has(X86_FEATURE_AVX512F) &&
19 		boot_cpu_has(X86_FEATURE_AVX512BW) &&
20 		boot_cpu_has(X86_FEATURE_AVX512VL) &&
21 		boot_cpu_has(X86_FEATURE_AVX512DQ);
22 }
23 
24 static void raid6_2data_recov_avx512(int disks, size_t bytes, int faila,
25 				     int failb, void **ptrs)
26 {
27 	u8 *p, *q, *dp, *dq;
28 	const u8 *pbmul;	/* P multiplier table for B data */
29 	const u8 *qmul;		/* Q multiplier table (for both) */
30 	const u8 x0f = 0x0f;
31 
32 	p = (u8 *)ptrs[disks-2];
33 	q = (u8 *)ptrs[disks-1];
34 
35 	/*
36 	 * Compute syndrome with zero for the missing data pages
37 	 * Use the dead data pages as temporary storage for
38 	 * delta p and delta q
39 	 */
40 
41 	dp = (u8 *)ptrs[faila];
42 	ptrs[faila] = (void *)raid6_empty_zero_page;
43 	ptrs[disks-2] = dp;
44 	dq = (u8 *)ptrs[failb];
45 	ptrs[failb] = (void *)raid6_empty_zero_page;
46 	ptrs[disks-1] = dq;
47 
48 	raid6_call.gen_syndrome(disks, bytes, ptrs);
49 
50 	/* Restore pointer table */
51 	ptrs[faila]   = dp;
52 	ptrs[failb]   = dq;
53 	ptrs[disks-2] = p;
54 	ptrs[disks-1] = q;
55 
56 	/* Now, pick the proper data tables */
57 	pbmul = raid6_vgfmul[raid6_gfexi[failb-faila]];
58 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila] ^
59 		raid6_gfexp[failb]]];
60 
61 	kernel_fpu_begin();
62 
63 	/* zmm0 = x0f[16] */
64 	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
65 
66 	while (bytes) {
67 #ifdef CONFIG_X86_64
68 		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
69 			     "vmovdqa64 %1, %%zmm9\n\t"
70 			     "vmovdqa64 %2, %%zmm0\n\t"
71 			     "vmovdqa64 %3, %%zmm8\n\t"
72 			     "vpxorq %4, %%zmm1, %%zmm1\n\t"
73 			     "vpxorq %5, %%zmm9, %%zmm9\n\t"
74 			     "vpxorq %6, %%zmm0, %%zmm0\n\t"
75 			     "vpxorq %7, %%zmm8, %%zmm8"
76 			     :
77 			     : "m" (q[0]), "m" (q[64]), "m" (p[0]),
78 			       "m" (p[64]), "m" (dq[0]), "m" (dq[64]),
79 			       "m" (dp[0]), "m" (dp[64]));
80 
81 		/*
82 		 * 1 = dq[0]  ^ q[0]
83 		 * 9 = dq[64] ^ q[64]
84 		 * 0 = dp[0]  ^ p[0]
85 		 * 8 = dp[64] ^ p[64]
86 		 */
87 
88 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
89 			     "vbroadcasti64x2 %1, %%zmm5"
90 			     :
91 			     : "m" (qmul[0]), "m" (qmul[16]));
92 
93 		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
94 			     "vpsraw $4, %%zmm9, %%zmm12\n\t"
95 			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
96 			     "vpandq %%zmm7, %%zmm9, %%zmm9\n\t"
97 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
98 			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
99 			     "vpshufb %%zmm9, %%zmm4, %%zmm14\n\t"
100 			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
101 			     "vpshufb %%zmm12, %%zmm5, %%zmm15\n\t"
102 			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
103 			     "vpxorq %%zmm14, %%zmm15, %%zmm15\n\t"
104 			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
105 			     :
106 			     : );
107 
108 		/*
109 		 * 5 = qx[0]
110 		 * 15 = qx[64]
111 		 */
112 
113 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
114 			     "vbroadcasti64x2 %1, %%zmm1\n\t"
115 			     "vpsraw $4, %%zmm0, %%zmm2\n\t"
116 			     "vpsraw $4, %%zmm8, %%zmm6\n\t"
117 			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
118 			     "vpandq %%zmm7, %%zmm8, %%zmm14\n\t"
119 			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
120 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
121 			     "vpshufb %%zmm14, %%zmm4, %%zmm12\n\t"
122 			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
123 			     "vpshufb %%zmm6, %%zmm1, %%zmm13\n\t"
124 			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
125 			     "vpxorq %%zmm4, %%zmm1, %%zmm1\n\t"
126 			     "vpxorq %%zmm12, %%zmm13, %%zmm13"
127 			     :
128 			     : "m" (pbmul[0]), "m" (pbmul[16]));
129 
130 		/*
131 		 * 1  = pbmul[px[0]]
132 		 * 13 = pbmul[px[64]]
133 		 */
134 		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
135 			     "vpxorq %%zmm15, %%zmm13, %%zmm13"
136 			     :
137 			     : );
138 
139 		/*
140 		 * 1 = db = DQ
141 		 * 13 = db[64] = DQ[64]
142 		 */
143 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
144 			     "vmovdqa64 %%zmm13,%1\n\t"
145 			     "vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
146 			     "vpxorq %%zmm13, %%zmm8, %%zmm8"
147 			     :
148 			     : "m" (dq[0]), "m" (dq[64]));
149 
150 		asm volatile("vmovdqa64 %%zmm0, %0\n\t"
151 			     "vmovdqa64 %%zmm8, %1"
152 			     :
153 			     : "m" (dp[0]), "m" (dp[64]));
154 
155 		bytes -= 128;
156 		p += 128;
157 		q += 128;
158 		dp += 128;
159 		dq += 128;
160 #else
161 		asm volatile("vmovdqa64 %0, %%zmm1\n\t"
162 			     "vmovdqa64 %1, %%zmm0\n\t"
163 			     "vpxorq %2, %%zmm1, %%zmm1\n\t"
164 			     "vpxorq %3, %%zmm0, %%zmm0"
165 			     :
166 			     : "m" (*q), "m" (*p), "m"(*dq), "m" (*dp));
167 
168 		/* 1 = dq ^ q;  0 = dp ^ p */
169 
170 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
171 			     "vbroadcasti64x2 %1, %%zmm5"
172 			     :
173 			     : "m" (qmul[0]), "m" (qmul[16]));
174 
175 		/*
176 		 * 1 = dq ^ q
177 		 * 3 = dq ^ p >> 4
178 		 */
179 		asm volatile("vpsraw $4, %%zmm1, %%zmm3\n\t"
180 			     "vpandq %%zmm7, %%zmm1, %%zmm1\n\t"
181 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
182 			     "vpshufb %%zmm1, %%zmm4, %%zmm4\n\t"
183 			     "vpshufb %%zmm3, %%zmm5, %%zmm5\n\t"
184 			     "vpxorq %%zmm4, %%zmm5, %%zmm5"
185 			     :
186 			     : );
187 
188 		/* 5 = qx */
189 
190 		asm volatile("vbroadcasti64x2 %0, %%zmm4\n\t"
191 			     "vbroadcasti64x2 %1, %%zmm1"
192 			     :
193 			     : "m" (pbmul[0]), "m" (pbmul[16]));
194 
195 		asm volatile("vpsraw $4, %%zmm0, %%zmm2\n\t"
196 			     "vpandq %%zmm7, %%zmm0, %%zmm3\n\t"
197 			     "vpandq %%zmm7, %%zmm2, %%zmm2\n\t"
198 			     "vpshufb %%zmm3, %%zmm4, %%zmm4\n\t"
199 			     "vpshufb %%zmm2, %%zmm1, %%zmm1\n\t"
200 			     "vpxorq %%zmm4, %%zmm1, %%zmm1"
201 			     :
202 			     : );
203 
204 		/* 1 = pbmul[px] */
205 		asm volatile("vpxorq %%zmm5, %%zmm1, %%zmm1\n\t"
206 			     /* 1 = db = DQ */
207 			     "vmovdqa64 %%zmm1, %0\n\t"
208 			     :
209 			     : "m" (dq[0]));
210 
211 		asm volatile("vpxorq %%zmm1, %%zmm0, %%zmm0\n\t"
212 			     "vmovdqa64 %%zmm0, %0"
213 			     :
214 			     : "m" (dp[0]));
215 
216 		bytes -= 64;
217 		p += 64;
218 		q += 64;
219 		dp += 64;
220 		dq += 64;
221 #endif
222 	}
223 
224 	kernel_fpu_end();
225 }
226 
227 static void raid6_datap_recov_avx512(int disks, size_t bytes, int faila,
228 				     void **ptrs)
229 {
230 	u8 *p, *q, *dq;
231 	const u8 *qmul;		/* Q multiplier table */
232 	const u8 x0f = 0x0f;
233 
234 	p = (u8 *)ptrs[disks-2];
235 	q = (u8 *)ptrs[disks-1];
236 
237 	/*
238 	 * Compute syndrome with zero for the missing data page
239 	 * Use the dead data page as temporary storage for delta q
240 	 */
241 
242 	dq = (u8 *)ptrs[faila];
243 	ptrs[faila] = (void *)raid6_empty_zero_page;
244 	ptrs[disks-1] = dq;
245 
246 	raid6_call.gen_syndrome(disks, bytes, ptrs);
247 
248 	/* Restore pointer table */
249 	ptrs[faila]   = dq;
250 	ptrs[disks-1] = q;
251 
252 	/* Now, pick the proper data tables */
253 	qmul  = raid6_vgfmul[raid6_gfinv[raid6_gfexp[faila]]];
254 
255 	kernel_fpu_begin();
256 
257 	asm volatile("vpbroadcastb %0, %%zmm7" : : "m" (x0f));
258 
259 	while (bytes) {
260 #ifdef CONFIG_X86_64
261 		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
262 			     "vmovdqa64 %1, %%zmm8\n\t"
263 			     "vpxorq %2, %%zmm3, %%zmm3\n\t"
264 			     "vpxorq %3, %%zmm8, %%zmm8"
265 			     :
266 			     : "m" (dq[0]), "m" (dq[64]), "m" (q[0]),
267 			       "m" (q[64]));
268 
269 		/*
270 		 * 3 = q[0] ^ dq[0]
271 		 * 8 = q[64] ^ dq[64]
272 		 */
273 		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
274 			     "vmovapd %%zmm0, %%zmm13\n\t"
275 			     "vbroadcasti64x2 %1, %%zmm1\n\t"
276 			     "vmovapd %%zmm1, %%zmm14"
277 			     :
278 			     : "m" (qmul[0]), "m" (qmul[16]));
279 
280 		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
281 			     "vpsraw $4, %%zmm8, %%zmm12\n\t"
282 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
283 			     "vpandq %%zmm7, %%zmm8, %%zmm8\n\t"
284 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
285 			     "vpandq %%zmm7, %%zmm12, %%zmm12\n\t"
286 			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
287 			     "vpshufb %%zmm8, %%zmm13, %%zmm13\n\t"
288 			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
289 			     "vpshufb %%zmm12, %%zmm14, %%zmm14\n\t"
290 			     "vpxorq %%zmm0, %%zmm1, %%zmm1\n\t"
291 			     "vpxorq %%zmm13, %%zmm14, %%zmm14"
292 			     :
293 			     : );
294 
295 		/*
296 		 * 1  = qmul[q[0]  ^ dq[0]]
297 		 * 14 = qmul[q[64] ^ dq[64]]
298 		 */
299 		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
300 			     "vmovdqa64 %1, %%zmm12\n\t"
301 			     "vpxorq %%zmm1, %%zmm2, %%zmm2\n\t"
302 			     "vpxorq %%zmm14, %%zmm12, %%zmm12"
303 			     :
304 			     : "m" (p[0]), "m" (p[64]));
305 
306 		/*
307 		 * 2  = p[0]  ^ qmul[q[0]  ^ dq[0]]
308 		 * 12 = p[64] ^ qmul[q[64] ^ dq[64]]
309 		 */
310 
311 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
312 			     "vmovdqa64 %%zmm14, %1\n\t"
313 			     "vmovdqa64 %%zmm2, %2\n\t"
314 			     "vmovdqa64 %%zmm12,%3"
315 			     :
316 			     : "m" (dq[0]), "m" (dq[64]), "m" (p[0]),
317 			       "m" (p[64]));
318 
319 		bytes -= 128;
320 		p += 128;
321 		q += 128;
322 		dq += 128;
323 #else
324 		asm volatile("vmovdqa64 %0, %%zmm3\n\t"
325 			     "vpxorq %1, %%zmm3, %%zmm3"
326 			     :
327 			     : "m" (dq[0]), "m" (q[0]));
328 
329 		/* 3 = q ^ dq */
330 
331 		asm volatile("vbroadcasti64x2 %0, %%zmm0\n\t"
332 			     "vbroadcasti64x2 %1, %%zmm1"
333 			     :
334 			     : "m" (qmul[0]), "m" (qmul[16]));
335 
336 		asm volatile("vpsraw $4, %%zmm3, %%zmm6\n\t"
337 			     "vpandq %%zmm7, %%zmm3, %%zmm3\n\t"
338 			     "vpandq %%zmm7, %%zmm6, %%zmm6\n\t"
339 			     "vpshufb %%zmm3, %%zmm0, %%zmm0\n\t"
340 			     "vpshufb %%zmm6, %%zmm1, %%zmm1\n\t"
341 			     "vpxorq %%zmm0, %%zmm1, %%zmm1"
342 			     :
343 			     : );
344 
345 		/* 1 = qmul[q ^ dq] */
346 
347 		asm volatile("vmovdqa64 %0, %%zmm2\n\t"
348 			     "vpxorq %%zmm1, %%zmm2, %%zmm2"
349 			     :
350 			     : "m" (p[0]));
351 
352 		/* 2 = p ^ qmul[q ^ dq] */
353 
354 		asm volatile("vmovdqa64 %%zmm1, %0\n\t"
355 			     "vmovdqa64 %%zmm2, %1"
356 			     :
357 			     : "m" (dq[0]), "m" (p[0]));
358 
359 		bytes -= 64;
360 		p += 64;
361 		q += 64;
362 		dq += 64;
363 #endif
364 	}
365 
366 	kernel_fpu_end();
367 }
368 
369 const struct raid6_recov_calls raid6_recov_avx512 = {
370 	.data2 = raid6_2data_recov_avx512,
371 	.datap = raid6_datap_recov_avx512,
372 	.valid = raid6_has_avx512,
373 #ifdef CONFIG_X86_64
374 	.name = "avx512x2",
375 #else
376 	.name = "avx512x1",
377 #endif
378 	.priority = 3,
379 };
380 
381 #else
382 #warning "your version of binutils lacks AVX512 support"
383 #endif
384