xref: /linux/lib/raid6/avx512.c (revision 161db5d165123a72792e2687ecfd8de146dbae1a)
1 /* -*- linux-c -*- --------------------------------------------------------
2  *
3  *   Copyright (C) 2016 Intel Corporation
4  *
5  *   Author: Gayatri Kammela <gayatri.kammela@intel.com>
6  *   Author: Megha Dey <megha.dey@linux.intel.com>
7  *
8  *   Based on avx2.c: Copyright 2012 Yuanhan Liu All Rights Reserved
9  *   Based on sse2.c: Copyright 2002 H. Peter Anvin - All Rights Reserved
10  *
11  *   This program is free software; you can redistribute it and/or modify
12  *   it under the terms of the GNU General Public License as published by
13  *   the Free Software Foundation, Inc., 53 Temple Place Ste 330,
14  *   Boston MA 02111-1307, USA; either version 2 of the License, or
15  *   (at your option) any later version; incorporated herein by reference.
16  *
17  * -----------------------------------------------------------------------
18  */
19 
20 /*
21  * AVX512 implementation of RAID-6 syndrome functions
22  *
23  */
24 
25 #ifdef CONFIG_AS_AVX512
26 
27 #include <linux/raid/pq.h>
28 #include "x86.h"
29 
30 static const struct raid6_avx512_constants {
31 	u64 x1d[8];
32 } raid6_avx512_constants __aligned(512) = {
33 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
34 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
35 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,
36 	  0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL,},
37 };
38 
39 static int raid6_have_avx512(void)
40 {
41 	return boot_cpu_has(X86_FEATURE_AVX2) &&
42 		boot_cpu_has(X86_FEATURE_AVX) &&
43 		boot_cpu_has(X86_FEATURE_AVX512F) &&
44 		boot_cpu_has(X86_FEATURE_AVX512BW) &&
45 		boot_cpu_has(X86_FEATURE_AVX512VL) &&
46 		boot_cpu_has(X86_FEATURE_AVX512DQ);
47 }
48 
49 static void raid6_avx5121_gen_syndrome(int disks, size_t bytes, void **ptrs)
50 {
51 	u8 **dptr = (u8 **)ptrs;
52 	u8 *p, *q;
53 	int d, z, z0;
54 
55 	z0 = disks - 3;         /* Highest data disk */
56 	p = dptr[z0+1];         /* XOR parity */
57 	q = dptr[z0+2];         /* RS syndrome */
58 
59 	kernel_fpu_begin();
60 
61 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
62 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
63 		     :
64 		     : "m" (raid6_avx512_constants.x1d[0]));
65 
66 	for (d = 0; d < bytes; d += 64) {
67 		asm volatile("prefetchnta %0\n\t"
68 			     "vmovdqa64 %0,%%zmm2\n\t"     /* P[0] */
69 			     "prefetchnta %1\n\t"
70 			     "vmovdqa64 %%zmm2,%%zmm4\n\t" /* Q[0] */
71 			     "vmovdqa64 %1,%%zmm6"
72 			     :
73 			     : "m" (dptr[z0][d]), "m" (dptr[z0-1][d]));
74 		for (z = z0-2; z >= 0; z--) {
75 			asm volatile("prefetchnta %0\n\t"
76 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
77 				     "vpmovm2b %%k1,%%zmm5\n\t"
78 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
79 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
80 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
81 				     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
82 				     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
83 				     "vmovdqa64 %0,%%zmm6"
84 				     :
85 				     : "m" (dptr[z][d]));
86 		}
87 		asm volatile("vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
88 			     "vpmovm2b %%k1,%%zmm5\n\t"
89 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
90 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
91 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
92 			     "vpxorq %%zmm6,%%zmm2,%%zmm2\n\t"
93 			     "vpxorq %%zmm6,%%zmm4,%%zmm4\n\t"
94 			     "vmovntdq %%zmm2,%0\n\t"
95 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
96 			     "vmovntdq %%zmm4,%1\n\t"
97 			     "vpxorq %%zmm4,%%zmm4,%%zmm4"
98 			     :
99 			     : "m" (p[d]), "m" (q[d]));
100 	}
101 
102 	asm volatile("sfence" : : : "memory");
103 	kernel_fpu_end();
104 }
105 
106 const struct raid6_calls raid6_avx512x1 = {
107 	raid6_avx5121_gen_syndrome,
108 	NULL,                   /* XOR not yet implemented */
109 	raid6_have_avx512,
110 	"avx512x1",
111 	1                       /* Has cache hints */
112 };
113 
114 /*
115  * Unrolled-by-2 AVX512 implementation
116  */
117 static void raid6_avx5122_gen_syndrome(int disks, size_t bytes, void **ptrs)
118 {
119 	u8 **dptr = (u8 **)ptrs;
120 	u8 *p, *q;
121 	int d, z, z0;
122 
123 	z0 = disks - 3;         /* Highest data disk */
124 	p = dptr[z0+1];         /* XOR parity */
125 	q = dptr[z0+2];         /* RS syndrome */
126 
127 	kernel_fpu_begin();
128 
129 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
130 		     "vpxorq %%zmm1,%%zmm1,%%zmm1" /* Zero temp */
131 		     :
132 		     : "m" (raid6_avx512_constants.x1d[0]));
133 
134 	/* We uniformly assume a single prefetch covers at least 64 bytes */
135 	for (d = 0; d < bytes; d += 128) {
136 		asm volatile("prefetchnta %0\n\t"
137 			     "prefetchnta %1\n\t"
138 			     "vmovdqa64 %0,%%zmm2\n\t"      /* P[0] */
139 			     "vmovdqa64 %1,%%zmm3\n\t"      /* P[1] */
140 			     "vmovdqa64 %%zmm2,%%zmm4\n\t"  /* Q[0] */
141 			     "vmovdqa64 %%zmm3,%%zmm6"      /* Q[1] */
142 			     :
143 			     : "m" (dptr[z0][d]), "m" (dptr[z0][d+64]));
144 		for (z = z0-1; z >= 0; z--) {
145 			asm volatile("prefetchnta %0\n\t"
146 				     "prefetchnta %1\n\t"
147 				     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
148 				     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
149 				     "vpmovm2b %%k1,%%zmm5\n\t"
150 				     "vpmovm2b %%k2,%%zmm7\n\t"
151 				     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
152 				     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
153 				     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
154 				     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
155 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
156 				     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
157 				     "vmovdqa64 %0,%%zmm5\n\t"
158 				     "vmovdqa64 %1,%%zmm7\n\t"
159 				     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
160 				     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
161 				     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
162 				     "vpxorq %%zmm7,%%zmm6,%%zmm6"
163 				     :
164 				     : "m" (dptr[z][d]), "m" (dptr[z][d+64]));
165 		}
166 		asm volatile("vmovntdq %%zmm2,%0\n\t"
167 			     "vmovntdq %%zmm3,%1\n\t"
168 			     "vmovntdq %%zmm4,%2\n\t"
169 			     "vmovntdq %%zmm6,%3"
170 			     :
171 			     : "m" (p[d]), "m" (p[d+64]), "m" (q[d]),
172 			       "m" (q[d+64]));
173 	}
174 
175 	asm volatile("sfence" : : : "memory");
176 	kernel_fpu_end();
177 }
178 
179 const struct raid6_calls raid6_avx512x2 = {
180 	raid6_avx5122_gen_syndrome,
181 	NULL,                   /* XOR not yet implemented */
182 	raid6_have_avx512,
183 	"avx512x2",
184 	1                       /* Has cache hints */
185 };
186 
187 #ifdef CONFIG_X86_64
188 
189 /*
190  * Unrolled-by-4 AVX2 implementation
191  */
192 static void raid6_avx5124_gen_syndrome(int disks, size_t bytes, void **ptrs)
193 {
194 	u8 **dptr = (u8 **)ptrs;
195 	u8 *p, *q;
196 	int d, z, z0;
197 
198 	z0 = disks - 3;         /* Highest data disk */
199 	p = dptr[z0+1];         /* XOR parity */
200 	q = dptr[z0+2];         /* RS syndrome */
201 
202 	kernel_fpu_begin();
203 
204 	asm volatile("vmovdqa64 %0,%%zmm0\n\t"
205 		     "vpxorq %%zmm1,%%zmm1,%%zmm1\n\t"       /* Zero temp */
206 		     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"       /* P[0] */
207 		     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"       /* P[1] */
208 		     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"       /* Q[0] */
209 		     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"       /* Q[1] */
210 		     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"    /* P[2] */
211 		     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"    /* P[3] */
212 		     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"    /* Q[2] */
213 		     "vpxorq %%zmm14,%%zmm14,%%zmm14"        /* Q[3] */
214 		     :
215 		     : "m" (raid6_avx512_constants.x1d[0]));
216 
217 	for (d = 0; d < bytes; d += 256) {
218 		for (z = z0; z >= 0; z--) {
219 		asm volatile("prefetchnta %0\n\t"
220 			     "prefetchnta %1\n\t"
221 			     "prefetchnta %2\n\t"
222 			     "prefetchnta %3\n\t"
223 			     "vpcmpgtb %%zmm4,%%zmm1,%%k1\n\t"
224 			     "vpcmpgtb %%zmm6,%%zmm1,%%k2\n\t"
225 			     "vpcmpgtb %%zmm12,%%zmm1,%%k3\n\t"
226 			     "vpcmpgtb %%zmm14,%%zmm1,%%k4\n\t"
227 			     "vpmovm2b %%k1,%%zmm5\n\t"
228 			     "vpmovm2b %%k2,%%zmm7\n\t"
229 			     "vpmovm2b %%k3,%%zmm13\n\t"
230 			     "vpmovm2b %%k4,%%zmm15\n\t"
231 			     "vpaddb %%zmm4,%%zmm4,%%zmm4\n\t"
232 			     "vpaddb %%zmm6,%%zmm6,%%zmm6\n\t"
233 			     "vpaddb %%zmm12,%%zmm12,%%zmm12\n\t"
234 			     "vpaddb %%zmm14,%%zmm14,%%zmm14\n\t"
235 			     "vpandq %%zmm0,%%zmm5,%%zmm5\n\t"
236 			     "vpandq %%zmm0,%%zmm7,%%zmm7\n\t"
237 			     "vpandq %%zmm0,%%zmm13,%%zmm13\n\t"
238 			     "vpandq %%zmm0,%%zmm15,%%zmm15\n\t"
239 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
240 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
241 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
242 			     "vpxorq %%zmm15,%%zmm14,%%zmm14\n\t"
243 			     "vmovdqa64 %0,%%zmm5\n\t"
244 			     "vmovdqa64 %1,%%zmm7\n\t"
245 			     "vmovdqa64 %2,%%zmm13\n\t"
246 			     "vmovdqa64 %3,%%zmm15\n\t"
247 			     "vpxorq %%zmm5,%%zmm2,%%zmm2\n\t"
248 			     "vpxorq %%zmm7,%%zmm3,%%zmm3\n\t"
249 			     "vpxorq %%zmm13,%%zmm10,%%zmm10\n\t"
250 			     "vpxorq %%zmm15,%%zmm11,%%zmm11\n"
251 			     "vpxorq %%zmm5,%%zmm4,%%zmm4\n\t"
252 			     "vpxorq %%zmm7,%%zmm6,%%zmm6\n\t"
253 			     "vpxorq %%zmm13,%%zmm12,%%zmm12\n\t"
254 			     "vpxorq %%zmm15,%%zmm14,%%zmm14"
255 			     :
256 			     : "m" (dptr[z][d]), "m" (dptr[z][d+64]),
257 			       "m" (dptr[z][d+128]), "m" (dptr[z][d+192]));
258 		}
259 		asm volatile("vmovntdq %%zmm2,%0\n\t"
260 			     "vpxorq %%zmm2,%%zmm2,%%zmm2\n\t"
261 			     "vmovntdq %%zmm3,%1\n\t"
262 			     "vpxorq %%zmm3,%%zmm3,%%zmm3\n\t"
263 			     "vmovntdq %%zmm10,%2\n\t"
264 			     "vpxorq %%zmm10,%%zmm10,%%zmm10\n\t"
265 			     "vmovntdq %%zmm11,%3\n\t"
266 			     "vpxorq %%zmm11,%%zmm11,%%zmm11\n\t"
267 			     "vmovntdq %%zmm4,%4\n\t"
268 			     "vpxorq %%zmm4,%%zmm4,%%zmm4\n\t"
269 			     "vmovntdq %%zmm6,%5\n\t"
270 			     "vpxorq %%zmm6,%%zmm6,%%zmm6\n\t"
271 			     "vmovntdq %%zmm12,%6\n\t"
272 			     "vpxorq %%zmm12,%%zmm12,%%zmm12\n\t"
273 			     "vmovntdq %%zmm14,%7\n\t"
274 			     "vpxorq %%zmm14,%%zmm14,%%zmm14"
275 			     :
276 			     : "m" (p[d]), "m" (p[d+64]), "m" (p[d+128]),
277 			       "m" (p[d+192]), "m" (q[d]), "m" (q[d+64]),
278 			       "m" (q[d+128]), "m" (q[d+192]));
279 	}
280 
281 	asm volatile("sfence" : : : "memory");
282 	kernel_fpu_end();
283 }
284 
285 const struct raid6_calls raid6_avx512x4 = {
286 	raid6_avx5124_gen_syndrome,
287 	NULL,                   /* XOR not yet implemented */
288 	raid6_have_avx512,
289 	"avx512x4",
290 	1                       /* Has cache hints */
291 };
292 #endif
293 
294 #endif /* CONFIG_AS_AVX512 */
295