xref: /linux/lib/raid/raid6/x86/sse2.c (revision 769d603fc44f896e7f61de7f0cdb8b78d46bc8c8)
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /* -*- linux-c -*- ------------------------------------------------------- *
3  *
4  *   Copyright 2002 H. Peter Anvin - All Rights Reserved
5  *
6  * ----------------------------------------------------------------------- */
7 
8 /*
9  * raid6/sse2.c
10  *
11  * SSE-2 implementation of RAID-6 syndrome functions
12  *
13  */
14 
15 #include <asm/cpufeature.h>
16 #include <asm/fpu/api.h>
17 #include "algos.h"
18 
19 static const struct raid6_sse_constants {
20 	u64 x1d[2];
21 } raid6_sse_constants  __attribute__((aligned(16))) = {
22 	{ 0x1d1d1d1d1d1d1d1dULL, 0x1d1d1d1d1d1d1d1dULL },
23 };
24 
25 static int raid6_have_sse2(void)
26 {
27 	/* Not really boot_cpu but "all_cpus" */
28 	return boot_cpu_has(X86_FEATURE_MMX) &&
29 		boot_cpu_has(X86_FEATURE_FXSR) &&
30 		boot_cpu_has(X86_FEATURE_XMM) &&
31 		boot_cpu_has(X86_FEATURE_XMM2);
32 }
33 
34 /*
35  * Plain SSE2 implementation
36  */
37 static void raid6_sse21_gen_syndrome(int disks, size_t bytes, void **ptrs)
38 {
39 	u8 **dptr = (u8 **)ptrs;
40 	u8 *p, *q;
41 	int d, z, z0;
42 
43 	z0 = disks - 3;		/* Highest data disk */
44 	p = dptr[z0+1];		/* XOR parity */
45 	q = dptr[z0+2];		/* RS syndrome */
46 
47 	kernel_fpu_begin();
48 
49 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
50 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
51 
52 	for ( d = 0 ; d < bytes ; d += 16 ) {
53 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
54 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d])); /* P[0] */
55 		asm volatile("prefetchnta %0" : : "m" (dptr[z0-1][d]));
56 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
57 		asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z0-1][d]));
58 		for ( z = z0-2 ; z >= 0 ; z-- ) {
59 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
60 			asm volatile("pcmpgtb %xmm4,%xmm5");
61 			asm volatile("paddb %xmm4,%xmm4");
62 			asm volatile("pand %xmm0,%xmm5");
63 			asm volatile("pxor %xmm5,%xmm4");
64 			asm volatile("pxor %xmm5,%xmm5");
65 			asm volatile("pxor %xmm6,%xmm2");
66 			asm volatile("pxor %xmm6,%xmm4");
67 			asm volatile("movdqa %0,%%xmm6" : : "m" (dptr[z][d]));
68 		}
69 		asm volatile("pcmpgtb %xmm4,%xmm5");
70 		asm volatile("paddb %xmm4,%xmm4");
71 		asm volatile("pand %xmm0,%xmm5");
72 		asm volatile("pxor %xmm5,%xmm4");
73 		asm volatile("pxor %xmm5,%xmm5");
74 		asm volatile("pxor %xmm6,%xmm2");
75 		asm volatile("pxor %xmm6,%xmm4");
76 
77 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
78 		asm volatile("pxor %xmm2,%xmm2");
79 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
80 		asm volatile("pxor %xmm4,%xmm4");
81 	}
82 
83 	asm volatile("sfence" : : : "memory");
84 	kernel_fpu_end();
85 }
86 
87 
88 static void raid6_sse21_xor_syndrome(int disks, int start, int stop,
89 				     size_t bytes, void **ptrs)
90 {
91 	u8 **dptr = (u8 **)ptrs;
92 	u8 *p, *q;
93 	int d, z, z0;
94 
95 	z0 = stop;		/* P/Q right side optimization */
96 	p = dptr[disks-2];	/* XOR parity */
97 	q = dptr[disks-1];	/* RS syndrome */
98 
99 	kernel_fpu_begin();
100 
101 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
102 
103 	for ( d = 0 ; d < bytes ; d += 16 ) {
104 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
105 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
106 		asm volatile("pxor %xmm4,%xmm2");
107 		/* P/Q data pages */
108 		for ( z = z0-1 ; z >= start ; z-- ) {
109 			asm volatile("pxor %xmm5,%xmm5");
110 			asm volatile("pcmpgtb %xmm4,%xmm5");
111 			asm volatile("paddb %xmm4,%xmm4");
112 			asm volatile("pand %xmm0,%xmm5");
113 			asm volatile("pxor %xmm5,%xmm4");
114 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
115 			asm volatile("pxor %xmm5,%xmm2");
116 			asm volatile("pxor %xmm5,%xmm4");
117 		}
118 		/* P/Q left side optimization */
119 		for ( z = start-1 ; z >= 0 ; z-- ) {
120 			asm volatile("pxor %xmm5,%xmm5");
121 			asm volatile("pcmpgtb %xmm4,%xmm5");
122 			asm volatile("paddb %xmm4,%xmm4");
123 			asm volatile("pand %xmm0,%xmm5");
124 			asm volatile("pxor %xmm5,%xmm4");
125 		}
126 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
127 		/* Don't use movntdq for r/w memory area < cache line */
128 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
129 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
130 	}
131 
132 	asm volatile("sfence" : : : "memory");
133 	kernel_fpu_end();
134 }
135 
136 const struct raid6_calls raid6_sse2x1 = {
137 	.gen_syndrome	= raid6_sse21_gen_syndrome,
138 	.xor_syndrome	= raid6_sse21_xor_syndrome,
139 	.valid		= raid6_have_sse2,
140 	.name		= "sse2x1",
141 	.priority	= 1,	/* Has cache hints */
142 };
143 
144 /*
145  * Unrolled-by-2 SSE2 implementation
146  */
147 static void raid6_sse22_gen_syndrome(int disks, size_t bytes, void **ptrs)
148 {
149 	u8 **dptr = (u8 **)ptrs;
150 	u8 *p, *q;
151 	int d, z, z0;
152 
153 	z0 = disks - 3;		/* Highest data disk */
154 	p = dptr[z0+1];		/* XOR parity */
155 	q = dptr[z0+2];		/* RS syndrome */
156 
157 	kernel_fpu_begin();
158 
159 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
160 	asm volatile("pxor %xmm5,%xmm5"); /* Zero temp */
161 	asm volatile("pxor %xmm7,%xmm7"); /* Zero temp */
162 
163 	/* We uniformly assume a single prefetch covers at least 32 bytes */
164 	for ( d = 0 ; d < bytes ; d += 32 ) {
165 		asm volatile("prefetchnta %0" : : "m" (dptr[z0][d]));
166 		asm volatile("movdqa %0,%%xmm2" : : "m" (dptr[z0][d]));    /* P[0] */
167 		asm volatile("movdqa %0,%%xmm3" : : "m" (dptr[z0][d+16])); /* P[1] */
168 		asm volatile("movdqa %xmm2,%xmm4"); /* Q[0] */
169 		asm volatile("movdqa %xmm3,%xmm6"); /* Q[1] */
170 		for ( z = z0-1 ; z >= 0 ; z-- ) {
171 			asm volatile("prefetchnta %0" : : "m" (dptr[z][d]));
172 			asm volatile("pcmpgtb %xmm4,%xmm5");
173 			asm volatile("pcmpgtb %xmm6,%xmm7");
174 			asm volatile("paddb %xmm4,%xmm4");
175 			asm volatile("paddb %xmm6,%xmm6");
176 			asm volatile("pand %xmm0,%xmm5");
177 			asm volatile("pand %xmm0,%xmm7");
178 			asm volatile("pxor %xmm5,%xmm4");
179 			asm volatile("pxor %xmm7,%xmm6");
180 			asm volatile("movdqa %0,%%xmm5" : : "m" (dptr[z][d]));
181 			asm volatile("movdqa %0,%%xmm7" : : "m" (dptr[z][d+16]));
182 			asm volatile("pxor %xmm5,%xmm2");
183 			asm volatile("pxor %xmm7,%xmm3");
184 			asm volatile("pxor %xmm5,%xmm4");
185 			asm volatile("pxor %xmm7,%xmm6");
186 			asm volatile("pxor %xmm5,%xmm5");
187 			asm volatile("pxor %xmm7,%xmm7");
188 		}
189 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
190 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
191 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
192 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
193 	}
194 
195 	asm volatile("sfence" : : : "memory");
196 	kernel_fpu_end();
197 }
198 
199 static void raid6_sse22_xor_syndrome(int disks, int start, int stop,
200 				     size_t bytes, void **ptrs)
201 {
202 	u8 **dptr = (u8 **)ptrs;
203 	u8 *p, *q;
204 	int d, z, z0;
205 
206 	z0 = stop;		/* P/Q right side optimization */
207 	p = dptr[disks-2];	/* XOR parity */
208 	q = dptr[disks-1];	/* RS syndrome */
209 
210 	kernel_fpu_begin();
211 
212 	asm volatile("movdqa %0,%%xmm0" : : "m" (raid6_sse_constants.x1d[0]));
213 
214 	for ( d = 0 ; d < bytes ; d += 32 ) {
215 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
216 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
217 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
218 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
219 		asm volatile("pxor %xmm4,%xmm2");
220 		asm volatile("pxor %xmm6,%xmm3");
221 		/* P/Q data pages */
222 		for ( z = z0-1 ; z >= start ; z-- ) {
223 			asm volatile("pxor %xmm5,%xmm5");
224 			asm volatile("pxor %xmm7,%xmm7");
225 			asm volatile("pcmpgtb %xmm4,%xmm5");
226 			asm volatile("pcmpgtb %xmm6,%xmm7");
227 			asm volatile("paddb %xmm4,%xmm4");
228 			asm volatile("paddb %xmm6,%xmm6");
229 			asm volatile("pand %xmm0,%xmm5");
230 			asm volatile("pand %xmm0,%xmm7");
231 			asm volatile("pxor %xmm5,%xmm4");
232 			asm volatile("pxor %xmm7,%xmm6");
233 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
234 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
235 			asm volatile("pxor %xmm5,%xmm2");
236 			asm volatile("pxor %xmm7,%xmm3");
237 			asm volatile("pxor %xmm5,%xmm4");
238 			asm volatile("pxor %xmm7,%xmm6");
239 		}
240 		/* P/Q left side optimization */
241 		for ( z = start-1 ; z >= 0 ; z-- ) {
242 			asm volatile("pxor %xmm5,%xmm5");
243 			asm volatile("pxor %xmm7,%xmm7");
244 			asm volatile("pcmpgtb %xmm4,%xmm5");
245 			asm volatile("pcmpgtb %xmm6,%xmm7");
246 			asm volatile("paddb %xmm4,%xmm4");
247 			asm volatile("paddb %xmm6,%xmm6");
248 			asm volatile("pand %xmm0,%xmm5");
249 			asm volatile("pand %xmm0,%xmm7");
250 			asm volatile("pxor %xmm5,%xmm4");
251 			asm volatile("pxor %xmm7,%xmm6");
252 		}
253 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
254 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
255 		/* Don't use movntdq for r/w memory area < cache line */
256 		asm volatile("movdqa %%xmm4,%0" : "=m" (q[d]));
257 		asm volatile("movdqa %%xmm6,%0" : "=m" (q[d+16]));
258 		asm volatile("movdqa %%xmm2,%0" : "=m" (p[d]));
259 		asm volatile("movdqa %%xmm3,%0" : "=m" (p[d+16]));
260 	}
261 
262 	asm volatile("sfence" : : : "memory");
263 	kernel_fpu_end();
264 }
265 
266 const struct raid6_calls raid6_sse2x2 = {
267 	.gen_syndrome	= raid6_sse22_gen_syndrome,
268 	.xor_syndrome	= raid6_sse22_xor_syndrome,
269 	.valid		= raid6_have_sse2,
270 	.name		= "sse2x2",
271 	.priority	= 1,	/* Has cache hints */
272 };
273 
274 #ifdef CONFIG_X86_64
275 
276 /*
277  * Unrolled-by-4 SSE2 implementation
278  */
279 static void raid6_sse24_gen_syndrome(int disks, size_t bytes, void **ptrs)
280 {
281 	u8 **dptr = (u8 **)ptrs;
282 	u8 *p, *q;
283 	int d, z, z0;
284 
285 	z0 = disks - 3;		/* Highest data disk */
286 	p = dptr[z0+1];		/* XOR parity */
287 	q = dptr[z0+2];		/* RS syndrome */
288 
289 	kernel_fpu_begin();
290 
291 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
292 	asm volatile("pxor %xmm2,%xmm2");	/* P[0] */
293 	asm volatile("pxor %xmm3,%xmm3");	/* P[1] */
294 	asm volatile("pxor %xmm4,%xmm4"); 	/* Q[0] */
295 	asm volatile("pxor %xmm5,%xmm5");	/* Zero temp */
296 	asm volatile("pxor %xmm6,%xmm6"); 	/* Q[1] */
297 	asm volatile("pxor %xmm7,%xmm7"); 	/* Zero temp */
298 	asm volatile("pxor %xmm10,%xmm10");	/* P[2] */
299 	asm volatile("pxor %xmm11,%xmm11");	/* P[3] */
300 	asm volatile("pxor %xmm12,%xmm12"); 	/* Q[2] */
301 	asm volatile("pxor %xmm13,%xmm13");	/* Zero temp */
302 	asm volatile("pxor %xmm14,%xmm14"); 	/* Q[3] */
303 	asm volatile("pxor %xmm15,%xmm15"); 	/* Zero temp */
304 
305 	for ( d = 0 ; d < bytes ; d += 64 ) {
306 		for ( z = z0 ; z >= 0 ; z-- ) {
307 			/* The second prefetch seems to improve performance... */
308 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
309 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
310 			asm volatile("pcmpgtb %xmm4,%xmm5");
311 			asm volatile("pcmpgtb %xmm6,%xmm7");
312 			asm volatile("pcmpgtb %xmm12,%xmm13");
313 			asm volatile("pcmpgtb %xmm14,%xmm15");
314 			asm volatile("paddb %xmm4,%xmm4");
315 			asm volatile("paddb %xmm6,%xmm6");
316 			asm volatile("paddb %xmm12,%xmm12");
317 			asm volatile("paddb %xmm14,%xmm14");
318 			asm volatile("pand %xmm0,%xmm5");
319 			asm volatile("pand %xmm0,%xmm7");
320 			asm volatile("pand %xmm0,%xmm13");
321 			asm volatile("pand %xmm0,%xmm15");
322 			asm volatile("pxor %xmm5,%xmm4");
323 			asm volatile("pxor %xmm7,%xmm6");
324 			asm volatile("pxor %xmm13,%xmm12");
325 			asm volatile("pxor %xmm15,%xmm14");
326 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
327 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
328 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
329 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
330 			asm volatile("pxor %xmm5,%xmm2");
331 			asm volatile("pxor %xmm7,%xmm3");
332 			asm volatile("pxor %xmm13,%xmm10");
333 			asm volatile("pxor %xmm15,%xmm11");
334 			asm volatile("pxor %xmm5,%xmm4");
335 			asm volatile("pxor %xmm7,%xmm6");
336 			asm volatile("pxor %xmm13,%xmm12");
337 			asm volatile("pxor %xmm15,%xmm14");
338 			asm volatile("pxor %xmm5,%xmm5");
339 			asm volatile("pxor %xmm7,%xmm7");
340 			asm volatile("pxor %xmm13,%xmm13");
341 			asm volatile("pxor %xmm15,%xmm15");
342 		}
343 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
344 		asm volatile("pxor %xmm2,%xmm2");
345 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
346 		asm volatile("pxor %xmm3,%xmm3");
347 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
348 		asm volatile("pxor %xmm10,%xmm10");
349 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
350 		asm volatile("pxor %xmm11,%xmm11");
351 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
352 		asm volatile("pxor %xmm4,%xmm4");
353 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
354 		asm volatile("pxor %xmm6,%xmm6");
355 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
356 		asm volatile("pxor %xmm12,%xmm12");
357 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
358 		asm volatile("pxor %xmm14,%xmm14");
359 	}
360 
361 	asm volatile("sfence" : : : "memory");
362 	kernel_fpu_end();
363 }
364 
365 static void raid6_sse24_xor_syndrome(int disks, int start, int stop,
366 				     size_t bytes, void **ptrs)
367 {
368 	u8 **dptr = (u8 **)ptrs;
369 	u8 *p, *q;
370 	int d, z, z0;
371 
372 	z0 = stop;		/* P/Q right side optimization */
373 	p = dptr[disks-2];	/* XOR parity */
374 	q = dptr[disks-1];	/* RS syndrome */
375 
376 	kernel_fpu_begin();
377 
378 	asm volatile("movdqa %0,%%xmm0" :: "m" (raid6_sse_constants.x1d[0]));
379 
380 	for ( d = 0 ; d < bytes ; d += 64 ) {
381 		asm volatile("movdqa %0,%%xmm4" :: "m" (dptr[z0][d]));
382 		asm volatile("movdqa %0,%%xmm6" :: "m" (dptr[z0][d+16]));
383 		asm volatile("movdqa %0,%%xmm12" :: "m" (dptr[z0][d+32]));
384 		asm volatile("movdqa %0,%%xmm14" :: "m" (dptr[z0][d+48]));
385 		asm volatile("movdqa %0,%%xmm2" : : "m" (p[d]));
386 		asm volatile("movdqa %0,%%xmm3" : : "m" (p[d+16]));
387 		asm volatile("movdqa %0,%%xmm10" : : "m" (p[d+32]));
388 		asm volatile("movdqa %0,%%xmm11" : : "m" (p[d+48]));
389 		asm volatile("pxor %xmm4,%xmm2");
390 		asm volatile("pxor %xmm6,%xmm3");
391 		asm volatile("pxor %xmm12,%xmm10");
392 		asm volatile("pxor %xmm14,%xmm11");
393 		/* P/Q data pages */
394 		for ( z = z0-1 ; z >= start ; z-- ) {
395 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d]));
396 			asm volatile("prefetchnta %0" :: "m" (dptr[z][d+32]));
397 			asm volatile("pxor %xmm5,%xmm5");
398 			asm volatile("pxor %xmm7,%xmm7");
399 			asm volatile("pxor %xmm13,%xmm13");
400 			asm volatile("pxor %xmm15,%xmm15");
401 			asm volatile("pcmpgtb %xmm4,%xmm5");
402 			asm volatile("pcmpgtb %xmm6,%xmm7");
403 			asm volatile("pcmpgtb %xmm12,%xmm13");
404 			asm volatile("pcmpgtb %xmm14,%xmm15");
405 			asm volatile("paddb %xmm4,%xmm4");
406 			asm volatile("paddb %xmm6,%xmm6");
407 			asm volatile("paddb %xmm12,%xmm12");
408 			asm volatile("paddb %xmm14,%xmm14");
409 			asm volatile("pand %xmm0,%xmm5");
410 			asm volatile("pand %xmm0,%xmm7");
411 			asm volatile("pand %xmm0,%xmm13");
412 			asm volatile("pand %xmm0,%xmm15");
413 			asm volatile("pxor %xmm5,%xmm4");
414 			asm volatile("pxor %xmm7,%xmm6");
415 			asm volatile("pxor %xmm13,%xmm12");
416 			asm volatile("pxor %xmm15,%xmm14");
417 			asm volatile("movdqa %0,%%xmm5" :: "m" (dptr[z][d]));
418 			asm volatile("movdqa %0,%%xmm7" :: "m" (dptr[z][d+16]));
419 			asm volatile("movdqa %0,%%xmm13" :: "m" (dptr[z][d+32]));
420 			asm volatile("movdqa %0,%%xmm15" :: "m" (dptr[z][d+48]));
421 			asm volatile("pxor %xmm5,%xmm2");
422 			asm volatile("pxor %xmm7,%xmm3");
423 			asm volatile("pxor %xmm13,%xmm10");
424 			asm volatile("pxor %xmm15,%xmm11");
425 			asm volatile("pxor %xmm5,%xmm4");
426 			asm volatile("pxor %xmm7,%xmm6");
427 			asm volatile("pxor %xmm13,%xmm12");
428 			asm volatile("pxor %xmm15,%xmm14");
429 		}
430 		asm volatile("prefetchnta %0" :: "m" (q[d]));
431 		asm volatile("prefetchnta %0" :: "m" (q[d+32]));
432 		/* P/Q left side optimization */
433 		for ( z = start-1 ; z >= 0 ; z-- ) {
434 			asm volatile("pxor %xmm5,%xmm5");
435 			asm volatile("pxor %xmm7,%xmm7");
436 			asm volatile("pxor %xmm13,%xmm13");
437 			asm volatile("pxor %xmm15,%xmm15");
438 			asm volatile("pcmpgtb %xmm4,%xmm5");
439 			asm volatile("pcmpgtb %xmm6,%xmm7");
440 			asm volatile("pcmpgtb %xmm12,%xmm13");
441 			asm volatile("pcmpgtb %xmm14,%xmm15");
442 			asm volatile("paddb %xmm4,%xmm4");
443 			asm volatile("paddb %xmm6,%xmm6");
444 			asm volatile("paddb %xmm12,%xmm12");
445 			asm volatile("paddb %xmm14,%xmm14");
446 			asm volatile("pand %xmm0,%xmm5");
447 			asm volatile("pand %xmm0,%xmm7");
448 			asm volatile("pand %xmm0,%xmm13");
449 			asm volatile("pand %xmm0,%xmm15");
450 			asm volatile("pxor %xmm5,%xmm4");
451 			asm volatile("pxor %xmm7,%xmm6");
452 			asm volatile("pxor %xmm13,%xmm12");
453 			asm volatile("pxor %xmm15,%xmm14");
454 		}
455 		asm volatile("movntdq %%xmm2,%0" : "=m" (p[d]));
456 		asm volatile("movntdq %%xmm3,%0" : "=m" (p[d+16]));
457 		asm volatile("movntdq %%xmm10,%0" : "=m" (p[d+32]));
458 		asm volatile("movntdq %%xmm11,%0" : "=m" (p[d+48]));
459 		asm volatile("pxor %0,%%xmm4" : : "m" (q[d]));
460 		asm volatile("pxor %0,%%xmm6" : : "m" (q[d+16]));
461 		asm volatile("pxor %0,%%xmm12" : : "m" (q[d+32]));
462 		asm volatile("pxor %0,%%xmm14" : : "m" (q[d+48]));
463 		asm volatile("movntdq %%xmm4,%0" : "=m" (q[d]));
464 		asm volatile("movntdq %%xmm6,%0" : "=m" (q[d+16]));
465 		asm volatile("movntdq %%xmm12,%0" : "=m" (q[d+32]));
466 		asm volatile("movntdq %%xmm14,%0" : "=m" (q[d+48]));
467 	}
468 	asm volatile("sfence" : : : "memory");
469 	kernel_fpu_end();
470 }
471 
472 
473 const struct raid6_calls raid6_sse2x4 = {
474 	.gen_syndrome	= raid6_sse24_gen_syndrome,
475 	.xor_syndrome	= raid6_sse24_xor_syndrome,
476 	.valid		= raid6_have_sse2,
477 	.name		= "sse2x4",
478 	.priority	= 1,	/* Has cache hints */
479 };
480 
481 #endif /* CONFIG_X86_64 */
482