xref: /linux/arch/x86/include/asm/xor_avx.h (revision 4f2c0a4acffbec01079c28f839422e64ddeff004)
1b886d83cSThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-only */
2ea4d26aeSJim Kukunas #ifndef _ASM_X86_XOR_AVX_H
3ea4d26aeSJim Kukunas #define _ASM_X86_XOR_AVX_H
4ea4d26aeSJim Kukunas 
5ea4d26aeSJim Kukunas /*
6ea4d26aeSJim Kukunas  * Optimized RAID-5 checksumming functions for AVX
7ea4d26aeSJim Kukunas  *
8ea4d26aeSJim Kukunas  * Copyright (C) 2012 Intel Corporation
9ea4d26aeSJim Kukunas  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10ea4d26aeSJim Kukunas  *
11ea4d26aeSJim Kukunas  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12ea4d26aeSJim Kukunas  */
13ea4d26aeSJim Kukunas 
14ea4d26aeSJim Kukunas #include <linux/compiler.h>
15df6b35f4SIngo Molnar #include <asm/fpu/api.h>
16ea4d26aeSJim Kukunas 
17ea4d26aeSJim Kukunas #define BLOCK4(i) \
18ea4d26aeSJim Kukunas 		BLOCK(32 * i, 0) \
19ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 1), 1) \
20ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 2), 2) \
21ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 3), 3)
22ea4d26aeSJim Kukunas 
23ea4d26aeSJim Kukunas #define BLOCK16() \
24ea4d26aeSJim Kukunas 		BLOCK4(0) \
25ea4d26aeSJim Kukunas 		BLOCK4(4) \
26ea4d26aeSJim Kukunas 		BLOCK4(8) \
27ea4d26aeSJim Kukunas 		BLOCK4(12)
28ea4d26aeSJim Kukunas 
xor_avx_2(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1)29*297565aaSArd Biesheuvel static void xor_avx_2(unsigned long bytes, unsigned long * __restrict p0,
30*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p1)
31ea4d26aeSJim Kukunas {
32841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
33ea4d26aeSJim Kukunas 
34841e3604SSuresh Siddha 	kernel_fpu_begin();
35ea4d26aeSJim Kukunas 
36ea4d26aeSJim Kukunas 	while (lines--) {
37ea4d26aeSJim Kukunas #undef BLOCK
38ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
39ea4d26aeSJim Kukunas do { \
40ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
41ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
42ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
43ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
44ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
45ea4d26aeSJim Kukunas } while (0);
46ea4d26aeSJim Kukunas 
47ea4d26aeSJim Kukunas 		BLOCK16()
48ea4d26aeSJim Kukunas 
49ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
50ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
51ea4d26aeSJim Kukunas 	}
52ea4d26aeSJim Kukunas 
53841e3604SSuresh Siddha 	kernel_fpu_end();
54ea4d26aeSJim Kukunas }
55ea4d26aeSJim Kukunas 
xor_avx_3(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2)56*297565aaSArd Biesheuvel static void xor_avx_3(unsigned long bytes, unsigned long * __restrict p0,
57*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p1,
58*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p2)
59ea4d26aeSJim Kukunas {
60841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
61ea4d26aeSJim Kukunas 
62841e3604SSuresh Siddha 	kernel_fpu_begin();
63ea4d26aeSJim Kukunas 
64ea4d26aeSJim Kukunas 	while (lines--) {
65ea4d26aeSJim Kukunas #undef BLOCK
66ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
67ea4d26aeSJim Kukunas do { \
68ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
71ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
73ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
75ea4d26aeSJim Kukunas } while (0);
76ea4d26aeSJim Kukunas 
77ea4d26aeSJim Kukunas 		BLOCK16()
78ea4d26aeSJim Kukunas 
79ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
80ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
81ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
82ea4d26aeSJim Kukunas 	}
83ea4d26aeSJim Kukunas 
84841e3604SSuresh Siddha 	kernel_fpu_end();
85ea4d26aeSJim Kukunas }
86ea4d26aeSJim Kukunas 
xor_avx_4(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3)87*297565aaSArd Biesheuvel static void xor_avx_4(unsigned long bytes, unsigned long * __restrict p0,
88*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p1,
89*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p2,
90*297565aaSArd Biesheuvel 		      const unsigned long * __restrict p3)
91ea4d26aeSJim Kukunas {
92841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
93ea4d26aeSJim Kukunas 
94841e3604SSuresh Siddha 	kernel_fpu_begin();
95ea4d26aeSJim Kukunas 
96ea4d26aeSJim Kukunas 	while (lines--) {
97ea4d26aeSJim Kukunas #undef BLOCK
98ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
99ea4d26aeSJim Kukunas do { \
100ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
101ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102ea4d26aeSJim Kukunas 		"m" (p2[i / sizeof(*p2)])); \
103ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
105ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
106ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
107ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
108ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
109ea4d26aeSJim Kukunas } while (0);
110ea4d26aeSJim Kukunas 
111ea4d26aeSJim Kukunas 		BLOCK16();
112ea4d26aeSJim Kukunas 
113ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
114ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
115ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
116ea4d26aeSJim Kukunas 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
117ea4d26aeSJim Kukunas 	}
118ea4d26aeSJim Kukunas 
119841e3604SSuresh Siddha 	kernel_fpu_end();
120ea4d26aeSJim Kukunas }
121ea4d26aeSJim Kukunas 
xor_avx_5(unsigned long bytes,unsigned long * __restrict p0,const unsigned long * __restrict p1,const unsigned long * __restrict p2,const unsigned long * __restrict p3,const unsigned long * __restrict p4)122*297565aaSArd Biesheuvel static void xor_avx_5(unsigned long bytes, unsigned long * __restrict p0,
123*297565aaSArd Biesheuvel 	     const unsigned long * __restrict p1,
124*297565aaSArd Biesheuvel 	     const unsigned long * __restrict p2,
125*297565aaSArd Biesheuvel 	     const unsigned long * __restrict p3,
126*297565aaSArd Biesheuvel 	     const unsigned long * __restrict p4)
127ea4d26aeSJim Kukunas {
128841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
129ea4d26aeSJim Kukunas 
130841e3604SSuresh Siddha 	kernel_fpu_begin();
131ea4d26aeSJim Kukunas 
132ea4d26aeSJim Kukunas 	while (lines--) {
133ea4d26aeSJim Kukunas #undef BLOCK
134ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
135ea4d26aeSJim Kukunas do { \
136ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
137ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
138ea4d26aeSJim Kukunas 		"m" (p3[i / sizeof(*p3)])); \
139ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
140ea4d26aeSJim Kukunas 		"m" (p2[i / sizeof(*p2)])); \
141ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
142ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
143ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
144ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
145ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
146ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
147ea4d26aeSJim Kukunas } while (0);
148ea4d26aeSJim Kukunas 
149ea4d26aeSJim Kukunas 		BLOCK16()
150ea4d26aeSJim Kukunas 
151ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
152ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
153ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
154ea4d26aeSJim Kukunas 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
155ea4d26aeSJim Kukunas 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
156ea4d26aeSJim Kukunas 	}
157ea4d26aeSJim Kukunas 
158841e3604SSuresh Siddha 	kernel_fpu_end();
159ea4d26aeSJim Kukunas }
160ea4d26aeSJim Kukunas 
161ea4d26aeSJim Kukunas static struct xor_block_template xor_block_avx = {
162ea4d26aeSJim Kukunas 	.name = "avx",
163ea4d26aeSJim Kukunas 	.do_2 = xor_avx_2,
164ea4d26aeSJim Kukunas 	.do_3 = xor_avx_3,
165ea4d26aeSJim Kukunas 	.do_4 = xor_avx_4,
166ea4d26aeSJim Kukunas 	.do_5 = xor_avx_5,
167ea4d26aeSJim Kukunas };
168ea4d26aeSJim Kukunas 
169ea4d26aeSJim Kukunas #define AVX_XOR_SPEED \
170ea4d26aeSJim Kukunas do { \
171da154e82SBorislav Petkov 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
172ea4d26aeSJim Kukunas 		xor_speed(&xor_block_avx); \
173ea4d26aeSJim Kukunas } while (0)
174ea4d26aeSJim Kukunas 
175ea4d26aeSJim Kukunas #define AVX_SELECT(FASTEST) \
176da154e82SBorislav Petkov 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
177ea4d26aeSJim Kukunas 
178ea4d26aeSJim Kukunas #endif
179