xref: /linux/arch/x86/include/asm/xor_avx.h (revision b886d83c5b621abc84ff9616f14c529be3f6b147)
1*b886d83cSThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-only */
2ea4d26aeSJim Kukunas #ifndef _ASM_X86_XOR_AVX_H
3ea4d26aeSJim Kukunas #define _ASM_X86_XOR_AVX_H
4ea4d26aeSJim Kukunas 
5ea4d26aeSJim Kukunas /*
6ea4d26aeSJim Kukunas  * Optimized RAID-5 checksumming functions for AVX
7ea4d26aeSJim Kukunas  *
8ea4d26aeSJim Kukunas  * Copyright (C) 2012 Intel Corporation
9ea4d26aeSJim Kukunas  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
10ea4d26aeSJim Kukunas  *
11ea4d26aeSJim Kukunas  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
12ea4d26aeSJim Kukunas  */
13ea4d26aeSJim Kukunas 
14ea4d26aeSJim Kukunas #ifdef CONFIG_AS_AVX
15ea4d26aeSJim Kukunas 
16ea4d26aeSJim Kukunas #include <linux/compiler.h>
17df6b35f4SIngo Molnar #include <asm/fpu/api.h>
18ea4d26aeSJim Kukunas 
19ea4d26aeSJim Kukunas #define BLOCK4(i) \
20ea4d26aeSJim Kukunas 		BLOCK(32 * i, 0) \
21ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 1), 1) \
22ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 2), 2) \
23ea4d26aeSJim Kukunas 		BLOCK(32 * (i + 3), 3)
24ea4d26aeSJim Kukunas 
25ea4d26aeSJim Kukunas #define BLOCK16() \
26ea4d26aeSJim Kukunas 		BLOCK4(0) \
27ea4d26aeSJim Kukunas 		BLOCK4(4) \
28ea4d26aeSJim Kukunas 		BLOCK4(8) \
29ea4d26aeSJim Kukunas 		BLOCK4(12)
30ea4d26aeSJim Kukunas 
31ea4d26aeSJim Kukunas static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
32ea4d26aeSJim Kukunas {
33841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
34ea4d26aeSJim Kukunas 
35841e3604SSuresh Siddha 	kernel_fpu_begin();
36ea4d26aeSJim Kukunas 
37ea4d26aeSJim Kukunas 	while (lines--) {
38ea4d26aeSJim Kukunas #undef BLOCK
39ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
40ea4d26aeSJim Kukunas do { \
41ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
42ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
43ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
44ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
45ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
46ea4d26aeSJim Kukunas } while (0);
47ea4d26aeSJim Kukunas 
48ea4d26aeSJim Kukunas 		BLOCK16()
49ea4d26aeSJim Kukunas 
50ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
51ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
52ea4d26aeSJim Kukunas 	}
53ea4d26aeSJim Kukunas 
54841e3604SSuresh Siddha 	kernel_fpu_end();
55ea4d26aeSJim Kukunas }
56ea4d26aeSJim Kukunas 
57ea4d26aeSJim Kukunas static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
58ea4d26aeSJim Kukunas 	unsigned long *p2)
59ea4d26aeSJim Kukunas {
60841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
61ea4d26aeSJim Kukunas 
62841e3604SSuresh Siddha 	kernel_fpu_begin();
63ea4d26aeSJim Kukunas 
64ea4d26aeSJim Kukunas 	while (lines--) {
65ea4d26aeSJim Kukunas #undef BLOCK
66ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
67ea4d26aeSJim Kukunas do { \
68ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
69ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
70ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
71ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
72ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
73ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
74ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
75ea4d26aeSJim Kukunas } while (0);
76ea4d26aeSJim Kukunas 
77ea4d26aeSJim Kukunas 		BLOCK16()
78ea4d26aeSJim Kukunas 
79ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
80ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
81ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
82ea4d26aeSJim Kukunas 	}
83ea4d26aeSJim Kukunas 
84841e3604SSuresh Siddha 	kernel_fpu_end();
85ea4d26aeSJim Kukunas }
86ea4d26aeSJim Kukunas 
87ea4d26aeSJim Kukunas static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
88ea4d26aeSJim Kukunas 	unsigned long *p2, unsigned long *p3)
89ea4d26aeSJim Kukunas {
90841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
91ea4d26aeSJim Kukunas 
92841e3604SSuresh Siddha 	kernel_fpu_begin();
93ea4d26aeSJim Kukunas 
94ea4d26aeSJim Kukunas 	while (lines--) {
95ea4d26aeSJim Kukunas #undef BLOCK
96ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
97ea4d26aeSJim Kukunas do { \
98ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
99ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
100ea4d26aeSJim Kukunas 		"m" (p2[i / sizeof(*p2)])); \
101ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
103ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
105ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
107ea4d26aeSJim Kukunas } while (0);
108ea4d26aeSJim Kukunas 
109ea4d26aeSJim Kukunas 		BLOCK16();
110ea4d26aeSJim Kukunas 
111ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
112ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
113ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
114ea4d26aeSJim Kukunas 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
115ea4d26aeSJim Kukunas 	}
116ea4d26aeSJim Kukunas 
117841e3604SSuresh Siddha 	kernel_fpu_end();
118ea4d26aeSJim Kukunas }
119ea4d26aeSJim Kukunas 
120ea4d26aeSJim Kukunas static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
121ea4d26aeSJim Kukunas 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
122ea4d26aeSJim Kukunas {
123841e3604SSuresh Siddha 	unsigned long lines = bytes >> 9;
124ea4d26aeSJim Kukunas 
125841e3604SSuresh Siddha 	kernel_fpu_begin();
126ea4d26aeSJim Kukunas 
127ea4d26aeSJim Kukunas 	while (lines--) {
128ea4d26aeSJim Kukunas #undef BLOCK
129ea4d26aeSJim Kukunas #define BLOCK(i, reg) \
130ea4d26aeSJim Kukunas do { \
131ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
132ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133ea4d26aeSJim Kukunas 		"m" (p3[i / sizeof(*p3)])); \
134ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135ea4d26aeSJim Kukunas 		"m" (p2[i / sizeof(*p2)])); \
136ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137ea4d26aeSJim Kukunas 		"m" (p1[i / sizeof(*p1)])); \
138ea4d26aeSJim Kukunas 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
139ea4d26aeSJim Kukunas 		"m" (p0[i / sizeof(*p0)])); \
140ea4d26aeSJim Kukunas 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
141ea4d26aeSJim Kukunas 		"=m" (p0[i / sizeof(*p0)])); \
142ea4d26aeSJim Kukunas } while (0);
143ea4d26aeSJim Kukunas 
144ea4d26aeSJim Kukunas 		BLOCK16()
145ea4d26aeSJim Kukunas 
146ea4d26aeSJim Kukunas 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
147ea4d26aeSJim Kukunas 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
148ea4d26aeSJim Kukunas 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
149ea4d26aeSJim Kukunas 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
150ea4d26aeSJim Kukunas 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
151ea4d26aeSJim Kukunas 	}
152ea4d26aeSJim Kukunas 
153841e3604SSuresh Siddha 	kernel_fpu_end();
154ea4d26aeSJim Kukunas }
155ea4d26aeSJim Kukunas 
156ea4d26aeSJim Kukunas static struct xor_block_template xor_block_avx = {
157ea4d26aeSJim Kukunas 	.name = "avx",
158ea4d26aeSJim Kukunas 	.do_2 = xor_avx_2,
159ea4d26aeSJim Kukunas 	.do_3 = xor_avx_3,
160ea4d26aeSJim Kukunas 	.do_4 = xor_avx_4,
161ea4d26aeSJim Kukunas 	.do_5 = xor_avx_5,
162ea4d26aeSJim Kukunas };
163ea4d26aeSJim Kukunas 
164ea4d26aeSJim Kukunas #define AVX_XOR_SPEED \
165ea4d26aeSJim Kukunas do { \
166da154e82SBorislav Petkov 	if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \
167ea4d26aeSJim Kukunas 		xor_speed(&xor_block_avx); \
168ea4d26aeSJim Kukunas } while (0)
169ea4d26aeSJim Kukunas 
170ea4d26aeSJim Kukunas #define AVX_SELECT(FASTEST) \
171da154e82SBorislav Petkov 	(boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST)
172ea4d26aeSJim Kukunas 
173ea4d26aeSJim Kukunas #else
174ea4d26aeSJim Kukunas 
175ea4d26aeSJim Kukunas #define AVX_XOR_SPEED {}
176ea4d26aeSJim Kukunas 
177ea4d26aeSJim Kukunas #define AVX_SELECT(FASTEST) (FASTEST)
178ea4d26aeSJim Kukunas 
179ea4d26aeSJim Kukunas #endif
180ea4d26aeSJim Kukunas #endif
181