xref: /linux/arch/x86/include/asm/xor_avx.h (revision 26b0d14106954ae46d2f4f7eec3481828a210f7d)
1 #ifndef _ASM_X86_XOR_AVX_H
2 #define _ASM_X86_XOR_AVX_H
3 
4 /*
5  * Optimized RAID-5 checksumming functions for AVX
6  *
7  * Copyright (C) 2012 Intel Corporation
8  * Author: Jim Kukunas <james.t.kukunas@linux.intel.com>
9  *
10  * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines
11  *
12  * This program is free software; you can redistribute it and/or
13  * modify it under the terms of the GNU General Public License
14  * as published by the Free Software Foundation; version 2
15  * of the License.
16  */
17 
18 #ifdef CONFIG_AS_AVX
19 
20 #include <linux/compiler.h>
21 #include <asm/i387.h>
22 
23 #define ALIGN32 __aligned(32)
24 
25 #define YMM_SAVED_REGS 4
26 
27 #define YMMS_SAVE \
28 do { \
29 	preempt_disable(); \
30 	cr0 = read_cr0(); \
31 	clts(); \
32 	asm volatile("vmovaps %%ymm0, %0" : "=m" (ymm_save[0]) : : "memory"); \
33 	asm volatile("vmovaps %%ymm1, %0" : "=m" (ymm_save[32]) : : "memory"); \
34 	asm volatile("vmovaps %%ymm2, %0" : "=m" (ymm_save[64]) : : "memory"); \
35 	asm volatile("vmovaps %%ymm3, %0" : "=m" (ymm_save[96]) : : "memory"); \
36 } while (0);
37 
38 #define YMMS_RESTORE \
39 do { \
40 	asm volatile("sfence" : : : "memory"); \
41 	asm volatile("vmovaps %0, %%ymm3" : : "m" (ymm_save[96])); \
42 	asm volatile("vmovaps %0, %%ymm2" : : "m" (ymm_save[64])); \
43 	asm volatile("vmovaps %0, %%ymm1" : : "m" (ymm_save[32])); \
44 	asm volatile("vmovaps %0, %%ymm0" : : "m" (ymm_save[0])); \
45 	write_cr0(cr0); \
46 	preempt_enable(); \
47 } while (0);
48 
49 #define BLOCK4(i) \
50 		BLOCK(32 * i, 0) \
51 		BLOCK(32 * (i + 1), 1) \
52 		BLOCK(32 * (i + 2), 2) \
53 		BLOCK(32 * (i + 3), 3)
54 
55 #define BLOCK16() \
56 		BLOCK4(0) \
57 		BLOCK4(4) \
58 		BLOCK4(8) \
59 		BLOCK4(12)
60 
61 static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1)
62 {
63 	unsigned long cr0, lines = bytes >> 9;
64 	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
65 
66 	YMMS_SAVE
67 
68 	while (lines--) {
69 #undef BLOCK
70 #define BLOCK(i, reg) \
71 do { \
72 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \
73 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm"  #reg : : \
74 		"m" (p0[i / sizeof(*p0)])); \
75 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
76 		"=m" (p0[i / sizeof(*p0)])); \
77 } while (0);
78 
79 		BLOCK16()
80 
81 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
82 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
83 	}
84 
85 	YMMS_RESTORE
86 }
87 
88 static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1,
89 	unsigned long *p2)
90 {
91 	unsigned long cr0, lines = bytes >> 9;
92 	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
93 
94 	YMMS_SAVE
95 
96 	while (lines--) {
97 #undef BLOCK
98 #define BLOCK(i, reg) \
99 do { \
100 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \
101 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
102 		"m" (p1[i / sizeof(*p1)])); \
103 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
104 		"m" (p0[i / sizeof(*p0)])); \
105 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
106 		"=m" (p0[i / sizeof(*p0)])); \
107 } while (0);
108 
109 		BLOCK16()
110 
111 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
112 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
113 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
114 	}
115 
116 	YMMS_RESTORE
117 }
118 
119 static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1,
120 	unsigned long *p2, unsigned long *p3)
121 {
122 	unsigned long cr0, lines = bytes >> 9;
123 	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
124 
125 	YMMS_SAVE
126 
127 	while (lines--) {
128 #undef BLOCK
129 #define BLOCK(i, reg) \
130 do { \
131 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \
132 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
133 		"m" (p2[i / sizeof(*p2)])); \
134 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
135 		"m" (p1[i / sizeof(*p1)])); \
136 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
137 		"m" (p0[i / sizeof(*p0)])); \
138 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
139 		"=m" (p0[i / sizeof(*p0)])); \
140 } while (0);
141 
142 		BLOCK16();
143 
144 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
145 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
146 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
147 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
148 	}
149 
150 	YMMS_RESTORE
151 }
152 
153 static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1,
154 	unsigned long *p2, unsigned long *p3, unsigned long *p4)
155 {
156 	unsigned long cr0, lines = bytes >> 9;
157 	char ymm_save[32 * YMM_SAVED_REGS] ALIGN32;
158 
159 	YMMS_SAVE
160 
161 	while (lines--) {
162 #undef BLOCK
163 #define BLOCK(i, reg) \
164 do { \
165 	asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \
166 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
167 		"m" (p3[i / sizeof(*p3)])); \
168 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
169 		"m" (p2[i / sizeof(*p2)])); \
170 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
171 		"m" (p1[i / sizeof(*p1)])); \
172 	asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \
173 		"m" (p0[i / sizeof(*p0)])); \
174 	asm volatile("vmovdqa %%ymm" #reg ", %0" : \
175 		"=m" (p0[i / sizeof(*p0)])); \
176 } while (0);
177 
178 		BLOCK16()
179 
180 		p0 = (unsigned long *)((uintptr_t)p0 + 512);
181 		p1 = (unsigned long *)((uintptr_t)p1 + 512);
182 		p2 = (unsigned long *)((uintptr_t)p2 + 512);
183 		p3 = (unsigned long *)((uintptr_t)p3 + 512);
184 		p4 = (unsigned long *)((uintptr_t)p4 + 512);
185 	}
186 
187 	YMMS_RESTORE
188 }
189 
190 static struct xor_block_template xor_block_avx = {
191 	.name = "avx",
192 	.do_2 = xor_avx_2,
193 	.do_3 = xor_avx_3,
194 	.do_4 = xor_avx_4,
195 	.do_5 = xor_avx_5,
196 };
197 
198 #define AVX_XOR_SPEED \
199 do { \
200 	if (cpu_has_avx) \
201 		xor_speed(&xor_block_avx); \
202 } while (0)
203 
204 #define AVX_SELECT(FASTEST) \
205 	(cpu_has_avx ? &xor_block_avx : FASTEST)
206 
207 #else
208 
209 #define AVX_XOR_SPEED {}
210 
211 #define AVX_SELECT(FASTEST) (FASTEST)
212 
213 #endif
214 #endif
215