1*b886d83cSThomas Gleixner /* SPDX-License-Identifier: GPL-2.0-only */ 2ea4d26aeSJim Kukunas #ifndef _ASM_X86_XOR_AVX_H 3ea4d26aeSJim Kukunas #define _ASM_X86_XOR_AVX_H 4ea4d26aeSJim Kukunas 5ea4d26aeSJim Kukunas /* 6ea4d26aeSJim Kukunas * Optimized RAID-5 checksumming functions for AVX 7ea4d26aeSJim Kukunas * 8ea4d26aeSJim Kukunas * Copyright (C) 2012 Intel Corporation 9ea4d26aeSJim Kukunas * Author: Jim Kukunas <james.t.kukunas@linux.intel.com> 10ea4d26aeSJim Kukunas * 11ea4d26aeSJim Kukunas * Based on Ingo Molnar and Zach Brown's respective MMX and SSE routines 12ea4d26aeSJim Kukunas */ 13ea4d26aeSJim Kukunas 14ea4d26aeSJim Kukunas #ifdef CONFIG_AS_AVX 15ea4d26aeSJim Kukunas 16ea4d26aeSJim Kukunas #include <linux/compiler.h> 17df6b35f4SIngo Molnar #include <asm/fpu/api.h> 18ea4d26aeSJim Kukunas 19ea4d26aeSJim Kukunas #define BLOCK4(i) \ 20ea4d26aeSJim Kukunas BLOCK(32 * i, 0) \ 21ea4d26aeSJim Kukunas BLOCK(32 * (i + 1), 1) \ 22ea4d26aeSJim Kukunas BLOCK(32 * (i + 2), 2) \ 23ea4d26aeSJim Kukunas BLOCK(32 * (i + 3), 3) 24ea4d26aeSJim Kukunas 25ea4d26aeSJim Kukunas #define BLOCK16() \ 26ea4d26aeSJim Kukunas BLOCK4(0) \ 27ea4d26aeSJim Kukunas BLOCK4(4) \ 28ea4d26aeSJim Kukunas BLOCK4(8) \ 29ea4d26aeSJim Kukunas BLOCK4(12) 30ea4d26aeSJim Kukunas 31ea4d26aeSJim Kukunas static void xor_avx_2(unsigned long bytes, unsigned long *p0, unsigned long *p1) 32ea4d26aeSJim Kukunas { 33841e3604SSuresh Siddha unsigned long lines = bytes >> 9; 34ea4d26aeSJim Kukunas 35841e3604SSuresh Siddha kernel_fpu_begin(); 36ea4d26aeSJim Kukunas 37ea4d26aeSJim Kukunas while (lines--) { 38ea4d26aeSJim Kukunas #undef BLOCK 39ea4d26aeSJim Kukunas #define BLOCK(i, reg) \ 40ea4d26aeSJim Kukunas do { \ 41ea4d26aeSJim Kukunas asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p1[i / sizeof(*p1)])); \ 42ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 43ea4d26aeSJim Kukunas "m" (p0[i / sizeof(*p0)])); \ 44ea4d26aeSJim Kukunas asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 45ea4d26aeSJim Kukunas "=m" (p0[i / sizeof(*p0)])); \ 46ea4d26aeSJim Kukunas } while (0); 47ea4d26aeSJim Kukunas 48ea4d26aeSJim Kukunas BLOCK16() 49ea4d26aeSJim Kukunas 50ea4d26aeSJim Kukunas p0 = (unsigned long *)((uintptr_t)p0 + 512); 51ea4d26aeSJim Kukunas p1 = (unsigned long *)((uintptr_t)p1 + 512); 52ea4d26aeSJim Kukunas } 53ea4d26aeSJim Kukunas 54841e3604SSuresh Siddha kernel_fpu_end(); 55ea4d26aeSJim Kukunas } 56ea4d26aeSJim Kukunas 57ea4d26aeSJim Kukunas static void xor_avx_3(unsigned long bytes, unsigned long *p0, unsigned long *p1, 58ea4d26aeSJim Kukunas unsigned long *p2) 59ea4d26aeSJim Kukunas { 60841e3604SSuresh Siddha unsigned long lines = bytes >> 9; 61ea4d26aeSJim Kukunas 62841e3604SSuresh Siddha kernel_fpu_begin(); 63ea4d26aeSJim Kukunas 64ea4d26aeSJim Kukunas while (lines--) { 65ea4d26aeSJim Kukunas #undef BLOCK 66ea4d26aeSJim Kukunas #define BLOCK(i, reg) \ 67ea4d26aeSJim Kukunas do { \ 68ea4d26aeSJim Kukunas asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p2[i / sizeof(*p2)])); \ 69ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 70ea4d26aeSJim Kukunas "m" (p1[i / sizeof(*p1)])); \ 71ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 72ea4d26aeSJim Kukunas "m" (p0[i / sizeof(*p0)])); \ 73ea4d26aeSJim Kukunas asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 74ea4d26aeSJim Kukunas "=m" (p0[i / sizeof(*p0)])); \ 75ea4d26aeSJim Kukunas } while (0); 76ea4d26aeSJim Kukunas 77ea4d26aeSJim Kukunas BLOCK16() 78ea4d26aeSJim Kukunas 79ea4d26aeSJim Kukunas p0 = (unsigned long *)((uintptr_t)p0 + 512); 80ea4d26aeSJim Kukunas p1 = (unsigned long *)((uintptr_t)p1 + 512); 81ea4d26aeSJim Kukunas p2 = (unsigned long *)((uintptr_t)p2 + 512); 82ea4d26aeSJim Kukunas } 83ea4d26aeSJim Kukunas 84841e3604SSuresh Siddha kernel_fpu_end(); 85ea4d26aeSJim Kukunas } 86ea4d26aeSJim Kukunas 87ea4d26aeSJim Kukunas static void xor_avx_4(unsigned long bytes, unsigned long *p0, unsigned long *p1, 88ea4d26aeSJim Kukunas unsigned long *p2, unsigned long *p3) 89ea4d26aeSJim Kukunas { 90841e3604SSuresh Siddha unsigned long lines = bytes >> 9; 91ea4d26aeSJim Kukunas 92841e3604SSuresh Siddha kernel_fpu_begin(); 93ea4d26aeSJim Kukunas 94ea4d26aeSJim Kukunas while (lines--) { 95ea4d26aeSJim Kukunas #undef BLOCK 96ea4d26aeSJim Kukunas #define BLOCK(i, reg) \ 97ea4d26aeSJim Kukunas do { \ 98ea4d26aeSJim Kukunas asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p3[i / sizeof(*p3)])); \ 99ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 100ea4d26aeSJim Kukunas "m" (p2[i / sizeof(*p2)])); \ 101ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 102ea4d26aeSJim Kukunas "m" (p1[i / sizeof(*p1)])); \ 103ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 104ea4d26aeSJim Kukunas "m" (p0[i / sizeof(*p0)])); \ 105ea4d26aeSJim Kukunas asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 106ea4d26aeSJim Kukunas "=m" (p0[i / sizeof(*p0)])); \ 107ea4d26aeSJim Kukunas } while (0); 108ea4d26aeSJim Kukunas 109ea4d26aeSJim Kukunas BLOCK16(); 110ea4d26aeSJim Kukunas 111ea4d26aeSJim Kukunas p0 = (unsigned long *)((uintptr_t)p0 + 512); 112ea4d26aeSJim Kukunas p1 = (unsigned long *)((uintptr_t)p1 + 512); 113ea4d26aeSJim Kukunas p2 = (unsigned long *)((uintptr_t)p2 + 512); 114ea4d26aeSJim Kukunas p3 = (unsigned long *)((uintptr_t)p3 + 512); 115ea4d26aeSJim Kukunas } 116ea4d26aeSJim Kukunas 117841e3604SSuresh Siddha kernel_fpu_end(); 118ea4d26aeSJim Kukunas } 119ea4d26aeSJim Kukunas 120ea4d26aeSJim Kukunas static void xor_avx_5(unsigned long bytes, unsigned long *p0, unsigned long *p1, 121ea4d26aeSJim Kukunas unsigned long *p2, unsigned long *p3, unsigned long *p4) 122ea4d26aeSJim Kukunas { 123841e3604SSuresh Siddha unsigned long lines = bytes >> 9; 124ea4d26aeSJim Kukunas 125841e3604SSuresh Siddha kernel_fpu_begin(); 126ea4d26aeSJim Kukunas 127ea4d26aeSJim Kukunas while (lines--) { 128ea4d26aeSJim Kukunas #undef BLOCK 129ea4d26aeSJim Kukunas #define BLOCK(i, reg) \ 130ea4d26aeSJim Kukunas do { \ 131ea4d26aeSJim Kukunas asm volatile("vmovdqa %0, %%ymm" #reg : : "m" (p4[i / sizeof(*p4)])); \ 132ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 133ea4d26aeSJim Kukunas "m" (p3[i / sizeof(*p3)])); \ 134ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 135ea4d26aeSJim Kukunas "m" (p2[i / sizeof(*p2)])); \ 136ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 137ea4d26aeSJim Kukunas "m" (p1[i / sizeof(*p1)])); \ 138ea4d26aeSJim Kukunas asm volatile("vxorps %0, %%ymm" #reg ", %%ymm" #reg : : \ 139ea4d26aeSJim Kukunas "m" (p0[i / sizeof(*p0)])); \ 140ea4d26aeSJim Kukunas asm volatile("vmovdqa %%ymm" #reg ", %0" : \ 141ea4d26aeSJim Kukunas "=m" (p0[i / sizeof(*p0)])); \ 142ea4d26aeSJim Kukunas } while (0); 143ea4d26aeSJim Kukunas 144ea4d26aeSJim Kukunas BLOCK16() 145ea4d26aeSJim Kukunas 146ea4d26aeSJim Kukunas p0 = (unsigned long *)((uintptr_t)p0 + 512); 147ea4d26aeSJim Kukunas p1 = (unsigned long *)((uintptr_t)p1 + 512); 148ea4d26aeSJim Kukunas p2 = (unsigned long *)((uintptr_t)p2 + 512); 149ea4d26aeSJim Kukunas p3 = (unsigned long *)((uintptr_t)p3 + 512); 150ea4d26aeSJim Kukunas p4 = (unsigned long *)((uintptr_t)p4 + 512); 151ea4d26aeSJim Kukunas } 152ea4d26aeSJim Kukunas 153841e3604SSuresh Siddha kernel_fpu_end(); 154ea4d26aeSJim Kukunas } 155ea4d26aeSJim Kukunas 156ea4d26aeSJim Kukunas static struct xor_block_template xor_block_avx = { 157ea4d26aeSJim Kukunas .name = "avx", 158ea4d26aeSJim Kukunas .do_2 = xor_avx_2, 159ea4d26aeSJim Kukunas .do_3 = xor_avx_3, 160ea4d26aeSJim Kukunas .do_4 = xor_avx_4, 161ea4d26aeSJim Kukunas .do_5 = xor_avx_5, 162ea4d26aeSJim Kukunas }; 163ea4d26aeSJim Kukunas 164ea4d26aeSJim Kukunas #define AVX_XOR_SPEED \ 165ea4d26aeSJim Kukunas do { \ 166da154e82SBorislav Petkov if (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE)) \ 167ea4d26aeSJim Kukunas xor_speed(&xor_block_avx); \ 168ea4d26aeSJim Kukunas } while (0) 169ea4d26aeSJim Kukunas 170ea4d26aeSJim Kukunas #define AVX_SELECT(FASTEST) \ 171da154e82SBorislav Petkov (boot_cpu_has(X86_FEATURE_AVX) && boot_cpu_has(X86_FEATURE_OSXSAVE) ? &xor_block_avx : FASTEST) 172ea4d26aeSJim Kukunas 173ea4d26aeSJim Kukunas #else 174ea4d26aeSJim Kukunas 175ea4d26aeSJim Kukunas #define AVX_XOR_SPEED {} 176ea4d26aeSJim Kukunas 177ea4d26aeSJim Kukunas #define AVX_SELECT(FASTEST) (FASTEST) 178ea4d26aeSJim Kukunas 179ea4d26aeSJim Kukunas #endif 180ea4d26aeSJim Kukunas #endif 181