1*75ded18aSWANG Xuerui // SPDX-License-Identifier: GPL-2.0-or-later 2*75ded18aSWANG Xuerui /* 3*75ded18aSWANG Xuerui * Copyright (C) 2023 WANG Xuerui <git@xen0n.name> 4*75ded18aSWANG Xuerui * 5*75ded18aSWANG Xuerui * Template for XOR operations, instantiated in xor_simd.c. 6*75ded18aSWANG Xuerui * 7*75ded18aSWANG Xuerui * Expected preprocessor definitions: 8*75ded18aSWANG Xuerui * 9*75ded18aSWANG Xuerui * - LINE_WIDTH 10*75ded18aSWANG Xuerui * - XOR_FUNC_NAME(nr) 11*75ded18aSWANG Xuerui * - LD_INOUT_LINE(buf) 12*75ded18aSWANG Xuerui * - LD_AND_XOR_LINE(buf) 13*75ded18aSWANG Xuerui * - ST_LINE(buf) 14*75ded18aSWANG Xuerui */ 15*75ded18aSWANG Xuerui 16*75ded18aSWANG Xuerui void XOR_FUNC_NAME(2)(unsigned long bytes, 17*75ded18aSWANG Xuerui unsigned long * __restrict v1, 18*75ded18aSWANG Xuerui const unsigned long * __restrict v2) 19*75ded18aSWANG Xuerui { 20*75ded18aSWANG Xuerui unsigned long lines = bytes / LINE_WIDTH; 21*75ded18aSWANG Xuerui 22*75ded18aSWANG Xuerui do { 23*75ded18aSWANG Xuerui __asm__ __volatile__ ( 24*75ded18aSWANG Xuerui LD_INOUT_LINE(v1) 25*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v2) 26*75ded18aSWANG Xuerui ST_LINE(v1) 27*75ded18aSWANG Xuerui : : [v1] "r"(v1), [v2] "r"(v2) : "memory" 28*75ded18aSWANG Xuerui ); 29*75ded18aSWANG Xuerui 30*75ded18aSWANG Xuerui v1 += LINE_WIDTH / sizeof(unsigned long); 31*75ded18aSWANG Xuerui v2 += LINE_WIDTH / sizeof(unsigned long); 32*75ded18aSWANG Xuerui } while (--lines > 0); 33*75ded18aSWANG Xuerui } 34*75ded18aSWANG Xuerui 35*75ded18aSWANG Xuerui void XOR_FUNC_NAME(3)(unsigned long bytes, 36*75ded18aSWANG Xuerui unsigned long * __restrict v1, 37*75ded18aSWANG Xuerui const unsigned long * __restrict v2, 38*75ded18aSWANG Xuerui const unsigned long * __restrict v3) 39*75ded18aSWANG Xuerui { 40*75ded18aSWANG Xuerui unsigned long lines = bytes / LINE_WIDTH; 41*75ded18aSWANG Xuerui 42*75ded18aSWANG Xuerui do { 43*75ded18aSWANG Xuerui __asm__ __volatile__ ( 44*75ded18aSWANG Xuerui LD_INOUT_LINE(v1) 45*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v2) 46*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v3) 47*75ded18aSWANG Xuerui ST_LINE(v1) 48*75ded18aSWANG Xuerui : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3) : "memory" 49*75ded18aSWANG Xuerui ); 50*75ded18aSWANG Xuerui 51*75ded18aSWANG Xuerui v1 += LINE_WIDTH / sizeof(unsigned long); 52*75ded18aSWANG Xuerui v2 += LINE_WIDTH / sizeof(unsigned long); 53*75ded18aSWANG Xuerui v3 += LINE_WIDTH / sizeof(unsigned long); 54*75ded18aSWANG Xuerui } while (--lines > 0); 55*75ded18aSWANG Xuerui } 56*75ded18aSWANG Xuerui 57*75ded18aSWANG Xuerui void XOR_FUNC_NAME(4)(unsigned long bytes, 58*75ded18aSWANG Xuerui unsigned long * __restrict v1, 59*75ded18aSWANG Xuerui const unsigned long * __restrict v2, 60*75ded18aSWANG Xuerui const unsigned long * __restrict v3, 61*75ded18aSWANG Xuerui const unsigned long * __restrict v4) 62*75ded18aSWANG Xuerui { 63*75ded18aSWANG Xuerui unsigned long lines = bytes / LINE_WIDTH; 64*75ded18aSWANG Xuerui 65*75ded18aSWANG Xuerui do { 66*75ded18aSWANG Xuerui __asm__ __volatile__ ( 67*75ded18aSWANG Xuerui LD_INOUT_LINE(v1) 68*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v2) 69*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v3) 70*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v4) 71*75ded18aSWANG Xuerui ST_LINE(v1) 72*75ded18aSWANG Xuerui : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4) 73*75ded18aSWANG Xuerui : "memory" 74*75ded18aSWANG Xuerui ); 75*75ded18aSWANG Xuerui 76*75ded18aSWANG Xuerui v1 += LINE_WIDTH / sizeof(unsigned long); 77*75ded18aSWANG Xuerui v2 += LINE_WIDTH / sizeof(unsigned long); 78*75ded18aSWANG Xuerui v3 += LINE_WIDTH / sizeof(unsigned long); 79*75ded18aSWANG Xuerui v4 += LINE_WIDTH / sizeof(unsigned long); 80*75ded18aSWANG Xuerui } while (--lines > 0); 81*75ded18aSWANG Xuerui } 82*75ded18aSWANG Xuerui 83*75ded18aSWANG Xuerui void XOR_FUNC_NAME(5)(unsigned long bytes, 84*75ded18aSWANG Xuerui unsigned long * __restrict v1, 85*75ded18aSWANG Xuerui const unsigned long * __restrict v2, 86*75ded18aSWANG Xuerui const unsigned long * __restrict v3, 87*75ded18aSWANG Xuerui const unsigned long * __restrict v4, 88*75ded18aSWANG Xuerui const unsigned long * __restrict v5) 89*75ded18aSWANG Xuerui { 90*75ded18aSWANG Xuerui unsigned long lines = bytes / LINE_WIDTH; 91*75ded18aSWANG Xuerui 92*75ded18aSWANG Xuerui do { 93*75ded18aSWANG Xuerui __asm__ __volatile__ ( 94*75ded18aSWANG Xuerui LD_INOUT_LINE(v1) 95*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v2) 96*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v3) 97*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v4) 98*75ded18aSWANG Xuerui LD_AND_XOR_LINE(v5) 99*75ded18aSWANG Xuerui ST_LINE(v1) 100*75ded18aSWANG Xuerui : : [v1] "r"(v1), [v2] "r"(v2), [v3] "r"(v3), [v4] "r"(v4), 101*75ded18aSWANG Xuerui [v5] "r"(v5) : "memory" 102*75ded18aSWANG Xuerui ); 103*75ded18aSWANG Xuerui 104*75ded18aSWANG Xuerui v1 += LINE_WIDTH / sizeof(unsigned long); 105*75ded18aSWANG Xuerui v2 += LINE_WIDTH / sizeof(unsigned long); 106*75ded18aSWANG Xuerui v3 += LINE_WIDTH / sizeof(unsigned long); 107*75ded18aSWANG Xuerui v4 += LINE_WIDTH / sizeof(unsigned long); 108*75ded18aSWANG Xuerui v5 += LINE_WIDTH / sizeof(unsigned long); 109*75ded18aSWANG Xuerui } while (--lines > 0); 110*75ded18aSWANG Xuerui } 111