1*5777eaedSRobin Murphy // SPDX-License-Identifier: GPL-2.0-only 2*5777eaedSRobin Murphy // Copyright (C) 2019-2020 Arm Ltd. 3*5777eaedSRobin Murphy 4*5777eaedSRobin Murphy #include <linux/compiler.h> 5*5777eaedSRobin Murphy #include <linux/kasan-checks.h> 6*5777eaedSRobin Murphy #include <linux/kernel.h> 7*5777eaedSRobin Murphy 8*5777eaedSRobin Murphy #include <net/checksum.h> 9*5777eaedSRobin Murphy 10*5777eaedSRobin Murphy /* Looks dumb, but generates nice-ish code */ 11*5777eaedSRobin Murphy static u64 accumulate(u64 sum, u64 data) 12*5777eaedSRobin Murphy { 13*5777eaedSRobin Murphy __uint128_t tmp = (__uint128_t)sum + data; 14*5777eaedSRobin Murphy return tmp + (tmp >> 64); 15*5777eaedSRobin Murphy } 16*5777eaedSRobin Murphy 17*5777eaedSRobin Murphy unsigned int do_csum(const unsigned char *buff, int len) 18*5777eaedSRobin Murphy { 19*5777eaedSRobin Murphy unsigned int offset, shift, sum; 20*5777eaedSRobin Murphy const u64 *ptr; 21*5777eaedSRobin Murphy u64 data, sum64 = 0; 22*5777eaedSRobin Murphy 23*5777eaedSRobin Murphy offset = (unsigned long)buff & 7; 24*5777eaedSRobin Murphy /* 25*5777eaedSRobin Murphy * This is to all intents and purposes safe, since rounding down cannot 26*5777eaedSRobin Murphy * result in a different page or cache line being accessed, and @buff 27*5777eaedSRobin Murphy * should absolutely not be pointing to anything read-sensitive. We do, 28*5777eaedSRobin Murphy * however, have to be careful not to piss off KASAN, which means using 29*5777eaedSRobin Murphy * unchecked reads to accommodate the head and tail, for which we'll 30*5777eaedSRobin Murphy * compensate with an explicit check up-front. 31*5777eaedSRobin Murphy */ 32*5777eaedSRobin Murphy kasan_check_read(buff, len); 33*5777eaedSRobin Murphy ptr = (u64 *)(buff - offset); 34*5777eaedSRobin Murphy len = len + offset - 8; 35*5777eaedSRobin Murphy 36*5777eaedSRobin Murphy /* 37*5777eaedSRobin Murphy * Head: zero out any excess leading bytes. Shifting back by the same 38*5777eaedSRobin Murphy * amount should be at least as fast as any other way of handling the 39*5777eaedSRobin Murphy * odd/even alignment, and means we can ignore it until the very end. 40*5777eaedSRobin Murphy */ 41*5777eaedSRobin Murphy shift = offset * 8; 42*5777eaedSRobin Murphy data = READ_ONCE_NOCHECK(*ptr++); 43*5777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 44*5777eaedSRobin Murphy data = (data >> shift) << shift; 45*5777eaedSRobin Murphy #else 46*5777eaedSRobin Murphy data = (data << shift) >> shift; 47*5777eaedSRobin Murphy #endif 48*5777eaedSRobin Murphy 49*5777eaedSRobin Murphy /* 50*5777eaedSRobin Murphy * Body: straightforward aligned loads from here on (the paired loads 51*5777eaedSRobin Murphy * underlying the quadword type still only need dword alignment). The 52*5777eaedSRobin Murphy * main loop strictly excludes the tail, so the second loop will always 53*5777eaedSRobin Murphy * run at least once. 54*5777eaedSRobin Murphy */ 55*5777eaedSRobin Murphy while (unlikely(len > 64)) { 56*5777eaedSRobin Murphy __uint128_t tmp1, tmp2, tmp3, tmp4; 57*5777eaedSRobin Murphy 58*5777eaedSRobin Murphy tmp1 = READ_ONCE_NOCHECK(*(__uint128_t *)ptr); 59*5777eaedSRobin Murphy tmp2 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 2)); 60*5777eaedSRobin Murphy tmp3 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 4)); 61*5777eaedSRobin Murphy tmp4 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 6)); 62*5777eaedSRobin Murphy 63*5777eaedSRobin Murphy len -= 64; 64*5777eaedSRobin Murphy ptr += 8; 65*5777eaedSRobin Murphy 66*5777eaedSRobin Murphy /* This is the "don't dump the carry flag into a GPR" idiom */ 67*5777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 68*5777eaedSRobin Murphy tmp2 += (tmp2 >> 64) | (tmp2 << 64); 69*5777eaedSRobin Murphy tmp3 += (tmp3 >> 64) | (tmp3 << 64); 70*5777eaedSRobin Murphy tmp4 += (tmp4 >> 64) | (tmp4 << 64); 71*5777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64); 72*5777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 73*5777eaedSRobin Murphy tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64); 74*5777eaedSRobin Murphy tmp3 += (tmp3 >> 64) | (tmp3 << 64); 75*5777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64); 76*5777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 77*5777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | sum64; 78*5777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 79*5777eaedSRobin Murphy sum64 = tmp1 >> 64; 80*5777eaedSRobin Murphy } 81*5777eaedSRobin Murphy while (len > 8) { 82*5777eaedSRobin Murphy __uint128_t tmp; 83*5777eaedSRobin Murphy 84*5777eaedSRobin Murphy sum64 = accumulate(sum64, data); 85*5777eaedSRobin Murphy tmp = READ_ONCE_NOCHECK(*(__uint128_t *)ptr); 86*5777eaedSRobin Murphy 87*5777eaedSRobin Murphy len -= 16; 88*5777eaedSRobin Murphy ptr += 2; 89*5777eaedSRobin Murphy 90*5777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 91*5777eaedSRobin Murphy data = tmp >> 64; 92*5777eaedSRobin Murphy sum64 = accumulate(sum64, tmp); 93*5777eaedSRobin Murphy #else 94*5777eaedSRobin Murphy data = tmp; 95*5777eaedSRobin Murphy sum64 = accumulate(sum64, tmp >> 64); 96*5777eaedSRobin Murphy #endif 97*5777eaedSRobin Murphy } 98*5777eaedSRobin Murphy if (len > 0) { 99*5777eaedSRobin Murphy sum64 = accumulate(sum64, data); 100*5777eaedSRobin Murphy data = READ_ONCE_NOCHECK(*ptr); 101*5777eaedSRobin Murphy len -= 8; 102*5777eaedSRobin Murphy } 103*5777eaedSRobin Murphy /* 104*5777eaedSRobin Murphy * Tail: zero any over-read bytes similarly to the head, again 105*5777eaedSRobin Murphy * preserving odd/even alignment. 106*5777eaedSRobin Murphy */ 107*5777eaedSRobin Murphy shift = len * -8; 108*5777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 109*5777eaedSRobin Murphy data = (data << shift) >> shift; 110*5777eaedSRobin Murphy #else 111*5777eaedSRobin Murphy data = (data >> shift) << shift; 112*5777eaedSRobin Murphy #endif 113*5777eaedSRobin Murphy sum64 = accumulate(sum64, data); 114*5777eaedSRobin Murphy 115*5777eaedSRobin Murphy /* Finally, folding */ 116*5777eaedSRobin Murphy sum64 += (sum64 >> 32) | (sum64 << 32); 117*5777eaedSRobin Murphy sum = sum64 >> 32; 118*5777eaedSRobin Murphy sum += (sum >> 16) | (sum << 16); 119*5777eaedSRobin Murphy if (offset & 1) 120*5777eaedSRobin Murphy return (u16)swab32(sum); 121*5777eaedSRobin Murphy 122*5777eaedSRobin Murphy return sum >> 16; 123*5777eaedSRobin Murphy } 124