15777eaedSRobin Murphy // SPDX-License-Identifier: GPL-2.0-only 25777eaedSRobin Murphy // Copyright (C) 2019-2020 Arm Ltd. 35777eaedSRobin Murphy 45777eaedSRobin Murphy #include <linux/compiler.h> 55777eaedSRobin Murphy #include <linux/kasan-checks.h> 65777eaedSRobin Murphy #include <linux/kernel.h> 75777eaedSRobin Murphy 85777eaedSRobin Murphy #include <net/checksum.h> 95777eaedSRobin Murphy 105777eaedSRobin Murphy /* Looks dumb, but generates nice-ish code */ 115777eaedSRobin Murphy static u64 accumulate(u64 sum, u64 data) 125777eaedSRobin Murphy { 135777eaedSRobin Murphy __uint128_t tmp = (__uint128_t)sum + data; 145777eaedSRobin Murphy return tmp + (tmp >> 64); 155777eaedSRobin Murphy } 165777eaedSRobin Murphy 175777eaedSRobin Murphy unsigned int do_csum(const unsigned char *buff, int len) 185777eaedSRobin Murphy { 195777eaedSRobin Murphy unsigned int offset, shift, sum; 205777eaedSRobin Murphy const u64 *ptr; 215777eaedSRobin Murphy u64 data, sum64 = 0; 225777eaedSRobin Murphy 23c2c24edbSRobin Murphy if (unlikely(len == 0)) 24c2c24edbSRobin Murphy return 0; 25c2c24edbSRobin Murphy 265777eaedSRobin Murphy offset = (unsigned long)buff & 7; 275777eaedSRobin Murphy /* 285777eaedSRobin Murphy * This is to all intents and purposes safe, since rounding down cannot 295777eaedSRobin Murphy * result in a different page or cache line being accessed, and @buff 305777eaedSRobin Murphy * should absolutely not be pointing to anything read-sensitive. We do, 315777eaedSRobin Murphy * however, have to be careful not to piss off KASAN, which means using 325777eaedSRobin Murphy * unchecked reads to accommodate the head and tail, for which we'll 335777eaedSRobin Murphy * compensate with an explicit check up-front. 345777eaedSRobin Murphy */ 355777eaedSRobin Murphy kasan_check_read(buff, len); 365777eaedSRobin Murphy ptr = (u64 *)(buff - offset); 375777eaedSRobin Murphy len = len + offset - 8; 385777eaedSRobin Murphy 395777eaedSRobin Murphy /* 405777eaedSRobin Murphy * Head: zero out any excess leading bytes. Shifting back by the same 415777eaedSRobin Murphy * amount should be at least as fast as any other way of handling the 425777eaedSRobin Murphy * odd/even alignment, and means we can ignore it until the very end. 435777eaedSRobin Murphy */ 445777eaedSRobin Murphy shift = offset * 8; 455777eaedSRobin Murphy data = READ_ONCE_NOCHECK(*ptr++); 465777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 475777eaedSRobin Murphy data = (data >> shift) << shift; 485777eaedSRobin Murphy #else 495777eaedSRobin Murphy data = (data << shift) >> shift; 505777eaedSRobin Murphy #endif 515777eaedSRobin Murphy 525777eaedSRobin Murphy /* 535777eaedSRobin Murphy * Body: straightforward aligned loads from here on (the paired loads 545777eaedSRobin Murphy * underlying the quadword type still only need dword alignment). The 555777eaedSRobin Murphy * main loop strictly excludes the tail, so the second loop will always 565777eaedSRobin Murphy * run at least once. 575777eaedSRobin Murphy */ 585777eaedSRobin Murphy while (unlikely(len > 64)) { 595777eaedSRobin Murphy __uint128_t tmp1, tmp2, tmp3, tmp4; 605777eaedSRobin Murphy 615777eaedSRobin Murphy tmp1 = READ_ONCE_NOCHECK(*(__uint128_t *)ptr); 625777eaedSRobin Murphy tmp2 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 2)); 635777eaedSRobin Murphy tmp3 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 4)); 645777eaedSRobin Murphy tmp4 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 6)); 655777eaedSRobin Murphy 665777eaedSRobin Murphy len -= 64; 675777eaedSRobin Murphy ptr += 8; 685777eaedSRobin Murphy 695777eaedSRobin Murphy /* This is the "don't dump the carry flag into a GPR" idiom */ 705777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 715777eaedSRobin Murphy tmp2 += (tmp2 >> 64) | (tmp2 << 64); 725777eaedSRobin Murphy tmp3 += (tmp3 >> 64) | (tmp3 << 64); 735777eaedSRobin Murphy tmp4 += (tmp4 >> 64) | (tmp4 << 64); 745777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64); 755777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 765777eaedSRobin Murphy tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64); 775777eaedSRobin Murphy tmp3 += (tmp3 >> 64) | (tmp3 << 64); 785777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64); 795777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 805777eaedSRobin Murphy tmp1 = ((tmp1 >> 64) << 64) | sum64; 815777eaedSRobin Murphy tmp1 += (tmp1 >> 64) | (tmp1 << 64); 825777eaedSRobin Murphy sum64 = tmp1 >> 64; 835777eaedSRobin Murphy } 845777eaedSRobin Murphy while (len > 8) { 855777eaedSRobin Murphy __uint128_t tmp; 865777eaedSRobin Murphy 875777eaedSRobin Murphy sum64 = accumulate(sum64, data); 885777eaedSRobin Murphy tmp = READ_ONCE_NOCHECK(*(__uint128_t *)ptr); 895777eaedSRobin Murphy 905777eaedSRobin Murphy len -= 16; 915777eaedSRobin Murphy ptr += 2; 925777eaedSRobin Murphy 935777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 945777eaedSRobin Murphy data = tmp >> 64; 955777eaedSRobin Murphy sum64 = accumulate(sum64, tmp); 965777eaedSRobin Murphy #else 975777eaedSRobin Murphy data = tmp; 985777eaedSRobin Murphy sum64 = accumulate(sum64, tmp >> 64); 995777eaedSRobin Murphy #endif 1005777eaedSRobin Murphy } 1015777eaedSRobin Murphy if (len > 0) { 1025777eaedSRobin Murphy sum64 = accumulate(sum64, data); 1035777eaedSRobin Murphy data = READ_ONCE_NOCHECK(*ptr); 1045777eaedSRobin Murphy len -= 8; 1055777eaedSRobin Murphy } 1065777eaedSRobin Murphy /* 1075777eaedSRobin Murphy * Tail: zero any over-read bytes similarly to the head, again 1085777eaedSRobin Murphy * preserving odd/even alignment. 1095777eaedSRobin Murphy */ 1105777eaedSRobin Murphy shift = len * -8; 1115777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN 1125777eaedSRobin Murphy data = (data << shift) >> shift; 1135777eaedSRobin Murphy #else 1145777eaedSRobin Murphy data = (data >> shift) << shift; 1155777eaedSRobin Murphy #endif 1165777eaedSRobin Murphy sum64 = accumulate(sum64, data); 1175777eaedSRobin Murphy 1185777eaedSRobin Murphy /* Finally, folding */ 1195777eaedSRobin Murphy sum64 += (sum64 >> 32) | (sum64 << 32); 1205777eaedSRobin Murphy sum = sum64 >> 32; 1215777eaedSRobin Murphy sum += (sum >> 16) | (sum << 16); 1225777eaedSRobin Murphy if (offset & 1) 1235777eaedSRobin Murphy return (u16)swab32(sum); 1245777eaedSRobin Murphy 1255777eaedSRobin Murphy return sum >> 16; 1265777eaedSRobin Murphy } 127*e9c7ddbfSRobin Murphy 128*e9c7ddbfSRobin Murphy __sum16 csum_ipv6_magic(const struct in6_addr *saddr, 129*e9c7ddbfSRobin Murphy const struct in6_addr *daddr, 130*e9c7ddbfSRobin Murphy __u32 len, __u8 proto, __wsum csum) 131*e9c7ddbfSRobin Murphy { 132*e9c7ddbfSRobin Murphy __uint128_t src, dst; 133*e9c7ddbfSRobin Murphy u64 sum = (__force u64)csum; 134*e9c7ddbfSRobin Murphy 135*e9c7ddbfSRobin Murphy src = *(const __uint128_t *)saddr->s6_addr; 136*e9c7ddbfSRobin Murphy dst = *(const __uint128_t *)daddr->s6_addr; 137*e9c7ddbfSRobin Murphy 138*e9c7ddbfSRobin Murphy sum += (__force u32)htonl(len); 139*e9c7ddbfSRobin Murphy #ifdef __LITTLE_ENDIAN 140*e9c7ddbfSRobin Murphy sum += (u32)proto << 24; 141*e9c7ddbfSRobin Murphy #else 142*e9c7ddbfSRobin Murphy sum += proto; 143*e9c7ddbfSRobin Murphy #endif 144*e9c7ddbfSRobin Murphy src += (src >> 64) | (src << 64); 145*e9c7ddbfSRobin Murphy dst += (dst >> 64) | (dst << 64); 146*e9c7ddbfSRobin Murphy 147*e9c7ddbfSRobin Murphy sum = accumulate(sum, src >> 64); 148*e9c7ddbfSRobin Murphy sum = accumulate(sum, dst >> 64); 149*e9c7ddbfSRobin Murphy 150*e9c7ddbfSRobin Murphy sum += ((sum >> 32) | (sum << 32)); 151*e9c7ddbfSRobin Murphy return csum_fold((__force __wsum)(sum >> 32)); 152*e9c7ddbfSRobin Murphy } 153*e9c7ddbfSRobin Murphy EXPORT_SYMBOL(csum_ipv6_magic); 154