xref: /linux/arch/arm64/lib/csum.c (revision e9c7ddbf8b4b6a291bf3b5bfa7c883235164d9be)
15777eaedSRobin Murphy // SPDX-License-Identifier: GPL-2.0-only
25777eaedSRobin Murphy // Copyright (C) 2019-2020 Arm Ltd.
35777eaedSRobin Murphy 
45777eaedSRobin Murphy #include <linux/compiler.h>
55777eaedSRobin Murphy #include <linux/kasan-checks.h>
65777eaedSRobin Murphy #include <linux/kernel.h>
75777eaedSRobin Murphy 
85777eaedSRobin Murphy #include <net/checksum.h>
95777eaedSRobin Murphy 
105777eaedSRobin Murphy /* Looks dumb, but generates nice-ish code */
115777eaedSRobin Murphy static u64 accumulate(u64 sum, u64 data)
125777eaedSRobin Murphy {
135777eaedSRobin Murphy 	__uint128_t tmp = (__uint128_t)sum + data;
145777eaedSRobin Murphy 	return tmp + (tmp >> 64);
155777eaedSRobin Murphy }
165777eaedSRobin Murphy 
175777eaedSRobin Murphy unsigned int do_csum(const unsigned char *buff, int len)
185777eaedSRobin Murphy {
195777eaedSRobin Murphy 	unsigned int offset, shift, sum;
205777eaedSRobin Murphy 	const u64 *ptr;
215777eaedSRobin Murphy 	u64 data, sum64 = 0;
225777eaedSRobin Murphy 
23c2c24edbSRobin Murphy 	if (unlikely(len == 0))
24c2c24edbSRobin Murphy 		return 0;
25c2c24edbSRobin Murphy 
265777eaedSRobin Murphy 	offset = (unsigned long)buff & 7;
275777eaedSRobin Murphy 	/*
285777eaedSRobin Murphy 	 * This is to all intents and purposes safe, since rounding down cannot
295777eaedSRobin Murphy 	 * result in a different page or cache line being accessed, and @buff
305777eaedSRobin Murphy 	 * should absolutely not be pointing to anything read-sensitive. We do,
315777eaedSRobin Murphy 	 * however, have to be careful not to piss off KASAN, which means using
325777eaedSRobin Murphy 	 * unchecked reads to accommodate the head and tail, for which we'll
335777eaedSRobin Murphy 	 * compensate with an explicit check up-front.
345777eaedSRobin Murphy 	 */
355777eaedSRobin Murphy 	kasan_check_read(buff, len);
365777eaedSRobin Murphy 	ptr = (u64 *)(buff - offset);
375777eaedSRobin Murphy 	len = len + offset - 8;
385777eaedSRobin Murphy 
395777eaedSRobin Murphy 	/*
405777eaedSRobin Murphy 	 * Head: zero out any excess leading bytes. Shifting back by the same
415777eaedSRobin Murphy 	 * amount should be at least as fast as any other way of handling the
425777eaedSRobin Murphy 	 * odd/even alignment, and means we can ignore it until the very end.
435777eaedSRobin Murphy 	 */
445777eaedSRobin Murphy 	shift = offset * 8;
455777eaedSRobin Murphy 	data = READ_ONCE_NOCHECK(*ptr++);
465777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN
475777eaedSRobin Murphy 	data = (data >> shift) << shift;
485777eaedSRobin Murphy #else
495777eaedSRobin Murphy 	data = (data << shift) >> shift;
505777eaedSRobin Murphy #endif
515777eaedSRobin Murphy 
525777eaedSRobin Murphy 	/*
535777eaedSRobin Murphy 	 * Body: straightforward aligned loads from here on (the paired loads
545777eaedSRobin Murphy 	 * underlying the quadword type still only need dword alignment). The
555777eaedSRobin Murphy 	 * main loop strictly excludes the tail, so the second loop will always
565777eaedSRobin Murphy 	 * run at least once.
575777eaedSRobin Murphy 	 */
585777eaedSRobin Murphy 	while (unlikely(len > 64)) {
595777eaedSRobin Murphy 		__uint128_t tmp1, tmp2, tmp3, tmp4;
605777eaedSRobin Murphy 
615777eaedSRobin Murphy 		tmp1 = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
625777eaedSRobin Murphy 		tmp2 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 2));
635777eaedSRobin Murphy 		tmp3 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 4));
645777eaedSRobin Murphy 		tmp4 = READ_ONCE_NOCHECK(*(__uint128_t *)(ptr + 6));
655777eaedSRobin Murphy 
665777eaedSRobin Murphy 		len -= 64;
675777eaedSRobin Murphy 		ptr += 8;
685777eaedSRobin Murphy 
695777eaedSRobin Murphy 		/* This is the "don't dump the carry flag into a GPR" idiom */
705777eaedSRobin Murphy 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
715777eaedSRobin Murphy 		tmp2 += (tmp2 >> 64) | (tmp2 << 64);
725777eaedSRobin Murphy 		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
735777eaedSRobin Murphy 		tmp4 += (tmp4 >> 64) | (tmp4 << 64);
745777eaedSRobin Murphy 		tmp1 = ((tmp1 >> 64) << 64) | (tmp2 >> 64);
755777eaedSRobin Murphy 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
765777eaedSRobin Murphy 		tmp3 = ((tmp3 >> 64) << 64) | (tmp4 >> 64);
775777eaedSRobin Murphy 		tmp3 += (tmp3 >> 64) | (tmp3 << 64);
785777eaedSRobin Murphy 		tmp1 = ((tmp1 >> 64) << 64) | (tmp3 >> 64);
795777eaedSRobin Murphy 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
805777eaedSRobin Murphy 		tmp1 = ((tmp1 >> 64) << 64) | sum64;
815777eaedSRobin Murphy 		tmp1 += (tmp1 >> 64) | (tmp1 << 64);
825777eaedSRobin Murphy 		sum64 = tmp1 >> 64;
835777eaedSRobin Murphy 	}
845777eaedSRobin Murphy 	while (len > 8) {
855777eaedSRobin Murphy 		__uint128_t tmp;
865777eaedSRobin Murphy 
875777eaedSRobin Murphy 		sum64 = accumulate(sum64, data);
885777eaedSRobin Murphy 		tmp = READ_ONCE_NOCHECK(*(__uint128_t *)ptr);
895777eaedSRobin Murphy 
905777eaedSRobin Murphy 		len -= 16;
915777eaedSRobin Murphy 		ptr += 2;
925777eaedSRobin Murphy 
935777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN
945777eaedSRobin Murphy 		data = tmp >> 64;
955777eaedSRobin Murphy 		sum64 = accumulate(sum64, tmp);
965777eaedSRobin Murphy #else
975777eaedSRobin Murphy 		data = tmp;
985777eaedSRobin Murphy 		sum64 = accumulate(sum64, tmp >> 64);
995777eaedSRobin Murphy #endif
1005777eaedSRobin Murphy 	}
1015777eaedSRobin Murphy 	if (len > 0) {
1025777eaedSRobin Murphy 		sum64 = accumulate(sum64, data);
1035777eaedSRobin Murphy 		data = READ_ONCE_NOCHECK(*ptr);
1045777eaedSRobin Murphy 		len -= 8;
1055777eaedSRobin Murphy 	}
1065777eaedSRobin Murphy 	/*
1075777eaedSRobin Murphy 	 * Tail: zero any over-read bytes similarly to the head, again
1085777eaedSRobin Murphy 	 * preserving odd/even alignment.
1095777eaedSRobin Murphy 	 */
1105777eaedSRobin Murphy 	shift = len * -8;
1115777eaedSRobin Murphy #ifdef __LITTLE_ENDIAN
1125777eaedSRobin Murphy 	data = (data << shift) >> shift;
1135777eaedSRobin Murphy #else
1145777eaedSRobin Murphy 	data = (data >> shift) << shift;
1155777eaedSRobin Murphy #endif
1165777eaedSRobin Murphy 	sum64 = accumulate(sum64, data);
1175777eaedSRobin Murphy 
1185777eaedSRobin Murphy 	/* Finally, folding */
1195777eaedSRobin Murphy 	sum64 += (sum64 >> 32) | (sum64 << 32);
1205777eaedSRobin Murphy 	sum = sum64 >> 32;
1215777eaedSRobin Murphy 	sum += (sum >> 16) | (sum << 16);
1225777eaedSRobin Murphy 	if (offset & 1)
1235777eaedSRobin Murphy 		return (u16)swab32(sum);
1245777eaedSRobin Murphy 
1255777eaedSRobin Murphy 	return sum >> 16;
1265777eaedSRobin Murphy }
127*e9c7ddbfSRobin Murphy 
128*e9c7ddbfSRobin Murphy __sum16 csum_ipv6_magic(const struct in6_addr *saddr,
129*e9c7ddbfSRobin Murphy 			const struct in6_addr *daddr,
130*e9c7ddbfSRobin Murphy 			__u32 len, __u8 proto, __wsum csum)
131*e9c7ddbfSRobin Murphy {
132*e9c7ddbfSRobin Murphy 	__uint128_t src, dst;
133*e9c7ddbfSRobin Murphy 	u64 sum = (__force u64)csum;
134*e9c7ddbfSRobin Murphy 
135*e9c7ddbfSRobin Murphy 	src = *(const __uint128_t *)saddr->s6_addr;
136*e9c7ddbfSRobin Murphy 	dst = *(const __uint128_t *)daddr->s6_addr;
137*e9c7ddbfSRobin Murphy 
138*e9c7ddbfSRobin Murphy 	sum += (__force u32)htonl(len);
139*e9c7ddbfSRobin Murphy #ifdef __LITTLE_ENDIAN
140*e9c7ddbfSRobin Murphy 	sum += (u32)proto << 24;
141*e9c7ddbfSRobin Murphy #else
142*e9c7ddbfSRobin Murphy 	sum += proto;
143*e9c7ddbfSRobin Murphy #endif
144*e9c7ddbfSRobin Murphy 	src += (src >> 64) | (src << 64);
145*e9c7ddbfSRobin Murphy 	dst += (dst >> 64) | (dst << 64);
146*e9c7ddbfSRobin Murphy 
147*e9c7ddbfSRobin Murphy 	sum = accumulate(sum, src >> 64);
148*e9c7ddbfSRobin Murphy 	sum = accumulate(sum, dst >> 64);
149*e9c7ddbfSRobin Murphy 
150*e9c7ddbfSRobin Murphy 	sum += ((sum >> 32) | (sum << 32));
151*e9c7ddbfSRobin Murphy 	return csum_fold((__force __wsum)(sum >> 32));
152*e9c7ddbfSRobin Murphy }
153*e9c7ddbfSRobin Murphy EXPORT_SYMBOL(csum_ipv6_magic);
154