xref: /freebsd/sys/netinet/in_rss.c (revision c25290420ee7b9f4a875426380d8ad042a561b9c)
17527624eSRobert Watson /*-
27527624eSRobert Watson  * Copyright (c) 2010-2011 Juniper Networks, Inc.
37527624eSRobert Watson  * All rights reserved.
47527624eSRobert Watson  *
57527624eSRobert Watson  * This software was developed by Robert N. M. Watson under contract
67527624eSRobert Watson  * to Juniper Networks, Inc.
77527624eSRobert Watson  *
87527624eSRobert Watson  * Redistribution and use in source and binary forms, with or without
97527624eSRobert Watson  * modification, are permitted provided that the following conditions
107527624eSRobert Watson  * are met:
117527624eSRobert Watson  * 1. Redistributions of source code must retain the above copyright
127527624eSRobert Watson  *    notice, this list of conditions and the following disclaimer.
137527624eSRobert Watson  * 2. Redistributions in binary form must reproduce the above copyright
147527624eSRobert Watson  *    notice, this list of conditions and the following disclaimer in the
157527624eSRobert Watson  *    documentation and/or other materials provided with the distribution.
167527624eSRobert Watson  *
177527624eSRobert Watson  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
187527624eSRobert Watson  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
197527624eSRobert Watson  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
207527624eSRobert Watson  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
217527624eSRobert Watson  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
227527624eSRobert Watson  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
237527624eSRobert Watson  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
247527624eSRobert Watson  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
257527624eSRobert Watson  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
267527624eSRobert Watson  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
277527624eSRobert Watson  * SUCH DAMAGE.
287527624eSRobert Watson  */
297527624eSRobert Watson 
307527624eSRobert Watson #include <sys/cdefs.h>
317527624eSRobert Watson 
327527624eSRobert Watson __FBSDID("$FreeBSD$");
337527624eSRobert Watson 
347527624eSRobert Watson #include "opt_inet6.h"
357527624eSRobert Watson #include "opt_pcbgroup.h"
367527624eSRobert Watson 
377527624eSRobert Watson #ifndef PCBGROUP
387527624eSRobert Watson #error "options RSS depends on options PCBGROUP"
397527624eSRobert Watson #endif
407527624eSRobert Watson 
417527624eSRobert Watson #include <sys/param.h>
427527624eSRobert Watson #include <sys/mbuf.h>
437527624eSRobert Watson #include <sys/socket.h>
447527624eSRobert Watson #include <sys/priv.h>
457527624eSRobert Watson #include <sys/kernel.h>
467527624eSRobert Watson #include <sys/smp.h>
477527624eSRobert Watson #include <sys/sysctl.h>
488bde802aSAdrian Chadd #include <sys/sbuf.h>
497527624eSRobert Watson 
507527624eSRobert Watson #include <net/if.h>
517527624eSRobert Watson #include <net/if_var.h>
527527624eSRobert Watson #include <net/netisr.h>
537527624eSRobert Watson 
547527624eSRobert Watson #include <netinet/in.h>
557527624eSRobert Watson #include <netinet/in_pcb.h>
567527624eSRobert Watson #include <netinet/in_rss.h>
577527624eSRobert Watson #include <netinet/in_var.h>
587527624eSRobert Watson #include <netinet/toeplitz.h>
597527624eSRobert Watson 
6072d33245SAdrian Chadd /* for software rss hash support */
6172d33245SAdrian Chadd #include <netinet/ip.h>
6272d33245SAdrian Chadd #include <netinet/tcp.h>
6372d33245SAdrian Chadd #include <netinet/udp.h>
6472d33245SAdrian Chadd 
657527624eSRobert Watson /*-
667527624eSRobert Watson  * Operating system parts of receiver-side scaling (RSS), which allows
677527624eSRobert Watson  * network cards to direct flows to particular receive queues based on hashes
687527624eSRobert Watson  * of header tuples.  This implementation aligns RSS buckets with connection
697527624eSRobert Watson  * groups at the TCP/IP layer, so each bucket is associated with exactly one
707527624eSRobert Watson  * group.  As a result, the group lookup structures (and lock) should have an
717527624eSRobert Watson  * effective affinity with exactly one CPU.
727527624eSRobert Watson  *
737527624eSRobert Watson  * Network device drivers needing to configure RSS will query this framework
747527624eSRobert Watson  * for parameters, such as the current RSS key, hashing policies, number of
757527624eSRobert Watson  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
767527624eSRobert Watson  * provide their own supplementary information, such as queue<->CPU bindings.
777527624eSRobert Watson  * It is the responsibility of the network device driver to inject packets
787527624eSRobert Watson  * into the stack on as close to the right CPU as possible, if playing by RSS
797527624eSRobert Watson  * rules.
807527624eSRobert Watson  *
817527624eSRobert Watson  * TODO:
827527624eSRobert Watson  *
837527624eSRobert Watson  * - Synchronization for rss_key and other future-configurable parameters.
847527624eSRobert Watson  * - Event handler drivers can register to pick up RSS configuration changes.
857527624eSRobert Watson  * - Should we allow rss_basecpu to be configured?
867527624eSRobert Watson  * - Randomize key on boot.
877527624eSRobert Watson  * - IPv6 support.
887527624eSRobert Watson  * - Statistics on how often there's a misalignment between hardware
897527624eSRobert Watson  *   placement and pcbgroup expectations.
907527624eSRobert Watson  */
917527624eSRobert Watson 
927527624eSRobert Watson SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
937527624eSRobert Watson 
947527624eSRobert Watson /*
957527624eSRobert Watson  * Toeplitz is the only required hash function in the RSS spec, so use it by
967527624eSRobert Watson  * default.
977527624eSRobert Watson  */
987527624eSRobert Watson static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
99af3b2549SHans Petter Selasky SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
1007527624eSRobert Watson     "RSS hash algorithm");
1017527624eSRobert Watson 
1027527624eSRobert Watson /*
1037527624eSRobert Watson  * Size of the indirection table; at most 128 entries per the RSS spec.  We
1047527624eSRobert Watson  * size it to at least 2 times the number of CPUs by default to allow useful
1057527624eSRobert Watson  * rebalancing.  If not set explicitly with a loader tunable, we tune based
1067527624eSRobert Watson  * on the number of CPUs present.
1077527624eSRobert Watson  *
1087527624eSRobert Watson  * XXXRW: buckets might be better to use for the tunable than bits.
1097527624eSRobert Watson  */
1107527624eSRobert Watson static u_int	rss_bits;
111af3b2549SHans Petter Selasky SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
1127527624eSRobert Watson     "RSS bits");
1137527624eSRobert Watson 
1147527624eSRobert Watson static u_int	rss_mask;
1157527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
1167527624eSRobert Watson     "RSS mask");
1177527624eSRobert Watson 
1187527624eSRobert Watson static const u_int	rss_maxbits = RSS_MAXBITS;
1197527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
1207527624eSRobert Watson     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
1217527624eSRobert Watson 
1227527624eSRobert Watson /*
1237527624eSRobert Watson  * RSS's own count of the number of CPUs it could be using for processing.
1247527624eSRobert Watson  * Bounded to 64 by RSS constants.
1257527624eSRobert Watson  */
1267527624eSRobert Watson static u_int	rss_ncpus;
1277527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
1287527624eSRobert Watson     "Number of CPUs available to RSS");
1297527624eSRobert Watson 
1307527624eSRobert Watson #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
1317527624eSRobert Watson static const u_int	rss_maxcpus = RSS_MAXCPUS;
1327527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
1337527624eSRobert Watson     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
1347527624eSRobert Watson 
1357527624eSRobert Watson /*
1367527624eSRobert Watson  * Variable exists just for reporting rss_bits in a user-friendly way.
1377527624eSRobert Watson  */
1387527624eSRobert Watson static u_int	rss_buckets;
1397527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
1407527624eSRobert Watson     "RSS buckets");
1417527624eSRobert Watson 
1427527624eSRobert Watson /*
1437527624eSRobert Watson  * Base CPU number; devices will add this to all CPU numbers returned by the
1447527624eSRobert Watson  * RSS indirection table.  Currently unmodifable in FreeBSD.
1457527624eSRobert Watson  */
1467527624eSRobert Watson static const u_int	rss_basecpu;
1477527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
1487527624eSRobert Watson     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
1497527624eSRobert Watson 
1507527624eSRobert Watson /*
1517527624eSRobert Watson  * RSS secret key, intended to prevent attacks on load-balancing.  Its
1527527624eSRobert Watson  * effectiveness may be limited by algorithm choice and available entropy
1537527624eSRobert Watson  * during the boot.
1547527624eSRobert Watson  *
1557527624eSRobert Watson  * XXXRW: And that we don't randomize it yet!
1567527624eSRobert Watson  *
15785415b47SAdrian Chadd  * This is the default Microsoft RSS specification key which is also
15885415b47SAdrian Chadd  * the Chelsio T5 firmware default key.
1597527624eSRobert Watson  */
1607527624eSRobert Watson static uint8_t rss_key[RSS_KEYSIZE] = {
16107b4e383SPeter Grehan 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
16207b4e383SPeter Grehan 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
16307b4e383SPeter Grehan 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
16407b4e383SPeter Grehan 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
16507b4e383SPeter Grehan 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
1667527624eSRobert Watson };
1677527624eSRobert Watson 
1687527624eSRobert Watson /*
1697527624eSRobert Watson  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
1707527624eSRobert Watson  * Drivers may supplement this table with a seperate CPU<->queue table when
1717527624eSRobert Watson  * programming devices.
1727527624eSRobert Watson  */
1737527624eSRobert Watson struct rss_table_entry {
1747527624eSRobert Watson 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
1757527624eSRobert Watson };
1767527624eSRobert Watson static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
1777527624eSRobert Watson 
17872d33245SAdrian Chadd static inline u_int rss_gethashconfig_local(void);
17972d33245SAdrian Chadd 
1807527624eSRobert Watson static void
1817527624eSRobert Watson rss_init(__unused void *arg)
1827527624eSRobert Watson {
1837527624eSRobert Watson 	u_int i;
184bad008ceSAdrian Chadd 	u_int cpuid;
1857527624eSRobert Watson 
1867527624eSRobert Watson 	/*
1877527624eSRobert Watson 	 * Validate tunables, coerce to sensible values.
1887527624eSRobert Watson 	 */
1897527624eSRobert Watson 	switch (rss_hashalgo) {
1907527624eSRobert Watson 	case RSS_HASH_TOEPLITZ:
1917527624eSRobert Watson 	case RSS_HASH_NAIVE:
1927527624eSRobert Watson 		break;
1937527624eSRobert Watson 
1947527624eSRobert Watson 	default:
1957527624eSRobert Watson 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
1967527624eSRobert Watson 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
1977527624eSRobert Watson 		rss_hashalgo = RSS_HASH_TOEPLITZ;
1987527624eSRobert Watson 	}
1997527624eSRobert Watson 
2007527624eSRobert Watson 	/*
2017527624eSRobert Watson 	 * Count available CPUs.
2027527624eSRobert Watson 	 *
2037527624eSRobert Watson 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
2047527624eSRobert Watson 	 * elsewhere.
2057527624eSRobert Watson 	 */
2067527624eSRobert Watson 	rss_ncpus = 0;
2077527624eSRobert Watson 	for (i = 0; i <= mp_maxid; i++) {
2087527624eSRobert Watson 		if (CPU_ABSENT(i))
2097527624eSRobert Watson 			continue;
2107527624eSRobert Watson 		rss_ncpus++;
2117527624eSRobert Watson 	}
2127527624eSRobert Watson 	if (rss_ncpus > RSS_MAXCPUS)
2137527624eSRobert Watson 		rss_ncpus = RSS_MAXCPUS;
2147527624eSRobert Watson 
2157527624eSRobert Watson 	/*
2167527624eSRobert Watson 	 * Tune RSS table entries to be no less than 2x the number of CPUs
2177527624eSRobert Watson 	 * -- unless we're running uniprocessor, in which case there's not
2187527624eSRobert Watson 	 * much point in having buckets to rearrange for load-balancing!
2197527624eSRobert Watson 	 */
2207527624eSRobert Watson 	if (rss_ncpus > 1) {
2217527624eSRobert Watson 		if (rss_bits == 0)
2227527624eSRobert Watson 			rss_bits = fls(rss_ncpus - 1) + 1;
2237527624eSRobert Watson 
2247527624eSRobert Watson 		/*
2257527624eSRobert Watson 		 * Microsoft limits RSS table entries to 128, so apply that
2267527624eSRobert Watson 		 * limit to both auto-detected CPU counts and user-configured
2277527624eSRobert Watson 		 * ones.
2287527624eSRobert Watson 		 */
2297527624eSRobert Watson 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
2307527624eSRobert Watson 			printf("%s: RSS bits %u not valid, coercing to  %u",
2317527624eSRobert Watson 			    __func__, rss_bits, RSS_MAXBITS);
2327527624eSRobert Watson 			rss_bits = RSS_MAXBITS;
2337527624eSRobert Watson 		}
2347527624eSRobert Watson 
2357527624eSRobert Watson 		/*
2367527624eSRobert Watson 		 * Figure out how many buckets to use; warn if less than the
2377527624eSRobert Watson 		 * number of configured CPUs, although this is not a fatal
2387527624eSRobert Watson 		 * problem.
2397527624eSRobert Watson 		 */
2407527624eSRobert Watson 		rss_buckets = (1 << rss_bits);
2417527624eSRobert Watson 		if (rss_buckets < rss_ncpus)
2427527624eSRobert Watson 			printf("%s: WARNING: rss_buckets (%u) less than "
2437527624eSRobert Watson 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
2447527624eSRobert Watson 			    rss_ncpus);
2457527624eSRobert Watson 		rss_mask = rss_buckets - 1;
2467527624eSRobert Watson 	} else {
2477527624eSRobert Watson 		rss_bits = 0;
2487527624eSRobert Watson 		rss_buckets = 1;
2497527624eSRobert Watson 		rss_mask = 0;
2507527624eSRobert Watson 	}
2517527624eSRobert Watson 
2527527624eSRobert Watson 	/*
2537527624eSRobert Watson 	 * Set up initial CPU assignments: round-robin by default.
2547527624eSRobert Watson 	 */
255bad008ceSAdrian Chadd 	cpuid = CPU_FIRST();
256bad008ceSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
257bad008ceSAdrian Chadd 		rss_table[i].rte_cpu = cpuid;
258bad008ceSAdrian Chadd 		cpuid = CPU_NEXT(cpuid);
259bad008ceSAdrian Chadd 	}
2607527624eSRobert Watson 
2617527624eSRobert Watson 	/*
2627527624eSRobert Watson 	 * Randomize rrs_key.
2637527624eSRobert Watson 	 *
2647527624eSRobert Watson 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
2657527624eSRobert Watson 	 * loop to check for "bad" RSS keys.
2667527624eSRobert Watson 	 */
2677527624eSRobert Watson }
2687527624eSRobert Watson SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
2697527624eSRobert Watson 
2707527624eSRobert Watson static uint32_t
2717527624eSRobert Watson rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
2727527624eSRobert Watson     const uint8_t *data)
2737527624eSRobert Watson {
2747527624eSRobert Watson 	uint32_t v;
2757527624eSRobert Watson 	u_int i;
2767527624eSRobert Watson 
2777527624eSRobert Watson 	v = 0;
2787527624eSRobert Watson 	for (i = 0; i < keylen; i++)
2797527624eSRobert Watson 		v += key[i];
2807527624eSRobert Watson 	for (i = 0; i < datalen; i++)
2817527624eSRobert Watson 		v += data[i];
2827527624eSRobert Watson 	return (v);
2837527624eSRobert Watson }
2847527624eSRobert Watson 
2857527624eSRobert Watson static uint32_t
2867527624eSRobert Watson rss_hash(u_int datalen, const uint8_t *data)
2877527624eSRobert Watson {
2887527624eSRobert Watson 
2897527624eSRobert Watson 	switch (rss_hashalgo) {
2907527624eSRobert Watson 	case RSS_HASH_TOEPLITZ:
2917527624eSRobert Watson 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
2927527624eSRobert Watson 		    data));
2937527624eSRobert Watson 
2947527624eSRobert Watson 	case RSS_HASH_NAIVE:
2957527624eSRobert Watson 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
2967527624eSRobert Watson 		    data));
2977527624eSRobert Watson 
2987527624eSRobert Watson 	default:
2997527624eSRobert Watson 		panic("%s: unsupported/unknown hashalgo %d", __func__,
3007527624eSRobert Watson 		    rss_hashalgo);
3017527624eSRobert Watson 	}
3027527624eSRobert Watson }
3037527624eSRobert Watson 
3047527624eSRobert Watson /*
3057527624eSRobert Watson  * Hash an IPv4 2-tuple.
3067527624eSRobert Watson  */
3077527624eSRobert Watson uint32_t
3087527624eSRobert Watson rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
3097527624eSRobert Watson {
3107527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst)];
3117527624eSRobert Watson 	u_int datalen;
3127527624eSRobert Watson 
3137527624eSRobert Watson 	datalen = 0;
3147527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
3157527624eSRobert Watson 	datalen += sizeof(src);
3167527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
3177527624eSRobert Watson 	datalen += sizeof(dst);
3187527624eSRobert Watson 	return (rss_hash(datalen, data));
3197527624eSRobert Watson }
3207527624eSRobert Watson 
3217527624eSRobert Watson /*
3227527624eSRobert Watson  * Hash an IPv4 4-tuple.
3237527624eSRobert Watson  */
3247527624eSRobert Watson uint32_t
3257527624eSRobert Watson rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
3267527624eSRobert Watson     u_short dstport)
3277527624eSRobert Watson {
3287527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
3297527624eSRobert Watson 	    sizeof(dstport)];
3307527624eSRobert Watson 	u_int datalen;
3317527624eSRobert Watson 
3327527624eSRobert Watson 	datalen = 0;
3337527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
3347527624eSRobert Watson 	datalen += sizeof(src);
3357527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
3367527624eSRobert Watson 	datalen += sizeof(dst);
3377527624eSRobert Watson 	bcopy(&srcport, &data[datalen], sizeof(srcport));
3387527624eSRobert Watson 	datalen += sizeof(srcport);
3397527624eSRobert Watson 	bcopy(&dstport, &data[datalen], sizeof(dstport));
3407527624eSRobert Watson 	datalen += sizeof(dstport);
3417527624eSRobert Watson 	return (rss_hash(datalen, data));
3427527624eSRobert Watson }
3437527624eSRobert Watson 
3447527624eSRobert Watson #ifdef INET6
3457527624eSRobert Watson /*
3467527624eSRobert Watson  * Hash an IPv6 2-tuple.
3477527624eSRobert Watson  */
3487527624eSRobert Watson uint32_t
3497527624eSRobert Watson rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst)
3507527624eSRobert Watson {
3517527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst)];
3527527624eSRobert Watson 	u_int datalen;
3537527624eSRobert Watson 
3547527624eSRobert Watson 	datalen = 0;
3557527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
3567527624eSRobert Watson 	datalen += sizeof(src);
3577527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
3587527624eSRobert Watson 	datalen += sizeof(dst);
3597527624eSRobert Watson 	return (rss_hash(datalen, data));
3607527624eSRobert Watson }
3617527624eSRobert Watson 
3627527624eSRobert Watson /*
3637527624eSRobert Watson  * Hash an IPv6 4-tuple.
3647527624eSRobert Watson  */
3657527624eSRobert Watson uint32_t
3667527624eSRobert Watson rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport,
3677527624eSRobert Watson     struct in6_addr dst, u_short dstport)
3687527624eSRobert Watson {
3697527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
3707527624eSRobert Watson 	    sizeof(dstport)];
3717527624eSRobert Watson 	u_int datalen;
3727527624eSRobert Watson 
3737527624eSRobert Watson 	datalen = 0;
3747527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
3757527624eSRobert Watson 	datalen += sizeof(src);
3767527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
3777527624eSRobert Watson 	datalen += sizeof(dst);
3787527624eSRobert Watson 	bcopy(&srcport, &data[datalen], sizeof(srcport));
3797527624eSRobert Watson 	datalen += sizeof(srcport);
3807527624eSRobert Watson 	bcopy(&dstport, &data[datalen], sizeof(dstport));
3817527624eSRobert Watson 	datalen += sizeof(dstport);
3827527624eSRobert Watson 	return (rss_hash(datalen, data));
3837527624eSRobert Watson }
3847527624eSRobert Watson #endif /* INET6 */
3857527624eSRobert Watson 
3867527624eSRobert Watson /*
3877527624eSRobert Watson  * Query the number of RSS bits in use.
3887527624eSRobert Watson  */
3897527624eSRobert Watson u_int
3907527624eSRobert Watson rss_getbits(void)
3917527624eSRobert Watson {
3927527624eSRobert Watson 
3937527624eSRobert Watson 	return (rss_bits);
3947527624eSRobert Watson }
3957527624eSRobert Watson 
3967527624eSRobert Watson /*
3977527624eSRobert Watson  * Query the RSS bucket associated with an RSS hash.
3987527624eSRobert Watson  */
3997527624eSRobert Watson u_int
4007527624eSRobert Watson rss_getbucket(u_int hash)
4017527624eSRobert Watson {
4027527624eSRobert Watson 
4037527624eSRobert Watson 	return (hash & rss_mask);
4047527624eSRobert Watson }
4057527624eSRobert Watson 
4067527624eSRobert Watson /*
407a6c88ec4SAdrian Chadd  * Query the RSS layer bucket associated with the given
408a6c88ec4SAdrian Chadd  * entry in the RSS hash space.
409a6c88ec4SAdrian Chadd  *
410a6c88ec4SAdrian Chadd  * The RSS indirection table is 0 .. rss_buckets-1,
411a6c88ec4SAdrian Chadd  * covering the low 'rss_bits' of the total 128 slot
412a6c88ec4SAdrian Chadd  * RSS indirection table.  So just mask off rss_bits and
413a6c88ec4SAdrian Chadd  * return that.
414a6c88ec4SAdrian Chadd  *
415a6c88ec4SAdrian Chadd  * NIC drivers can then iterate over the 128 slot RSS
416a6c88ec4SAdrian Chadd  * indirection table and fetch which RSS bucket to
417a6c88ec4SAdrian Chadd  * map it to.  This will typically be a CPU queue
418a6c88ec4SAdrian Chadd  */
419a6c88ec4SAdrian Chadd u_int
420a6c88ec4SAdrian Chadd rss_get_indirection_to_bucket(u_int index)
421a6c88ec4SAdrian Chadd {
422a6c88ec4SAdrian Chadd 
423a6c88ec4SAdrian Chadd 	return (index & rss_mask);
424a6c88ec4SAdrian Chadd }
425a6c88ec4SAdrian Chadd 
426a6c88ec4SAdrian Chadd /*
4277527624eSRobert Watson  * Query the RSS CPU associated with an RSS bucket.
4287527624eSRobert Watson  */
4297527624eSRobert Watson u_int
4307527624eSRobert Watson rss_getcpu(u_int bucket)
4317527624eSRobert Watson {
4327527624eSRobert Watson 
4337527624eSRobert Watson 	return (rss_table[bucket].rte_cpu);
4347527624eSRobert Watson }
4357527624eSRobert Watson 
4367527624eSRobert Watson /*
437cc6c1877SAdrian Chadd  * netisr CPU affinity lookup given just the hash and hashtype.
438cc6c1877SAdrian Chadd  */
439cc6c1877SAdrian Chadd u_int
440cc6c1877SAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
441cc6c1877SAdrian Chadd {
442cc6c1877SAdrian Chadd 
443cc6c1877SAdrian Chadd 	switch (hash_type) {
444cc6c1877SAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
445cc6c1877SAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
4469870806cSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
447e989b65fSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
448e989b65fSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
4499870806cSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
450cc6c1877SAdrian Chadd 		return (rss_getcpu(rss_getbucket(hash_val)));
451cc6c1877SAdrian Chadd 	default:
452cc6c1877SAdrian Chadd 		return (NETISR_CPUID_NONE);
453cc6c1877SAdrian Chadd 	}
454cc6c1877SAdrian Chadd }
455cc6c1877SAdrian Chadd 
456cc6c1877SAdrian Chadd /*
4578bde802aSAdrian Chadd  * Query the RSS bucket associated with the given hash value and
4588bde802aSAdrian Chadd  * type.
4598bde802aSAdrian Chadd  */
4608bde802aSAdrian Chadd int
4618bde802aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
4628bde802aSAdrian Chadd {
4638bde802aSAdrian Chadd 
4648bde802aSAdrian Chadd 	switch (hash_type) {
4658bde802aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
4668bde802aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
4679870806cSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
468e989b65fSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
469e989b65fSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
4709870806cSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
4718bde802aSAdrian Chadd 		*bucket_id = rss_getbucket(hash_val);
4728bde802aSAdrian Chadd 		return (0);
4738bde802aSAdrian Chadd 	default:
4748bde802aSAdrian Chadd 		return (-1);
4758bde802aSAdrian Chadd 	}
4768bde802aSAdrian Chadd }
4778bde802aSAdrian Chadd 
4788bde802aSAdrian Chadd /*
4797527624eSRobert Watson  * netisr CPU affinity lookup routine for use by protocols.
4807527624eSRobert Watson  */
4817527624eSRobert Watson struct mbuf *
4827527624eSRobert Watson rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
4837527624eSRobert Watson {
4847527624eSRobert Watson 
4857527624eSRobert Watson 	M_ASSERTPKTHDR(m);
486cc6c1877SAdrian Chadd 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
4877527624eSRobert Watson 	return (m);
4887527624eSRobert Watson }
4897527624eSRobert Watson 
4908bde802aSAdrian Chadd int
4918bde802aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
4928bde802aSAdrian Chadd {
4938bde802aSAdrian Chadd 
4948bde802aSAdrian Chadd 	M_ASSERTPKTHDR(m);
4958bde802aSAdrian Chadd 
4968bde802aSAdrian Chadd 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
4978bde802aSAdrian Chadd 	    bucket_id));
4988bde802aSAdrian Chadd }
4998bde802aSAdrian Chadd 
5007527624eSRobert Watson /*
50172d33245SAdrian Chadd  * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given
50272d33245SAdrian Chadd  * IPv4 source/destination address, UDP or TCP source/destination ports
50372d33245SAdrian Chadd  * and the protocol type.
50472d33245SAdrian Chadd  *
50572d33245SAdrian Chadd  * The protocol code may wish to do a software hash of the given
50672d33245SAdrian Chadd  * tuple.  This depends upon the currently configured RSS hash types.
50772d33245SAdrian Chadd  *
50872d33245SAdrian Chadd  * This assumes that the packet in question isn't a fragment.
50972d33245SAdrian Chadd  *
51072d33245SAdrian Chadd  * It also assumes the packet source/destination address
51172d33245SAdrian Chadd  * are in "incoming" packet order (ie, source is "far" address.)
51272d33245SAdrian Chadd  */
51372d33245SAdrian Chadd int
51472d33245SAdrian Chadd rss_proto_software_hash_v4(struct in_addr s, struct in_addr d,
51572d33245SAdrian Chadd     u_short sp, u_short dp, int proto,
51672d33245SAdrian Chadd     uint32_t *hashval, uint32_t *hashtype)
51772d33245SAdrian Chadd {
51872d33245SAdrian Chadd 	uint32_t hash;
51972d33245SAdrian Chadd 
52072d33245SAdrian Chadd 	/*
52172d33245SAdrian Chadd 	 * Next, choose the hash type depending upon the protocol
52272d33245SAdrian Chadd 	 * identifier.
52372d33245SAdrian Chadd 	 */
52472d33245SAdrian Chadd 	if ((proto == IPPROTO_TCP) &&
52572d33245SAdrian Chadd 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
52672d33245SAdrian Chadd 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
52772d33245SAdrian Chadd 		*hashval = hash;
52872d33245SAdrian Chadd 		*hashtype = M_HASHTYPE_RSS_TCP_IPV4;
52972d33245SAdrian Chadd 		return (0);
53072d33245SAdrian Chadd 	} else if ((proto == IPPROTO_UDP) &&
53172d33245SAdrian Chadd 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
53272d33245SAdrian Chadd 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
53372d33245SAdrian Chadd 		*hashval = hash;
53472d33245SAdrian Chadd 		*hashtype = M_HASHTYPE_RSS_UDP_IPV4;
53572d33245SAdrian Chadd 		return (0);
53672d33245SAdrian Chadd 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
53772d33245SAdrian Chadd 		/* RSS doesn't hash on other protocols like SCTP; so 2-tuple */
53872d33245SAdrian Chadd 		hash = rss_hash_ip4_2tuple(s, d);
53972d33245SAdrian Chadd 		*hashval = hash;
54072d33245SAdrian Chadd 		*hashtype = M_HASHTYPE_RSS_IPV4;
54172d33245SAdrian Chadd 		return (0);
54272d33245SAdrian Chadd 	}
54372d33245SAdrian Chadd 
54472d33245SAdrian Chadd 	/* No configured available hashtypes! */
54572d33245SAdrian Chadd 	printf("%s: no available hashtypes!\n", __func__);
54672d33245SAdrian Chadd 	return (-1);
54772d33245SAdrian Chadd }
54872d33245SAdrian Chadd 
54972d33245SAdrian Chadd /*
55072d33245SAdrian Chadd  * Do a software calculation of the RSS for the given mbuf.
55172d33245SAdrian Chadd  *
55272d33245SAdrian Chadd  * This is typically used by the input path to recalculate the RSS after
55372d33245SAdrian Chadd  * some form of packet processing (eg de-capsulation, IP fragment reassembly.)
55472d33245SAdrian Chadd  *
55572d33245SAdrian Chadd  * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and
55672d33245SAdrian Chadd  * RSS_HASH_PKT_EGRESS for outgoing.
55772d33245SAdrian Chadd  *
55872d33245SAdrian Chadd  * Returns 0 if a hash was done, -1 if no hash was done, +1 if
55972d33245SAdrian Chadd  * the mbuf already had a valid RSS flowid.
56072d33245SAdrian Chadd  *
56172d33245SAdrian Chadd  * This function doesn't modify the mbuf.  It's up to the caller to
56272d33245SAdrian Chadd  * assign flowid/flowtype as appropriate.
56372d33245SAdrian Chadd  */
56472d33245SAdrian Chadd int
56572d33245SAdrian Chadd rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval,
56672d33245SAdrian Chadd     uint32_t *hashtype)
56772d33245SAdrian Chadd {
56872d33245SAdrian Chadd 	const struct ip *ip;
56972d33245SAdrian Chadd 	const struct tcphdr *th;
57072d33245SAdrian Chadd 	const struct udphdr *uh;
571*c2529042SHans Petter Selasky 	uint32_t flowid;
572*c2529042SHans Petter Selasky 	uint32_t flowtype;
57372d33245SAdrian Chadd 	uint8_t proto;
57472d33245SAdrian Chadd 	int iphlen;
57572d33245SAdrian Chadd 	int is_frag = 0;
57672d33245SAdrian Chadd 
57772d33245SAdrian Chadd 	/*
57872d33245SAdrian Chadd 	 * XXX For now this only handles hashing on incoming mbufs.
57972d33245SAdrian Chadd 	 */
58072d33245SAdrian Chadd 	if (dir != RSS_HASH_PKT_INGRESS) {
58172d33245SAdrian Chadd 		printf("%s: called on EGRESS packet!\n", __func__);
58272d33245SAdrian Chadd 		return (-1);
58372d33245SAdrian Chadd 	}
58472d33245SAdrian Chadd 
58572d33245SAdrian Chadd 	/*
58672d33245SAdrian Chadd 	 * First, validate that the mbuf we have is long enough
58772d33245SAdrian Chadd 	 * to have an IPv4 header in it.
58872d33245SAdrian Chadd 	 */
58972d33245SAdrian Chadd 	if (m->m_pkthdr.len < (sizeof(struct ip))) {
59072d33245SAdrian Chadd 		printf("%s: short mbuf pkthdr\n", __func__);
59172d33245SAdrian Chadd 		return (-1);
59272d33245SAdrian Chadd 	}
59372d33245SAdrian Chadd 	if (m->m_len < (sizeof(struct ip))) {
59472d33245SAdrian Chadd 		printf("%s: short mbuf len\n", __func__);
59572d33245SAdrian Chadd 		return (-1);
59672d33245SAdrian Chadd 	}
59772d33245SAdrian Chadd 
59872d33245SAdrian Chadd 	/* Ok, let's dereference that */
59972d33245SAdrian Chadd 	ip = mtod(m, struct ip *);
60072d33245SAdrian Chadd 	proto = ip->ip_p;
60172d33245SAdrian Chadd 	iphlen = ip->ip_hl << 2;
60272d33245SAdrian Chadd 
60372d33245SAdrian Chadd 	/*
60472d33245SAdrian Chadd 	 * If this is a fragment then it shouldn't be four-tuple
60572d33245SAdrian Chadd 	 * hashed just yet.  Once it's reassembled into a full
60672d33245SAdrian Chadd 	 * frame it should be re-hashed.
60772d33245SAdrian Chadd 	 */
60872d33245SAdrian Chadd 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
60972d33245SAdrian Chadd 		is_frag = 1;
61072d33245SAdrian Chadd 
61172d33245SAdrian Chadd 	/*
61272d33245SAdrian Chadd 	 * If the mbuf flowid/flowtype matches the packet type,
61372d33245SAdrian Chadd 	 * and we don't support the 4-tuple version of the given protocol,
61472d33245SAdrian Chadd 	 * then signal to the owner that it can trust the flowid/flowtype
61572d33245SAdrian Chadd 	 * details.
61672d33245SAdrian Chadd 	 *
61772d33245SAdrian Chadd 	 * This is a little picky - eg, if TCPv4 / UDPv4 hashing
61872d33245SAdrian Chadd 	 * is supported but we got a TCP/UDP frame only 2-tuple hashed,
61972d33245SAdrian Chadd 	 * then we shouldn't just "trust" the 2-tuple hash.  We need
62072d33245SAdrian Chadd 	 * a 4-tuple hash.
62172d33245SAdrian Chadd 	 */
62272d33245SAdrian Chadd 	flowid = m->m_pkthdr.flowid;
62372d33245SAdrian Chadd 	flowtype = M_HASHTYPE_GET(m);
62472d33245SAdrian Chadd 
625*c2529042SHans Petter Selasky 	if (flowtype != M_HASHTYPE_NONE) {
62672d33245SAdrian Chadd 		switch (proto) {
62772d33245SAdrian Chadd 		case IPPROTO_UDP:
62872d33245SAdrian Chadd 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
62972d33245SAdrian Chadd 			    (flowtype == M_HASHTYPE_RSS_UDP_IPV4) &&
63072d33245SAdrian Chadd 			    (is_frag == 0)) {
63172d33245SAdrian Chadd 				return (1);
63272d33245SAdrian Chadd 			}
63372d33245SAdrian Chadd 			/*
63472d33245SAdrian Chadd 			 * Only allow 2-tuple for UDP frames if we don't also
63572d33245SAdrian Chadd 			 * support 4-tuple for UDP.
63672d33245SAdrian Chadd 			 */
63772d33245SAdrian Chadd 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
63872d33245SAdrian Chadd 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) &&
63972d33245SAdrian Chadd 			    flowtype == M_HASHTYPE_RSS_IPV4) {
64072d33245SAdrian Chadd 				return (1);
64172d33245SAdrian Chadd 			}
64272d33245SAdrian Chadd 			break;
64372d33245SAdrian Chadd 		case IPPROTO_TCP:
64472d33245SAdrian Chadd 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
64572d33245SAdrian Chadd 			    (flowtype == M_HASHTYPE_RSS_TCP_IPV4) &&
64672d33245SAdrian Chadd 			    (is_frag == 0)) {
64772d33245SAdrian Chadd 				return (1);
64872d33245SAdrian Chadd 			}
64972d33245SAdrian Chadd 			/*
65072d33245SAdrian Chadd 			 * Only allow 2-tuple for TCP frames if we don't also
65172d33245SAdrian Chadd 			 * support 2-tuple for TCP.
65272d33245SAdrian Chadd 			 */
65372d33245SAdrian Chadd 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
65472d33245SAdrian Chadd 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) &&
65572d33245SAdrian Chadd 			    flowtype == M_HASHTYPE_RSS_IPV4) {
65672d33245SAdrian Chadd 				return (1);
65772d33245SAdrian Chadd 			}
65872d33245SAdrian Chadd 			break;
65972d33245SAdrian Chadd 		default:
66072d33245SAdrian Chadd 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
66172d33245SAdrian Chadd 			    flowtype == M_HASHTYPE_RSS_IPV4) {
66272d33245SAdrian Chadd 				return (1);
66372d33245SAdrian Chadd 			}
66472d33245SAdrian Chadd 			break;
66572d33245SAdrian Chadd 		}
66672d33245SAdrian Chadd 	}
66772d33245SAdrian Chadd 
66872d33245SAdrian Chadd 	/*
66972d33245SAdrian Chadd 	 * Decode enough information to make a hash decision.
67072d33245SAdrian Chadd 	 *
67172d33245SAdrian Chadd 	 * XXX TODO: does the hardware hash on 4-tuple if IP
67272d33245SAdrian Chadd 	 *    options are present?
67372d33245SAdrian Chadd 	 */
674f4659f4cSAdrian Chadd 	if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
675f4659f4cSAdrian Chadd 	    (proto == IPPROTO_TCP) &&
676f4659f4cSAdrian Chadd 	    (is_frag == 0)) {
67772d33245SAdrian Chadd 		if (m->m_len < iphlen + sizeof(struct tcphdr)) {
67872d33245SAdrian Chadd 			printf("%s: short TCP frame?\n", __func__);
67972d33245SAdrian Chadd 			return (-1);
68072d33245SAdrian Chadd 		}
68172d33245SAdrian Chadd 		th = (struct tcphdr *)((caddr_t)ip + iphlen);
68272d33245SAdrian Chadd 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
68372d33245SAdrian Chadd 		    th->th_sport,
68472d33245SAdrian Chadd 		    th->th_dport,
68572d33245SAdrian Chadd 		    proto,
68672d33245SAdrian Chadd 		    hashval,
68772d33245SAdrian Chadd 		    hashtype);
688f4659f4cSAdrian Chadd 	} else if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
689f4659f4cSAdrian Chadd 	    (proto == IPPROTO_UDP) &&
690f4659f4cSAdrian Chadd 	    (is_frag == 0)) {
69172d33245SAdrian Chadd 		uh = (struct udphdr *)((caddr_t)ip + iphlen);
69272d33245SAdrian Chadd 		if (m->m_len < iphlen + sizeof(struct udphdr)) {
69372d33245SAdrian Chadd 			printf("%s: short UDP frame?\n", __func__);
69472d33245SAdrian Chadd 			return (-1);
69572d33245SAdrian Chadd 		}
69672d33245SAdrian Chadd 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
69772d33245SAdrian Chadd 		    uh->uh_sport,
69872d33245SAdrian Chadd 		    uh->uh_dport,
69972d33245SAdrian Chadd 		    proto,
70072d33245SAdrian Chadd 		    hashval,
70172d33245SAdrian Chadd 		    hashtype);
702f4659f4cSAdrian Chadd 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
70372d33245SAdrian Chadd 		/* Default to 2-tuple hash */
70472d33245SAdrian Chadd 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
70572d33245SAdrian Chadd 		    0,	/* source port */
70672d33245SAdrian Chadd 		    0,	/* destination port */
70772d33245SAdrian Chadd 		    0,	/* IPPROTO_IP */
70872d33245SAdrian Chadd 		    hashval,
70972d33245SAdrian Chadd 		    hashtype);
710f4659f4cSAdrian Chadd 	} else {
711f4659f4cSAdrian Chadd 		printf("%s: no available hashtypes!\n", __func__);
712f4659f4cSAdrian Chadd 		return (-1);
71372d33245SAdrian Chadd 	}
71472d33245SAdrian Chadd }
71572d33245SAdrian Chadd 
71672d33245SAdrian Chadd /*
71772d33245SAdrian Chadd  * Similar to rss_m2cpuid, but designed to be used by the IP NETISR
71872d33245SAdrian Chadd  * on incoming frames.
71972d33245SAdrian Chadd  *
72072d33245SAdrian Chadd  * If an existing RSS hash exists and it matches what the configured
72172d33245SAdrian Chadd  * hashing is, then use it.
72272d33245SAdrian Chadd  *
72372d33245SAdrian Chadd  * If there's an existing RSS hash but the desired hash is different,
72472d33245SAdrian Chadd  * or if there's no useful RSS hash, then calculate it via
72572d33245SAdrian Chadd  * the software path.
72672d33245SAdrian Chadd  *
72772d33245SAdrian Chadd  * XXX TODO: definitely want statistics here!
72872d33245SAdrian Chadd  */
72972d33245SAdrian Chadd struct mbuf *
73072d33245SAdrian Chadd rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
73172d33245SAdrian Chadd {
73272d33245SAdrian Chadd 	uint32_t hash_val, hash_type;
73372d33245SAdrian Chadd 	int ret;
73472d33245SAdrian Chadd 
73572d33245SAdrian Chadd 	M_ASSERTPKTHDR(m);
73672d33245SAdrian Chadd 
73772d33245SAdrian Chadd 	ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS,
73872d33245SAdrian Chadd 	    &hash_val, &hash_type);
73972d33245SAdrian Chadd 	if (ret > 0) {
74072d33245SAdrian Chadd 		/* mbuf has a valid hash already; don't need to modify it */
74172d33245SAdrian Chadd 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
74272d33245SAdrian Chadd 	} else if (ret == 0) {
74372d33245SAdrian Chadd 		/* hash was done; update */
74472d33245SAdrian Chadd 		m->m_pkthdr.flowid = hash_val;
74572d33245SAdrian Chadd 		M_HASHTYPE_SET(m, hash_type);
74672d33245SAdrian Chadd 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
74772d33245SAdrian Chadd 	} else { /* ret < 0 */
74872d33245SAdrian Chadd 		/* no hash was done */
74972d33245SAdrian Chadd 		*cpuid = NETISR_CPUID_NONE;
75072d33245SAdrian Chadd 	}
75172d33245SAdrian Chadd 	return (m);
75272d33245SAdrian Chadd }
75372d33245SAdrian Chadd 
75472d33245SAdrian Chadd /*
7557527624eSRobert Watson  * Query the RSS hash algorithm.
7567527624eSRobert Watson  */
7577527624eSRobert Watson u_int
7587527624eSRobert Watson rss_gethashalgo(void)
7597527624eSRobert Watson {
7607527624eSRobert Watson 
7617527624eSRobert Watson 	return (rss_hashalgo);
7627527624eSRobert Watson }
7637527624eSRobert Watson 
7647527624eSRobert Watson /*
7657527624eSRobert Watson  * Query the current RSS key; likely to be used by device drivers when
7667527624eSRobert Watson  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
7677527624eSRobert Watson  *
7687527624eSRobert Watson  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
7697527624eSRobert Watson  */
7707527624eSRobert Watson void
7717527624eSRobert Watson rss_getkey(uint8_t *key)
7727527624eSRobert Watson {
7737527624eSRobert Watson 
7747527624eSRobert Watson 	bcopy(rss_key, key, sizeof(rss_key));
7757527624eSRobert Watson }
7767527624eSRobert Watson 
7777527624eSRobert Watson /*
7787527624eSRobert Watson  * Query the number of buckets; this may be used by both network device
7797527624eSRobert Watson  * drivers, which will need to populate hardware shadows of the software
7807527624eSRobert Watson  * indirection table, and the network stack itself (such as when deciding how
7817527624eSRobert Watson  * many connection groups to allocate).
7827527624eSRobert Watson  */
7837527624eSRobert Watson u_int
7847527624eSRobert Watson rss_getnumbuckets(void)
7857527624eSRobert Watson {
7867527624eSRobert Watson 
7877527624eSRobert Watson 	return (rss_buckets);
7887527624eSRobert Watson }
7897527624eSRobert Watson 
7907527624eSRobert Watson /*
7917527624eSRobert Watson  * Query the number of CPUs in use by RSS; may be useful to device drivers
7927527624eSRobert Watson  * trying to figure out how to map a larger number of CPUs into a smaller
7937527624eSRobert Watson  * number of receive queues.
7947527624eSRobert Watson  */
7957527624eSRobert Watson u_int
7967527624eSRobert Watson rss_getnumcpus(void)
7977527624eSRobert Watson {
7987527624eSRobert Watson 
7997527624eSRobert Watson 	return (rss_ncpus);
8007527624eSRobert Watson }
8017527624eSRobert Watson 
80272d33245SAdrian Chadd static inline u_int
80372d33245SAdrian Chadd rss_gethashconfig_local(void)
80440c753e3SAdrian Chadd {
80572d33245SAdrian Chadd 
80640c753e3SAdrian Chadd 	/* Return 4-tuple for TCP; 2-tuple for others */
80740c753e3SAdrian Chadd 	/*
80840c753e3SAdrian Chadd 	 * UDP may fragment more often than TCP and thus we'll end up with
80940c753e3SAdrian Chadd 	 * NICs returning 2-tuple fragments.
81040c753e3SAdrian Chadd 	 * udp_init() and udplite_init() both currently initialise things
81140c753e3SAdrian Chadd 	 * as 2-tuple.
81240c753e3SAdrian Chadd 	 * So for now disable UDP 4-tuple hashing until all of the other
81340c753e3SAdrian Chadd 	 * pieces are in place.
81440c753e3SAdrian Chadd 	 */
81540c753e3SAdrian Chadd 	return (
81640c753e3SAdrian Chadd 	    RSS_HASHTYPE_RSS_IPV4
81740c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV4
81840c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6
81940c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6
82040c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6_EX
82140c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
82240c753e3SAdrian Chadd #if 0
82340c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4
82440c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4_EX
82540c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6
82640c753e3SAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
82740c753e3SAdrian Chadd #endif
82840c753e3SAdrian Chadd 	);
82940c753e3SAdrian Chadd }
83040c753e3SAdrian Chadd 
83140c753e3SAdrian Chadd /*
83272d33245SAdrian Chadd  * Return the supported RSS hash configuration.
83372d33245SAdrian Chadd  *
83472d33245SAdrian Chadd  * NICs should query this to determine what to configure in their redirection
83572d33245SAdrian Chadd  * matching table.
83672d33245SAdrian Chadd  */
83772d33245SAdrian Chadd u_int
83872d33245SAdrian Chadd rss_gethashconfig(void)
83972d33245SAdrian Chadd {
84072d33245SAdrian Chadd 
84172d33245SAdrian Chadd 	return (rss_gethashconfig_local());
84272d33245SAdrian Chadd }
84372d33245SAdrian Chadd 
84472d33245SAdrian Chadd /*
8457527624eSRobert Watson  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
8467527624eSRobert Watson  * it appearing in debugging output unnecessarily.
8477527624eSRobert Watson  */
8487527624eSRobert Watson static int
8497527624eSRobert Watson sysctl_rss_key(SYSCTL_HANDLER_ARGS)
8507527624eSRobert Watson {
8517527624eSRobert Watson 	uint8_t temp_rss_key[RSS_KEYSIZE];
8527527624eSRobert Watson 	int error;
8537527624eSRobert Watson 
8547527624eSRobert Watson 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
8557527624eSRobert Watson 	if (error)
8567527624eSRobert Watson 		return (error);
8577527624eSRobert Watson 
8587527624eSRobert Watson 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
8597527624eSRobert Watson 	error = sysctl_handle_opaque(oidp, temp_rss_key,
8607527624eSRobert Watson 	    sizeof(temp_rss_key), req);
8617527624eSRobert Watson 	if (error)
8627527624eSRobert Watson 		return (error);
8637527624eSRobert Watson 	if (req->newptr != NULL) {
8647527624eSRobert Watson 		/* XXXRW: Not yet. */
8657527624eSRobert Watson 		return (EINVAL);
8667527624eSRobert Watson 	}
8677527624eSRobert Watson 	return (0);
8687527624eSRobert Watson }
8697527624eSRobert Watson SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
8707527624eSRobert Watson     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
8717527624eSRobert Watson     "", "RSS keying material");
8728bde802aSAdrian Chadd 
8738bde802aSAdrian Chadd static int
8748bde802aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
8758bde802aSAdrian Chadd {
8768bde802aSAdrian Chadd 	struct sbuf *sb;
8778bde802aSAdrian Chadd 	int error;
8788bde802aSAdrian Chadd 	int i;
8798bde802aSAdrian Chadd 
8808bde802aSAdrian Chadd 	error = 0;
8818bde802aSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
8828bde802aSAdrian Chadd 	if (error != 0)
8838bde802aSAdrian Chadd 		return (error);
8848bde802aSAdrian Chadd 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
8858bde802aSAdrian Chadd 	if (sb == NULL)
8868bde802aSAdrian Chadd 		return (ENOMEM);
8878bde802aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
8888bde802aSAdrian Chadd 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
8898bde802aSAdrian Chadd 		    i,
8908bde802aSAdrian Chadd 		    rss_getcpu(i));
8918bde802aSAdrian Chadd 	}
8928bde802aSAdrian Chadd 	error = sbuf_finish(sb);
8938bde802aSAdrian Chadd 	sbuf_delete(sb);
8948bde802aSAdrian Chadd 
8958bde802aSAdrian Chadd 	return (error);
8968bde802aSAdrian Chadd }
8978bde802aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
8988bde802aSAdrian Chadd     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
8998bde802aSAdrian Chadd     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
900