xref: /freebsd/sys/net/rss_config.c (revision e5562eb9343bddcc11e791103c3371f0528fe481)
1b2bdc62aSAdrian Chadd /*-
2b2bdc62aSAdrian Chadd  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3b2bdc62aSAdrian Chadd  * All rights reserved.
4b2bdc62aSAdrian Chadd  *
5b2bdc62aSAdrian Chadd  * This software was developed by Robert N. M. Watson under contract
6b2bdc62aSAdrian Chadd  * to Juniper Networks, Inc.
7b2bdc62aSAdrian Chadd  *
8b2bdc62aSAdrian Chadd  * Redistribution and use in source and binary forms, with or without
9b2bdc62aSAdrian Chadd  * modification, are permitted provided that the following conditions
10b2bdc62aSAdrian Chadd  * are met:
11b2bdc62aSAdrian Chadd  * 1. Redistributions of source code must retain the above copyright
12b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer.
13b2bdc62aSAdrian Chadd  * 2. Redistributions in binary form must reproduce the above copyright
14b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer in the
15b2bdc62aSAdrian Chadd  *    documentation and/or other materials provided with the distribution.
16b2bdc62aSAdrian Chadd  *
17b2bdc62aSAdrian Chadd  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18b2bdc62aSAdrian Chadd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19b2bdc62aSAdrian Chadd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20b2bdc62aSAdrian Chadd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21b2bdc62aSAdrian Chadd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22b2bdc62aSAdrian Chadd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23b2bdc62aSAdrian Chadd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24b2bdc62aSAdrian Chadd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25b2bdc62aSAdrian Chadd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26b2bdc62aSAdrian Chadd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27b2bdc62aSAdrian Chadd  * SUCH DAMAGE.
28b2bdc62aSAdrian Chadd  */
29b2bdc62aSAdrian Chadd 
30b2bdc62aSAdrian Chadd #include <sys/cdefs.h>
31b2bdc62aSAdrian Chadd 
32b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$");
33b2bdc62aSAdrian Chadd 
34b2bdc62aSAdrian Chadd #include "opt_inet6.h"
35b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h"
36b2bdc62aSAdrian Chadd 
37b2bdc62aSAdrian Chadd #ifndef PCBGROUP
38b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP"
39b2bdc62aSAdrian Chadd #endif
40b2bdc62aSAdrian Chadd 
41b2bdc62aSAdrian Chadd #include <sys/param.h>
42b2bdc62aSAdrian Chadd #include <sys/mbuf.h>
43b2bdc62aSAdrian Chadd #include <sys/socket.h>
44b2bdc62aSAdrian Chadd #include <sys/priv.h>
45b2bdc62aSAdrian Chadd #include <sys/kernel.h>
46b2bdc62aSAdrian Chadd #include <sys/smp.h>
47b2bdc62aSAdrian Chadd #include <sys/sysctl.h>
48b2bdc62aSAdrian Chadd #include <sys/sbuf.h>
49b2bdc62aSAdrian Chadd 
50b2bdc62aSAdrian Chadd #include <net/if.h>
51b2bdc62aSAdrian Chadd #include <net/if_var.h>
52b2bdc62aSAdrian Chadd #include <net/netisr.h>
53b2bdc62aSAdrian Chadd #include <net/rss_config.h>
54b2bdc62aSAdrian Chadd #include <net/toeplitz.h>
55b2bdc62aSAdrian Chadd 
56b2bdc62aSAdrian Chadd #if 0
57b2bdc62aSAdrian Chadd #include <netinet/in.h>
58b2bdc62aSAdrian Chadd #include <netinet/in_pcb.h>
59b2bdc62aSAdrian Chadd #include <netinet/in_rss.h>
60b2bdc62aSAdrian Chadd #include <netinet/in_var.h>
61b2bdc62aSAdrian Chadd 
62b2bdc62aSAdrian Chadd /* for software rss hash support */
63b2bdc62aSAdrian Chadd #include <netinet/ip.h>
64b2bdc62aSAdrian Chadd #include <netinet/tcp.h>
65b2bdc62aSAdrian Chadd #include <netinet/udp.h>
66b2bdc62aSAdrian Chadd #endif
67b2bdc62aSAdrian Chadd 
68b2bdc62aSAdrian Chadd /*-
69b2bdc62aSAdrian Chadd  * Operating system parts of receiver-side scaling (RSS), which allows
70b2bdc62aSAdrian Chadd  * network cards to direct flows to particular receive queues based on hashes
71b2bdc62aSAdrian Chadd  * of header tuples.  This implementation aligns RSS buckets with connection
72b2bdc62aSAdrian Chadd  * groups at the TCP/IP layer, so each bucket is associated with exactly one
73b2bdc62aSAdrian Chadd  * group.  As a result, the group lookup structures (and lock) should have an
74b2bdc62aSAdrian Chadd  * effective affinity with exactly one CPU.
75b2bdc62aSAdrian Chadd  *
76b2bdc62aSAdrian Chadd  * Network device drivers needing to configure RSS will query this framework
77b2bdc62aSAdrian Chadd  * for parameters, such as the current RSS key, hashing policies, number of
78b2bdc62aSAdrian Chadd  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
79b2bdc62aSAdrian Chadd  * provide their own supplementary information, such as queue<->CPU bindings.
80b2bdc62aSAdrian Chadd  * It is the responsibility of the network device driver to inject packets
81b2bdc62aSAdrian Chadd  * into the stack on as close to the right CPU as possible, if playing by RSS
82b2bdc62aSAdrian Chadd  * rules.
83b2bdc62aSAdrian Chadd  *
84b2bdc62aSAdrian Chadd  * TODO:
85b2bdc62aSAdrian Chadd  *
86b2bdc62aSAdrian Chadd  * - Synchronization for rss_key and other future-configurable parameters.
87b2bdc62aSAdrian Chadd  * - Event handler drivers can register to pick up RSS configuration changes.
88b2bdc62aSAdrian Chadd  * - Should we allow rss_basecpu to be configured?
89b2bdc62aSAdrian Chadd  * - Randomize key on boot.
90b2bdc62aSAdrian Chadd  * - IPv6 support.
91b2bdc62aSAdrian Chadd  * - Statistics on how often there's a misalignment between hardware
92b2bdc62aSAdrian Chadd  *   placement and pcbgroup expectations.
93b2bdc62aSAdrian Chadd  */
94b2bdc62aSAdrian Chadd 
95b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet);
96b2bdc62aSAdrian Chadd SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
97b2bdc62aSAdrian Chadd 
98b2bdc62aSAdrian Chadd /*
99b2bdc62aSAdrian Chadd  * Toeplitz is the only required hash function in the RSS spec, so use it by
100b2bdc62aSAdrian Chadd  * default.
101b2bdc62aSAdrian Chadd  */
102b2bdc62aSAdrian Chadd static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
103b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
104b2bdc62aSAdrian Chadd     "RSS hash algorithm");
105b2bdc62aSAdrian Chadd 
106b2bdc62aSAdrian Chadd /*
107b2bdc62aSAdrian Chadd  * Size of the indirection table; at most 128 entries per the RSS spec.  We
108b2bdc62aSAdrian Chadd  * size it to at least 2 times the number of CPUs by default to allow useful
109b2bdc62aSAdrian Chadd  * rebalancing.  If not set explicitly with a loader tunable, we tune based
110b2bdc62aSAdrian Chadd  * on the number of CPUs present.
111b2bdc62aSAdrian Chadd  *
112b2bdc62aSAdrian Chadd  * XXXRW: buckets might be better to use for the tunable than bits.
113b2bdc62aSAdrian Chadd  */
114b2bdc62aSAdrian Chadd static u_int	rss_bits;
115b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
116b2bdc62aSAdrian Chadd     "RSS bits");
117b2bdc62aSAdrian Chadd 
118b2bdc62aSAdrian Chadd static u_int	rss_mask;
119b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
120b2bdc62aSAdrian Chadd     "RSS mask");
121b2bdc62aSAdrian Chadd 
122b2bdc62aSAdrian Chadd static const u_int	rss_maxbits = RSS_MAXBITS;
123b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
124b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
125b2bdc62aSAdrian Chadd 
126b2bdc62aSAdrian Chadd /*
127b2bdc62aSAdrian Chadd  * RSS's own count of the number of CPUs it could be using for processing.
128b2bdc62aSAdrian Chadd  * Bounded to 64 by RSS constants.
129b2bdc62aSAdrian Chadd  */
130b2bdc62aSAdrian Chadd static u_int	rss_ncpus;
131b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
132b2bdc62aSAdrian Chadd     "Number of CPUs available to RSS");
133b2bdc62aSAdrian Chadd 
134b2bdc62aSAdrian Chadd #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
135b2bdc62aSAdrian Chadd static const u_int	rss_maxcpus = RSS_MAXCPUS;
136b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
137b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
138b2bdc62aSAdrian Chadd 
139b2bdc62aSAdrian Chadd /*
140b2bdc62aSAdrian Chadd  * Variable exists just for reporting rss_bits in a user-friendly way.
141b2bdc62aSAdrian Chadd  */
142b2bdc62aSAdrian Chadd static u_int	rss_buckets;
143b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
144b2bdc62aSAdrian Chadd     "RSS buckets");
145b2bdc62aSAdrian Chadd 
146b2bdc62aSAdrian Chadd /*
147b2bdc62aSAdrian Chadd  * Base CPU number; devices will add this to all CPU numbers returned by the
148b2bdc62aSAdrian Chadd  * RSS indirection table.  Currently unmodifable in FreeBSD.
149b2bdc62aSAdrian Chadd  */
150b2bdc62aSAdrian Chadd static const u_int	rss_basecpu;
151b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
152b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
153b2bdc62aSAdrian Chadd 
154b2bdc62aSAdrian Chadd /*
155*e5562eb9SAdrian Chadd  * Print verbose debugging messages.
156*e5562eb9SAdrian Chadd  * 0 - disable
157*e5562eb9SAdrian Chadd  * non-zero - enable
158*e5562eb9SAdrian Chadd  */
159*e5562eb9SAdrian Chadd int	rss_debug = 0;
160*e5562eb9SAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0,
161*e5562eb9SAdrian Chadd     "RSS debug level");
162*e5562eb9SAdrian Chadd 
163*e5562eb9SAdrian Chadd /*
164b2bdc62aSAdrian Chadd  * RSS secret key, intended to prevent attacks on load-balancing.  Its
165b2bdc62aSAdrian Chadd  * effectiveness may be limited by algorithm choice and available entropy
166b2bdc62aSAdrian Chadd  * during the boot.
167b2bdc62aSAdrian Chadd  *
168b2bdc62aSAdrian Chadd  * XXXRW: And that we don't randomize it yet!
169b2bdc62aSAdrian Chadd  *
170b2bdc62aSAdrian Chadd  * This is the default Microsoft RSS specification key which is also
171b2bdc62aSAdrian Chadd  * the Chelsio T5 firmware default key.
172b2bdc62aSAdrian Chadd  */
173b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = {
174b2bdc62aSAdrian Chadd 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
175b2bdc62aSAdrian Chadd 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
176b2bdc62aSAdrian Chadd 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
177b2bdc62aSAdrian Chadd 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
178b2bdc62aSAdrian Chadd 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
179b2bdc62aSAdrian Chadd };
180b2bdc62aSAdrian Chadd 
181b2bdc62aSAdrian Chadd /*
182b2bdc62aSAdrian Chadd  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
183b2bdc62aSAdrian Chadd  * Drivers may supplement this table with a seperate CPU<->queue table when
184b2bdc62aSAdrian Chadd  * programming devices.
185b2bdc62aSAdrian Chadd  */
186b2bdc62aSAdrian Chadd struct rss_table_entry {
187b2bdc62aSAdrian Chadd 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
188b2bdc62aSAdrian Chadd };
189b2bdc62aSAdrian Chadd static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
190b2bdc62aSAdrian Chadd 
191b2bdc62aSAdrian Chadd static void
192b2bdc62aSAdrian Chadd rss_init(__unused void *arg)
193b2bdc62aSAdrian Chadd {
194b2bdc62aSAdrian Chadd 	u_int i;
195b2bdc62aSAdrian Chadd 	u_int cpuid;
196b2bdc62aSAdrian Chadd 
197b2bdc62aSAdrian Chadd 	/*
198b2bdc62aSAdrian Chadd 	 * Validate tunables, coerce to sensible values.
199b2bdc62aSAdrian Chadd 	 */
200b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
201b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
202b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
203b2bdc62aSAdrian Chadd 		break;
204b2bdc62aSAdrian Chadd 
205b2bdc62aSAdrian Chadd 	default:
206*e5562eb9SAdrian Chadd 		RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n",
207*e5562eb9SAdrian Chadd 		    rss_hashalgo, RSS_HASH_TOEPLITZ);
208b2bdc62aSAdrian Chadd 		rss_hashalgo = RSS_HASH_TOEPLITZ;
209b2bdc62aSAdrian Chadd 	}
210b2bdc62aSAdrian Chadd 
211b2bdc62aSAdrian Chadd 	/*
212b2bdc62aSAdrian Chadd 	 * Count available CPUs.
213b2bdc62aSAdrian Chadd 	 *
214b2bdc62aSAdrian Chadd 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
215b2bdc62aSAdrian Chadd 	 * elsewhere.
216b2bdc62aSAdrian Chadd 	 */
217b2bdc62aSAdrian Chadd 	rss_ncpus = 0;
218b2bdc62aSAdrian Chadd 	for (i = 0; i <= mp_maxid; i++) {
219b2bdc62aSAdrian Chadd 		if (CPU_ABSENT(i))
220b2bdc62aSAdrian Chadd 			continue;
221b2bdc62aSAdrian Chadd 		rss_ncpus++;
222b2bdc62aSAdrian Chadd 	}
223b2bdc62aSAdrian Chadd 	if (rss_ncpus > RSS_MAXCPUS)
224b2bdc62aSAdrian Chadd 		rss_ncpus = RSS_MAXCPUS;
225b2bdc62aSAdrian Chadd 
226b2bdc62aSAdrian Chadd 	/*
227b2bdc62aSAdrian Chadd 	 * Tune RSS table entries to be no less than 2x the number of CPUs
228b2bdc62aSAdrian Chadd 	 * -- unless we're running uniprocessor, in which case there's not
229b2bdc62aSAdrian Chadd 	 * much point in having buckets to rearrange for load-balancing!
230b2bdc62aSAdrian Chadd 	 */
231b2bdc62aSAdrian Chadd 	if (rss_ncpus > 1) {
232b2bdc62aSAdrian Chadd 		if (rss_bits == 0)
233b2bdc62aSAdrian Chadd 			rss_bits = fls(rss_ncpus - 1) + 1;
234b2bdc62aSAdrian Chadd 
235b2bdc62aSAdrian Chadd 		/*
236b2bdc62aSAdrian Chadd 		 * Microsoft limits RSS table entries to 128, so apply that
237b2bdc62aSAdrian Chadd 		 * limit to both auto-detected CPU counts and user-configured
238b2bdc62aSAdrian Chadd 		 * ones.
239b2bdc62aSAdrian Chadd 		 */
240b2bdc62aSAdrian Chadd 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
241*e5562eb9SAdrian Chadd 			RSS_DEBUG("RSS bits %u not valid, coercing to %u\n",
242*e5562eb9SAdrian Chadd 			    rss_bits, RSS_MAXBITS);
243b2bdc62aSAdrian Chadd 			rss_bits = RSS_MAXBITS;
244b2bdc62aSAdrian Chadd 		}
245b2bdc62aSAdrian Chadd 
246b2bdc62aSAdrian Chadd 		/*
247b2bdc62aSAdrian Chadd 		 * Figure out how many buckets to use; warn if less than the
248b2bdc62aSAdrian Chadd 		 * number of configured CPUs, although this is not a fatal
249b2bdc62aSAdrian Chadd 		 * problem.
250b2bdc62aSAdrian Chadd 		 */
251b2bdc62aSAdrian Chadd 		rss_buckets = (1 << rss_bits);
252b2bdc62aSAdrian Chadd 		if (rss_buckets < rss_ncpus)
253*e5562eb9SAdrian Chadd 			RSS_DEBUG("WARNING: rss_buckets (%u) less than "
254*e5562eb9SAdrian Chadd 			    "rss_ncpus (%u)\n", rss_buckets, rss_ncpus);
255b2bdc62aSAdrian Chadd 		rss_mask = rss_buckets - 1;
256b2bdc62aSAdrian Chadd 	} else {
257b2bdc62aSAdrian Chadd 		rss_bits = 0;
258b2bdc62aSAdrian Chadd 		rss_buckets = 1;
259b2bdc62aSAdrian Chadd 		rss_mask = 0;
260b2bdc62aSAdrian Chadd 	}
261b2bdc62aSAdrian Chadd 
262b2bdc62aSAdrian Chadd 	/*
263b2bdc62aSAdrian Chadd 	 * Set up initial CPU assignments: round-robin by default.
264b2bdc62aSAdrian Chadd 	 */
265b2bdc62aSAdrian Chadd 	cpuid = CPU_FIRST();
266b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
267b2bdc62aSAdrian Chadd 		rss_table[i].rte_cpu = cpuid;
268b2bdc62aSAdrian Chadd 		cpuid = CPU_NEXT(cpuid);
269b2bdc62aSAdrian Chadd 	}
270b2bdc62aSAdrian Chadd 
271b2bdc62aSAdrian Chadd 	/*
272b2bdc62aSAdrian Chadd 	 * Randomize rrs_key.
273b2bdc62aSAdrian Chadd 	 *
274b2bdc62aSAdrian Chadd 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
275b2bdc62aSAdrian Chadd 	 * loop to check for "bad" RSS keys.
276b2bdc62aSAdrian Chadd 	 */
277b2bdc62aSAdrian Chadd }
278b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
279b2bdc62aSAdrian Chadd 
280b2bdc62aSAdrian Chadd static uint32_t
281b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
282b2bdc62aSAdrian Chadd     const uint8_t *data)
283b2bdc62aSAdrian Chadd {
284b2bdc62aSAdrian Chadd 	uint32_t v;
285b2bdc62aSAdrian Chadd 	u_int i;
286b2bdc62aSAdrian Chadd 
287b2bdc62aSAdrian Chadd 	v = 0;
288b2bdc62aSAdrian Chadd 	for (i = 0; i < keylen; i++)
289b2bdc62aSAdrian Chadd 		v += key[i];
290b2bdc62aSAdrian Chadd 	for (i = 0; i < datalen; i++)
291b2bdc62aSAdrian Chadd 		v += data[i];
292b2bdc62aSAdrian Chadd 	return (v);
293b2bdc62aSAdrian Chadd }
294b2bdc62aSAdrian Chadd 
295b2bdc62aSAdrian Chadd uint32_t
296b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data)
297b2bdc62aSAdrian Chadd {
298b2bdc62aSAdrian Chadd 
299b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
300b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
301b2bdc62aSAdrian Chadd 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
302b2bdc62aSAdrian Chadd 		    data));
303b2bdc62aSAdrian Chadd 
304b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
305b2bdc62aSAdrian Chadd 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
306b2bdc62aSAdrian Chadd 		    data));
307b2bdc62aSAdrian Chadd 
308b2bdc62aSAdrian Chadd 	default:
309b2bdc62aSAdrian Chadd 		panic("%s: unsupported/unknown hashalgo %d", __func__,
310b2bdc62aSAdrian Chadd 		    rss_hashalgo);
311b2bdc62aSAdrian Chadd 	}
312b2bdc62aSAdrian Chadd }
313b2bdc62aSAdrian Chadd 
314b2bdc62aSAdrian Chadd /*
315b2bdc62aSAdrian Chadd  * Query the number of RSS bits in use.
316b2bdc62aSAdrian Chadd  */
317b2bdc62aSAdrian Chadd u_int
318b2bdc62aSAdrian Chadd rss_getbits(void)
319b2bdc62aSAdrian Chadd {
320b2bdc62aSAdrian Chadd 
321b2bdc62aSAdrian Chadd 	return (rss_bits);
322b2bdc62aSAdrian Chadd }
323b2bdc62aSAdrian Chadd 
324b2bdc62aSAdrian Chadd /*
325b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with an RSS hash.
326b2bdc62aSAdrian Chadd  */
327b2bdc62aSAdrian Chadd u_int
328b2bdc62aSAdrian Chadd rss_getbucket(u_int hash)
329b2bdc62aSAdrian Chadd {
330b2bdc62aSAdrian Chadd 
331b2bdc62aSAdrian Chadd 	return (hash & rss_mask);
332b2bdc62aSAdrian Chadd }
333b2bdc62aSAdrian Chadd 
334b2bdc62aSAdrian Chadd /*
335b2bdc62aSAdrian Chadd  * Query the RSS layer bucket associated with the given
336b2bdc62aSAdrian Chadd  * entry in the RSS hash space.
337b2bdc62aSAdrian Chadd  *
338b2bdc62aSAdrian Chadd  * The RSS indirection table is 0 .. rss_buckets-1,
339b2bdc62aSAdrian Chadd  * covering the low 'rss_bits' of the total 128 slot
340b2bdc62aSAdrian Chadd  * RSS indirection table.  So just mask off rss_bits and
341b2bdc62aSAdrian Chadd  * return that.
342b2bdc62aSAdrian Chadd  *
343b2bdc62aSAdrian Chadd  * NIC drivers can then iterate over the 128 slot RSS
344b2bdc62aSAdrian Chadd  * indirection table and fetch which RSS bucket to
345b2bdc62aSAdrian Chadd  * map it to.  This will typically be a CPU queue
346b2bdc62aSAdrian Chadd  */
347b2bdc62aSAdrian Chadd u_int
348b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index)
349b2bdc62aSAdrian Chadd {
350b2bdc62aSAdrian Chadd 
351b2bdc62aSAdrian Chadd 	return (index & rss_mask);
352b2bdc62aSAdrian Chadd }
353b2bdc62aSAdrian Chadd 
354b2bdc62aSAdrian Chadd /*
355b2bdc62aSAdrian Chadd  * Query the RSS CPU associated with an RSS bucket.
356b2bdc62aSAdrian Chadd  */
357b2bdc62aSAdrian Chadd u_int
358b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket)
359b2bdc62aSAdrian Chadd {
360b2bdc62aSAdrian Chadd 
361b2bdc62aSAdrian Chadd 	return (rss_table[bucket].rte_cpu);
362b2bdc62aSAdrian Chadd }
363b2bdc62aSAdrian Chadd 
364b2bdc62aSAdrian Chadd /*
365b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup given just the hash and hashtype.
366b2bdc62aSAdrian Chadd  */
367b2bdc62aSAdrian Chadd u_int
368b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
369b2bdc62aSAdrian Chadd {
370b2bdc62aSAdrian Chadd 
371b2bdc62aSAdrian Chadd 	switch (hash_type) {
372b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
373b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
374b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
375b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
376b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
377b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
378b2bdc62aSAdrian Chadd 		return (rss_getcpu(rss_getbucket(hash_val)));
379b2bdc62aSAdrian Chadd 	default:
380b2bdc62aSAdrian Chadd 		return (NETISR_CPUID_NONE);
381b2bdc62aSAdrian Chadd 	}
382b2bdc62aSAdrian Chadd }
383b2bdc62aSAdrian Chadd 
384b2bdc62aSAdrian Chadd /*
385b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with the given hash value and
386b2bdc62aSAdrian Chadd  * type.
387b2bdc62aSAdrian Chadd  */
388b2bdc62aSAdrian Chadd int
389b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
390b2bdc62aSAdrian Chadd {
391b2bdc62aSAdrian Chadd 
392b2bdc62aSAdrian Chadd 	switch (hash_type) {
393b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
394b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
395b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
396b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
397b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
398b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
399b2bdc62aSAdrian Chadd 		*bucket_id = rss_getbucket(hash_val);
400b2bdc62aSAdrian Chadd 		return (0);
401b2bdc62aSAdrian Chadd 	default:
402b2bdc62aSAdrian Chadd 		return (-1);
403b2bdc62aSAdrian Chadd 	}
404b2bdc62aSAdrian Chadd }
405b2bdc62aSAdrian Chadd 
406b2bdc62aSAdrian Chadd /*
407b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup routine for use by protocols.
408b2bdc62aSAdrian Chadd  */
409b2bdc62aSAdrian Chadd struct mbuf *
410b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
411b2bdc62aSAdrian Chadd {
412b2bdc62aSAdrian Chadd 
413b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
414b2bdc62aSAdrian Chadd 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
415b2bdc62aSAdrian Chadd 	return (m);
416b2bdc62aSAdrian Chadd }
417b2bdc62aSAdrian Chadd 
418b2bdc62aSAdrian Chadd int
419b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
420b2bdc62aSAdrian Chadd {
421b2bdc62aSAdrian Chadd 
422b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
423b2bdc62aSAdrian Chadd 
424b2bdc62aSAdrian Chadd 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
425b2bdc62aSAdrian Chadd 	    bucket_id));
426b2bdc62aSAdrian Chadd }
427b2bdc62aSAdrian Chadd 
428b2bdc62aSAdrian Chadd /*
429b2bdc62aSAdrian Chadd  * Query the RSS hash algorithm.
430b2bdc62aSAdrian Chadd  */
431b2bdc62aSAdrian Chadd u_int
432b2bdc62aSAdrian Chadd rss_gethashalgo(void)
433b2bdc62aSAdrian Chadd {
434b2bdc62aSAdrian Chadd 
435b2bdc62aSAdrian Chadd 	return (rss_hashalgo);
436b2bdc62aSAdrian Chadd }
437b2bdc62aSAdrian Chadd 
438b2bdc62aSAdrian Chadd /*
439b2bdc62aSAdrian Chadd  * Query the current RSS key; likely to be used by device drivers when
440b2bdc62aSAdrian Chadd  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
441b2bdc62aSAdrian Chadd  *
442b2bdc62aSAdrian Chadd  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
443b2bdc62aSAdrian Chadd  */
444b2bdc62aSAdrian Chadd void
445b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key)
446b2bdc62aSAdrian Chadd {
447b2bdc62aSAdrian Chadd 
448b2bdc62aSAdrian Chadd 	bcopy(rss_key, key, sizeof(rss_key));
449b2bdc62aSAdrian Chadd }
450b2bdc62aSAdrian Chadd 
451b2bdc62aSAdrian Chadd /*
452b2bdc62aSAdrian Chadd  * Query the number of buckets; this may be used by both network device
453b2bdc62aSAdrian Chadd  * drivers, which will need to populate hardware shadows of the software
454b2bdc62aSAdrian Chadd  * indirection table, and the network stack itself (such as when deciding how
455b2bdc62aSAdrian Chadd  * many connection groups to allocate).
456b2bdc62aSAdrian Chadd  */
457b2bdc62aSAdrian Chadd u_int
458b2bdc62aSAdrian Chadd rss_getnumbuckets(void)
459b2bdc62aSAdrian Chadd {
460b2bdc62aSAdrian Chadd 
461b2bdc62aSAdrian Chadd 	return (rss_buckets);
462b2bdc62aSAdrian Chadd }
463b2bdc62aSAdrian Chadd 
464b2bdc62aSAdrian Chadd /*
465b2bdc62aSAdrian Chadd  * Query the number of CPUs in use by RSS; may be useful to device drivers
466b2bdc62aSAdrian Chadd  * trying to figure out how to map a larger number of CPUs into a smaller
467b2bdc62aSAdrian Chadd  * number of receive queues.
468b2bdc62aSAdrian Chadd  */
469b2bdc62aSAdrian Chadd u_int
470b2bdc62aSAdrian Chadd rss_getnumcpus(void)
471b2bdc62aSAdrian Chadd {
472b2bdc62aSAdrian Chadd 
473b2bdc62aSAdrian Chadd 	return (rss_ncpus);
474b2bdc62aSAdrian Chadd }
475b2bdc62aSAdrian Chadd 
476b2bdc62aSAdrian Chadd /*
477b2bdc62aSAdrian Chadd  * Return the supported RSS hash configuration.
478b2bdc62aSAdrian Chadd  *
479b2bdc62aSAdrian Chadd  * NICs should query this to determine what to configure in their redirection
480b2bdc62aSAdrian Chadd  * matching table.
481b2bdc62aSAdrian Chadd  */
482b2bdc62aSAdrian Chadd inline u_int
483b2bdc62aSAdrian Chadd rss_gethashconfig(void)
484b2bdc62aSAdrian Chadd {
485b2bdc62aSAdrian Chadd 
486b2bdc62aSAdrian Chadd 	/* Return 4-tuple for TCP; 2-tuple for others */
487b2bdc62aSAdrian Chadd 	/*
488b2bdc62aSAdrian Chadd 	 * UDP may fragment more often than TCP and thus we'll end up with
489b2bdc62aSAdrian Chadd 	 * NICs returning 2-tuple fragments.
490b2bdc62aSAdrian Chadd 	 * udp_init() and udplite_init() both currently initialise things
491b2bdc62aSAdrian Chadd 	 * as 2-tuple.
492b2bdc62aSAdrian Chadd 	 * So for now disable UDP 4-tuple hashing until all of the other
493b2bdc62aSAdrian Chadd 	 * pieces are in place.
494b2bdc62aSAdrian Chadd 	 */
495b2bdc62aSAdrian Chadd 	return (
496b2bdc62aSAdrian Chadd 	    RSS_HASHTYPE_RSS_IPV4
497b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV4
498b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6
499b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6
500b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6_EX
501b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
502b2bdc62aSAdrian Chadd #if 0
503b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4
504b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4_EX
505b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6
506b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
507b2bdc62aSAdrian Chadd #endif
508b2bdc62aSAdrian Chadd 	);
509b2bdc62aSAdrian Chadd }
510b2bdc62aSAdrian Chadd 
511b2bdc62aSAdrian Chadd /*
512b2bdc62aSAdrian Chadd  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
513b2bdc62aSAdrian Chadd  * it appearing in debugging output unnecessarily.
514b2bdc62aSAdrian Chadd  */
515b2bdc62aSAdrian Chadd static int
516b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS)
517b2bdc62aSAdrian Chadd {
518b2bdc62aSAdrian Chadd 	uint8_t temp_rss_key[RSS_KEYSIZE];
519b2bdc62aSAdrian Chadd 	int error;
520b2bdc62aSAdrian Chadd 
521b2bdc62aSAdrian Chadd 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
522b2bdc62aSAdrian Chadd 	if (error)
523b2bdc62aSAdrian Chadd 		return (error);
524b2bdc62aSAdrian Chadd 
525b2bdc62aSAdrian Chadd 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
526b2bdc62aSAdrian Chadd 	error = sysctl_handle_opaque(oidp, temp_rss_key,
527b2bdc62aSAdrian Chadd 	    sizeof(temp_rss_key), req);
528b2bdc62aSAdrian Chadd 	if (error)
529b2bdc62aSAdrian Chadd 		return (error);
530b2bdc62aSAdrian Chadd 	if (req->newptr != NULL) {
531b2bdc62aSAdrian Chadd 		/* XXXRW: Not yet. */
532b2bdc62aSAdrian Chadd 		return (EINVAL);
533b2bdc62aSAdrian Chadd 	}
534b2bdc62aSAdrian Chadd 	return (0);
535b2bdc62aSAdrian Chadd }
536b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
537b2bdc62aSAdrian Chadd     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
538b2bdc62aSAdrian Chadd     "", "RSS keying material");
539b2bdc62aSAdrian Chadd 
540b2bdc62aSAdrian Chadd static int
541b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
542b2bdc62aSAdrian Chadd {
543b2bdc62aSAdrian Chadd 	struct sbuf *sb;
544b2bdc62aSAdrian Chadd 	int error;
545b2bdc62aSAdrian Chadd 	int i;
546b2bdc62aSAdrian Chadd 
547b2bdc62aSAdrian Chadd 	error = 0;
548b2bdc62aSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
549b2bdc62aSAdrian Chadd 	if (error != 0)
550b2bdc62aSAdrian Chadd 		return (error);
551b2bdc62aSAdrian Chadd 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
552b2bdc62aSAdrian Chadd 	if (sb == NULL)
553b2bdc62aSAdrian Chadd 		return (ENOMEM);
554b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
555b2bdc62aSAdrian Chadd 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
556b2bdc62aSAdrian Chadd 		    i,
557b2bdc62aSAdrian Chadd 		    rss_getcpu(i));
558b2bdc62aSAdrian Chadd 	}
559b2bdc62aSAdrian Chadd 	error = sbuf_finish(sb);
560b2bdc62aSAdrian Chadd 	sbuf_delete(sb);
561b2bdc62aSAdrian Chadd 
562b2bdc62aSAdrian Chadd 	return (error);
563b2bdc62aSAdrian Chadd }
564b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
565b2bdc62aSAdrian Chadd     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
566b2bdc62aSAdrian Chadd     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
567