xref: /freebsd/sys/net/rss_config.c (revision 7029da5c36f2d3cf6bb6c81bf551229f416399e8)
1b2bdc62aSAdrian Chadd /*-
2b2bdc62aSAdrian Chadd  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3b2bdc62aSAdrian Chadd  * All rights reserved.
4b2bdc62aSAdrian Chadd  *
5b2bdc62aSAdrian Chadd  * This software was developed by Robert N. M. Watson under contract
6b2bdc62aSAdrian Chadd  * to Juniper Networks, Inc.
7b2bdc62aSAdrian Chadd  *
8b2bdc62aSAdrian Chadd  * Redistribution and use in source and binary forms, with or without
9b2bdc62aSAdrian Chadd  * modification, are permitted provided that the following conditions
10b2bdc62aSAdrian Chadd  * are met:
11b2bdc62aSAdrian Chadd  * 1. Redistributions of source code must retain the above copyright
12b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer.
13b2bdc62aSAdrian Chadd  * 2. Redistributions in binary form must reproduce the above copyright
14b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer in the
15b2bdc62aSAdrian Chadd  *    documentation and/or other materials provided with the distribution.
16b2bdc62aSAdrian Chadd  *
17b2bdc62aSAdrian Chadd  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18b2bdc62aSAdrian Chadd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19b2bdc62aSAdrian Chadd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20b2bdc62aSAdrian Chadd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21b2bdc62aSAdrian Chadd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22b2bdc62aSAdrian Chadd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23b2bdc62aSAdrian Chadd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24b2bdc62aSAdrian Chadd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25b2bdc62aSAdrian Chadd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26b2bdc62aSAdrian Chadd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27b2bdc62aSAdrian Chadd  * SUCH DAMAGE.
28b2bdc62aSAdrian Chadd  */
29b2bdc62aSAdrian Chadd 
30b2bdc62aSAdrian Chadd #include <sys/cdefs.h>
31b2bdc62aSAdrian Chadd 
32b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$");
33b2bdc62aSAdrian Chadd 
34b2bdc62aSAdrian Chadd #include "opt_inet6.h"
35b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h"
36b2bdc62aSAdrian Chadd 
37b2bdc62aSAdrian Chadd #ifndef PCBGROUP
38b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP"
39b2bdc62aSAdrian Chadd #endif
40b2bdc62aSAdrian Chadd 
41b2bdc62aSAdrian Chadd #include <sys/param.h>
42b2bdc62aSAdrian Chadd #include <sys/mbuf.h>
43b2bdc62aSAdrian Chadd #include <sys/socket.h>
44b2bdc62aSAdrian Chadd #include <sys/priv.h>
45b2bdc62aSAdrian Chadd #include <sys/kernel.h>
46b2bdc62aSAdrian Chadd #include <sys/smp.h>
47b2bdc62aSAdrian Chadd #include <sys/sysctl.h>
48b2bdc62aSAdrian Chadd #include <sys/sbuf.h>
49b2bdc62aSAdrian Chadd 
50b2bdc62aSAdrian Chadd #include <net/if.h>
51b2bdc62aSAdrian Chadd #include <net/if_var.h>
52b2bdc62aSAdrian Chadd #include <net/netisr.h>
53b2bdc62aSAdrian Chadd #include <net/rss_config.h>
54b2bdc62aSAdrian Chadd #include <net/toeplitz.h>
55b2bdc62aSAdrian Chadd 
56b2bdc62aSAdrian Chadd /*-
57b2bdc62aSAdrian Chadd  * Operating system parts of receiver-side scaling (RSS), which allows
58b2bdc62aSAdrian Chadd  * network cards to direct flows to particular receive queues based on hashes
59b2bdc62aSAdrian Chadd  * of header tuples.  This implementation aligns RSS buckets with connection
60b2bdc62aSAdrian Chadd  * groups at the TCP/IP layer, so each bucket is associated with exactly one
61b2bdc62aSAdrian Chadd  * group.  As a result, the group lookup structures (and lock) should have an
62b2bdc62aSAdrian Chadd  * effective affinity with exactly one CPU.
63b2bdc62aSAdrian Chadd  *
64b2bdc62aSAdrian Chadd  * Network device drivers needing to configure RSS will query this framework
65b2bdc62aSAdrian Chadd  * for parameters, such as the current RSS key, hashing policies, number of
66b2bdc62aSAdrian Chadd  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
67b2bdc62aSAdrian Chadd  * provide their own supplementary information, such as queue<->CPU bindings.
68b2bdc62aSAdrian Chadd  * It is the responsibility of the network device driver to inject packets
69b2bdc62aSAdrian Chadd  * into the stack on as close to the right CPU as possible, if playing by RSS
70b2bdc62aSAdrian Chadd  * rules.
71b2bdc62aSAdrian Chadd  *
72b2bdc62aSAdrian Chadd  * TODO:
73b2bdc62aSAdrian Chadd  *
74b2bdc62aSAdrian Chadd  * - Synchronization for rss_key and other future-configurable parameters.
75b2bdc62aSAdrian Chadd  * - Event handler drivers can register to pick up RSS configuration changes.
76b2bdc62aSAdrian Chadd  * - Should we allow rss_basecpu to be configured?
77b2bdc62aSAdrian Chadd  * - Randomize key on boot.
78b2bdc62aSAdrian Chadd  * - IPv6 support.
79b2bdc62aSAdrian Chadd  * - Statistics on how often there's a misalignment between hardware
80b2bdc62aSAdrian Chadd  *   placement and pcbgroup expectations.
81b2bdc62aSAdrian Chadd  */
82b2bdc62aSAdrian Chadd 
83b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet);
84*7029da5cSPawel Biernacki SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
85*7029da5cSPawel Biernacki     "Receive-side steering");
86b2bdc62aSAdrian Chadd 
87b2bdc62aSAdrian Chadd /*
88b2bdc62aSAdrian Chadd  * Toeplitz is the only required hash function in the RSS spec, so use it by
89b2bdc62aSAdrian Chadd  * default.
90b2bdc62aSAdrian Chadd  */
91b2bdc62aSAdrian Chadd static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
92b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
93b2bdc62aSAdrian Chadd     "RSS hash algorithm");
94b2bdc62aSAdrian Chadd 
95b2bdc62aSAdrian Chadd /*
96b2bdc62aSAdrian Chadd  * Size of the indirection table; at most 128 entries per the RSS spec.  We
97b2bdc62aSAdrian Chadd  * size it to at least 2 times the number of CPUs by default to allow useful
98b2bdc62aSAdrian Chadd  * rebalancing.  If not set explicitly with a loader tunable, we tune based
99b2bdc62aSAdrian Chadd  * on the number of CPUs present.
100b2bdc62aSAdrian Chadd  *
101b2bdc62aSAdrian Chadd  * XXXRW: buckets might be better to use for the tunable than bits.
102b2bdc62aSAdrian Chadd  */
103b2bdc62aSAdrian Chadd static u_int	rss_bits;
104b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
105b2bdc62aSAdrian Chadd     "RSS bits");
106b2bdc62aSAdrian Chadd 
107b2bdc62aSAdrian Chadd static u_int	rss_mask;
108b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
109b2bdc62aSAdrian Chadd     "RSS mask");
110b2bdc62aSAdrian Chadd 
111b2bdc62aSAdrian Chadd static const u_int	rss_maxbits = RSS_MAXBITS;
112b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
113b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
114b2bdc62aSAdrian Chadd 
115b2bdc62aSAdrian Chadd /*
116b2bdc62aSAdrian Chadd  * RSS's own count of the number of CPUs it could be using for processing.
117b2bdc62aSAdrian Chadd  * Bounded to 64 by RSS constants.
118b2bdc62aSAdrian Chadd  */
119b2bdc62aSAdrian Chadd static u_int	rss_ncpus;
120b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
121b2bdc62aSAdrian Chadd     "Number of CPUs available to RSS");
122b2bdc62aSAdrian Chadd 
123b2bdc62aSAdrian Chadd #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
124b2bdc62aSAdrian Chadd static const u_int	rss_maxcpus = RSS_MAXCPUS;
125b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
126b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
127b2bdc62aSAdrian Chadd 
128b2bdc62aSAdrian Chadd /*
129b2bdc62aSAdrian Chadd  * Variable exists just for reporting rss_bits in a user-friendly way.
130b2bdc62aSAdrian Chadd  */
131b2bdc62aSAdrian Chadd static u_int	rss_buckets;
132b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
133b2bdc62aSAdrian Chadd     "RSS buckets");
134b2bdc62aSAdrian Chadd 
135b2bdc62aSAdrian Chadd /*
136b2bdc62aSAdrian Chadd  * Base CPU number; devices will add this to all CPU numbers returned by the
137b2bdc62aSAdrian Chadd  * RSS indirection table.  Currently unmodifable in FreeBSD.
138b2bdc62aSAdrian Chadd  */
139b2bdc62aSAdrian Chadd static const u_int	rss_basecpu;
140b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
141b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
142b2bdc62aSAdrian Chadd 
143b2bdc62aSAdrian Chadd /*
144e5562eb9SAdrian Chadd  * Print verbose debugging messages.
145e5562eb9SAdrian Chadd  * 0 - disable
146e5562eb9SAdrian Chadd  * non-zero - enable
147e5562eb9SAdrian Chadd  */
148e5562eb9SAdrian Chadd int	rss_debug = 0;
149e5562eb9SAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0,
150e5562eb9SAdrian Chadd     "RSS debug level");
151e5562eb9SAdrian Chadd 
152e5562eb9SAdrian Chadd /*
153b2bdc62aSAdrian Chadd  * RSS secret key, intended to prevent attacks on load-balancing.  Its
154b2bdc62aSAdrian Chadd  * effectiveness may be limited by algorithm choice and available entropy
155b2bdc62aSAdrian Chadd  * during the boot.
156b2bdc62aSAdrian Chadd  *
157b2bdc62aSAdrian Chadd  * XXXRW: And that we don't randomize it yet!
158b2bdc62aSAdrian Chadd  *
159b2bdc62aSAdrian Chadd  * This is the default Microsoft RSS specification key which is also
160b2bdc62aSAdrian Chadd  * the Chelsio T5 firmware default key.
161b2bdc62aSAdrian Chadd  */
162b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = {
163b2bdc62aSAdrian Chadd 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
164b2bdc62aSAdrian Chadd 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
165b2bdc62aSAdrian Chadd 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
166b2bdc62aSAdrian Chadd 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
167b2bdc62aSAdrian Chadd 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
168b2bdc62aSAdrian Chadd };
169b2bdc62aSAdrian Chadd 
170b2bdc62aSAdrian Chadd /*
171b2bdc62aSAdrian Chadd  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
172a4641f4eSPedro F. Giffuni  * Drivers may supplement this table with a separate CPU<->queue table when
173b2bdc62aSAdrian Chadd  * programming devices.
174b2bdc62aSAdrian Chadd  */
175b2bdc62aSAdrian Chadd struct rss_table_entry {
176b2bdc62aSAdrian Chadd 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
177b2bdc62aSAdrian Chadd };
178b2bdc62aSAdrian Chadd static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
179b2bdc62aSAdrian Chadd 
180b2bdc62aSAdrian Chadd static void
181b2bdc62aSAdrian Chadd rss_init(__unused void *arg)
182b2bdc62aSAdrian Chadd {
183b2bdc62aSAdrian Chadd 	u_int i;
184b2bdc62aSAdrian Chadd 	u_int cpuid;
185b2bdc62aSAdrian Chadd 
186b2bdc62aSAdrian Chadd 	/*
187b2bdc62aSAdrian Chadd 	 * Validate tunables, coerce to sensible values.
188b2bdc62aSAdrian Chadd 	 */
189b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
190b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
191b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
192b2bdc62aSAdrian Chadd 		break;
193b2bdc62aSAdrian Chadd 
194b2bdc62aSAdrian Chadd 	default:
195e5562eb9SAdrian Chadd 		RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n",
196e5562eb9SAdrian Chadd 		    rss_hashalgo, RSS_HASH_TOEPLITZ);
197b2bdc62aSAdrian Chadd 		rss_hashalgo = RSS_HASH_TOEPLITZ;
198b2bdc62aSAdrian Chadd 	}
199b2bdc62aSAdrian Chadd 
200b2bdc62aSAdrian Chadd 	/*
201b2bdc62aSAdrian Chadd 	 * Count available CPUs.
202b2bdc62aSAdrian Chadd 	 *
203b2bdc62aSAdrian Chadd 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
204b2bdc62aSAdrian Chadd 	 * elsewhere.
205b2bdc62aSAdrian Chadd 	 */
206b2bdc62aSAdrian Chadd 	rss_ncpus = 0;
207b2bdc62aSAdrian Chadd 	for (i = 0; i <= mp_maxid; i++) {
208b2bdc62aSAdrian Chadd 		if (CPU_ABSENT(i))
209b2bdc62aSAdrian Chadd 			continue;
210b2bdc62aSAdrian Chadd 		rss_ncpus++;
211b2bdc62aSAdrian Chadd 	}
212b2bdc62aSAdrian Chadd 	if (rss_ncpus > RSS_MAXCPUS)
213b2bdc62aSAdrian Chadd 		rss_ncpus = RSS_MAXCPUS;
214b2bdc62aSAdrian Chadd 
215b2bdc62aSAdrian Chadd 	/*
216b2bdc62aSAdrian Chadd 	 * Tune RSS table entries to be no less than 2x the number of CPUs
217b2bdc62aSAdrian Chadd 	 * -- unless we're running uniprocessor, in which case there's not
218b2bdc62aSAdrian Chadd 	 * much point in having buckets to rearrange for load-balancing!
219b2bdc62aSAdrian Chadd 	 */
220b2bdc62aSAdrian Chadd 	if (rss_ncpus > 1) {
221b2bdc62aSAdrian Chadd 		if (rss_bits == 0)
222b2bdc62aSAdrian Chadd 			rss_bits = fls(rss_ncpus - 1) + 1;
223b2bdc62aSAdrian Chadd 
224b2bdc62aSAdrian Chadd 		/*
225b2bdc62aSAdrian Chadd 		 * Microsoft limits RSS table entries to 128, so apply that
226b2bdc62aSAdrian Chadd 		 * limit to both auto-detected CPU counts and user-configured
227b2bdc62aSAdrian Chadd 		 * ones.
228b2bdc62aSAdrian Chadd 		 */
229b2bdc62aSAdrian Chadd 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
230e5562eb9SAdrian Chadd 			RSS_DEBUG("RSS bits %u not valid, coercing to %u\n",
231e5562eb9SAdrian Chadd 			    rss_bits, RSS_MAXBITS);
232b2bdc62aSAdrian Chadd 			rss_bits = RSS_MAXBITS;
233b2bdc62aSAdrian Chadd 		}
234b2bdc62aSAdrian Chadd 
235b2bdc62aSAdrian Chadd 		/*
236b2bdc62aSAdrian Chadd 		 * Figure out how many buckets to use; warn if less than the
237b2bdc62aSAdrian Chadd 		 * number of configured CPUs, although this is not a fatal
238b2bdc62aSAdrian Chadd 		 * problem.
239b2bdc62aSAdrian Chadd 		 */
240b2bdc62aSAdrian Chadd 		rss_buckets = (1 << rss_bits);
241b2bdc62aSAdrian Chadd 		if (rss_buckets < rss_ncpus)
242e5562eb9SAdrian Chadd 			RSS_DEBUG("WARNING: rss_buckets (%u) less than "
243e5562eb9SAdrian Chadd 			    "rss_ncpus (%u)\n", rss_buckets, rss_ncpus);
244b2bdc62aSAdrian Chadd 		rss_mask = rss_buckets - 1;
245b2bdc62aSAdrian Chadd 	} else {
246b2bdc62aSAdrian Chadd 		rss_bits = 0;
247b2bdc62aSAdrian Chadd 		rss_buckets = 1;
248b2bdc62aSAdrian Chadd 		rss_mask = 0;
249b2bdc62aSAdrian Chadd 	}
250b2bdc62aSAdrian Chadd 
251b2bdc62aSAdrian Chadd 	/*
252b2bdc62aSAdrian Chadd 	 * Set up initial CPU assignments: round-robin by default.
253b2bdc62aSAdrian Chadd 	 */
254b2bdc62aSAdrian Chadd 	cpuid = CPU_FIRST();
255b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
256b2bdc62aSAdrian Chadd 		rss_table[i].rte_cpu = cpuid;
257b2bdc62aSAdrian Chadd 		cpuid = CPU_NEXT(cpuid);
258b2bdc62aSAdrian Chadd 	}
259b2bdc62aSAdrian Chadd 
260b2bdc62aSAdrian Chadd 	/*
261b2bdc62aSAdrian Chadd 	 * Randomize rrs_key.
262b2bdc62aSAdrian Chadd 	 *
263b2bdc62aSAdrian Chadd 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
264b2bdc62aSAdrian Chadd 	 * loop to check for "bad" RSS keys.
265b2bdc62aSAdrian Chadd 	 */
266b2bdc62aSAdrian Chadd }
267b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
268b2bdc62aSAdrian Chadd 
269b2bdc62aSAdrian Chadd static uint32_t
270b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
271b2bdc62aSAdrian Chadd     const uint8_t *data)
272b2bdc62aSAdrian Chadd {
273b2bdc62aSAdrian Chadd 	uint32_t v;
274b2bdc62aSAdrian Chadd 	u_int i;
275b2bdc62aSAdrian Chadd 
276b2bdc62aSAdrian Chadd 	v = 0;
277b2bdc62aSAdrian Chadd 	for (i = 0; i < keylen; i++)
278b2bdc62aSAdrian Chadd 		v += key[i];
279b2bdc62aSAdrian Chadd 	for (i = 0; i < datalen; i++)
280b2bdc62aSAdrian Chadd 		v += data[i];
281b2bdc62aSAdrian Chadd 	return (v);
282b2bdc62aSAdrian Chadd }
283b2bdc62aSAdrian Chadd 
284b2bdc62aSAdrian Chadd uint32_t
285b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data)
286b2bdc62aSAdrian Chadd {
287b2bdc62aSAdrian Chadd 
288b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
289b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
290b2bdc62aSAdrian Chadd 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
291b2bdc62aSAdrian Chadd 		    data));
292b2bdc62aSAdrian Chadd 
293b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
294b2bdc62aSAdrian Chadd 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
295b2bdc62aSAdrian Chadd 		    data));
296b2bdc62aSAdrian Chadd 
297b2bdc62aSAdrian Chadd 	default:
298b2bdc62aSAdrian Chadd 		panic("%s: unsupported/unknown hashalgo %d", __func__,
299b2bdc62aSAdrian Chadd 		    rss_hashalgo);
300b2bdc62aSAdrian Chadd 	}
301b2bdc62aSAdrian Chadd }
302b2bdc62aSAdrian Chadd 
303b2bdc62aSAdrian Chadd /*
304b2bdc62aSAdrian Chadd  * Query the number of RSS bits in use.
305b2bdc62aSAdrian Chadd  */
306b2bdc62aSAdrian Chadd u_int
307b2bdc62aSAdrian Chadd rss_getbits(void)
308b2bdc62aSAdrian Chadd {
309b2bdc62aSAdrian Chadd 
310b2bdc62aSAdrian Chadd 	return (rss_bits);
311b2bdc62aSAdrian Chadd }
312b2bdc62aSAdrian Chadd 
313b2bdc62aSAdrian Chadd /*
314b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with an RSS hash.
315b2bdc62aSAdrian Chadd  */
316b2bdc62aSAdrian Chadd u_int
317b2bdc62aSAdrian Chadd rss_getbucket(u_int hash)
318b2bdc62aSAdrian Chadd {
319b2bdc62aSAdrian Chadd 
320b2bdc62aSAdrian Chadd 	return (hash & rss_mask);
321b2bdc62aSAdrian Chadd }
322b2bdc62aSAdrian Chadd 
323b2bdc62aSAdrian Chadd /*
324b2bdc62aSAdrian Chadd  * Query the RSS layer bucket associated with the given
325b2bdc62aSAdrian Chadd  * entry in the RSS hash space.
326b2bdc62aSAdrian Chadd  *
327b2bdc62aSAdrian Chadd  * The RSS indirection table is 0 .. rss_buckets-1,
328b2bdc62aSAdrian Chadd  * covering the low 'rss_bits' of the total 128 slot
329b2bdc62aSAdrian Chadd  * RSS indirection table.  So just mask off rss_bits and
330b2bdc62aSAdrian Chadd  * return that.
331b2bdc62aSAdrian Chadd  *
332b2bdc62aSAdrian Chadd  * NIC drivers can then iterate over the 128 slot RSS
333b2bdc62aSAdrian Chadd  * indirection table and fetch which RSS bucket to
334b2bdc62aSAdrian Chadd  * map it to.  This will typically be a CPU queue
335b2bdc62aSAdrian Chadd  */
336b2bdc62aSAdrian Chadd u_int
337b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index)
338b2bdc62aSAdrian Chadd {
339b2bdc62aSAdrian Chadd 
340b2bdc62aSAdrian Chadd 	return (index & rss_mask);
341b2bdc62aSAdrian Chadd }
342b2bdc62aSAdrian Chadd 
343b2bdc62aSAdrian Chadd /*
344b2bdc62aSAdrian Chadd  * Query the RSS CPU associated with an RSS bucket.
345b2bdc62aSAdrian Chadd  */
346b2bdc62aSAdrian Chadd u_int
347b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket)
348b2bdc62aSAdrian Chadd {
349b2bdc62aSAdrian Chadd 
350b2bdc62aSAdrian Chadd 	return (rss_table[bucket].rte_cpu);
351b2bdc62aSAdrian Chadd }
352b2bdc62aSAdrian Chadd 
353b2bdc62aSAdrian Chadd /*
354b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup given just the hash and hashtype.
355b2bdc62aSAdrian Chadd  */
356b2bdc62aSAdrian Chadd u_int
357b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
358b2bdc62aSAdrian Chadd {
359b2bdc62aSAdrian Chadd 
360b2bdc62aSAdrian Chadd 	switch (hash_type) {
361b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
362b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
363b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
364b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
365b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
366b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
367b2bdc62aSAdrian Chadd 		return (rss_getcpu(rss_getbucket(hash_val)));
368b2bdc62aSAdrian Chadd 	default:
369b2bdc62aSAdrian Chadd 		return (NETISR_CPUID_NONE);
370b2bdc62aSAdrian Chadd 	}
371b2bdc62aSAdrian Chadd }
372b2bdc62aSAdrian Chadd 
373b2bdc62aSAdrian Chadd /*
374b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with the given hash value and
375b2bdc62aSAdrian Chadd  * type.
376b2bdc62aSAdrian Chadd  */
377b2bdc62aSAdrian Chadd int
378b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
379b2bdc62aSAdrian Chadd {
380b2bdc62aSAdrian Chadd 
381b2bdc62aSAdrian Chadd 	switch (hash_type) {
382b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
383b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
384b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
385b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
386b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
387b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
388b2bdc62aSAdrian Chadd 		*bucket_id = rss_getbucket(hash_val);
389b2bdc62aSAdrian Chadd 		return (0);
390b2bdc62aSAdrian Chadd 	default:
391b2bdc62aSAdrian Chadd 		return (-1);
392b2bdc62aSAdrian Chadd 	}
393b2bdc62aSAdrian Chadd }
394b2bdc62aSAdrian Chadd 
395b2bdc62aSAdrian Chadd /*
396b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup routine for use by protocols.
397b2bdc62aSAdrian Chadd  */
398b2bdc62aSAdrian Chadd struct mbuf *
399b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
400b2bdc62aSAdrian Chadd {
401b2bdc62aSAdrian Chadd 
402b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
403b2bdc62aSAdrian Chadd 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
404b2bdc62aSAdrian Chadd 	return (m);
405b2bdc62aSAdrian Chadd }
406b2bdc62aSAdrian Chadd 
407b2bdc62aSAdrian Chadd int
408b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
409b2bdc62aSAdrian Chadd {
410b2bdc62aSAdrian Chadd 
411b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
412b2bdc62aSAdrian Chadd 
413b2bdc62aSAdrian Chadd 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
414b2bdc62aSAdrian Chadd 	    bucket_id));
415b2bdc62aSAdrian Chadd }
416b2bdc62aSAdrian Chadd 
417b2bdc62aSAdrian Chadd /*
418b2bdc62aSAdrian Chadd  * Query the RSS hash algorithm.
419b2bdc62aSAdrian Chadd  */
420b2bdc62aSAdrian Chadd u_int
421b2bdc62aSAdrian Chadd rss_gethashalgo(void)
422b2bdc62aSAdrian Chadd {
423b2bdc62aSAdrian Chadd 
424b2bdc62aSAdrian Chadd 	return (rss_hashalgo);
425b2bdc62aSAdrian Chadd }
426b2bdc62aSAdrian Chadd 
427b2bdc62aSAdrian Chadd /*
428b2bdc62aSAdrian Chadd  * Query the current RSS key; likely to be used by device drivers when
429b2bdc62aSAdrian Chadd  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
430b2bdc62aSAdrian Chadd  *
431b2bdc62aSAdrian Chadd  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
432b2bdc62aSAdrian Chadd  */
433b2bdc62aSAdrian Chadd void
434b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key)
435b2bdc62aSAdrian Chadd {
436b2bdc62aSAdrian Chadd 
437b2bdc62aSAdrian Chadd 	bcopy(rss_key, key, sizeof(rss_key));
438b2bdc62aSAdrian Chadd }
439b2bdc62aSAdrian Chadd 
440b2bdc62aSAdrian Chadd /*
441b2bdc62aSAdrian Chadd  * Query the number of buckets; this may be used by both network device
442b2bdc62aSAdrian Chadd  * drivers, which will need to populate hardware shadows of the software
443b2bdc62aSAdrian Chadd  * indirection table, and the network stack itself (such as when deciding how
444b2bdc62aSAdrian Chadd  * many connection groups to allocate).
445b2bdc62aSAdrian Chadd  */
446b2bdc62aSAdrian Chadd u_int
447b2bdc62aSAdrian Chadd rss_getnumbuckets(void)
448b2bdc62aSAdrian Chadd {
449b2bdc62aSAdrian Chadd 
450b2bdc62aSAdrian Chadd 	return (rss_buckets);
451b2bdc62aSAdrian Chadd }
452b2bdc62aSAdrian Chadd 
453b2bdc62aSAdrian Chadd /*
454b2bdc62aSAdrian Chadd  * Query the number of CPUs in use by RSS; may be useful to device drivers
455b2bdc62aSAdrian Chadd  * trying to figure out how to map a larger number of CPUs into a smaller
456b2bdc62aSAdrian Chadd  * number of receive queues.
457b2bdc62aSAdrian Chadd  */
458b2bdc62aSAdrian Chadd u_int
459b2bdc62aSAdrian Chadd rss_getnumcpus(void)
460b2bdc62aSAdrian Chadd {
461b2bdc62aSAdrian Chadd 
462b2bdc62aSAdrian Chadd 	return (rss_ncpus);
463b2bdc62aSAdrian Chadd }
464b2bdc62aSAdrian Chadd 
465b2bdc62aSAdrian Chadd /*
466b2bdc62aSAdrian Chadd  * Return the supported RSS hash configuration.
467b2bdc62aSAdrian Chadd  *
468b2bdc62aSAdrian Chadd  * NICs should query this to determine what to configure in their redirection
469b2bdc62aSAdrian Chadd  * matching table.
470b2bdc62aSAdrian Chadd  */
471b2bdc62aSAdrian Chadd inline u_int
472b2bdc62aSAdrian Chadd rss_gethashconfig(void)
473b2bdc62aSAdrian Chadd {
474b2bdc62aSAdrian Chadd 
475b2bdc62aSAdrian Chadd 	/* Return 4-tuple for TCP; 2-tuple for others */
476b2bdc62aSAdrian Chadd 	/*
477b2bdc62aSAdrian Chadd 	 * UDP may fragment more often than TCP and thus we'll end up with
478b2bdc62aSAdrian Chadd 	 * NICs returning 2-tuple fragments.
479b2bdc62aSAdrian Chadd 	 * udp_init() and udplite_init() both currently initialise things
480b2bdc62aSAdrian Chadd 	 * as 2-tuple.
481b2bdc62aSAdrian Chadd 	 * So for now disable UDP 4-tuple hashing until all of the other
482b2bdc62aSAdrian Chadd 	 * pieces are in place.
483b2bdc62aSAdrian Chadd 	 */
484b2bdc62aSAdrian Chadd 	return (
485b2bdc62aSAdrian Chadd 	    RSS_HASHTYPE_RSS_IPV4
486b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV4
487b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6
488b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6
489b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6_EX
490b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
491b2bdc62aSAdrian Chadd #if 0
492b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4
493b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6
494b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
495b2bdc62aSAdrian Chadd #endif
496b2bdc62aSAdrian Chadd 	);
497b2bdc62aSAdrian Chadd }
498b2bdc62aSAdrian Chadd 
499b2bdc62aSAdrian Chadd /*
500b2bdc62aSAdrian Chadd  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
501b2bdc62aSAdrian Chadd  * it appearing in debugging output unnecessarily.
502b2bdc62aSAdrian Chadd  */
503b2bdc62aSAdrian Chadd static int
504b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS)
505b2bdc62aSAdrian Chadd {
506b2bdc62aSAdrian Chadd 	uint8_t temp_rss_key[RSS_KEYSIZE];
507b2bdc62aSAdrian Chadd 	int error;
508b2bdc62aSAdrian Chadd 
509b2bdc62aSAdrian Chadd 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
510b2bdc62aSAdrian Chadd 	if (error)
511b2bdc62aSAdrian Chadd 		return (error);
512b2bdc62aSAdrian Chadd 
513b2bdc62aSAdrian Chadd 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
514b2bdc62aSAdrian Chadd 	error = sysctl_handle_opaque(oidp, temp_rss_key,
515b2bdc62aSAdrian Chadd 	    sizeof(temp_rss_key), req);
516b2bdc62aSAdrian Chadd 	if (error)
517b2bdc62aSAdrian Chadd 		return (error);
518b2bdc62aSAdrian Chadd 	if (req->newptr != NULL) {
519b2bdc62aSAdrian Chadd 		/* XXXRW: Not yet. */
520b2bdc62aSAdrian Chadd 		return (EINVAL);
521b2bdc62aSAdrian Chadd 	}
522b2bdc62aSAdrian Chadd 	return (0);
523b2bdc62aSAdrian Chadd }
524b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
525b2bdc62aSAdrian Chadd     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
526b2bdc62aSAdrian Chadd     "", "RSS keying material");
527b2bdc62aSAdrian Chadd 
528b2bdc62aSAdrian Chadd static int
529b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
530b2bdc62aSAdrian Chadd {
531b2bdc62aSAdrian Chadd 	struct sbuf *sb;
532b2bdc62aSAdrian Chadd 	int error;
533b2bdc62aSAdrian Chadd 	int i;
534b2bdc62aSAdrian Chadd 
535b2bdc62aSAdrian Chadd 	error = 0;
536b2bdc62aSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
537b2bdc62aSAdrian Chadd 	if (error != 0)
538b2bdc62aSAdrian Chadd 		return (error);
539b2bdc62aSAdrian Chadd 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
540b2bdc62aSAdrian Chadd 	if (sb == NULL)
541b2bdc62aSAdrian Chadd 		return (ENOMEM);
542b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
543b2bdc62aSAdrian Chadd 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
544b2bdc62aSAdrian Chadd 		    i,
545b2bdc62aSAdrian Chadd 		    rss_getcpu(i));
546b2bdc62aSAdrian Chadd 	}
547b2bdc62aSAdrian Chadd 	error = sbuf_finish(sb);
548b2bdc62aSAdrian Chadd 	sbuf_delete(sb);
549b2bdc62aSAdrian Chadd 
550b2bdc62aSAdrian Chadd 	return (error);
551b2bdc62aSAdrian Chadd }
552b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
553*7029da5cSPawel Biernacki     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
554b2bdc62aSAdrian Chadd     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
555