xref: /freebsd/sys/net/rss_config.c (revision dfc016587a1e11191676c42672aeeee5eb8cd64b)
1b2bdc62aSAdrian Chadd /*-
2b2bdc62aSAdrian Chadd  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3b2bdc62aSAdrian Chadd  * All rights reserved.
4b2bdc62aSAdrian Chadd  *
5b2bdc62aSAdrian Chadd  * This software was developed by Robert N. M. Watson under contract
6b2bdc62aSAdrian Chadd  * to Juniper Networks, Inc.
7b2bdc62aSAdrian Chadd  *
8b2bdc62aSAdrian Chadd  * Redistribution and use in source and binary forms, with or without
9b2bdc62aSAdrian Chadd  * modification, are permitted provided that the following conditions
10b2bdc62aSAdrian Chadd  * are met:
11b2bdc62aSAdrian Chadd  * 1. Redistributions of source code must retain the above copyright
12b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer.
13b2bdc62aSAdrian Chadd  * 2. Redistributions in binary form must reproduce the above copyright
14b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer in the
15b2bdc62aSAdrian Chadd  *    documentation and/or other materials provided with the distribution.
16b2bdc62aSAdrian Chadd  *
17b2bdc62aSAdrian Chadd  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18b2bdc62aSAdrian Chadd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19b2bdc62aSAdrian Chadd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20b2bdc62aSAdrian Chadd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21b2bdc62aSAdrian Chadd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22b2bdc62aSAdrian Chadd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23b2bdc62aSAdrian Chadd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24b2bdc62aSAdrian Chadd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25b2bdc62aSAdrian Chadd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26b2bdc62aSAdrian Chadd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27b2bdc62aSAdrian Chadd  * SUCH DAMAGE.
28b2bdc62aSAdrian Chadd  */
29b2bdc62aSAdrian Chadd 
30b2bdc62aSAdrian Chadd 
31b2bdc62aSAdrian Chadd #include "opt_inet6.h"
32b2bdc62aSAdrian Chadd 
33b2bdc62aSAdrian Chadd #include <sys/param.h>
34b2bdc62aSAdrian Chadd #include <sys/mbuf.h>
35b2bdc62aSAdrian Chadd #include <sys/socket.h>
36b2bdc62aSAdrian Chadd #include <sys/priv.h>
37b2bdc62aSAdrian Chadd #include <sys/kernel.h>
38b2bdc62aSAdrian Chadd #include <sys/smp.h>
39b2bdc62aSAdrian Chadd #include <sys/sysctl.h>
40b2bdc62aSAdrian Chadd #include <sys/sbuf.h>
41b2bdc62aSAdrian Chadd 
42b2bdc62aSAdrian Chadd #include <net/if.h>
43b2bdc62aSAdrian Chadd #include <net/if_var.h>
44b2bdc62aSAdrian Chadd #include <net/netisr.h>
45b2bdc62aSAdrian Chadd #include <net/rss_config.h>
46b2bdc62aSAdrian Chadd #include <net/toeplitz.h>
47b2bdc62aSAdrian Chadd 
48b2bdc62aSAdrian Chadd /*-
49b2bdc62aSAdrian Chadd  * Operating system parts of receiver-side scaling (RSS), which allows
50b2bdc62aSAdrian Chadd  * network cards to direct flows to particular receive queues based on hashes
51b2bdc62aSAdrian Chadd  * of header tuples.  This implementation aligns RSS buckets with connection
52b2bdc62aSAdrian Chadd  * groups at the TCP/IP layer, so each bucket is associated with exactly one
53b2bdc62aSAdrian Chadd  * group.  As a result, the group lookup structures (and lock) should have an
54b2bdc62aSAdrian Chadd  * effective affinity with exactly one CPU.
55b2bdc62aSAdrian Chadd  *
56b2bdc62aSAdrian Chadd  * Network device drivers needing to configure RSS will query this framework
57b2bdc62aSAdrian Chadd  * for parameters, such as the current RSS key, hashing policies, number of
58b2bdc62aSAdrian Chadd  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
59b2bdc62aSAdrian Chadd  * provide their own supplementary information, such as queue<->CPU bindings.
60b2bdc62aSAdrian Chadd  * It is the responsibility of the network device driver to inject packets
61b2bdc62aSAdrian Chadd  * into the stack on as close to the right CPU as possible, if playing by RSS
62b2bdc62aSAdrian Chadd  * rules.
63b2bdc62aSAdrian Chadd  *
64b2bdc62aSAdrian Chadd  * TODO:
65b2bdc62aSAdrian Chadd  *
66b2bdc62aSAdrian Chadd  * - Synchronization for rss_key and other future-configurable parameters.
67b2bdc62aSAdrian Chadd  * - Event handler drivers can register to pick up RSS configuration changes.
68b2bdc62aSAdrian Chadd  * - Should we allow rss_basecpu to be configured?
69b2bdc62aSAdrian Chadd  * - Randomize key on boot.
70b2bdc62aSAdrian Chadd  * - IPv6 support.
71b2bdc62aSAdrian Chadd  * - Statistics on how often there's a misalignment between hardware
72b2bdc62aSAdrian Chadd  *   placement and pcbgroup expectations.
73b2bdc62aSAdrian Chadd  */
74b2bdc62aSAdrian Chadd 
75b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet);
76*7029da5cSPawel Biernacki SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
77*7029da5cSPawel Biernacki     "Receive-side steering");
78b2bdc62aSAdrian Chadd 
79b2bdc62aSAdrian Chadd /*
80b2bdc62aSAdrian Chadd  * Toeplitz is the only required hash function in the RSS spec, so use it by
81b2bdc62aSAdrian Chadd  * default.
82b2bdc62aSAdrian Chadd  */
83b2bdc62aSAdrian Chadd static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
84b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
85b2bdc62aSAdrian Chadd     "RSS hash algorithm");
86b2bdc62aSAdrian Chadd 
87b2bdc62aSAdrian Chadd /*
88b2bdc62aSAdrian Chadd  * Size of the indirection table; at most 128 entries per the RSS spec.  We
89b2bdc62aSAdrian Chadd  * size it to at least 2 times the number of CPUs by default to allow useful
90b2bdc62aSAdrian Chadd  * rebalancing.  If not set explicitly with a loader tunable, we tune based
91b2bdc62aSAdrian Chadd  * on the number of CPUs present.
92b2bdc62aSAdrian Chadd  *
93b2bdc62aSAdrian Chadd  * XXXRW: buckets might be better to use for the tunable than bits.
94b2bdc62aSAdrian Chadd  */
95b2bdc62aSAdrian Chadd static u_int	rss_bits;
96b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
97b2bdc62aSAdrian Chadd     "RSS bits");
98b2bdc62aSAdrian Chadd 
99b2bdc62aSAdrian Chadd static u_int	rss_mask;
100b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
101b2bdc62aSAdrian Chadd     "RSS mask");
102b2bdc62aSAdrian Chadd 
103b2bdc62aSAdrian Chadd static const u_int	rss_maxbits = RSS_MAXBITS;
104b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
105b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
106b2bdc62aSAdrian Chadd 
107b2bdc62aSAdrian Chadd /*
108b2bdc62aSAdrian Chadd  * RSS's own count of the number of CPUs it could be using for processing.
109b2bdc62aSAdrian Chadd  * Bounded to 64 by RSS constants.
110b2bdc62aSAdrian Chadd  */
111b2bdc62aSAdrian Chadd static u_int	rss_ncpus;
112b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
113b2bdc62aSAdrian Chadd     "Number of CPUs available to RSS");
114b2bdc62aSAdrian Chadd 
115b2bdc62aSAdrian Chadd #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
116b2bdc62aSAdrian Chadd static const u_int	rss_maxcpus = RSS_MAXCPUS;
117b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
118b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
119b2bdc62aSAdrian Chadd 
120b2bdc62aSAdrian Chadd /*
121b2bdc62aSAdrian Chadd  * Variable exists just for reporting rss_bits in a user-friendly way.
122b2bdc62aSAdrian Chadd  */
123b2bdc62aSAdrian Chadd static u_int	rss_buckets;
124b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
125b2bdc62aSAdrian Chadd     "RSS buckets");
126b2bdc62aSAdrian Chadd 
127b2bdc62aSAdrian Chadd /*
128b2bdc62aSAdrian Chadd  * Base CPU number; devices will add this to all CPU numbers returned by the
129b2bdc62aSAdrian Chadd  * RSS indirection table.  Currently unmodifable in FreeBSD.
130b2bdc62aSAdrian Chadd  */
131b2bdc62aSAdrian Chadd static const u_int	rss_basecpu;
132b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
133b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
134b2bdc62aSAdrian Chadd 
135b2bdc62aSAdrian Chadd /*
136e5562eb9SAdrian Chadd  * Print verbose debugging messages.
137e5562eb9SAdrian Chadd  * 0 - disable
138e5562eb9SAdrian Chadd  * non-zero - enable
139e5562eb9SAdrian Chadd  */
140e5562eb9SAdrian Chadd int	rss_debug = 0;
141e5562eb9SAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0,
142e5562eb9SAdrian Chadd     "RSS debug level");
143e5562eb9SAdrian Chadd 
144e5562eb9SAdrian Chadd /*
145b2bdc62aSAdrian Chadd  * RSS secret key, intended to prevent attacks on load-balancing.  Its
146b2bdc62aSAdrian Chadd  * effectiveness may be limited by algorithm choice and available entropy
147b2bdc62aSAdrian Chadd  * during the boot.
148b2bdc62aSAdrian Chadd  *
149b2bdc62aSAdrian Chadd  * XXXRW: And that we don't randomize it yet!
150b2bdc62aSAdrian Chadd  *
151b2bdc62aSAdrian Chadd  * This is the default Microsoft RSS specification key which is also
152b2bdc62aSAdrian Chadd  * the Chelsio T5 firmware default key.
153b2bdc62aSAdrian Chadd  */
154b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = {
155b2bdc62aSAdrian Chadd 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
156b2bdc62aSAdrian Chadd 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
157b2bdc62aSAdrian Chadd 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
158b2bdc62aSAdrian Chadd 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
159b2bdc62aSAdrian Chadd 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
160b2bdc62aSAdrian Chadd };
161b2bdc62aSAdrian Chadd 
162b2bdc62aSAdrian Chadd /*
163b2bdc62aSAdrian Chadd  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
164a4641f4eSPedro F. Giffuni  * Drivers may supplement this table with a separate CPU<->queue table when
165b2bdc62aSAdrian Chadd  * programming devices.
166b2bdc62aSAdrian Chadd  */
167b2bdc62aSAdrian Chadd struct rss_table_entry {
168b2bdc62aSAdrian Chadd 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
169b2bdc62aSAdrian Chadd };
170b2bdc62aSAdrian Chadd static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
171b2bdc62aSAdrian Chadd 
172b2bdc62aSAdrian Chadd static void
rss_init(__unused void * arg)173b2bdc62aSAdrian Chadd rss_init(__unused void *arg)
174b2bdc62aSAdrian Chadd {
175b2bdc62aSAdrian Chadd 	u_int i;
176b2bdc62aSAdrian Chadd 	u_int cpuid;
177b2bdc62aSAdrian Chadd 
178b2bdc62aSAdrian Chadd 	/*
179b2bdc62aSAdrian Chadd 	 * Validate tunables, coerce to sensible values.
180b2bdc62aSAdrian Chadd 	 */
181b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
182b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
183b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
184b2bdc62aSAdrian Chadd 		break;
185b2bdc62aSAdrian Chadd 
186b2bdc62aSAdrian Chadd 	default:
187e5562eb9SAdrian Chadd 		RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n",
188e5562eb9SAdrian Chadd 		    rss_hashalgo, RSS_HASH_TOEPLITZ);
189b2bdc62aSAdrian Chadd 		rss_hashalgo = RSS_HASH_TOEPLITZ;
190b2bdc62aSAdrian Chadd 	}
191b2bdc62aSAdrian Chadd 
192b2bdc62aSAdrian Chadd 	/*
193b2bdc62aSAdrian Chadd 	 * Count available CPUs.
194b2bdc62aSAdrian Chadd 	 *
195b2bdc62aSAdrian Chadd 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
196b2bdc62aSAdrian Chadd 	 * elsewhere.
197b2bdc62aSAdrian Chadd 	 */
198b2bdc62aSAdrian Chadd 	rss_ncpus = 0;
199b2bdc62aSAdrian Chadd 	for (i = 0; i <= mp_maxid; i++) {
200b2bdc62aSAdrian Chadd 		if (CPU_ABSENT(i))
201b2bdc62aSAdrian Chadd 			continue;
202b2bdc62aSAdrian Chadd 		rss_ncpus++;
203b2bdc62aSAdrian Chadd 	}
204b2bdc62aSAdrian Chadd 	if (rss_ncpus > RSS_MAXCPUS)
205b2bdc62aSAdrian Chadd 		rss_ncpus = RSS_MAXCPUS;
206b2bdc62aSAdrian Chadd 
207b2bdc62aSAdrian Chadd 	/*
208b2bdc62aSAdrian Chadd 	 * Tune RSS table entries to be no less than 2x the number of CPUs
209b2bdc62aSAdrian Chadd 	 * -- unless we're running uniprocessor, in which case there's not
210b2bdc62aSAdrian Chadd 	 * much point in having buckets to rearrange for load-balancing!
211b2bdc62aSAdrian Chadd 	 */
212b2bdc62aSAdrian Chadd 	if (rss_ncpus > 1) {
213b2bdc62aSAdrian Chadd 		if (rss_bits == 0)
214b2bdc62aSAdrian Chadd 			rss_bits = fls(rss_ncpus - 1) + 1;
215b2bdc62aSAdrian Chadd 
216b2bdc62aSAdrian Chadd 		/*
217b2bdc62aSAdrian Chadd 		 * Microsoft limits RSS table entries to 128, so apply that
218b2bdc62aSAdrian Chadd 		 * limit to both auto-detected CPU counts and user-configured
219b2bdc62aSAdrian Chadd 		 * ones.
220b2bdc62aSAdrian Chadd 		 */
221b2bdc62aSAdrian Chadd 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
222e5562eb9SAdrian Chadd 			RSS_DEBUG("RSS bits %u not valid, coercing to %u\n",
223e5562eb9SAdrian Chadd 			    rss_bits, RSS_MAXBITS);
224b2bdc62aSAdrian Chadd 			rss_bits = RSS_MAXBITS;
225b2bdc62aSAdrian Chadd 		}
226b2bdc62aSAdrian Chadd 
227b2bdc62aSAdrian Chadd 		/*
228b2bdc62aSAdrian Chadd 		 * Figure out how many buckets to use; warn if less than the
229b2bdc62aSAdrian Chadd 		 * number of configured CPUs, although this is not a fatal
230b2bdc62aSAdrian Chadd 		 * problem.
231b2bdc62aSAdrian Chadd 		 */
232b2bdc62aSAdrian Chadd 		rss_buckets = (1 << rss_bits);
233b2bdc62aSAdrian Chadd 		if (rss_buckets < rss_ncpus)
234e5562eb9SAdrian Chadd 			RSS_DEBUG("WARNING: rss_buckets (%u) less than "
235e5562eb9SAdrian Chadd 			    "rss_ncpus (%u)\n", rss_buckets, rss_ncpus);
236b2bdc62aSAdrian Chadd 		rss_mask = rss_buckets - 1;
237b2bdc62aSAdrian Chadd 	} else {
238b2bdc62aSAdrian Chadd 		rss_bits = 0;
239b2bdc62aSAdrian Chadd 		rss_buckets = 1;
240b2bdc62aSAdrian Chadd 		rss_mask = 0;
241b2bdc62aSAdrian Chadd 	}
242b2bdc62aSAdrian Chadd 
243b2bdc62aSAdrian Chadd 	/*
244b2bdc62aSAdrian Chadd 	 * Set up initial CPU assignments: round-robin by default.
245b2bdc62aSAdrian Chadd 	 */
246b2bdc62aSAdrian Chadd 	cpuid = CPU_FIRST();
247b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
248b2bdc62aSAdrian Chadd 		rss_table[i].rte_cpu = cpuid;
249b2bdc62aSAdrian Chadd 		cpuid = CPU_NEXT(cpuid);
250b2bdc62aSAdrian Chadd 	}
251b2bdc62aSAdrian Chadd 
252b2bdc62aSAdrian Chadd 	/*
253b2bdc62aSAdrian Chadd 	 * Randomize rrs_key.
254b2bdc62aSAdrian Chadd 	 *
255b2bdc62aSAdrian Chadd 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
256b2bdc62aSAdrian Chadd 	 * loop to check for "bad" RSS keys.
257b2bdc62aSAdrian Chadd 	 */
258b2bdc62aSAdrian Chadd }
259b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
260b2bdc62aSAdrian Chadd 
261b2bdc62aSAdrian Chadd static uint32_t
rss_naive_hash(u_int keylen,const uint8_t * key,u_int datalen,const uint8_t * data)262b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
263b2bdc62aSAdrian Chadd     const uint8_t *data)
264b2bdc62aSAdrian Chadd {
265b2bdc62aSAdrian Chadd 	uint32_t v;
266b2bdc62aSAdrian Chadd 	u_int i;
267b2bdc62aSAdrian Chadd 
268b2bdc62aSAdrian Chadd 	v = 0;
269b2bdc62aSAdrian Chadd 	for (i = 0; i < keylen; i++)
270b2bdc62aSAdrian Chadd 		v += key[i];
271b2bdc62aSAdrian Chadd 	for (i = 0; i < datalen; i++)
272b2bdc62aSAdrian Chadd 		v += data[i];
273b2bdc62aSAdrian Chadd 	return (v);
274b2bdc62aSAdrian Chadd }
275b2bdc62aSAdrian Chadd 
276b2bdc62aSAdrian Chadd uint32_t
rss_hash(u_int datalen,const uint8_t * data)277b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data)
278b2bdc62aSAdrian Chadd {
279b2bdc62aSAdrian Chadd 
280b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
281b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
282b2bdc62aSAdrian Chadd 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
283b2bdc62aSAdrian Chadd 		    data));
284b2bdc62aSAdrian Chadd 
285b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
286b2bdc62aSAdrian Chadd 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
287b2bdc62aSAdrian Chadd 		    data));
288b2bdc62aSAdrian Chadd 
289b2bdc62aSAdrian Chadd 	default:
290b2bdc62aSAdrian Chadd 		panic("%s: unsupported/unknown hashalgo %d", __func__,
291b2bdc62aSAdrian Chadd 		    rss_hashalgo);
292b2bdc62aSAdrian Chadd 	}
293b2bdc62aSAdrian Chadd }
294b2bdc62aSAdrian Chadd 
295b2bdc62aSAdrian Chadd /*
296b2bdc62aSAdrian Chadd  * Query the number of RSS bits in use.
297b2bdc62aSAdrian Chadd  */
298b2bdc62aSAdrian Chadd u_int
rss_getbits(void)299b2bdc62aSAdrian Chadd rss_getbits(void)
300b2bdc62aSAdrian Chadd {
301b2bdc62aSAdrian Chadd 
302b2bdc62aSAdrian Chadd 	return (rss_bits);
303b2bdc62aSAdrian Chadd }
304b2bdc62aSAdrian Chadd 
305b2bdc62aSAdrian Chadd /*
306b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with an RSS hash.
307b2bdc62aSAdrian Chadd  */
308b2bdc62aSAdrian Chadd u_int
rss_getbucket(u_int hash)309b2bdc62aSAdrian Chadd rss_getbucket(u_int hash)
310b2bdc62aSAdrian Chadd {
311b2bdc62aSAdrian Chadd 
312b2bdc62aSAdrian Chadd 	return (hash & rss_mask);
313b2bdc62aSAdrian Chadd }
314b2bdc62aSAdrian Chadd 
315b2bdc62aSAdrian Chadd /*
316b2bdc62aSAdrian Chadd  * Query the RSS layer bucket associated with the given
317b2bdc62aSAdrian Chadd  * entry in the RSS hash space.
318b2bdc62aSAdrian Chadd  *
319b2bdc62aSAdrian Chadd  * The RSS indirection table is 0 .. rss_buckets-1,
320b2bdc62aSAdrian Chadd  * covering the low 'rss_bits' of the total 128 slot
321b2bdc62aSAdrian Chadd  * RSS indirection table.  So just mask off rss_bits and
322b2bdc62aSAdrian Chadd  * return that.
323b2bdc62aSAdrian Chadd  *
324b2bdc62aSAdrian Chadd  * NIC drivers can then iterate over the 128 slot RSS
325b2bdc62aSAdrian Chadd  * indirection table and fetch which RSS bucket to
326b2bdc62aSAdrian Chadd  * map it to.  This will typically be a CPU queue
327b2bdc62aSAdrian Chadd  */
328b2bdc62aSAdrian Chadd u_int
rss_get_indirection_to_bucket(u_int index)329b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index)
330b2bdc62aSAdrian Chadd {
331b2bdc62aSAdrian Chadd 
332b2bdc62aSAdrian Chadd 	return (index & rss_mask);
333b2bdc62aSAdrian Chadd }
334b2bdc62aSAdrian Chadd 
335b2bdc62aSAdrian Chadd /*
336b2bdc62aSAdrian Chadd  * Query the RSS CPU associated with an RSS bucket.
337b2bdc62aSAdrian Chadd  */
338b2bdc62aSAdrian Chadd u_int
rss_getcpu(u_int bucket)339b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket)
340b2bdc62aSAdrian Chadd {
341b2bdc62aSAdrian Chadd 
342b2bdc62aSAdrian Chadd 	return (rss_table[bucket].rte_cpu);
343b2bdc62aSAdrian Chadd }
344b2bdc62aSAdrian Chadd 
345b2bdc62aSAdrian Chadd /*
346b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup given just the hash and hashtype.
347b2bdc62aSAdrian Chadd  */
348b2bdc62aSAdrian Chadd u_int
rss_hash2cpuid(uint32_t hash_val,uint32_t hash_type)349b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
350b2bdc62aSAdrian Chadd {
351b2bdc62aSAdrian Chadd 
352b2bdc62aSAdrian Chadd 	switch (hash_type) {
353b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
354b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
355b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
356b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
357b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
358b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
359b2bdc62aSAdrian Chadd 		return (rss_getcpu(rss_getbucket(hash_val)));
360b2bdc62aSAdrian Chadd 	default:
361b2bdc62aSAdrian Chadd 		return (NETISR_CPUID_NONE);
362b2bdc62aSAdrian Chadd 	}
363b2bdc62aSAdrian Chadd }
364b2bdc62aSAdrian Chadd 
365b2bdc62aSAdrian Chadd /*
366b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with the given hash value and
367b2bdc62aSAdrian Chadd  * type.
368b2bdc62aSAdrian Chadd  */
369b2bdc62aSAdrian Chadd int
rss_hash2bucket(uint32_t hash_val,uint32_t hash_type,uint32_t * bucket_id)370b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
371b2bdc62aSAdrian Chadd {
372b2bdc62aSAdrian Chadd 
373b2bdc62aSAdrian Chadd 	switch (hash_type) {
374b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
375b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
376b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
377b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
378b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
379b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
380b2bdc62aSAdrian Chadd 		*bucket_id = rss_getbucket(hash_val);
381b2bdc62aSAdrian Chadd 		return (0);
382b2bdc62aSAdrian Chadd 	default:
383b2bdc62aSAdrian Chadd 		return (-1);
384b2bdc62aSAdrian Chadd 	}
385b2bdc62aSAdrian Chadd }
386b2bdc62aSAdrian Chadd 
387b2bdc62aSAdrian Chadd /*
388b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup routine for use by protocols.
389b2bdc62aSAdrian Chadd  */
390b2bdc62aSAdrian Chadd struct mbuf *
rss_m2cpuid(struct mbuf * m,uintptr_t source,u_int * cpuid)391b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
392b2bdc62aSAdrian Chadd {
393b2bdc62aSAdrian Chadd 
394b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
395b2bdc62aSAdrian Chadd 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
396b2bdc62aSAdrian Chadd 	return (m);
397b2bdc62aSAdrian Chadd }
398b2bdc62aSAdrian Chadd 
399b2bdc62aSAdrian Chadd int
rss_m2bucket(struct mbuf * m,uint32_t * bucket_id)400b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
401b2bdc62aSAdrian Chadd {
402b2bdc62aSAdrian Chadd 
403b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
404b2bdc62aSAdrian Chadd 
405b2bdc62aSAdrian Chadd 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
406b2bdc62aSAdrian Chadd 	    bucket_id));
407b2bdc62aSAdrian Chadd }
408b2bdc62aSAdrian Chadd 
409b2bdc62aSAdrian Chadd /*
410b2bdc62aSAdrian Chadd  * Query the RSS hash algorithm.
411b2bdc62aSAdrian Chadd  */
412b2bdc62aSAdrian Chadd u_int
rss_gethashalgo(void)413b2bdc62aSAdrian Chadd rss_gethashalgo(void)
414b2bdc62aSAdrian Chadd {
415b2bdc62aSAdrian Chadd 
416b2bdc62aSAdrian Chadd 	return (rss_hashalgo);
417b2bdc62aSAdrian Chadd }
418b2bdc62aSAdrian Chadd 
419b2bdc62aSAdrian Chadd /*
420b2bdc62aSAdrian Chadd  * Query the current RSS key; likely to be used by device drivers when
421b2bdc62aSAdrian Chadd  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
422b2bdc62aSAdrian Chadd  *
423b2bdc62aSAdrian Chadd  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
424b2bdc62aSAdrian Chadd  */
425b2bdc62aSAdrian Chadd void
rss_getkey(uint8_t * key)426b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key)
427b2bdc62aSAdrian Chadd {
428b2bdc62aSAdrian Chadd 
429b2bdc62aSAdrian Chadd 	bcopy(rss_key, key, sizeof(rss_key));
430b2bdc62aSAdrian Chadd }
431b2bdc62aSAdrian Chadd 
432b2bdc62aSAdrian Chadd /*
433b2bdc62aSAdrian Chadd  * Query the number of buckets; this may be used by both network device
434b2bdc62aSAdrian Chadd  * drivers, which will need to populate hardware shadows of the software
435b2bdc62aSAdrian Chadd  * indirection table, and the network stack itself (such as when deciding how
436b2bdc62aSAdrian Chadd  * many connection groups to allocate).
437b2bdc62aSAdrian Chadd  */
438b2bdc62aSAdrian Chadd u_int
rss_getnumbuckets(void)439b2bdc62aSAdrian Chadd rss_getnumbuckets(void)
440b2bdc62aSAdrian Chadd {
441b2bdc62aSAdrian Chadd 
442b2bdc62aSAdrian Chadd 	return (rss_buckets);
443b2bdc62aSAdrian Chadd }
444b2bdc62aSAdrian Chadd 
445b2bdc62aSAdrian Chadd /*
446b2bdc62aSAdrian Chadd  * Query the number of CPUs in use by RSS; may be useful to device drivers
447b2bdc62aSAdrian Chadd  * trying to figure out how to map a larger number of CPUs into a smaller
448b2bdc62aSAdrian Chadd  * number of receive queues.
449b2bdc62aSAdrian Chadd  */
450b2bdc62aSAdrian Chadd u_int
rss_getnumcpus(void)451b2bdc62aSAdrian Chadd rss_getnumcpus(void)
452b2bdc62aSAdrian Chadd {
453b2bdc62aSAdrian Chadd 
454b2bdc62aSAdrian Chadd 	return (rss_ncpus);
455b2bdc62aSAdrian Chadd }
456b2bdc62aSAdrian Chadd 
457b2bdc62aSAdrian Chadd /*
458b2bdc62aSAdrian Chadd  * Return the supported RSS hash configuration.
459b2bdc62aSAdrian Chadd  *
460b2bdc62aSAdrian Chadd  * NICs should query this to determine what to configure in their redirection
461b2bdc62aSAdrian Chadd  * matching table.
462b2bdc62aSAdrian Chadd  */
463b2bdc62aSAdrian Chadd inline u_int
rss_gethashconfig(void)464b2bdc62aSAdrian Chadd rss_gethashconfig(void)
465b2bdc62aSAdrian Chadd {
466b2bdc62aSAdrian Chadd 
467b2bdc62aSAdrian Chadd 	/* Return 4-tuple for TCP; 2-tuple for others */
468b2bdc62aSAdrian Chadd 	/*
469b2bdc62aSAdrian Chadd 	 * UDP may fragment more often than TCP and thus we'll end up with
470b2bdc62aSAdrian Chadd 	 * NICs returning 2-tuple fragments.
471b2bdc62aSAdrian Chadd 	 * udp_init() and udplite_init() both currently initialise things
472b2bdc62aSAdrian Chadd 	 * as 2-tuple.
473b2bdc62aSAdrian Chadd 	 * So for now disable UDP 4-tuple hashing until all of the other
474b2bdc62aSAdrian Chadd 	 * pieces are in place.
475b2bdc62aSAdrian Chadd 	 */
476b2bdc62aSAdrian Chadd 	return (
477b2bdc62aSAdrian Chadd 	    RSS_HASHTYPE_RSS_IPV4
478b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV4
479b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6
480b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6
481b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6_EX
482b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
483b2bdc62aSAdrian Chadd #if 0
484b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4
485b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6
486b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
487b2bdc62aSAdrian Chadd #endif
488b2bdc62aSAdrian Chadd 	);
489b2bdc62aSAdrian Chadd }
490b2bdc62aSAdrian Chadd 
491b2bdc62aSAdrian Chadd /*
492b2bdc62aSAdrian Chadd  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
493b2bdc62aSAdrian Chadd  * it appearing in debugging output unnecessarily.
494b2bdc62aSAdrian Chadd  */
495b2bdc62aSAdrian Chadd static int
sysctl_rss_key(SYSCTL_HANDLER_ARGS)496b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS)
497b2bdc62aSAdrian Chadd {
498b2bdc62aSAdrian Chadd 	uint8_t temp_rss_key[RSS_KEYSIZE];
499b2bdc62aSAdrian Chadd 	int error;
500b2bdc62aSAdrian Chadd 
501b2bdc62aSAdrian Chadd 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
502b2bdc62aSAdrian Chadd 	if (error)
503b2bdc62aSAdrian Chadd 		return (error);
504b2bdc62aSAdrian Chadd 
505b2bdc62aSAdrian Chadd 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
506b2bdc62aSAdrian Chadd 	error = sysctl_handle_opaque(oidp, temp_rss_key,
507b2bdc62aSAdrian Chadd 	    sizeof(temp_rss_key), req);
508b2bdc62aSAdrian Chadd 	if (error)
509b2bdc62aSAdrian Chadd 		return (error);
510b2bdc62aSAdrian Chadd 	if (req->newptr != NULL) {
511b2bdc62aSAdrian Chadd 		/* XXXRW: Not yet. */
512b2bdc62aSAdrian Chadd 		return (EINVAL);
513b2bdc62aSAdrian Chadd 	}
514b2bdc62aSAdrian Chadd 	return (0);
515b2bdc62aSAdrian Chadd }
516b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
517b2bdc62aSAdrian Chadd     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
518b2bdc62aSAdrian Chadd     "", "RSS keying material");
519b2bdc62aSAdrian Chadd 
520b2bdc62aSAdrian Chadd static int
sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)521b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
522b2bdc62aSAdrian Chadd {
523b2bdc62aSAdrian Chadd 	struct sbuf *sb;
524b2bdc62aSAdrian Chadd 	int error;
525b2bdc62aSAdrian Chadd 	int i;
526b2bdc62aSAdrian Chadd 
527b2bdc62aSAdrian Chadd 	error = 0;
528b2bdc62aSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
529b2bdc62aSAdrian Chadd 	if (error != 0)
530b2bdc62aSAdrian Chadd 		return (error);
531b2bdc62aSAdrian Chadd 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
532b2bdc62aSAdrian Chadd 	if (sb == NULL)
533b2bdc62aSAdrian Chadd 		return (ENOMEM);
534b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
535b2bdc62aSAdrian Chadd 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
536b2bdc62aSAdrian Chadd 		    i,
537b2bdc62aSAdrian Chadd 		    rss_getcpu(i));
538b2bdc62aSAdrian Chadd 	}
539b2bdc62aSAdrian Chadd 	error = sbuf_finish(sb);
540b2bdc62aSAdrian Chadd 	sbuf_delete(sb);
541b2bdc62aSAdrian Chadd 
542b2bdc62aSAdrian Chadd 	return (error);
543b2bdc62aSAdrian Chadd }
544b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
545*7029da5cSPawel Biernacki     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
546b2bdc62aSAdrian Chadd     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
547