xref: /freebsd/sys/net/rss_config.c (revision b2bdc62a95a0241981d79c46eb79e3208eeec5f8)
1*b2bdc62aSAdrian Chadd /*-
2*b2bdc62aSAdrian Chadd  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3*b2bdc62aSAdrian Chadd  * All rights reserved.
4*b2bdc62aSAdrian Chadd  *
5*b2bdc62aSAdrian Chadd  * This software was developed by Robert N. M. Watson under contract
6*b2bdc62aSAdrian Chadd  * to Juniper Networks, Inc.
7*b2bdc62aSAdrian Chadd  *
8*b2bdc62aSAdrian Chadd  * Redistribution and use in source and binary forms, with or without
9*b2bdc62aSAdrian Chadd  * modification, are permitted provided that the following conditions
10*b2bdc62aSAdrian Chadd  * are met:
11*b2bdc62aSAdrian Chadd  * 1. Redistributions of source code must retain the above copyright
12*b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer.
13*b2bdc62aSAdrian Chadd  * 2. Redistributions in binary form must reproduce the above copyright
14*b2bdc62aSAdrian Chadd  *    notice, this list of conditions and the following disclaimer in the
15*b2bdc62aSAdrian Chadd  *    documentation and/or other materials provided with the distribution.
16*b2bdc62aSAdrian Chadd  *
17*b2bdc62aSAdrian Chadd  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18*b2bdc62aSAdrian Chadd  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19*b2bdc62aSAdrian Chadd  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20*b2bdc62aSAdrian Chadd  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21*b2bdc62aSAdrian Chadd  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22*b2bdc62aSAdrian Chadd  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23*b2bdc62aSAdrian Chadd  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24*b2bdc62aSAdrian Chadd  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25*b2bdc62aSAdrian Chadd  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26*b2bdc62aSAdrian Chadd  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27*b2bdc62aSAdrian Chadd  * SUCH DAMAGE.
28*b2bdc62aSAdrian Chadd  */
29*b2bdc62aSAdrian Chadd 
30*b2bdc62aSAdrian Chadd #include <sys/cdefs.h>
31*b2bdc62aSAdrian Chadd 
32*b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$");
33*b2bdc62aSAdrian Chadd 
34*b2bdc62aSAdrian Chadd #include "opt_inet6.h"
35*b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h"
36*b2bdc62aSAdrian Chadd 
37*b2bdc62aSAdrian Chadd #ifndef PCBGROUP
38*b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP"
39*b2bdc62aSAdrian Chadd #endif
40*b2bdc62aSAdrian Chadd 
41*b2bdc62aSAdrian Chadd #include <sys/param.h>
42*b2bdc62aSAdrian Chadd #include <sys/mbuf.h>
43*b2bdc62aSAdrian Chadd #include <sys/socket.h>
44*b2bdc62aSAdrian Chadd #include <sys/priv.h>
45*b2bdc62aSAdrian Chadd #include <sys/kernel.h>
46*b2bdc62aSAdrian Chadd #include <sys/smp.h>
47*b2bdc62aSAdrian Chadd #include <sys/sysctl.h>
48*b2bdc62aSAdrian Chadd #include <sys/sbuf.h>
49*b2bdc62aSAdrian Chadd 
50*b2bdc62aSAdrian Chadd #include <net/if.h>
51*b2bdc62aSAdrian Chadd #include <net/if_var.h>
52*b2bdc62aSAdrian Chadd #include <net/netisr.h>
53*b2bdc62aSAdrian Chadd #include <net/rss_config.h>
54*b2bdc62aSAdrian Chadd #include <net/toeplitz.h>
55*b2bdc62aSAdrian Chadd 
56*b2bdc62aSAdrian Chadd #if 0
57*b2bdc62aSAdrian Chadd #include <netinet/in.h>
58*b2bdc62aSAdrian Chadd #include <netinet/in_pcb.h>
59*b2bdc62aSAdrian Chadd #include <netinet/in_rss.h>
60*b2bdc62aSAdrian Chadd #include <netinet/in_var.h>
61*b2bdc62aSAdrian Chadd 
62*b2bdc62aSAdrian Chadd /* for software rss hash support */
63*b2bdc62aSAdrian Chadd #include <netinet/ip.h>
64*b2bdc62aSAdrian Chadd #include <netinet/tcp.h>
65*b2bdc62aSAdrian Chadd #include <netinet/udp.h>
66*b2bdc62aSAdrian Chadd #endif
67*b2bdc62aSAdrian Chadd 
68*b2bdc62aSAdrian Chadd /*-
69*b2bdc62aSAdrian Chadd  * Operating system parts of receiver-side scaling (RSS), which allows
70*b2bdc62aSAdrian Chadd  * network cards to direct flows to particular receive queues based on hashes
71*b2bdc62aSAdrian Chadd  * of header tuples.  This implementation aligns RSS buckets with connection
72*b2bdc62aSAdrian Chadd  * groups at the TCP/IP layer, so each bucket is associated with exactly one
73*b2bdc62aSAdrian Chadd  * group.  As a result, the group lookup structures (and lock) should have an
74*b2bdc62aSAdrian Chadd  * effective affinity with exactly one CPU.
75*b2bdc62aSAdrian Chadd  *
76*b2bdc62aSAdrian Chadd  * Network device drivers needing to configure RSS will query this framework
77*b2bdc62aSAdrian Chadd  * for parameters, such as the current RSS key, hashing policies, number of
78*b2bdc62aSAdrian Chadd  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
79*b2bdc62aSAdrian Chadd  * provide their own supplementary information, such as queue<->CPU bindings.
80*b2bdc62aSAdrian Chadd  * It is the responsibility of the network device driver to inject packets
81*b2bdc62aSAdrian Chadd  * into the stack on as close to the right CPU as possible, if playing by RSS
82*b2bdc62aSAdrian Chadd  * rules.
83*b2bdc62aSAdrian Chadd  *
84*b2bdc62aSAdrian Chadd  * TODO:
85*b2bdc62aSAdrian Chadd  *
86*b2bdc62aSAdrian Chadd  * - Synchronization for rss_key and other future-configurable parameters.
87*b2bdc62aSAdrian Chadd  * - Event handler drivers can register to pick up RSS configuration changes.
88*b2bdc62aSAdrian Chadd  * - Should we allow rss_basecpu to be configured?
89*b2bdc62aSAdrian Chadd  * - Randomize key on boot.
90*b2bdc62aSAdrian Chadd  * - IPv6 support.
91*b2bdc62aSAdrian Chadd  * - Statistics on how often there's a misalignment between hardware
92*b2bdc62aSAdrian Chadd  *   placement and pcbgroup expectations.
93*b2bdc62aSAdrian Chadd  */
94*b2bdc62aSAdrian Chadd 
95*b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet);
96*b2bdc62aSAdrian Chadd SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
97*b2bdc62aSAdrian Chadd 
98*b2bdc62aSAdrian Chadd /*
99*b2bdc62aSAdrian Chadd  * Toeplitz is the only required hash function in the RSS spec, so use it by
100*b2bdc62aSAdrian Chadd  * default.
101*b2bdc62aSAdrian Chadd  */
102*b2bdc62aSAdrian Chadd static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
103*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
104*b2bdc62aSAdrian Chadd     "RSS hash algorithm");
105*b2bdc62aSAdrian Chadd 
106*b2bdc62aSAdrian Chadd /*
107*b2bdc62aSAdrian Chadd  * Size of the indirection table; at most 128 entries per the RSS spec.  We
108*b2bdc62aSAdrian Chadd  * size it to at least 2 times the number of CPUs by default to allow useful
109*b2bdc62aSAdrian Chadd  * rebalancing.  If not set explicitly with a loader tunable, we tune based
110*b2bdc62aSAdrian Chadd  * on the number of CPUs present.
111*b2bdc62aSAdrian Chadd  *
112*b2bdc62aSAdrian Chadd  * XXXRW: buckets might be better to use for the tunable than bits.
113*b2bdc62aSAdrian Chadd  */
114*b2bdc62aSAdrian Chadd static u_int	rss_bits;
115*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
116*b2bdc62aSAdrian Chadd     "RSS bits");
117*b2bdc62aSAdrian Chadd 
118*b2bdc62aSAdrian Chadd static u_int	rss_mask;
119*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
120*b2bdc62aSAdrian Chadd     "RSS mask");
121*b2bdc62aSAdrian Chadd 
122*b2bdc62aSAdrian Chadd static const u_int	rss_maxbits = RSS_MAXBITS;
123*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
124*b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
125*b2bdc62aSAdrian Chadd 
126*b2bdc62aSAdrian Chadd /*
127*b2bdc62aSAdrian Chadd  * RSS's own count of the number of CPUs it could be using for processing.
128*b2bdc62aSAdrian Chadd  * Bounded to 64 by RSS constants.
129*b2bdc62aSAdrian Chadd  */
130*b2bdc62aSAdrian Chadd static u_int	rss_ncpus;
131*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
132*b2bdc62aSAdrian Chadd     "Number of CPUs available to RSS");
133*b2bdc62aSAdrian Chadd 
134*b2bdc62aSAdrian Chadd #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
135*b2bdc62aSAdrian Chadd static const u_int	rss_maxcpus = RSS_MAXCPUS;
136*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
137*b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
138*b2bdc62aSAdrian Chadd 
139*b2bdc62aSAdrian Chadd /*
140*b2bdc62aSAdrian Chadd  * Variable exists just for reporting rss_bits in a user-friendly way.
141*b2bdc62aSAdrian Chadd  */
142*b2bdc62aSAdrian Chadd static u_int	rss_buckets;
143*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
144*b2bdc62aSAdrian Chadd     "RSS buckets");
145*b2bdc62aSAdrian Chadd 
146*b2bdc62aSAdrian Chadd /*
147*b2bdc62aSAdrian Chadd  * Base CPU number; devices will add this to all CPU numbers returned by the
148*b2bdc62aSAdrian Chadd  * RSS indirection table.  Currently unmodifable in FreeBSD.
149*b2bdc62aSAdrian Chadd  */
150*b2bdc62aSAdrian Chadd static const u_int	rss_basecpu;
151*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
152*b2bdc62aSAdrian Chadd     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
153*b2bdc62aSAdrian Chadd 
154*b2bdc62aSAdrian Chadd /*
155*b2bdc62aSAdrian Chadd  * RSS secret key, intended to prevent attacks on load-balancing.  Its
156*b2bdc62aSAdrian Chadd  * effectiveness may be limited by algorithm choice and available entropy
157*b2bdc62aSAdrian Chadd  * during the boot.
158*b2bdc62aSAdrian Chadd  *
159*b2bdc62aSAdrian Chadd  * XXXRW: And that we don't randomize it yet!
160*b2bdc62aSAdrian Chadd  *
161*b2bdc62aSAdrian Chadd  * This is the default Microsoft RSS specification key which is also
162*b2bdc62aSAdrian Chadd  * the Chelsio T5 firmware default key.
163*b2bdc62aSAdrian Chadd  */
164*b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = {
165*b2bdc62aSAdrian Chadd 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
166*b2bdc62aSAdrian Chadd 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
167*b2bdc62aSAdrian Chadd 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
168*b2bdc62aSAdrian Chadd 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
169*b2bdc62aSAdrian Chadd 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
170*b2bdc62aSAdrian Chadd };
171*b2bdc62aSAdrian Chadd 
172*b2bdc62aSAdrian Chadd /*
173*b2bdc62aSAdrian Chadd  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
174*b2bdc62aSAdrian Chadd  * Drivers may supplement this table with a seperate CPU<->queue table when
175*b2bdc62aSAdrian Chadd  * programming devices.
176*b2bdc62aSAdrian Chadd  */
177*b2bdc62aSAdrian Chadd struct rss_table_entry {
178*b2bdc62aSAdrian Chadd 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
179*b2bdc62aSAdrian Chadd };
180*b2bdc62aSAdrian Chadd static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
181*b2bdc62aSAdrian Chadd 
182*b2bdc62aSAdrian Chadd static void
183*b2bdc62aSAdrian Chadd rss_init(__unused void *arg)
184*b2bdc62aSAdrian Chadd {
185*b2bdc62aSAdrian Chadd 	u_int i;
186*b2bdc62aSAdrian Chadd 	u_int cpuid;
187*b2bdc62aSAdrian Chadd 
188*b2bdc62aSAdrian Chadd 	/*
189*b2bdc62aSAdrian Chadd 	 * Validate tunables, coerce to sensible values.
190*b2bdc62aSAdrian Chadd 	 */
191*b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
192*b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
193*b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
194*b2bdc62aSAdrian Chadd 		break;
195*b2bdc62aSAdrian Chadd 
196*b2bdc62aSAdrian Chadd 	default:
197*b2bdc62aSAdrian Chadd 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
198*b2bdc62aSAdrian Chadd 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
199*b2bdc62aSAdrian Chadd 		rss_hashalgo = RSS_HASH_TOEPLITZ;
200*b2bdc62aSAdrian Chadd 	}
201*b2bdc62aSAdrian Chadd 
202*b2bdc62aSAdrian Chadd 	/*
203*b2bdc62aSAdrian Chadd 	 * Count available CPUs.
204*b2bdc62aSAdrian Chadd 	 *
205*b2bdc62aSAdrian Chadd 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
206*b2bdc62aSAdrian Chadd 	 * elsewhere.
207*b2bdc62aSAdrian Chadd 	 */
208*b2bdc62aSAdrian Chadd 	rss_ncpus = 0;
209*b2bdc62aSAdrian Chadd 	for (i = 0; i <= mp_maxid; i++) {
210*b2bdc62aSAdrian Chadd 		if (CPU_ABSENT(i))
211*b2bdc62aSAdrian Chadd 			continue;
212*b2bdc62aSAdrian Chadd 		rss_ncpus++;
213*b2bdc62aSAdrian Chadd 	}
214*b2bdc62aSAdrian Chadd 	if (rss_ncpus > RSS_MAXCPUS)
215*b2bdc62aSAdrian Chadd 		rss_ncpus = RSS_MAXCPUS;
216*b2bdc62aSAdrian Chadd 
217*b2bdc62aSAdrian Chadd 	/*
218*b2bdc62aSAdrian Chadd 	 * Tune RSS table entries to be no less than 2x the number of CPUs
219*b2bdc62aSAdrian Chadd 	 * -- unless we're running uniprocessor, in which case there's not
220*b2bdc62aSAdrian Chadd 	 * much point in having buckets to rearrange for load-balancing!
221*b2bdc62aSAdrian Chadd 	 */
222*b2bdc62aSAdrian Chadd 	if (rss_ncpus > 1) {
223*b2bdc62aSAdrian Chadd 		if (rss_bits == 0)
224*b2bdc62aSAdrian Chadd 			rss_bits = fls(rss_ncpus - 1) + 1;
225*b2bdc62aSAdrian Chadd 
226*b2bdc62aSAdrian Chadd 		/*
227*b2bdc62aSAdrian Chadd 		 * Microsoft limits RSS table entries to 128, so apply that
228*b2bdc62aSAdrian Chadd 		 * limit to both auto-detected CPU counts and user-configured
229*b2bdc62aSAdrian Chadd 		 * ones.
230*b2bdc62aSAdrian Chadd 		 */
231*b2bdc62aSAdrian Chadd 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
232*b2bdc62aSAdrian Chadd 			printf("%s: RSS bits %u not valid, coercing to  %u",
233*b2bdc62aSAdrian Chadd 			    __func__, rss_bits, RSS_MAXBITS);
234*b2bdc62aSAdrian Chadd 			rss_bits = RSS_MAXBITS;
235*b2bdc62aSAdrian Chadd 		}
236*b2bdc62aSAdrian Chadd 
237*b2bdc62aSAdrian Chadd 		/*
238*b2bdc62aSAdrian Chadd 		 * Figure out how many buckets to use; warn if less than the
239*b2bdc62aSAdrian Chadd 		 * number of configured CPUs, although this is not a fatal
240*b2bdc62aSAdrian Chadd 		 * problem.
241*b2bdc62aSAdrian Chadd 		 */
242*b2bdc62aSAdrian Chadd 		rss_buckets = (1 << rss_bits);
243*b2bdc62aSAdrian Chadd 		if (rss_buckets < rss_ncpus)
244*b2bdc62aSAdrian Chadd 			printf("%s: WARNING: rss_buckets (%u) less than "
245*b2bdc62aSAdrian Chadd 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
246*b2bdc62aSAdrian Chadd 			    rss_ncpus);
247*b2bdc62aSAdrian Chadd 		rss_mask = rss_buckets - 1;
248*b2bdc62aSAdrian Chadd 	} else {
249*b2bdc62aSAdrian Chadd 		rss_bits = 0;
250*b2bdc62aSAdrian Chadd 		rss_buckets = 1;
251*b2bdc62aSAdrian Chadd 		rss_mask = 0;
252*b2bdc62aSAdrian Chadd 	}
253*b2bdc62aSAdrian Chadd 
254*b2bdc62aSAdrian Chadd 	/*
255*b2bdc62aSAdrian Chadd 	 * Set up initial CPU assignments: round-robin by default.
256*b2bdc62aSAdrian Chadd 	 */
257*b2bdc62aSAdrian Chadd 	cpuid = CPU_FIRST();
258*b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
259*b2bdc62aSAdrian Chadd 		rss_table[i].rte_cpu = cpuid;
260*b2bdc62aSAdrian Chadd 		cpuid = CPU_NEXT(cpuid);
261*b2bdc62aSAdrian Chadd 	}
262*b2bdc62aSAdrian Chadd 
263*b2bdc62aSAdrian Chadd 	/*
264*b2bdc62aSAdrian Chadd 	 * Randomize rrs_key.
265*b2bdc62aSAdrian Chadd 	 *
266*b2bdc62aSAdrian Chadd 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
267*b2bdc62aSAdrian Chadd 	 * loop to check for "bad" RSS keys.
268*b2bdc62aSAdrian Chadd 	 */
269*b2bdc62aSAdrian Chadd }
270*b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
271*b2bdc62aSAdrian Chadd 
272*b2bdc62aSAdrian Chadd static uint32_t
273*b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
274*b2bdc62aSAdrian Chadd     const uint8_t *data)
275*b2bdc62aSAdrian Chadd {
276*b2bdc62aSAdrian Chadd 	uint32_t v;
277*b2bdc62aSAdrian Chadd 	u_int i;
278*b2bdc62aSAdrian Chadd 
279*b2bdc62aSAdrian Chadd 	v = 0;
280*b2bdc62aSAdrian Chadd 	for (i = 0; i < keylen; i++)
281*b2bdc62aSAdrian Chadd 		v += key[i];
282*b2bdc62aSAdrian Chadd 	for (i = 0; i < datalen; i++)
283*b2bdc62aSAdrian Chadd 		v += data[i];
284*b2bdc62aSAdrian Chadd 	return (v);
285*b2bdc62aSAdrian Chadd }
286*b2bdc62aSAdrian Chadd 
287*b2bdc62aSAdrian Chadd uint32_t
288*b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data)
289*b2bdc62aSAdrian Chadd {
290*b2bdc62aSAdrian Chadd 
291*b2bdc62aSAdrian Chadd 	switch (rss_hashalgo) {
292*b2bdc62aSAdrian Chadd 	case RSS_HASH_TOEPLITZ:
293*b2bdc62aSAdrian Chadd 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
294*b2bdc62aSAdrian Chadd 		    data));
295*b2bdc62aSAdrian Chadd 
296*b2bdc62aSAdrian Chadd 	case RSS_HASH_NAIVE:
297*b2bdc62aSAdrian Chadd 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
298*b2bdc62aSAdrian Chadd 		    data));
299*b2bdc62aSAdrian Chadd 
300*b2bdc62aSAdrian Chadd 	default:
301*b2bdc62aSAdrian Chadd 		panic("%s: unsupported/unknown hashalgo %d", __func__,
302*b2bdc62aSAdrian Chadd 		    rss_hashalgo);
303*b2bdc62aSAdrian Chadd 	}
304*b2bdc62aSAdrian Chadd }
305*b2bdc62aSAdrian Chadd 
306*b2bdc62aSAdrian Chadd /*
307*b2bdc62aSAdrian Chadd  * Query the number of RSS bits in use.
308*b2bdc62aSAdrian Chadd  */
309*b2bdc62aSAdrian Chadd u_int
310*b2bdc62aSAdrian Chadd rss_getbits(void)
311*b2bdc62aSAdrian Chadd {
312*b2bdc62aSAdrian Chadd 
313*b2bdc62aSAdrian Chadd 	return (rss_bits);
314*b2bdc62aSAdrian Chadd }
315*b2bdc62aSAdrian Chadd 
316*b2bdc62aSAdrian Chadd /*
317*b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with an RSS hash.
318*b2bdc62aSAdrian Chadd  */
319*b2bdc62aSAdrian Chadd u_int
320*b2bdc62aSAdrian Chadd rss_getbucket(u_int hash)
321*b2bdc62aSAdrian Chadd {
322*b2bdc62aSAdrian Chadd 
323*b2bdc62aSAdrian Chadd 	return (hash & rss_mask);
324*b2bdc62aSAdrian Chadd }
325*b2bdc62aSAdrian Chadd 
326*b2bdc62aSAdrian Chadd /*
327*b2bdc62aSAdrian Chadd  * Query the RSS layer bucket associated with the given
328*b2bdc62aSAdrian Chadd  * entry in the RSS hash space.
329*b2bdc62aSAdrian Chadd  *
330*b2bdc62aSAdrian Chadd  * The RSS indirection table is 0 .. rss_buckets-1,
331*b2bdc62aSAdrian Chadd  * covering the low 'rss_bits' of the total 128 slot
332*b2bdc62aSAdrian Chadd  * RSS indirection table.  So just mask off rss_bits and
333*b2bdc62aSAdrian Chadd  * return that.
334*b2bdc62aSAdrian Chadd  *
335*b2bdc62aSAdrian Chadd  * NIC drivers can then iterate over the 128 slot RSS
336*b2bdc62aSAdrian Chadd  * indirection table and fetch which RSS bucket to
337*b2bdc62aSAdrian Chadd  * map it to.  This will typically be a CPU queue
338*b2bdc62aSAdrian Chadd  */
339*b2bdc62aSAdrian Chadd u_int
340*b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index)
341*b2bdc62aSAdrian Chadd {
342*b2bdc62aSAdrian Chadd 
343*b2bdc62aSAdrian Chadd 	return (index & rss_mask);
344*b2bdc62aSAdrian Chadd }
345*b2bdc62aSAdrian Chadd 
346*b2bdc62aSAdrian Chadd /*
347*b2bdc62aSAdrian Chadd  * Query the RSS CPU associated with an RSS bucket.
348*b2bdc62aSAdrian Chadd  */
349*b2bdc62aSAdrian Chadd u_int
350*b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket)
351*b2bdc62aSAdrian Chadd {
352*b2bdc62aSAdrian Chadd 
353*b2bdc62aSAdrian Chadd 	return (rss_table[bucket].rte_cpu);
354*b2bdc62aSAdrian Chadd }
355*b2bdc62aSAdrian Chadd 
356*b2bdc62aSAdrian Chadd /*
357*b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup given just the hash and hashtype.
358*b2bdc62aSAdrian Chadd  */
359*b2bdc62aSAdrian Chadd u_int
360*b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
361*b2bdc62aSAdrian Chadd {
362*b2bdc62aSAdrian Chadd 
363*b2bdc62aSAdrian Chadd 	switch (hash_type) {
364*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
365*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
366*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
367*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
368*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
369*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
370*b2bdc62aSAdrian Chadd 		return (rss_getcpu(rss_getbucket(hash_val)));
371*b2bdc62aSAdrian Chadd 	default:
372*b2bdc62aSAdrian Chadd 		return (NETISR_CPUID_NONE);
373*b2bdc62aSAdrian Chadd 	}
374*b2bdc62aSAdrian Chadd }
375*b2bdc62aSAdrian Chadd 
376*b2bdc62aSAdrian Chadd /*
377*b2bdc62aSAdrian Chadd  * Query the RSS bucket associated with the given hash value and
378*b2bdc62aSAdrian Chadd  * type.
379*b2bdc62aSAdrian Chadd  */
380*b2bdc62aSAdrian Chadd int
381*b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
382*b2bdc62aSAdrian Chadd {
383*b2bdc62aSAdrian Chadd 
384*b2bdc62aSAdrian Chadd 	switch (hash_type) {
385*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV4:
386*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV4:
387*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV4:
388*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_IPV6:
389*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_TCP_IPV6:
390*b2bdc62aSAdrian Chadd 	case M_HASHTYPE_RSS_UDP_IPV6:
391*b2bdc62aSAdrian Chadd 		*bucket_id = rss_getbucket(hash_val);
392*b2bdc62aSAdrian Chadd 		return (0);
393*b2bdc62aSAdrian Chadd 	default:
394*b2bdc62aSAdrian Chadd 		return (-1);
395*b2bdc62aSAdrian Chadd 	}
396*b2bdc62aSAdrian Chadd }
397*b2bdc62aSAdrian Chadd 
398*b2bdc62aSAdrian Chadd /*
399*b2bdc62aSAdrian Chadd  * netisr CPU affinity lookup routine for use by protocols.
400*b2bdc62aSAdrian Chadd  */
401*b2bdc62aSAdrian Chadd struct mbuf *
402*b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
403*b2bdc62aSAdrian Chadd {
404*b2bdc62aSAdrian Chadd 
405*b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
406*b2bdc62aSAdrian Chadd 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
407*b2bdc62aSAdrian Chadd 	return (m);
408*b2bdc62aSAdrian Chadd }
409*b2bdc62aSAdrian Chadd 
410*b2bdc62aSAdrian Chadd int
411*b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
412*b2bdc62aSAdrian Chadd {
413*b2bdc62aSAdrian Chadd 
414*b2bdc62aSAdrian Chadd 	M_ASSERTPKTHDR(m);
415*b2bdc62aSAdrian Chadd 
416*b2bdc62aSAdrian Chadd 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
417*b2bdc62aSAdrian Chadd 	    bucket_id));
418*b2bdc62aSAdrian Chadd }
419*b2bdc62aSAdrian Chadd 
420*b2bdc62aSAdrian Chadd /*
421*b2bdc62aSAdrian Chadd  * Query the RSS hash algorithm.
422*b2bdc62aSAdrian Chadd  */
423*b2bdc62aSAdrian Chadd u_int
424*b2bdc62aSAdrian Chadd rss_gethashalgo(void)
425*b2bdc62aSAdrian Chadd {
426*b2bdc62aSAdrian Chadd 
427*b2bdc62aSAdrian Chadd 	return (rss_hashalgo);
428*b2bdc62aSAdrian Chadd }
429*b2bdc62aSAdrian Chadd 
430*b2bdc62aSAdrian Chadd /*
431*b2bdc62aSAdrian Chadd  * Query the current RSS key; likely to be used by device drivers when
432*b2bdc62aSAdrian Chadd  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
433*b2bdc62aSAdrian Chadd  *
434*b2bdc62aSAdrian Chadd  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
435*b2bdc62aSAdrian Chadd  */
436*b2bdc62aSAdrian Chadd void
437*b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key)
438*b2bdc62aSAdrian Chadd {
439*b2bdc62aSAdrian Chadd 
440*b2bdc62aSAdrian Chadd 	bcopy(rss_key, key, sizeof(rss_key));
441*b2bdc62aSAdrian Chadd }
442*b2bdc62aSAdrian Chadd 
443*b2bdc62aSAdrian Chadd /*
444*b2bdc62aSAdrian Chadd  * Query the number of buckets; this may be used by both network device
445*b2bdc62aSAdrian Chadd  * drivers, which will need to populate hardware shadows of the software
446*b2bdc62aSAdrian Chadd  * indirection table, and the network stack itself (such as when deciding how
447*b2bdc62aSAdrian Chadd  * many connection groups to allocate).
448*b2bdc62aSAdrian Chadd  */
449*b2bdc62aSAdrian Chadd u_int
450*b2bdc62aSAdrian Chadd rss_getnumbuckets(void)
451*b2bdc62aSAdrian Chadd {
452*b2bdc62aSAdrian Chadd 
453*b2bdc62aSAdrian Chadd 	return (rss_buckets);
454*b2bdc62aSAdrian Chadd }
455*b2bdc62aSAdrian Chadd 
456*b2bdc62aSAdrian Chadd /*
457*b2bdc62aSAdrian Chadd  * Query the number of CPUs in use by RSS; may be useful to device drivers
458*b2bdc62aSAdrian Chadd  * trying to figure out how to map a larger number of CPUs into a smaller
459*b2bdc62aSAdrian Chadd  * number of receive queues.
460*b2bdc62aSAdrian Chadd  */
461*b2bdc62aSAdrian Chadd u_int
462*b2bdc62aSAdrian Chadd rss_getnumcpus(void)
463*b2bdc62aSAdrian Chadd {
464*b2bdc62aSAdrian Chadd 
465*b2bdc62aSAdrian Chadd 	return (rss_ncpus);
466*b2bdc62aSAdrian Chadd }
467*b2bdc62aSAdrian Chadd 
468*b2bdc62aSAdrian Chadd /*
469*b2bdc62aSAdrian Chadd  * Return the supported RSS hash configuration.
470*b2bdc62aSAdrian Chadd  *
471*b2bdc62aSAdrian Chadd  * NICs should query this to determine what to configure in their redirection
472*b2bdc62aSAdrian Chadd  * matching table.
473*b2bdc62aSAdrian Chadd  */
474*b2bdc62aSAdrian Chadd inline u_int
475*b2bdc62aSAdrian Chadd rss_gethashconfig(void)
476*b2bdc62aSAdrian Chadd {
477*b2bdc62aSAdrian Chadd 
478*b2bdc62aSAdrian Chadd 	/* Return 4-tuple for TCP; 2-tuple for others */
479*b2bdc62aSAdrian Chadd 	/*
480*b2bdc62aSAdrian Chadd 	 * UDP may fragment more often than TCP and thus we'll end up with
481*b2bdc62aSAdrian Chadd 	 * NICs returning 2-tuple fragments.
482*b2bdc62aSAdrian Chadd 	 * udp_init() and udplite_init() both currently initialise things
483*b2bdc62aSAdrian Chadd 	 * as 2-tuple.
484*b2bdc62aSAdrian Chadd 	 * So for now disable UDP 4-tuple hashing until all of the other
485*b2bdc62aSAdrian Chadd 	 * pieces are in place.
486*b2bdc62aSAdrian Chadd 	 */
487*b2bdc62aSAdrian Chadd 	return (
488*b2bdc62aSAdrian Chadd 	    RSS_HASHTYPE_RSS_IPV4
489*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV4
490*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6
491*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6
492*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_IPV6_EX
493*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
494*b2bdc62aSAdrian Chadd #if 0
495*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4
496*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV4_EX
497*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6
498*b2bdc62aSAdrian Chadd 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
499*b2bdc62aSAdrian Chadd #endif
500*b2bdc62aSAdrian Chadd 	);
501*b2bdc62aSAdrian Chadd }
502*b2bdc62aSAdrian Chadd 
503*b2bdc62aSAdrian Chadd /*
504*b2bdc62aSAdrian Chadd  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
505*b2bdc62aSAdrian Chadd  * it appearing in debugging output unnecessarily.
506*b2bdc62aSAdrian Chadd  */
507*b2bdc62aSAdrian Chadd static int
508*b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS)
509*b2bdc62aSAdrian Chadd {
510*b2bdc62aSAdrian Chadd 	uint8_t temp_rss_key[RSS_KEYSIZE];
511*b2bdc62aSAdrian Chadd 	int error;
512*b2bdc62aSAdrian Chadd 
513*b2bdc62aSAdrian Chadd 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
514*b2bdc62aSAdrian Chadd 	if (error)
515*b2bdc62aSAdrian Chadd 		return (error);
516*b2bdc62aSAdrian Chadd 
517*b2bdc62aSAdrian Chadd 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
518*b2bdc62aSAdrian Chadd 	error = sysctl_handle_opaque(oidp, temp_rss_key,
519*b2bdc62aSAdrian Chadd 	    sizeof(temp_rss_key), req);
520*b2bdc62aSAdrian Chadd 	if (error)
521*b2bdc62aSAdrian Chadd 		return (error);
522*b2bdc62aSAdrian Chadd 	if (req->newptr != NULL) {
523*b2bdc62aSAdrian Chadd 		/* XXXRW: Not yet. */
524*b2bdc62aSAdrian Chadd 		return (EINVAL);
525*b2bdc62aSAdrian Chadd 	}
526*b2bdc62aSAdrian Chadd 	return (0);
527*b2bdc62aSAdrian Chadd }
528*b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
529*b2bdc62aSAdrian Chadd     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
530*b2bdc62aSAdrian Chadd     "", "RSS keying material");
531*b2bdc62aSAdrian Chadd 
532*b2bdc62aSAdrian Chadd static int
533*b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
534*b2bdc62aSAdrian Chadd {
535*b2bdc62aSAdrian Chadd 	struct sbuf *sb;
536*b2bdc62aSAdrian Chadd 	int error;
537*b2bdc62aSAdrian Chadd 	int i;
538*b2bdc62aSAdrian Chadd 
539*b2bdc62aSAdrian Chadd 	error = 0;
540*b2bdc62aSAdrian Chadd 	error = sysctl_wire_old_buffer(req, 0);
541*b2bdc62aSAdrian Chadd 	if (error != 0)
542*b2bdc62aSAdrian Chadd 		return (error);
543*b2bdc62aSAdrian Chadd 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
544*b2bdc62aSAdrian Chadd 	if (sb == NULL)
545*b2bdc62aSAdrian Chadd 		return (ENOMEM);
546*b2bdc62aSAdrian Chadd 	for (i = 0; i < rss_buckets; i++) {
547*b2bdc62aSAdrian Chadd 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
548*b2bdc62aSAdrian Chadd 		    i,
549*b2bdc62aSAdrian Chadd 		    rss_getcpu(i));
550*b2bdc62aSAdrian Chadd 	}
551*b2bdc62aSAdrian Chadd 	error = sbuf_finish(sb);
552*b2bdc62aSAdrian Chadd 	sbuf_delete(sb);
553*b2bdc62aSAdrian Chadd 
554*b2bdc62aSAdrian Chadd 	return (error);
555*b2bdc62aSAdrian Chadd }
556*b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
557*b2bdc62aSAdrian Chadd     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
558*b2bdc62aSAdrian Chadd     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
559