xref: /freebsd/sys/netinet/in_rss.c (revision 7527624efaae989d115cd61a6d519ba29ac929c3)
1*7527624eSRobert Watson /*-
2*7527624eSRobert Watson  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3*7527624eSRobert Watson  * All rights reserved.
4*7527624eSRobert Watson  *
5*7527624eSRobert Watson  * This software was developed by Robert N. M. Watson under contract
6*7527624eSRobert Watson  * to Juniper Networks, Inc.
7*7527624eSRobert Watson  *
8*7527624eSRobert Watson  * Redistribution and use in source and binary forms, with or without
9*7527624eSRobert Watson  * modification, are permitted provided that the following conditions
10*7527624eSRobert Watson  * are met:
11*7527624eSRobert Watson  * 1. Redistributions of source code must retain the above copyright
12*7527624eSRobert Watson  *    notice, this list of conditions and the following disclaimer.
13*7527624eSRobert Watson  * 2. Redistributions in binary form must reproduce the above copyright
14*7527624eSRobert Watson  *    notice, this list of conditions and the following disclaimer in the
15*7527624eSRobert Watson  *    documentation and/or other materials provided with the distribution.
16*7527624eSRobert Watson  *
17*7527624eSRobert Watson  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18*7527624eSRobert Watson  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19*7527624eSRobert Watson  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20*7527624eSRobert Watson  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21*7527624eSRobert Watson  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22*7527624eSRobert Watson  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23*7527624eSRobert Watson  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24*7527624eSRobert Watson  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25*7527624eSRobert Watson  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26*7527624eSRobert Watson  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27*7527624eSRobert Watson  * SUCH DAMAGE.
28*7527624eSRobert Watson  */
29*7527624eSRobert Watson 
30*7527624eSRobert Watson #include <sys/cdefs.h>
31*7527624eSRobert Watson 
32*7527624eSRobert Watson __FBSDID("$FreeBSD$");
33*7527624eSRobert Watson 
34*7527624eSRobert Watson #include "opt_inet6.h"
35*7527624eSRobert Watson #include "opt_pcbgroup.h"
36*7527624eSRobert Watson 
37*7527624eSRobert Watson #ifndef PCBGROUP
38*7527624eSRobert Watson #error "options RSS depends on options PCBGROUP"
39*7527624eSRobert Watson #endif
40*7527624eSRobert Watson 
41*7527624eSRobert Watson #include <sys/param.h>
42*7527624eSRobert Watson #include <sys/mbuf.h>
43*7527624eSRobert Watson #include <sys/socket.h>
44*7527624eSRobert Watson #include <sys/priv.h>
45*7527624eSRobert Watson #include <sys/kernel.h>
46*7527624eSRobert Watson #include <sys/smp.h>
47*7527624eSRobert Watson #include <sys/sysctl.h>
48*7527624eSRobert Watson 
49*7527624eSRobert Watson #include <net/if.h>
50*7527624eSRobert Watson #include <net/if_var.h>
51*7527624eSRobert Watson #include <net/netisr.h>
52*7527624eSRobert Watson 
53*7527624eSRobert Watson #include <netinet/in.h>
54*7527624eSRobert Watson #include <netinet/in_pcb.h>
55*7527624eSRobert Watson #include <netinet/in_rss.h>
56*7527624eSRobert Watson #include <netinet/in_var.h>
57*7527624eSRobert Watson #include <netinet/toeplitz.h>
58*7527624eSRobert Watson 
59*7527624eSRobert Watson /*-
60*7527624eSRobert Watson  * Operating system parts of receiver-side scaling (RSS), which allows
61*7527624eSRobert Watson  * network cards to direct flows to particular receive queues based on hashes
62*7527624eSRobert Watson  * of header tuples.  This implementation aligns RSS buckets with connection
63*7527624eSRobert Watson  * groups at the TCP/IP layer, so each bucket is associated with exactly one
64*7527624eSRobert Watson  * group.  As a result, the group lookup structures (and lock) should have an
65*7527624eSRobert Watson  * effective affinity with exactly one CPU.
66*7527624eSRobert Watson  *
67*7527624eSRobert Watson  * Network device drivers needing to configure RSS will query this framework
68*7527624eSRobert Watson  * for parameters, such as the current RSS key, hashing policies, number of
69*7527624eSRobert Watson  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
70*7527624eSRobert Watson  * provide their own supplementary information, such as queue<->CPU bindings.
71*7527624eSRobert Watson  * It is the responsibility of the network device driver to inject packets
72*7527624eSRobert Watson  * into the stack on as close to the right CPU as possible, if playing by RSS
73*7527624eSRobert Watson  * rules.
74*7527624eSRobert Watson  *
75*7527624eSRobert Watson  * TODO:
76*7527624eSRobert Watson  *
77*7527624eSRobert Watson  * - Synchronization for rss_key and other future-configurable parameters.
78*7527624eSRobert Watson  * - Event handler drivers can register to pick up RSS configuration changes.
79*7527624eSRobert Watson  * - Should we allow rss_basecpu to be configured?
80*7527624eSRobert Watson  * - Randomize key on boot.
81*7527624eSRobert Watson  * - IPv6 support.
82*7527624eSRobert Watson  * - Statistics on how often there's a misalignment between hardware
83*7527624eSRobert Watson  *   placement and pcbgroup expectations.
84*7527624eSRobert Watson  */
85*7527624eSRobert Watson 
86*7527624eSRobert Watson SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
87*7527624eSRobert Watson 
88*7527624eSRobert Watson /*
89*7527624eSRobert Watson  * Toeplitz is the only required hash function in the RSS spec, so use it by
90*7527624eSRobert Watson  * default.
91*7527624eSRobert Watson  */
92*7527624eSRobert Watson static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
93*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RD, &rss_hashalgo, 0,
94*7527624eSRobert Watson     "RSS hash algorithm");
95*7527624eSRobert Watson TUNABLE_INT("net.inet.rss.hashalgo", &rss_hashalgo);
96*7527624eSRobert Watson 
97*7527624eSRobert Watson /*
98*7527624eSRobert Watson  * Size of the indirection table; at most 128 entries per the RSS spec.  We
99*7527624eSRobert Watson  * size it to at least 2 times the number of CPUs by default to allow useful
100*7527624eSRobert Watson  * rebalancing.  If not set explicitly with a loader tunable, we tune based
101*7527624eSRobert Watson  * on the number of CPUs present.
102*7527624eSRobert Watson  *
103*7527624eSRobert Watson  * XXXRW: buckets might be better to use for the tunable than bits.
104*7527624eSRobert Watson  */
105*7527624eSRobert Watson static u_int	rss_bits;
106*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RD, &rss_bits, 0,
107*7527624eSRobert Watson     "RSS bits");
108*7527624eSRobert Watson TUNABLE_INT("net.inet.rss.bits", &rss_bits);
109*7527624eSRobert Watson 
110*7527624eSRobert Watson static u_int	rss_mask;
111*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
112*7527624eSRobert Watson     "RSS mask");
113*7527624eSRobert Watson 
114*7527624eSRobert Watson static const u_int	rss_maxbits = RSS_MAXBITS;
115*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
116*7527624eSRobert Watson     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
117*7527624eSRobert Watson 
118*7527624eSRobert Watson /*
119*7527624eSRobert Watson  * RSS's own count of the number of CPUs it could be using for processing.
120*7527624eSRobert Watson  * Bounded to 64 by RSS constants.
121*7527624eSRobert Watson  */
122*7527624eSRobert Watson static u_int	rss_ncpus;
123*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
124*7527624eSRobert Watson     "Number of CPUs available to RSS");
125*7527624eSRobert Watson 
126*7527624eSRobert Watson #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
127*7527624eSRobert Watson static const u_int	rss_maxcpus = RSS_MAXCPUS;
128*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
129*7527624eSRobert Watson     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
130*7527624eSRobert Watson 
131*7527624eSRobert Watson /*
132*7527624eSRobert Watson  * Variable exists just for reporting rss_bits in a user-friendly way.
133*7527624eSRobert Watson  */
134*7527624eSRobert Watson static u_int	rss_buckets;
135*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
136*7527624eSRobert Watson     "RSS buckets");
137*7527624eSRobert Watson 
138*7527624eSRobert Watson /*
139*7527624eSRobert Watson  * Base CPU number; devices will add this to all CPU numbers returned by the
140*7527624eSRobert Watson  * RSS indirection table.  Currently unmodifable in FreeBSD.
141*7527624eSRobert Watson  */
142*7527624eSRobert Watson static const u_int	rss_basecpu;
143*7527624eSRobert Watson SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
144*7527624eSRobert Watson     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
145*7527624eSRobert Watson 
146*7527624eSRobert Watson /*
147*7527624eSRobert Watson  * RSS secret key, intended to prevent attacks on load-balancing.  Its
148*7527624eSRobert Watson  * effectiveness may be limited by algorithm choice and available entropy
149*7527624eSRobert Watson  * during the boot.
150*7527624eSRobert Watson  *
151*7527624eSRobert Watson  * XXXRW: And that we don't randomize it yet!
152*7527624eSRobert Watson  *
153*7527624eSRobert Watson  * XXXRW: This default is actually the default key from Chelsio T3 cards, as
154*7527624eSRobert Watson  * it offers reasonable distribution, unlike all-0 keys which always
155*7527624eSRobert Watson  * generate a hash of 0 (upsettingly).
156*7527624eSRobert Watson  */
157*7527624eSRobert Watson static uint8_t	rss_key[RSS_KEYSIZE] = {
158*7527624eSRobert Watson 	0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d,
159*7527624eSRobert Watson 	0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda,
160*7527624eSRobert Watson 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
161*7527624eSRobert Watson 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
162*7527624eSRobert Watson 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
163*7527624eSRobert Watson };
164*7527624eSRobert Watson 
165*7527624eSRobert Watson /*
166*7527624eSRobert Watson  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
167*7527624eSRobert Watson  * Drivers may supplement this table with a seperate CPU<->queue table when
168*7527624eSRobert Watson  * programming devices.
169*7527624eSRobert Watson  */
170*7527624eSRobert Watson struct rss_table_entry {
171*7527624eSRobert Watson 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
172*7527624eSRobert Watson };
173*7527624eSRobert Watson static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
174*7527624eSRobert Watson 
175*7527624eSRobert Watson static void
176*7527624eSRobert Watson rss_init(__unused void *arg)
177*7527624eSRobert Watson {
178*7527624eSRobert Watson 	u_int i;
179*7527624eSRobert Watson 
180*7527624eSRobert Watson 	/*
181*7527624eSRobert Watson 	 * Validate tunables, coerce to sensible values.
182*7527624eSRobert Watson 	 */
183*7527624eSRobert Watson 	switch (rss_hashalgo) {
184*7527624eSRobert Watson 	case RSS_HASH_TOEPLITZ:
185*7527624eSRobert Watson 	case RSS_HASH_NAIVE:
186*7527624eSRobert Watson 		break;
187*7527624eSRobert Watson 
188*7527624eSRobert Watson 	default:
189*7527624eSRobert Watson 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
190*7527624eSRobert Watson 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
191*7527624eSRobert Watson 		rss_hashalgo = RSS_HASH_TOEPLITZ;
192*7527624eSRobert Watson 	}
193*7527624eSRobert Watson 
194*7527624eSRobert Watson 	/*
195*7527624eSRobert Watson 	 * Count available CPUs.
196*7527624eSRobert Watson 	 *
197*7527624eSRobert Watson 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
198*7527624eSRobert Watson 	 * elsewhere.
199*7527624eSRobert Watson 	 */
200*7527624eSRobert Watson 	rss_ncpus = 0;
201*7527624eSRobert Watson 	for (i = 0; i <= mp_maxid; i++) {
202*7527624eSRobert Watson 		if (CPU_ABSENT(i))
203*7527624eSRobert Watson 			continue;
204*7527624eSRobert Watson 		rss_ncpus++;
205*7527624eSRobert Watson 	}
206*7527624eSRobert Watson 	if (rss_ncpus > RSS_MAXCPUS)
207*7527624eSRobert Watson 		rss_ncpus = RSS_MAXCPUS;
208*7527624eSRobert Watson 
209*7527624eSRobert Watson 	/*
210*7527624eSRobert Watson 	 * Tune RSS table entries to be no less than 2x the number of CPUs
211*7527624eSRobert Watson 	 * -- unless we're running uniprocessor, in which case there's not
212*7527624eSRobert Watson 	 * much point in having buckets to rearrange for load-balancing!
213*7527624eSRobert Watson 	 */
214*7527624eSRobert Watson 	if (rss_ncpus > 1) {
215*7527624eSRobert Watson 		if (rss_bits == 0)
216*7527624eSRobert Watson 			rss_bits = fls(rss_ncpus - 1) + 1;
217*7527624eSRobert Watson 
218*7527624eSRobert Watson 		/*
219*7527624eSRobert Watson 		 * Microsoft limits RSS table entries to 128, so apply that
220*7527624eSRobert Watson 		 * limit to both auto-detected CPU counts and user-configured
221*7527624eSRobert Watson 		 * ones.
222*7527624eSRobert Watson 		 */
223*7527624eSRobert Watson 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
224*7527624eSRobert Watson 			printf("%s: RSS bits %u not valid, coercing to  %u",
225*7527624eSRobert Watson 			    __func__, rss_bits, RSS_MAXBITS);
226*7527624eSRobert Watson 			rss_bits = RSS_MAXBITS;
227*7527624eSRobert Watson 		}
228*7527624eSRobert Watson 
229*7527624eSRobert Watson 		/*
230*7527624eSRobert Watson 		 * Figure out how many buckets to use; warn if less than the
231*7527624eSRobert Watson 		 * number of configured CPUs, although this is not a fatal
232*7527624eSRobert Watson 		 * problem.
233*7527624eSRobert Watson 		 */
234*7527624eSRobert Watson 		rss_buckets = (1 << rss_bits);
235*7527624eSRobert Watson 		if (rss_buckets < rss_ncpus)
236*7527624eSRobert Watson 			printf("%s: WARNING: rss_buckets (%u) less than "
237*7527624eSRobert Watson 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
238*7527624eSRobert Watson 			    rss_ncpus);
239*7527624eSRobert Watson 		rss_mask = rss_buckets - 1;
240*7527624eSRobert Watson 	} else {
241*7527624eSRobert Watson 		rss_bits = 0;
242*7527624eSRobert Watson 		rss_buckets = 1;
243*7527624eSRobert Watson 		rss_mask = 0;
244*7527624eSRobert Watson 	}
245*7527624eSRobert Watson 
246*7527624eSRobert Watson 	/*
247*7527624eSRobert Watson 	 * Set up initial CPU assignments: round-robin by default.
248*7527624eSRobert Watson 	 *
249*7527624eSRobert Watson 	 * XXXRW: Need a mapping to non-contiguous IDs here.
250*7527624eSRobert Watson 	 */
251*7527624eSRobert Watson 	for (i = 0; i < rss_buckets; i++)
252*7527624eSRobert Watson 		rss_table[i].rte_cpu = i % rss_ncpus;
253*7527624eSRobert Watson 
254*7527624eSRobert Watson 	/*
255*7527624eSRobert Watson 	 * Randomize rrs_key.
256*7527624eSRobert Watson 	 *
257*7527624eSRobert Watson 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
258*7527624eSRobert Watson 	 * loop to check for "bad" RSS keys.
259*7527624eSRobert Watson 	 */
260*7527624eSRobert Watson }
261*7527624eSRobert Watson SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
262*7527624eSRobert Watson 
263*7527624eSRobert Watson static uint32_t
264*7527624eSRobert Watson rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
265*7527624eSRobert Watson     const uint8_t *data)
266*7527624eSRobert Watson {
267*7527624eSRobert Watson 	uint32_t v;
268*7527624eSRobert Watson 	u_int i;
269*7527624eSRobert Watson 
270*7527624eSRobert Watson 	v = 0;
271*7527624eSRobert Watson 	for (i = 0; i < keylen; i++)
272*7527624eSRobert Watson 		v += key[i];
273*7527624eSRobert Watson 	for (i = 0; i < datalen; i++)
274*7527624eSRobert Watson 		v += data[i];
275*7527624eSRobert Watson 	return (v);
276*7527624eSRobert Watson }
277*7527624eSRobert Watson 
278*7527624eSRobert Watson static uint32_t
279*7527624eSRobert Watson rss_hash(u_int datalen, const uint8_t *data)
280*7527624eSRobert Watson {
281*7527624eSRobert Watson 
282*7527624eSRobert Watson 	switch (rss_hashalgo) {
283*7527624eSRobert Watson 	case RSS_HASH_TOEPLITZ:
284*7527624eSRobert Watson 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
285*7527624eSRobert Watson 		    data));
286*7527624eSRobert Watson 
287*7527624eSRobert Watson 	case RSS_HASH_NAIVE:
288*7527624eSRobert Watson 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
289*7527624eSRobert Watson 		    data));
290*7527624eSRobert Watson 
291*7527624eSRobert Watson 	default:
292*7527624eSRobert Watson 		panic("%s: unsupported/unknown hashalgo %d", __func__,
293*7527624eSRobert Watson 		    rss_hashalgo);
294*7527624eSRobert Watson 	}
295*7527624eSRobert Watson }
296*7527624eSRobert Watson 
297*7527624eSRobert Watson /*
298*7527624eSRobert Watson  * Hash an IPv4 2-tuple.
299*7527624eSRobert Watson  */
300*7527624eSRobert Watson uint32_t
301*7527624eSRobert Watson rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
302*7527624eSRobert Watson {
303*7527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst)];
304*7527624eSRobert Watson 	u_int datalen;
305*7527624eSRobert Watson 
306*7527624eSRobert Watson 	datalen = 0;
307*7527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
308*7527624eSRobert Watson 	datalen += sizeof(src);
309*7527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
310*7527624eSRobert Watson 	datalen += sizeof(dst);
311*7527624eSRobert Watson 	return (rss_hash(datalen, data));
312*7527624eSRobert Watson }
313*7527624eSRobert Watson 
314*7527624eSRobert Watson /*
315*7527624eSRobert Watson  * Hash an IPv4 4-tuple.
316*7527624eSRobert Watson  */
317*7527624eSRobert Watson uint32_t
318*7527624eSRobert Watson rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
319*7527624eSRobert Watson     u_short dstport)
320*7527624eSRobert Watson {
321*7527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
322*7527624eSRobert Watson 	    sizeof(dstport)];
323*7527624eSRobert Watson 	u_int datalen;
324*7527624eSRobert Watson 
325*7527624eSRobert Watson 	datalen = 0;
326*7527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
327*7527624eSRobert Watson 	datalen += sizeof(src);
328*7527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
329*7527624eSRobert Watson 	datalen += sizeof(dst);
330*7527624eSRobert Watson 	bcopy(&srcport, &data[datalen], sizeof(srcport));
331*7527624eSRobert Watson 	datalen += sizeof(srcport);
332*7527624eSRobert Watson 	bcopy(&dstport, &data[datalen], sizeof(dstport));
333*7527624eSRobert Watson 	datalen += sizeof(dstport);
334*7527624eSRobert Watson 	return (rss_hash(datalen, data));
335*7527624eSRobert Watson }
336*7527624eSRobert Watson 
337*7527624eSRobert Watson #ifdef INET6
338*7527624eSRobert Watson /*
339*7527624eSRobert Watson  * Hash an IPv6 2-tuple.
340*7527624eSRobert Watson  */
341*7527624eSRobert Watson uint32_t
342*7527624eSRobert Watson rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst)
343*7527624eSRobert Watson {
344*7527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst)];
345*7527624eSRobert Watson 	u_int datalen;
346*7527624eSRobert Watson 
347*7527624eSRobert Watson 	datalen = 0;
348*7527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
349*7527624eSRobert Watson 	datalen += sizeof(src);
350*7527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
351*7527624eSRobert Watson 	datalen += sizeof(dst);
352*7527624eSRobert Watson 	return (rss_hash(datalen, data));
353*7527624eSRobert Watson }
354*7527624eSRobert Watson 
355*7527624eSRobert Watson /*
356*7527624eSRobert Watson  * Hash an IPv6 4-tuple.
357*7527624eSRobert Watson  */
358*7527624eSRobert Watson uint32_t
359*7527624eSRobert Watson rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport,
360*7527624eSRobert Watson     struct in6_addr dst, u_short dstport)
361*7527624eSRobert Watson {
362*7527624eSRobert Watson 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
363*7527624eSRobert Watson 	    sizeof(dstport)];
364*7527624eSRobert Watson 	u_int datalen;
365*7527624eSRobert Watson 
366*7527624eSRobert Watson 	datalen = 0;
367*7527624eSRobert Watson 	bcopy(&src, &data[datalen], sizeof(src));
368*7527624eSRobert Watson 	datalen += sizeof(src);
369*7527624eSRobert Watson 	bcopy(&dst, &data[datalen], sizeof(dst));
370*7527624eSRobert Watson 	datalen += sizeof(dst);
371*7527624eSRobert Watson 	bcopy(&srcport, &data[datalen], sizeof(srcport));
372*7527624eSRobert Watson 	datalen += sizeof(srcport);
373*7527624eSRobert Watson 	bcopy(&dstport, &data[datalen], sizeof(dstport));
374*7527624eSRobert Watson 	datalen += sizeof(dstport);
375*7527624eSRobert Watson 	return (rss_hash(datalen, data));
376*7527624eSRobert Watson }
377*7527624eSRobert Watson #endif /* INET6 */
378*7527624eSRobert Watson 
379*7527624eSRobert Watson /*
380*7527624eSRobert Watson  * Query the number of RSS bits in use.
381*7527624eSRobert Watson  */
382*7527624eSRobert Watson u_int
383*7527624eSRobert Watson rss_getbits(void)
384*7527624eSRobert Watson {
385*7527624eSRobert Watson 
386*7527624eSRobert Watson 	return (rss_bits);
387*7527624eSRobert Watson }
388*7527624eSRobert Watson 
389*7527624eSRobert Watson /*
390*7527624eSRobert Watson  * Query the RSS bucket associated with an RSS hash.
391*7527624eSRobert Watson  */
392*7527624eSRobert Watson u_int
393*7527624eSRobert Watson rss_getbucket(u_int hash)
394*7527624eSRobert Watson {
395*7527624eSRobert Watson 
396*7527624eSRobert Watson 	return (hash & rss_mask);
397*7527624eSRobert Watson }
398*7527624eSRobert Watson 
399*7527624eSRobert Watson /*
400*7527624eSRobert Watson  * Query the RSS CPU associated with an RSS bucket.
401*7527624eSRobert Watson  */
402*7527624eSRobert Watson u_int
403*7527624eSRobert Watson rss_getcpu(u_int bucket)
404*7527624eSRobert Watson {
405*7527624eSRobert Watson 
406*7527624eSRobert Watson 	return (rss_table[bucket].rte_cpu);
407*7527624eSRobert Watson }
408*7527624eSRobert Watson 
409*7527624eSRobert Watson /*
410*7527624eSRobert Watson  * netisr CPU affinity lookup routine for use by protocols.
411*7527624eSRobert Watson  */
412*7527624eSRobert Watson struct mbuf *
413*7527624eSRobert Watson rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
414*7527624eSRobert Watson {
415*7527624eSRobert Watson 
416*7527624eSRobert Watson 	M_ASSERTPKTHDR(m);
417*7527624eSRobert Watson 
418*7527624eSRobert Watson 	switch (M_HASHTYPE_GET(m)) {
419*7527624eSRobert Watson 	case M_HASHTYPE_RSS_IPV4:
420*7527624eSRobert Watson 	case M_HASHTYPE_RSS_TCP_IPV4:
421*7527624eSRobert Watson 		*cpuid = rss_getcpu(rss_getbucket(m->m_pkthdr.flowid));
422*7527624eSRobert Watson 		return (m);
423*7527624eSRobert Watson 
424*7527624eSRobert Watson 	default:
425*7527624eSRobert Watson 		*cpuid = NETISR_CPUID_NONE;
426*7527624eSRobert Watson 		return (m);
427*7527624eSRobert Watson 	}
428*7527624eSRobert Watson }
429*7527624eSRobert Watson 
430*7527624eSRobert Watson /*
431*7527624eSRobert Watson  * Query the RSS hash algorithm.
432*7527624eSRobert Watson  */
433*7527624eSRobert Watson u_int
434*7527624eSRobert Watson rss_gethashalgo(void)
435*7527624eSRobert Watson {
436*7527624eSRobert Watson 
437*7527624eSRobert Watson 	return (rss_hashalgo);
438*7527624eSRobert Watson }
439*7527624eSRobert Watson 
440*7527624eSRobert Watson /*
441*7527624eSRobert Watson  * Query the current RSS key; likely to be used by device drivers when
442*7527624eSRobert Watson  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
443*7527624eSRobert Watson  *
444*7527624eSRobert Watson  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
445*7527624eSRobert Watson  */
446*7527624eSRobert Watson void
447*7527624eSRobert Watson rss_getkey(uint8_t *key)
448*7527624eSRobert Watson {
449*7527624eSRobert Watson 
450*7527624eSRobert Watson 	bcopy(rss_key, key, sizeof(rss_key));
451*7527624eSRobert Watson }
452*7527624eSRobert Watson 
453*7527624eSRobert Watson /*
454*7527624eSRobert Watson  * Query the number of buckets; this may be used by both network device
455*7527624eSRobert Watson  * drivers, which will need to populate hardware shadows of the software
456*7527624eSRobert Watson  * indirection table, and the network stack itself (such as when deciding how
457*7527624eSRobert Watson  * many connection groups to allocate).
458*7527624eSRobert Watson  */
459*7527624eSRobert Watson u_int
460*7527624eSRobert Watson rss_getnumbuckets(void)
461*7527624eSRobert Watson {
462*7527624eSRobert Watson 
463*7527624eSRobert Watson 	return (rss_buckets);
464*7527624eSRobert Watson }
465*7527624eSRobert Watson 
466*7527624eSRobert Watson /*
467*7527624eSRobert Watson  * Query the number of CPUs in use by RSS; may be useful to device drivers
468*7527624eSRobert Watson  * trying to figure out how to map a larger number of CPUs into a smaller
469*7527624eSRobert Watson  * number of receive queues.
470*7527624eSRobert Watson  */
471*7527624eSRobert Watson u_int
472*7527624eSRobert Watson rss_getnumcpus(void)
473*7527624eSRobert Watson {
474*7527624eSRobert Watson 
475*7527624eSRobert Watson 	return (rss_ncpus);
476*7527624eSRobert Watson }
477*7527624eSRobert Watson 
478*7527624eSRobert Watson /*
479*7527624eSRobert Watson  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
480*7527624eSRobert Watson  * it appearing in debugging output unnecessarily.
481*7527624eSRobert Watson  */
482*7527624eSRobert Watson static int
483*7527624eSRobert Watson sysctl_rss_key(SYSCTL_HANDLER_ARGS)
484*7527624eSRobert Watson {
485*7527624eSRobert Watson 	uint8_t temp_rss_key[RSS_KEYSIZE];
486*7527624eSRobert Watson 	int error;
487*7527624eSRobert Watson 
488*7527624eSRobert Watson 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
489*7527624eSRobert Watson 	if (error)
490*7527624eSRobert Watson 		return (error);
491*7527624eSRobert Watson 
492*7527624eSRobert Watson 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
493*7527624eSRobert Watson 	error = sysctl_handle_opaque(oidp, temp_rss_key,
494*7527624eSRobert Watson 	    sizeof(temp_rss_key), req);
495*7527624eSRobert Watson 	if (error)
496*7527624eSRobert Watson 		return (error);
497*7527624eSRobert Watson 	if (req->newptr != NULL) {
498*7527624eSRobert Watson 		/* XXXRW: Not yet. */
499*7527624eSRobert Watson 		return (EINVAL);
500*7527624eSRobert Watson 	}
501*7527624eSRobert Watson 	return (0);
502*7527624eSRobert Watson }
503*7527624eSRobert Watson SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
504*7527624eSRobert Watson     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
505*7527624eSRobert Watson     "", "RSS keying material");
506