xref: /freebsd/sys/netinet/in_rss.c (revision 0e97acdf58fe27b09c4824a474b0344daf997c5f)
1 /*-
2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3  * All rights reserved.
4  *
5  * This software was developed by Robert N. M. Watson under contract
6  * to Juniper Networks, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet6.h"
35 #include "opt_pcbgroup.h"
36 
37 #ifndef PCBGROUP
38 #error "options RSS depends on options PCBGROUP"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/priv.h>
45 #include <sys/kernel.h>
46 #include <sys/smp.h>
47 #include <sys/sysctl.h>
48 #include <sys/sbuf.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/netisr.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_rss.h>
57 #include <netinet/in_var.h>
58 #include <netinet/toeplitz.h>
59 
60 /* for software rss hash support */
61 #include <netinet/ip.h>
62 #include <netinet/tcp.h>
63 #include <netinet/udp.h>
64 
65 /*-
66  * Operating system parts of receiver-side scaling (RSS), which allows
67  * network cards to direct flows to particular receive queues based on hashes
68  * of header tuples.  This implementation aligns RSS buckets with connection
69  * groups at the TCP/IP layer, so each bucket is associated with exactly one
70  * group.  As a result, the group lookup structures (and lock) should have an
71  * effective affinity with exactly one CPU.
72  *
73  * Network device drivers needing to configure RSS will query this framework
74  * for parameters, such as the current RSS key, hashing policies, number of
75  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
76  * provide their own supplementary information, such as queue<->CPU bindings.
77  * It is the responsibility of the network device driver to inject packets
78  * into the stack on as close to the right CPU as possible, if playing by RSS
79  * rules.
80  *
81  * TODO:
82  *
83  * - Synchronization for rss_key and other future-configurable parameters.
84  * - Event handler drivers can register to pick up RSS configuration changes.
85  * - Should we allow rss_basecpu to be configured?
86  * - Randomize key on boot.
87  * - IPv6 support.
88  * - Statistics on how often there's a misalignment between hardware
89  *   placement and pcbgroup expectations.
90  */
91 
92 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
93 
94 /*
95  * Toeplitz is the only required hash function in the RSS spec, so use it by
96  * default.
97  */
98 static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
99 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
100     "RSS hash algorithm");
101 
102 /*
103  * Size of the indirection table; at most 128 entries per the RSS spec.  We
104  * size it to at least 2 times the number of CPUs by default to allow useful
105  * rebalancing.  If not set explicitly with a loader tunable, we tune based
106  * on the number of CPUs present.
107  *
108  * XXXRW: buckets might be better to use for the tunable than bits.
109  */
110 static u_int	rss_bits;
111 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
112     "RSS bits");
113 
114 static u_int	rss_mask;
115 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
116     "RSS mask");
117 
118 static const u_int	rss_maxbits = RSS_MAXBITS;
119 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
120     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
121 
122 /*
123  * RSS's own count of the number of CPUs it could be using for processing.
124  * Bounded to 64 by RSS constants.
125  */
126 static u_int	rss_ncpus;
127 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
128     "Number of CPUs available to RSS");
129 
130 #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
131 static const u_int	rss_maxcpus = RSS_MAXCPUS;
132 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
133     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
134 
135 /*
136  * Variable exists just for reporting rss_bits in a user-friendly way.
137  */
138 static u_int	rss_buckets;
139 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
140     "RSS buckets");
141 
142 /*
143  * Base CPU number; devices will add this to all CPU numbers returned by the
144  * RSS indirection table.  Currently unmodifable in FreeBSD.
145  */
146 static const u_int	rss_basecpu;
147 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
148     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
149 
150 /*
151  * RSS secret key, intended to prevent attacks on load-balancing.  Its
152  * effectiveness may be limited by algorithm choice and available entropy
153  * during the boot.
154  *
155  * XXXRW: And that we don't randomize it yet!
156  *
157  * This is the default Microsoft RSS specification key which is also
158  * the Chelsio T5 firmware default key.
159  */
160 static uint8_t rss_key[RSS_KEYSIZE] = {
161 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
162 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
163 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
164 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
165 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
166 };
167 
168 /*
169  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
170  * Drivers may supplement this table with a seperate CPU<->queue table when
171  * programming devices.
172  */
173 struct rss_table_entry {
174 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
175 };
176 static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
177 
178 static inline u_int rss_gethashconfig_local(void);
179 
180 static void
181 rss_init(__unused void *arg)
182 {
183 	u_int i;
184 	u_int cpuid;
185 
186 	/*
187 	 * Validate tunables, coerce to sensible values.
188 	 */
189 	switch (rss_hashalgo) {
190 	case RSS_HASH_TOEPLITZ:
191 	case RSS_HASH_NAIVE:
192 		break;
193 
194 	default:
195 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
196 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
197 		rss_hashalgo = RSS_HASH_TOEPLITZ;
198 	}
199 
200 	/*
201 	 * Count available CPUs.
202 	 *
203 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
204 	 * elsewhere.
205 	 */
206 	rss_ncpus = 0;
207 	for (i = 0; i <= mp_maxid; i++) {
208 		if (CPU_ABSENT(i))
209 			continue;
210 		rss_ncpus++;
211 	}
212 	if (rss_ncpus > RSS_MAXCPUS)
213 		rss_ncpus = RSS_MAXCPUS;
214 
215 	/*
216 	 * Tune RSS table entries to be no less than 2x the number of CPUs
217 	 * -- unless we're running uniprocessor, in which case there's not
218 	 * much point in having buckets to rearrange for load-balancing!
219 	 */
220 	if (rss_ncpus > 1) {
221 		if (rss_bits == 0)
222 			rss_bits = fls(rss_ncpus - 1) + 1;
223 
224 		/*
225 		 * Microsoft limits RSS table entries to 128, so apply that
226 		 * limit to both auto-detected CPU counts and user-configured
227 		 * ones.
228 		 */
229 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
230 			printf("%s: RSS bits %u not valid, coercing to  %u",
231 			    __func__, rss_bits, RSS_MAXBITS);
232 			rss_bits = RSS_MAXBITS;
233 		}
234 
235 		/*
236 		 * Figure out how many buckets to use; warn if less than the
237 		 * number of configured CPUs, although this is not a fatal
238 		 * problem.
239 		 */
240 		rss_buckets = (1 << rss_bits);
241 		if (rss_buckets < rss_ncpus)
242 			printf("%s: WARNING: rss_buckets (%u) less than "
243 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
244 			    rss_ncpus);
245 		rss_mask = rss_buckets - 1;
246 	} else {
247 		rss_bits = 0;
248 		rss_buckets = 1;
249 		rss_mask = 0;
250 	}
251 
252 	/*
253 	 * Set up initial CPU assignments: round-robin by default.
254 	 */
255 	cpuid = CPU_FIRST();
256 	for (i = 0; i < rss_buckets; i++) {
257 		rss_table[i].rte_cpu = cpuid;
258 		cpuid = CPU_NEXT(cpuid);
259 	}
260 
261 	/*
262 	 * Randomize rrs_key.
263 	 *
264 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
265 	 * loop to check for "bad" RSS keys.
266 	 */
267 }
268 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
269 
270 static uint32_t
271 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
272     const uint8_t *data)
273 {
274 	uint32_t v;
275 	u_int i;
276 
277 	v = 0;
278 	for (i = 0; i < keylen; i++)
279 		v += key[i];
280 	for (i = 0; i < datalen; i++)
281 		v += data[i];
282 	return (v);
283 }
284 
285 static uint32_t
286 rss_hash(u_int datalen, const uint8_t *data)
287 {
288 
289 	switch (rss_hashalgo) {
290 	case RSS_HASH_TOEPLITZ:
291 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
292 		    data));
293 
294 	case RSS_HASH_NAIVE:
295 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
296 		    data));
297 
298 	default:
299 		panic("%s: unsupported/unknown hashalgo %d", __func__,
300 		    rss_hashalgo);
301 	}
302 }
303 
304 /*
305  * Hash an IPv4 2-tuple.
306  */
307 uint32_t
308 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
309 {
310 	uint8_t data[sizeof(src) + sizeof(dst)];
311 	u_int datalen;
312 
313 	datalen = 0;
314 	bcopy(&src, &data[datalen], sizeof(src));
315 	datalen += sizeof(src);
316 	bcopy(&dst, &data[datalen], sizeof(dst));
317 	datalen += sizeof(dst);
318 	return (rss_hash(datalen, data));
319 }
320 
321 /*
322  * Hash an IPv4 4-tuple.
323  */
324 uint32_t
325 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
326     u_short dstport)
327 {
328 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
329 	    sizeof(dstport)];
330 	u_int datalen;
331 
332 	datalen = 0;
333 	bcopy(&src, &data[datalen], sizeof(src));
334 	datalen += sizeof(src);
335 	bcopy(&dst, &data[datalen], sizeof(dst));
336 	datalen += sizeof(dst);
337 	bcopy(&srcport, &data[datalen], sizeof(srcport));
338 	datalen += sizeof(srcport);
339 	bcopy(&dstport, &data[datalen], sizeof(dstport));
340 	datalen += sizeof(dstport);
341 	return (rss_hash(datalen, data));
342 }
343 
344 #ifdef INET6
345 /*
346  * Hash an IPv6 2-tuple.
347  */
348 uint32_t
349 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst)
350 {
351 	uint8_t data[sizeof(src) + sizeof(dst)];
352 	u_int datalen;
353 
354 	datalen = 0;
355 	bcopy(&src, &data[datalen], sizeof(src));
356 	datalen += sizeof(src);
357 	bcopy(&dst, &data[datalen], sizeof(dst));
358 	datalen += sizeof(dst);
359 	return (rss_hash(datalen, data));
360 }
361 
362 /*
363  * Hash an IPv6 4-tuple.
364  */
365 uint32_t
366 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport,
367     struct in6_addr dst, u_short dstport)
368 {
369 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
370 	    sizeof(dstport)];
371 	u_int datalen;
372 
373 	datalen = 0;
374 	bcopy(&src, &data[datalen], sizeof(src));
375 	datalen += sizeof(src);
376 	bcopy(&dst, &data[datalen], sizeof(dst));
377 	datalen += sizeof(dst);
378 	bcopy(&srcport, &data[datalen], sizeof(srcport));
379 	datalen += sizeof(srcport);
380 	bcopy(&dstport, &data[datalen], sizeof(dstport));
381 	datalen += sizeof(dstport);
382 	return (rss_hash(datalen, data));
383 }
384 #endif /* INET6 */
385 
386 /*
387  * Query the number of RSS bits in use.
388  */
389 u_int
390 rss_getbits(void)
391 {
392 
393 	return (rss_bits);
394 }
395 
396 /*
397  * Query the RSS bucket associated with an RSS hash.
398  */
399 u_int
400 rss_getbucket(u_int hash)
401 {
402 
403 	return (hash & rss_mask);
404 }
405 
406 /*
407  * Query the RSS layer bucket associated with the given
408  * entry in the RSS hash space.
409  *
410  * The RSS indirection table is 0 .. rss_buckets-1,
411  * covering the low 'rss_bits' of the total 128 slot
412  * RSS indirection table.  So just mask off rss_bits and
413  * return that.
414  *
415  * NIC drivers can then iterate over the 128 slot RSS
416  * indirection table and fetch which RSS bucket to
417  * map it to.  This will typically be a CPU queue
418  */
419 u_int
420 rss_get_indirection_to_bucket(u_int index)
421 {
422 
423 	return (index & rss_mask);
424 }
425 
426 /*
427  * Query the RSS CPU associated with an RSS bucket.
428  */
429 u_int
430 rss_getcpu(u_int bucket)
431 {
432 
433 	return (rss_table[bucket].rte_cpu);
434 }
435 
436 /*
437  * netisr CPU affinity lookup given just the hash and hashtype.
438  */
439 u_int
440 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
441 {
442 
443 	switch (hash_type) {
444 	case M_HASHTYPE_RSS_IPV4:
445 	case M_HASHTYPE_RSS_TCP_IPV4:
446 	case M_HASHTYPE_RSS_UDP_IPV4:
447 	case M_HASHTYPE_RSS_IPV6:
448 	case M_HASHTYPE_RSS_TCP_IPV6:
449 	case M_HASHTYPE_RSS_UDP_IPV6:
450 		return (rss_getcpu(rss_getbucket(hash_val)));
451 	default:
452 		return (NETISR_CPUID_NONE);
453 	}
454 }
455 
456 /*
457  * Query the RSS bucket associated with the given hash value and
458  * type.
459  */
460 int
461 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
462 {
463 
464 	switch (hash_type) {
465 	case M_HASHTYPE_RSS_IPV4:
466 	case M_HASHTYPE_RSS_TCP_IPV4:
467 	case M_HASHTYPE_RSS_UDP_IPV4:
468 	case M_HASHTYPE_RSS_IPV6:
469 	case M_HASHTYPE_RSS_TCP_IPV6:
470 	case M_HASHTYPE_RSS_UDP_IPV6:
471 		*bucket_id = rss_getbucket(hash_val);
472 		return (0);
473 	default:
474 		return (-1);
475 	}
476 }
477 
478 /*
479  * netisr CPU affinity lookup routine for use by protocols.
480  */
481 struct mbuf *
482 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
483 {
484 
485 	M_ASSERTPKTHDR(m);
486 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
487 	return (m);
488 }
489 
490 int
491 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
492 {
493 
494 	M_ASSERTPKTHDR(m);
495 
496 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
497 	    bucket_id));
498 }
499 
500 /*
501  * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given
502  * IPv4 source/destination address, UDP or TCP source/destination ports
503  * and the protocol type.
504  *
505  * The protocol code may wish to do a software hash of the given
506  * tuple.  This depends upon the currently configured RSS hash types.
507  *
508  * This assumes that the packet in question isn't a fragment.
509  *
510  * It also assumes the packet source/destination address
511  * are in "incoming" packet order (ie, source is "far" address.)
512  */
513 int
514 rss_proto_software_hash_v4(struct in_addr s, struct in_addr d,
515     u_short sp, u_short dp, int proto,
516     uint32_t *hashval, uint32_t *hashtype)
517 {
518 	uint32_t hash;
519 
520 	/*
521 	 * Next, choose the hash type depending upon the protocol
522 	 * identifier.
523 	 */
524 	if ((proto == IPPROTO_TCP) &&
525 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
526 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
527 		*hashval = hash;
528 		*hashtype = M_HASHTYPE_RSS_TCP_IPV4;
529 		return (0);
530 	} else if ((proto == IPPROTO_UDP) &&
531 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
532 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
533 		*hashval = hash;
534 		*hashtype = M_HASHTYPE_RSS_UDP_IPV4;
535 		return (0);
536 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
537 		/* RSS doesn't hash on other protocols like SCTP; so 2-tuple */
538 		hash = rss_hash_ip4_2tuple(s, d);
539 		*hashval = hash;
540 		*hashtype = M_HASHTYPE_RSS_IPV4;
541 		return (0);
542 	}
543 
544 	/* No configured available hashtypes! */
545 	printf("%s: no available hashtypes!\n", __func__);
546 	return (-1);
547 }
548 
549 /*
550  * Do a software calculation of the RSS for the given mbuf.
551  *
552  * This is typically used by the input path to recalculate the RSS after
553  * some form of packet processing (eg de-capsulation, IP fragment reassembly.)
554  *
555  * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and
556  * RSS_HASH_PKT_EGRESS for outgoing.
557  *
558  * Returns 0 if a hash was done, -1 if no hash was done, +1 if
559  * the mbuf already had a valid RSS flowid.
560  *
561  * This function doesn't modify the mbuf.  It's up to the caller to
562  * assign flowid/flowtype as appropriate.
563  */
564 int
565 rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval,
566     uint32_t *hashtype)
567 {
568 	const struct ip *ip;
569 	const struct tcphdr *th;
570 	const struct udphdr *uh;
571 	uint8_t proto;
572 	int iphlen;
573 	int is_frag = 0;
574 
575 	/*
576 	 * XXX For now this only handles hashing on incoming mbufs.
577 	 */
578 	if (dir != RSS_HASH_PKT_INGRESS) {
579 		printf("%s: called on EGRESS packet!\n", __func__);
580 		return (-1);
581 	}
582 
583 	/*
584 	 * First, validate that the mbuf we have is long enough
585 	 * to have an IPv4 header in it.
586 	 */
587 	if (m->m_pkthdr.len < (sizeof(struct ip))) {
588 		printf("%s: short mbuf pkthdr\n", __func__);
589 		return (-1);
590 	}
591 	if (m->m_len < (sizeof(struct ip))) {
592 		printf("%s: short mbuf len\n", __func__);
593 		return (-1);
594 	}
595 
596 	/* Ok, let's dereference that */
597 	ip = mtod(m, struct ip *);
598 	proto = ip->ip_p;
599 	iphlen = ip->ip_hl << 2;
600 
601 	/*
602 	 * If this is a fragment then it shouldn't be four-tuple
603 	 * hashed just yet.  Once it's reassembled into a full
604 	 * frame it should be re-hashed.
605 	 */
606 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
607 		is_frag = 1;
608 
609 	/*
610 	 * If the mbuf flowid/flowtype matches the packet type,
611 	 * and we don't support the 4-tuple version of the given protocol,
612 	 * then signal to the owner that it can trust the flowid/flowtype
613 	 * details.
614 	 *
615 	 * This is a little picky - eg, if TCPv4 / UDPv4 hashing
616 	 * is supported but we got a TCP/UDP frame only 2-tuple hashed,
617 	 * then we shouldn't just "trust" the 2-tuple hash.  We need
618 	 * a 4-tuple hash.
619 	 */
620 	if (m->m_flags & M_FLOWID) {
621 		uint32_t flowid, flowtype;
622 
623 		flowid = m->m_pkthdr.flowid;
624 		flowtype = M_HASHTYPE_GET(m);
625 
626 		switch (proto) {
627 		case IPPROTO_UDP:
628 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
629 			    (flowtype == M_HASHTYPE_RSS_UDP_IPV4) &&
630 			    (is_frag == 0)) {
631 				return (1);
632 			}
633 			/*
634 			 * Only allow 2-tuple for UDP frames if we don't also
635 			 * support 4-tuple for UDP.
636 			 */
637 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
638 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) &&
639 			    flowtype == M_HASHTYPE_RSS_IPV4) {
640 				return (1);
641 			}
642 			break;
643 		case IPPROTO_TCP:
644 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
645 			    (flowtype == M_HASHTYPE_RSS_TCP_IPV4) &&
646 			    (is_frag == 0)) {
647 				return (1);
648 			}
649 			/*
650 			 * Only allow 2-tuple for TCP frames if we don't also
651 			 * support 2-tuple for TCP.
652 			 */
653 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
654 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) &&
655 			    flowtype == M_HASHTYPE_RSS_IPV4) {
656 				return (1);
657 			}
658 			break;
659 		default:
660 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
661 			    flowtype == M_HASHTYPE_RSS_IPV4) {
662 				return (1);
663 			}
664 			break;
665 		}
666 	}
667 
668 	/*
669 	 * Decode enough information to make a hash decision.
670 	 *
671 	 * XXX TODO: does the hardware hash on 4-tuple if IP
672 	 *    options are present?
673 	 */
674 	if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
675 	    (proto == IPPROTO_TCP) &&
676 	    (is_frag == 0)) {
677 		if (m->m_len < iphlen + sizeof(struct tcphdr)) {
678 			printf("%s: short TCP frame?\n", __func__);
679 			return (-1);
680 		}
681 		th = (struct tcphdr *)((caddr_t)ip + iphlen);
682 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
683 		    th->th_sport,
684 		    th->th_dport,
685 		    proto,
686 		    hashval,
687 		    hashtype);
688 	} else if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
689 	    (proto == IPPROTO_UDP) &&
690 	    (is_frag == 0)) {
691 		uh = (struct udphdr *)((caddr_t)ip + iphlen);
692 		if (m->m_len < iphlen + sizeof(struct udphdr)) {
693 			printf("%s: short UDP frame?\n", __func__);
694 			return (-1);
695 		}
696 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
697 		    uh->uh_sport,
698 		    uh->uh_dport,
699 		    proto,
700 		    hashval,
701 		    hashtype);
702 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
703 		/* Default to 2-tuple hash */
704 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
705 		    0,	/* source port */
706 		    0,	/* destination port */
707 		    0,	/* IPPROTO_IP */
708 		    hashval,
709 		    hashtype);
710 	} else {
711 		printf("%s: no available hashtypes!\n", __func__);
712 		return (-1);
713 	}
714 }
715 
716 /*
717  * Similar to rss_m2cpuid, but designed to be used by the IP NETISR
718  * on incoming frames.
719  *
720  * If an existing RSS hash exists and it matches what the configured
721  * hashing is, then use it.
722  *
723  * If there's an existing RSS hash but the desired hash is different,
724  * or if there's no useful RSS hash, then calculate it via
725  * the software path.
726  *
727  * XXX TODO: definitely want statistics here!
728  */
729 struct mbuf *
730 rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
731 {
732 	uint32_t hash_val, hash_type;
733 	int ret;
734 
735 	M_ASSERTPKTHDR(m);
736 
737 	ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS,
738 	    &hash_val, &hash_type);
739 	if (ret > 0) {
740 		/* mbuf has a valid hash already; don't need to modify it */
741 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
742 	} else if (ret == 0) {
743 		/* hash was done; update */
744 		m->m_pkthdr.flowid = hash_val;
745 		M_HASHTYPE_SET(m, hash_type);
746 		m->m_flags |= M_FLOWID;
747 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
748 	} else { /* ret < 0 */
749 		/* no hash was done */
750 		*cpuid = NETISR_CPUID_NONE;
751 	}
752 	return (m);
753 }
754 
755 /*
756  * Query the RSS hash algorithm.
757  */
758 u_int
759 rss_gethashalgo(void)
760 {
761 
762 	return (rss_hashalgo);
763 }
764 
765 /*
766  * Query the current RSS key; likely to be used by device drivers when
767  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
768  *
769  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
770  */
771 void
772 rss_getkey(uint8_t *key)
773 {
774 
775 	bcopy(rss_key, key, sizeof(rss_key));
776 }
777 
778 /*
779  * Query the number of buckets; this may be used by both network device
780  * drivers, which will need to populate hardware shadows of the software
781  * indirection table, and the network stack itself (such as when deciding how
782  * many connection groups to allocate).
783  */
784 u_int
785 rss_getnumbuckets(void)
786 {
787 
788 	return (rss_buckets);
789 }
790 
791 /*
792  * Query the number of CPUs in use by RSS; may be useful to device drivers
793  * trying to figure out how to map a larger number of CPUs into a smaller
794  * number of receive queues.
795  */
796 u_int
797 rss_getnumcpus(void)
798 {
799 
800 	return (rss_ncpus);
801 }
802 
803 static inline u_int
804 rss_gethashconfig_local(void)
805 {
806 
807 	/* Return 4-tuple for TCP; 2-tuple for others */
808 	/*
809 	 * UDP may fragment more often than TCP and thus we'll end up with
810 	 * NICs returning 2-tuple fragments.
811 	 * udp_init() and udplite_init() both currently initialise things
812 	 * as 2-tuple.
813 	 * So for now disable UDP 4-tuple hashing until all of the other
814 	 * pieces are in place.
815 	 */
816 	return (
817 	    RSS_HASHTYPE_RSS_IPV4
818 	|    RSS_HASHTYPE_RSS_TCP_IPV4
819 	|    RSS_HASHTYPE_RSS_IPV6
820 	|    RSS_HASHTYPE_RSS_TCP_IPV6
821 	|    RSS_HASHTYPE_RSS_IPV6_EX
822 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
823 #if 0
824 	|    RSS_HASHTYPE_RSS_UDP_IPV4
825 	|    RSS_HASHTYPE_RSS_UDP_IPV4_EX
826 	|    RSS_HASHTYPE_RSS_UDP_IPV6
827 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
828 #endif
829 	);
830 }
831 
832 /*
833  * Return the supported RSS hash configuration.
834  *
835  * NICs should query this to determine what to configure in their redirection
836  * matching table.
837  */
838 u_int
839 rss_gethashconfig(void)
840 {
841 
842 	return (rss_gethashconfig_local());
843 }
844 
845 /*
846  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
847  * it appearing in debugging output unnecessarily.
848  */
849 static int
850 sysctl_rss_key(SYSCTL_HANDLER_ARGS)
851 {
852 	uint8_t temp_rss_key[RSS_KEYSIZE];
853 	int error;
854 
855 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
856 	if (error)
857 		return (error);
858 
859 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
860 	error = sysctl_handle_opaque(oidp, temp_rss_key,
861 	    sizeof(temp_rss_key), req);
862 	if (error)
863 		return (error);
864 	if (req->newptr != NULL) {
865 		/* XXXRW: Not yet. */
866 		return (EINVAL);
867 	}
868 	return (0);
869 }
870 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
871     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
872     "", "RSS keying material");
873 
874 static int
875 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
876 {
877 	struct sbuf *sb;
878 	int error;
879 	int i;
880 
881 	error = 0;
882 	error = sysctl_wire_old_buffer(req, 0);
883 	if (error != 0)
884 		return (error);
885 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
886 	if (sb == NULL)
887 		return (ENOMEM);
888 	for (i = 0; i < rss_buckets; i++) {
889 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
890 		    i,
891 		    rss_getcpu(i));
892 	}
893 	error = sbuf_finish(sb);
894 	sbuf_delete(sb);
895 
896 	return (error);
897 }
898 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
899     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
900     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
901