xref: /freebsd/sys/netinet/in_rss.c (revision db3cb3640f547c063293e9fdc4db69e9dc120951)
1 /*-
2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3  * All rights reserved.
4  *
5  * This software was developed by Robert N. M. Watson under contract
6  * to Juniper Networks, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet6.h"
35 #include "opt_pcbgroup.h"
36 
37 #ifndef PCBGROUP
38 #error "options RSS depends on options PCBGROUP"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/priv.h>
45 #include <sys/kernel.h>
46 #include <sys/smp.h>
47 #include <sys/sysctl.h>
48 #include <sys/sbuf.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/netisr.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_rss.h>
57 #include <netinet/in_var.h>
58 #include <netinet/toeplitz.h>
59 
60 /* for software rss hash support */
61 #include <netinet/ip.h>
62 #include <netinet/tcp.h>
63 #include <netinet/udp.h>
64 
65 /*-
66  * Operating system parts of receiver-side scaling (RSS), which allows
67  * network cards to direct flows to particular receive queues based on hashes
68  * of header tuples.  This implementation aligns RSS buckets with connection
69  * groups at the TCP/IP layer, so each bucket is associated with exactly one
70  * group.  As a result, the group lookup structures (and lock) should have an
71  * effective affinity with exactly one CPU.
72  *
73  * Network device drivers needing to configure RSS will query this framework
74  * for parameters, such as the current RSS key, hashing policies, number of
75  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
76  * provide their own supplementary information, such as queue<->CPU bindings.
77  * It is the responsibility of the network device driver to inject packets
78  * into the stack on as close to the right CPU as possible, if playing by RSS
79  * rules.
80  *
81  * TODO:
82  *
83  * - Synchronization for rss_key and other future-configurable parameters.
84  * - Event handler drivers can register to pick up RSS configuration changes.
85  * - Should we allow rss_basecpu to be configured?
86  * - Randomize key on boot.
87  * - IPv6 support.
88  * - Statistics on how often there's a misalignment between hardware
89  *   placement and pcbgroup expectations.
90  */
91 
92 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
93 
94 /*
95  * Toeplitz is the only required hash function in the RSS spec, so use it by
96  * default.
97  */
98 static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
99 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0,
100     "RSS hash algorithm");
101 
102 /*
103  * Size of the indirection table; at most 128 entries per the RSS spec.  We
104  * size it to at least 2 times the number of CPUs by default to allow useful
105  * rebalancing.  If not set explicitly with a loader tunable, we tune based
106  * on the number of CPUs present.
107  *
108  * XXXRW: buckets might be better to use for the tunable than bits.
109  */
110 static u_int	rss_bits;
111 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0,
112     "RSS bits");
113 
114 static u_int	rss_mask;
115 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
116     "RSS mask");
117 
118 static const u_int	rss_maxbits = RSS_MAXBITS;
119 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
120     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
121 
122 /*
123  * RSS's own count of the number of CPUs it could be using for processing.
124  * Bounded to 64 by RSS constants.
125  */
126 static u_int	rss_ncpus;
127 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
128     "Number of CPUs available to RSS");
129 
130 #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
131 static const u_int	rss_maxcpus = RSS_MAXCPUS;
132 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
133     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
134 
135 /*
136  * Variable exists just for reporting rss_bits in a user-friendly way.
137  */
138 static u_int	rss_buckets;
139 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
140     "RSS buckets");
141 
142 /*
143  * Base CPU number; devices will add this to all CPU numbers returned by the
144  * RSS indirection table.  Currently unmodifable in FreeBSD.
145  */
146 static const u_int	rss_basecpu;
147 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
148     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
149 
150 /*
151  * RSS secret key, intended to prevent attacks on load-balancing.  Its
152  * effectiveness may be limited by algorithm choice and available entropy
153  * during the boot.
154  *
155  * XXXRW: And that we don't randomize it yet!
156  *
157  * This is the default Microsoft RSS specification key which is also
158  * the Chelsio T5 firmware default key.
159  */
160 static uint8_t rss_key[RSS_KEYSIZE] = {
161 	0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2,
162 	0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0,
163 	0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4,
164 	0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c,
165 	0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa,
166 };
167 
168 /*
169  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
170  * Drivers may supplement this table with a seperate CPU<->queue table when
171  * programming devices.
172  */
173 struct rss_table_entry {
174 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
175 };
176 static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
177 
178 static inline u_int rss_gethashconfig_local(void);
179 
180 static void
181 rss_init(__unused void *arg)
182 {
183 	u_int i;
184 	u_int cpuid;
185 
186 	/*
187 	 * Validate tunables, coerce to sensible values.
188 	 */
189 	switch (rss_hashalgo) {
190 	case RSS_HASH_TOEPLITZ:
191 	case RSS_HASH_NAIVE:
192 		break;
193 
194 	default:
195 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
196 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
197 		rss_hashalgo = RSS_HASH_TOEPLITZ;
198 	}
199 
200 	/*
201 	 * Count available CPUs.
202 	 *
203 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
204 	 * elsewhere.
205 	 */
206 	rss_ncpus = 0;
207 	for (i = 0; i <= mp_maxid; i++) {
208 		if (CPU_ABSENT(i))
209 			continue;
210 		rss_ncpus++;
211 	}
212 	if (rss_ncpus > RSS_MAXCPUS)
213 		rss_ncpus = RSS_MAXCPUS;
214 
215 	/*
216 	 * Tune RSS table entries to be no less than 2x the number of CPUs
217 	 * -- unless we're running uniprocessor, in which case there's not
218 	 * much point in having buckets to rearrange for load-balancing!
219 	 */
220 	if (rss_ncpus > 1) {
221 		if (rss_bits == 0)
222 			rss_bits = fls(rss_ncpus - 1) + 1;
223 
224 		/*
225 		 * Microsoft limits RSS table entries to 128, so apply that
226 		 * limit to both auto-detected CPU counts and user-configured
227 		 * ones.
228 		 */
229 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
230 			printf("%s: RSS bits %u not valid, coercing to  %u",
231 			    __func__, rss_bits, RSS_MAXBITS);
232 			rss_bits = RSS_MAXBITS;
233 		}
234 
235 		/*
236 		 * Figure out how many buckets to use; warn if less than the
237 		 * number of configured CPUs, although this is not a fatal
238 		 * problem.
239 		 */
240 		rss_buckets = (1 << rss_bits);
241 		if (rss_buckets < rss_ncpus)
242 			printf("%s: WARNING: rss_buckets (%u) less than "
243 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
244 			    rss_ncpus);
245 		rss_mask = rss_buckets - 1;
246 	} else {
247 		rss_bits = 0;
248 		rss_buckets = 1;
249 		rss_mask = 0;
250 	}
251 
252 	/*
253 	 * Set up initial CPU assignments: round-robin by default.
254 	 */
255 	cpuid = CPU_FIRST();
256 	for (i = 0; i < rss_buckets; i++) {
257 		rss_table[i].rte_cpu = cpuid;
258 		cpuid = CPU_NEXT(cpuid);
259 	}
260 
261 	/*
262 	 * Randomize rrs_key.
263 	 *
264 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
265 	 * loop to check for "bad" RSS keys.
266 	 */
267 }
268 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
269 
270 static uint32_t
271 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
272     const uint8_t *data)
273 {
274 	uint32_t v;
275 	u_int i;
276 
277 	v = 0;
278 	for (i = 0; i < keylen; i++)
279 		v += key[i];
280 	for (i = 0; i < datalen; i++)
281 		v += data[i];
282 	return (v);
283 }
284 
285 static uint32_t
286 rss_hash(u_int datalen, const uint8_t *data)
287 {
288 
289 	switch (rss_hashalgo) {
290 	case RSS_HASH_TOEPLITZ:
291 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
292 		    data));
293 
294 	case RSS_HASH_NAIVE:
295 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
296 		    data));
297 
298 	default:
299 		panic("%s: unsupported/unknown hashalgo %d", __func__,
300 		    rss_hashalgo);
301 	}
302 }
303 
304 /*
305  * Hash an IPv4 2-tuple.
306  */
307 uint32_t
308 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
309 {
310 	uint8_t data[sizeof(src) + sizeof(dst)];
311 	u_int datalen;
312 
313 	datalen = 0;
314 	bcopy(&src, &data[datalen], sizeof(src));
315 	datalen += sizeof(src);
316 	bcopy(&dst, &data[datalen], sizeof(dst));
317 	datalen += sizeof(dst);
318 	return (rss_hash(datalen, data));
319 }
320 
321 /*
322  * Hash an IPv4 4-tuple.
323  */
324 uint32_t
325 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
326     u_short dstport)
327 {
328 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
329 	    sizeof(dstport)];
330 	u_int datalen;
331 
332 	datalen = 0;
333 	bcopy(&src, &data[datalen], sizeof(src));
334 	datalen += sizeof(src);
335 	bcopy(&dst, &data[datalen], sizeof(dst));
336 	datalen += sizeof(dst);
337 	bcopy(&srcport, &data[datalen], sizeof(srcport));
338 	datalen += sizeof(srcport);
339 	bcopy(&dstport, &data[datalen], sizeof(dstport));
340 	datalen += sizeof(dstport);
341 	return (rss_hash(datalen, data));
342 }
343 
344 #ifdef INET6
345 /*
346  * Hash an IPv6 2-tuple.
347  */
348 uint32_t
349 rss_hash_ip6_2tuple(const struct in6_addr *src, const struct in6_addr *dst)
350 {
351 	uint8_t data[sizeof(*src) + sizeof(*dst)];
352 	u_int datalen;
353 
354 	datalen = 0;
355 	bcopy(src, &data[datalen], sizeof(*src));
356 	datalen += sizeof(*src);
357 	bcopy(dst, &data[datalen], sizeof(*dst));
358 	datalen += sizeof(*dst);
359 	return (rss_hash(datalen, data));
360 }
361 
362 /*
363  * Hash an IPv6 4-tuple.
364  */
365 uint32_t
366 rss_hash_ip6_4tuple(const struct in6_addr *src, u_short srcport,
367     const struct in6_addr *dst, u_short dstport)
368 {
369 	uint8_t data[sizeof(*src) + sizeof(*dst) + sizeof(srcport) +
370 	    sizeof(dstport)];
371 	u_int datalen;
372 
373 	datalen = 0;
374 	bcopy(src, &data[datalen], sizeof(*src));
375 	datalen += sizeof(*src);
376 	bcopy(dst, &data[datalen], sizeof(*dst));
377 	datalen += sizeof(*dst);
378 	bcopy(&srcport, &data[datalen], sizeof(srcport));
379 	datalen += sizeof(srcport);
380 	bcopy(&dstport, &data[datalen], sizeof(dstport));
381 	datalen += sizeof(dstport);
382 	return (rss_hash(datalen, data));
383 }
384 #endif /* INET6 */
385 
386 /*
387  * Query the number of RSS bits in use.
388  */
389 u_int
390 rss_getbits(void)
391 {
392 
393 	return (rss_bits);
394 }
395 
396 /*
397  * Query the RSS bucket associated with an RSS hash.
398  */
399 u_int
400 rss_getbucket(u_int hash)
401 {
402 
403 	return (hash & rss_mask);
404 }
405 
406 /*
407  * Query the RSS layer bucket associated with the given
408  * entry in the RSS hash space.
409  *
410  * The RSS indirection table is 0 .. rss_buckets-1,
411  * covering the low 'rss_bits' of the total 128 slot
412  * RSS indirection table.  So just mask off rss_bits and
413  * return that.
414  *
415  * NIC drivers can then iterate over the 128 slot RSS
416  * indirection table and fetch which RSS bucket to
417  * map it to.  This will typically be a CPU queue
418  */
419 u_int
420 rss_get_indirection_to_bucket(u_int index)
421 {
422 
423 	return (index & rss_mask);
424 }
425 
426 /*
427  * Query the RSS CPU associated with an RSS bucket.
428  */
429 u_int
430 rss_getcpu(u_int bucket)
431 {
432 
433 	return (rss_table[bucket].rte_cpu);
434 }
435 
436 /*
437  * netisr CPU affinity lookup given just the hash and hashtype.
438  */
439 u_int
440 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
441 {
442 
443 	switch (hash_type) {
444 	case M_HASHTYPE_RSS_IPV4:
445 	case M_HASHTYPE_RSS_TCP_IPV4:
446 	case M_HASHTYPE_RSS_UDP_IPV4:
447 	case M_HASHTYPE_RSS_IPV6:
448 	case M_HASHTYPE_RSS_TCP_IPV6:
449 	case M_HASHTYPE_RSS_UDP_IPV6:
450 		return (rss_getcpu(rss_getbucket(hash_val)));
451 	default:
452 		return (NETISR_CPUID_NONE);
453 	}
454 }
455 
456 /*
457  * Query the RSS bucket associated with the given hash value and
458  * type.
459  */
460 int
461 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
462 {
463 
464 	switch (hash_type) {
465 	case M_HASHTYPE_RSS_IPV4:
466 	case M_HASHTYPE_RSS_TCP_IPV4:
467 	case M_HASHTYPE_RSS_UDP_IPV4:
468 	case M_HASHTYPE_RSS_IPV6:
469 	case M_HASHTYPE_RSS_TCP_IPV6:
470 	case M_HASHTYPE_RSS_UDP_IPV6:
471 		*bucket_id = rss_getbucket(hash_val);
472 		return (0);
473 	default:
474 		return (-1);
475 	}
476 }
477 
478 /*
479  * netisr CPU affinity lookup routine for use by protocols.
480  */
481 struct mbuf *
482 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
483 {
484 
485 	M_ASSERTPKTHDR(m);
486 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
487 	return (m);
488 }
489 
490 int
491 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
492 {
493 
494 	M_ASSERTPKTHDR(m);
495 
496 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
497 	    bucket_id));
498 }
499 
500 /*
501  * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given
502  * IPv4 source/destination address, UDP or TCP source/destination ports
503  * and the protocol type.
504  *
505  * The protocol code may wish to do a software hash of the given
506  * tuple.  This depends upon the currently configured RSS hash types.
507  *
508  * This assumes that the packet in question isn't a fragment.
509  *
510  * It also assumes the packet source/destination address
511  * are in "incoming" packet order (ie, source is "far" address.)
512  */
513 int
514 rss_proto_software_hash_v4(struct in_addr s, struct in_addr d,
515     u_short sp, u_short dp, int proto,
516     uint32_t *hashval, uint32_t *hashtype)
517 {
518 	uint32_t hash;
519 
520 	/*
521 	 * Next, choose the hash type depending upon the protocol
522 	 * identifier.
523 	 */
524 	if ((proto == IPPROTO_TCP) &&
525 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
526 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
527 		*hashval = hash;
528 		*hashtype = M_HASHTYPE_RSS_TCP_IPV4;
529 		return (0);
530 	} else if ((proto == IPPROTO_UDP) &&
531 	    (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
532 		hash = rss_hash_ip4_4tuple(s, sp, d, dp);
533 		*hashval = hash;
534 		*hashtype = M_HASHTYPE_RSS_UDP_IPV4;
535 		return (0);
536 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
537 		/* RSS doesn't hash on other protocols like SCTP; so 2-tuple */
538 		hash = rss_hash_ip4_2tuple(s, d);
539 		*hashval = hash;
540 		*hashtype = M_HASHTYPE_RSS_IPV4;
541 		return (0);
542 	}
543 
544 	/* No configured available hashtypes! */
545 	printf("%s: no available hashtypes!\n", __func__);
546 	return (-1);
547 }
548 
549 /*
550  * Do a software calculation of the RSS for the given mbuf.
551  *
552  * This is typically used by the input path to recalculate the RSS after
553  * some form of packet processing (eg de-capsulation, IP fragment reassembly.)
554  *
555  * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and
556  * RSS_HASH_PKT_EGRESS for outgoing.
557  *
558  * Returns 0 if a hash was done, -1 if no hash was done, +1 if
559  * the mbuf already had a valid RSS flowid.
560  *
561  * This function doesn't modify the mbuf.  It's up to the caller to
562  * assign flowid/flowtype as appropriate.
563  */
564 int
565 rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval,
566     uint32_t *hashtype)
567 {
568 	const struct ip *ip;
569 	const struct tcphdr *th;
570 	const struct udphdr *uh;
571 	uint32_t flowid;
572 	uint32_t flowtype;
573 	uint8_t proto;
574 	int iphlen;
575 	int is_frag = 0;
576 
577 	/*
578 	 * XXX For now this only handles hashing on incoming mbufs.
579 	 */
580 	if (dir != RSS_HASH_PKT_INGRESS) {
581 		printf("%s: called on EGRESS packet!\n", __func__);
582 		return (-1);
583 	}
584 
585 	/*
586 	 * First, validate that the mbuf we have is long enough
587 	 * to have an IPv4 header in it.
588 	 */
589 	if (m->m_pkthdr.len < (sizeof(struct ip))) {
590 		printf("%s: short mbuf pkthdr\n", __func__);
591 		return (-1);
592 	}
593 	if (m->m_len < (sizeof(struct ip))) {
594 		printf("%s: short mbuf len\n", __func__);
595 		return (-1);
596 	}
597 
598 	/* Ok, let's dereference that */
599 	ip = mtod(m, struct ip *);
600 	proto = ip->ip_p;
601 	iphlen = ip->ip_hl << 2;
602 
603 	/*
604 	 * If this is a fragment then it shouldn't be four-tuple
605 	 * hashed just yet.  Once it's reassembled into a full
606 	 * frame it should be re-hashed.
607 	 */
608 	if (ip->ip_off & htons(IP_MF | IP_OFFMASK))
609 		is_frag = 1;
610 
611 	/*
612 	 * If the mbuf flowid/flowtype matches the packet type,
613 	 * and we don't support the 4-tuple version of the given protocol,
614 	 * then signal to the owner that it can trust the flowid/flowtype
615 	 * details.
616 	 *
617 	 * This is a little picky - eg, if TCPv4 / UDPv4 hashing
618 	 * is supported but we got a TCP/UDP frame only 2-tuple hashed,
619 	 * then we shouldn't just "trust" the 2-tuple hash.  We need
620 	 * a 4-tuple hash.
621 	 */
622 	flowid = m->m_pkthdr.flowid;
623 	flowtype = M_HASHTYPE_GET(m);
624 
625 	if (flowtype != M_HASHTYPE_NONE) {
626 		switch (proto) {
627 		case IPPROTO_UDP:
628 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
629 			    (flowtype == M_HASHTYPE_RSS_UDP_IPV4) &&
630 			    (is_frag == 0)) {
631 				return (1);
632 			}
633 			/*
634 			 * Only allow 2-tuple for UDP frames if we don't also
635 			 * support 4-tuple for UDP.
636 			 */
637 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
638 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) &&
639 			    flowtype == M_HASHTYPE_RSS_IPV4) {
640 				return (1);
641 			}
642 			break;
643 		case IPPROTO_TCP:
644 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
645 			    (flowtype == M_HASHTYPE_RSS_TCP_IPV4) &&
646 			    (is_frag == 0)) {
647 				return (1);
648 			}
649 			/*
650 			 * Only allow 2-tuple for TCP frames if we don't also
651 			 * support 2-tuple for TCP.
652 			 */
653 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
654 			    ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) &&
655 			    flowtype == M_HASHTYPE_RSS_IPV4) {
656 				return (1);
657 			}
658 			break;
659 		default:
660 			if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
661 			    flowtype == M_HASHTYPE_RSS_IPV4) {
662 				return (1);
663 			}
664 			break;
665 		}
666 	}
667 
668 	/*
669 	 * Decode enough information to make a hash decision.
670 	 *
671 	 * XXX TODO: does the hardware hash on 4-tuple if IP
672 	 *    options are present?
673 	 */
674 	if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
675 	    (proto == IPPROTO_TCP) &&
676 	    (is_frag == 0)) {
677 		if (m->m_len < iphlen + sizeof(struct tcphdr)) {
678 			printf("%s: short TCP frame?\n", __func__);
679 			return (-1);
680 		}
681 		th = (struct tcphdr *)((caddr_t)ip + iphlen);
682 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
683 		    th->th_sport,
684 		    th->th_dport,
685 		    proto,
686 		    hashval,
687 		    hashtype);
688 	} else if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
689 	    (proto == IPPROTO_UDP) &&
690 	    (is_frag == 0)) {
691 		uh = (struct udphdr *)((caddr_t)ip + iphlen);
692 		if (m->m_len < iphlen + sizeof(struct udphdr)) {
693 			printf("%s: short UDP frame?\n", __func__);
694 			return (-1);
695 		}
696 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
697 		    uh->uh_sport,
698 		    uh->uh_dport,
699 		    proto,
700 		    hashval,
701 		    hashtype);
702 	} else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
703 		/* Default to 2-tuple hash */
704 		return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
705 		    0,	/* source port */
706 		    0,	/* destination port */
707 		    0,	/* IPPROTO_IP */
708 		    hashval,
709 		    hashtype);
710 	} else {
711 		printf("%s: no available hashtypes!\n", __func__);
712 		return (-1);
713 	}
714 }
715 
716 /*
717  * Similar to rss_m2cpuid, but designed to be used by the IP NETISR
718  * on incoming frames.
719  *
720  * If an existing RSS hash exists and it matches what the configured
721  * hashing is, then use it.
722  *
723  * If there's an existing RSS hash but the desired hash is different,
724  * or if there's no useful RSS hash, then calculate it via
725  * the software path.
726  *
727  * XXX TODO: definitely want statistics here!
728  */
729 struct mbuf *
730 rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
731 {
732 	uint32_t hash_val, hash_type;
733 	int ret;
734 
735 	M_ASSERTPKTHDR(m);
736 
737 	ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS,
738 	    &hash_val, &hash_type);
739 	if (ret > 0) {
740 		/* mbuf has a valid hash already; don't need to modify it */
741 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
742 	} else if (ret == 0) {
743 		/* hash was done; update */
744 		m->m_pkthdr.flowid = hash_val;
745 		M_HASHTYPE_SET(m, hash_type);
746 		*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
747 	} else { /* ret < 0 */
748 		/* no hash was done */
749 		*cpuid = NETISR_CPUID_NONE;
750 	}
751 	return (m);
752 }
753 
754 /*
755  * Query the RSS hash algorithm.
756  */
757 u_int
758 rss_gethashalgo(void)
759 {
760 
761 	return (rss_hashalgo);
762 }
763 
764 /*
765  * Query the current RSS key; likely to be used by device drivers when
766  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
767  *
768  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
769  */
770 void
771 rss_getkey(uint8_t *key)
772 {
773 
774 	bcopy(rss_key, key, sizeof(rss_key));
775 }
776 
777 /*
778  * Query the number of buckets; this may be used by both network device
779  * drivers, which will need to populate hardware shadows of the software
780  * indirection table, and the network stack itself (such as when deciding how
781  * many connection groups to allocate).
782  */
783 u_int
784 rss_getnumbuckets(void)
785 {
786 
787 	return (rss_buckets);
788 }
789 
790 /*
791  * Query the number of CPUs in use by RSS; may be useful to device drivers
792  * trying to figure out how to map a larger number of CPUs into a smaller
793  * number of receive queues.
794  */
795 u_int
796 rss_getnumcpus(void)
797 {
798 
799 	return (rss_ncpus);
800 }
801 
802 static inline u_int
803 rss_gethashconfig_local(void)
804 {
805 
806 	/* Return 4-tuple for TCP; 2-tuple for others */
807 	/*
808 	 * UDP may fragment more often than TCP and thus we'll end up with
809 	 * NICs returning 2-tuple fragments.
810 	 * udp_init() and udplite_init() both currently initialise things
811 	 * as 2-tuple.
812 	 * So for now disable UDP 4-tuple hashing until all of the other
813 	 * pieces are in place.
814 	 */
815 	return (
816 	    RSS_HASHTYPE_RSS_IPV4
817 	|    RSS_HASHTYPE_RSS_TCP_IPV4
818 	|    RSS_HASHTYPE_RSS_IPV6
819 	|    RSS_HASHTYPE_RSS_TCP_IPV6
820 	|    RSS_HASHTYPE_RSS_IPV6_EX
821 	|    RSS_HASHTYPE_RSS_TCP_IPV6_EX
822 #if 0
823 	|    RSS_HASHTYPE_RSS_UDP_IPV4
824 	|    RSS_HASHTYPE_RSS_UDP_IPV4_EX
825 	|    RSS_HASHTYPE_RSS_UDP_IPV6
826 	|    RSS_HASHTYPE_RSS_UDP_IPV6_EX
827 #endif
828 	);
829 }
830 
831 /*
832  * Return the supported RSS hash configuration.
833  *
834  * NICs should query this to determine what to configure in their redirection
835  * matching table.
836  */
837 u_int
838 rss_gethashconfig(void)
839 {
840 
841 	return (rss_gethashconfig_local());
842 }
843 
844 /*
845  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
846  * it appearing in debugging output unnecessarily.
847  */
848 static int
849 sysctl_rss_key(SYSCTL_HANDLER_ARGS)
850 {
851 	uint8_t temp_rss_key[RSS_KEYSIZE];
852 	int error;
853 
854 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
855 	if (error)
856 		return (error);
857 
858 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
859 	error = sysctl_handle_opaque(oidp, temp_rss_key,
860 	    sizeof(temp_rss_key), req);
861 	if (error)
862 		return (error);
863 	if (req->newptr != NULL) {
864 		/* XXXRW: Not yet. */
865 		return (EINVAL);
866 	}
867 	return (0);
868 }
869 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
870     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
871     "", "RSS keying material");
872 
873 static int
874 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
875 {
876 	struct sbuf *sb;
877 	int error;
878 	int i;
879 
880 	error = 0;
881 	error = sysctl_wire_old_buffer(req, 0);
882 	if (error != 0)
883 		return (error);
884 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
885 	if (sb == NULL)
886 		return (ENOMEM);
887 	for (i = 0; i < rss_buckets; i++) {
888 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
889 		    i,
890 		    rss_getcpu(i));
891 	}
892 	error = sbuf_finish(sb);
893 	sbuf_delete(sb);
894 
895 	return (error);
896 }
897 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
898     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
899     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
900