xref: /freebsd/sys/netinet/in_rss.c (revision 864c53ead899f7838cd2e1cca3b485a4a82f5cdc)
1 /*-
2  * Copyright (c) 2010-2011 Juniper Networks, Inc.
3  * All rights reserved.
4  *
5  * This software was developed by Robert N. M. Watson under contract
6  * to Juniper Networks, Inc.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 #include <sys/cdefs.h>
31 
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_inet6.h"
35 #include "opt_pcbgroup.h"
36 
37 #ifndef PCBGROUP
38 #error "options RSS depends on options PCBGROUP"
39 #endif
40 
41 #include <sys/param.h>
42 #include <sys/mbuf.h>
43 #include <sys/socket.h>
44 #include <sys/priv.h>
45 #include <sys/kernel.h>
46 #include <sys/smp.h>
47 #include <sys/sysctl.h>
48 #include <sys/sbuf.h>
49 
50 #include <net/if.h>
51 #include <net/if_var.h>
52 #include <net/netisr.h>
53 
54 #include <netinet/in.h>
55 #include <netinet/in_pcb.h>
56 #include <netinet/in_rss.h>
57 #include <netinet/in_var.h>
58 #include <netinet/toeplitz.h>
59 
60 /*-
61  * Operating system parts of receiver-side scaling (RSS), which allows
62  * network cards to direct flows to particular receive queues based on hashes
63  * of header tuples.  This implementation aligns RSS buckets with connection
64  * groups at the TCP/IP layer, so each bucket is associated with exactly one
65  * group.  As a result, the group lookup structures (and lock) should have an
66  * effective affinity with exactly one CPU.
67  *
68  * Network device drivers needing to configure RSS will query this framework
69  * for parameters, such as the current RSS key, hashing policies, number of
70  * bits, and indirection table mapping hashes to buckets and CPUs.  They may
71  * provide their own supplementary information, such as queue<->CPU bindings.
72  * It is the responsibility of the network device driver to inject packets
73  * into the stack on as close to the right CPU as possible, if playing by RSS
74  * rules.
75  *
76  * TODO:
77  *
78  * - Synchronization for rss_key and other future-configurable parameters.
79  * - Event handler drivers can register to pick up RSS configuration changes.
80  * - Should we allow rss_basecpu to be configured?
81  * - Randomize key on boot.
82  * - IPv6 support.
83  * - Statistics on how often there's a misalignment between hardware
84  *   placement and pcbgroup expectations.
85  */
86 
87 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering");
88 
89 /*
90  * Toeplitz is the only required hash function in the RSS spec, so use it by
91  * default.
92  */
93 static u_int	rss_hashalgo = RSS_HASH_TOEPLITZ;
94 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RD, &rss_hashalgo, 0,
95     "RSS hash algorithm");
96 TUNABLE_INT("net.inet.rss.hashalgo", &rss_hashalgo);
97 
98 /*
99  * Size of the indirection table; at most 128 entries per the RSS spec.  We
100  * size it to at least 2 times the number of CPUs by default to allow useful
101  * rebalancing.  If not set explicitly with a loader tunable, we tune based
102  * on the number of CPUs present.
103  *
104  * XXXRW: buckets might be better to use for the tunable than bits.
105  */
106 static u_int	rss_bits;
107 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RD, &rss_bits, 0,
108     "RSS bits");
109 TUNABLE_INT("net.inet.rss.bits", &rss_bits);
110 
111 static u_int	rss_mask;
112 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0,
113     "RSS mask");
114 
115 static const u_int	rss_maxbits = RSS_MAXBITS;
116 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD,
117     __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits");
118 
119 /*
120  * RSS's own count of the number of CPUs it could be using for processing.
121  * Bounded to 64 by RSS constants.
122  */
123 static u_int	rss_ncpus;
124 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0,
125     "Number of CPUs available to RSS");
126 
127 #define	RSS_MAXCPUS	(1 << (RSS_MAXBITS - 1))
128 static const u_int	rss_maxcpus = RSS_MAXCPUS;
129 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD,
130     __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used");
131 
132 /*
133  * Variable exists just for reporting rss_bits in a user-friendly way.
134  */
135 static u_int	rss_buckets;
136 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0,
137     "RSS buckets");
138 
139 /*
140  * Base CPU number; devices will add this to all CPU numbers returned by the
141  * RSS indirection table.  Currently unmodifable in FreeBSD.
142  */
143 static const u_int	rss_basecpu;
144 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD,
145     __DECONST(int *, &rss_basecpu), 0, "RSS base CPU");
146 
147 /*
148  * RSS secret key, intended to prevent attacks on load-balancing.  Its
149  * effectiveness may be limited by algorithm choice and available entropy
150  * during the boot.
151  *
152  * XXXRW: And that we don't randomize it yet!
153  *
154  * XXXRW: This default is actually the default key from Chelsio T3 cards, as
155  * it offers reasonable distribution, unlike all-0 keys which always
156  * generate a hash of 0 (upsettingly).
157  */
158 static uint8_t	rss_key[RSS_KEYSIZE] = {
159 	0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d,
160 	0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda,
161 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
162 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
163 	0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
164 };
165 
166 /*
167  * RSS hash->CPU table, which maps hashed packet headers to particular CPUs.
168  * Drivers may supplement this table with a seperate CPU<->queue table when
169  * programming devices.
170  */
171 struct rss_table_entry {
172 	uint8_t		rte_cpu;	/* CPU affinity of bucket. */
173 };
174 static struct rss_table_entry	rss_table[RSS_TABLE_MAXLEN];
175 
176 static void
177 rss_init(__unused void *arg)
178 {
179 	u_int i;
180 	u_int cpuid;
181 
182 	/*
183 	 * Validate tunables, coerce to sensible values.
184 	 */
185 	switch (rss_hashalgo) {
186 	case RSS_HASH_TOEPLITZ:
187 	case RSS_HASH_NAIVE:
188 		break;
189 
190 	default:
191 		printf("%s: invalid RSS hashalgo %u, coercing to %u",
192 		    __func__, rss_hashalgo, RSS_HASH_TOEPLITZ);
193 		rss_hashalgo = RSS_HASH_TOEPLITZ;
194 	}
195 
196 	/*
197 	 * Count available CPUs.
198 	 *
199 	 * XXXRW: Note incorrect assumptions regarding contiguity of this set
200 	 * elsewhere.
201 	 */
202 	rss_ncpus = 0;
203 	for (i = 0; i <= mp_maxid; i++) {
204 		if (CPU_ABSENT(i))
205 			continue;
206 		rss_ncpus++;
207 	}
208 	if (rss_ncpus > RSS_MAXCPUS)
209 		rss_ncpus = RSS_MAXCPUS;
210 
211 	/*
212 	 * Tune RSS table entries to be no less than 2x the number of CPUs
213 	 * -- unless we're running uniprocessor, in which case there's not
214 	 * much point in having buckets to rearrange for load-balancing!
215 	 */
216 	if (rss_ncpus > 1) {
217 		if (rss_bits == 0)
218 			rss_bits = fls(rss_ncpus - 1) + 1;
219 
220 		/*
221 		 * Microsoft limits RSS table entries to 128, so apply that
222 		 * limit to both auto-detected CPU counts and user-configured
223 		 * ones.
224 		 */
225 		if (rss_bits == 0 || rss_bits > RSS_MAXBITS) {
226 			printf("%s: RSS bits %u not valid, coercing to  %u",
227 			    __func__, rss_bits, RSS_MAXBITS);
228 			rss_bits = RSS_MAXBITS;
229 		}
230 
231 		/*
232 		 * Figure out how many buckets to use; warn if less than the
233 		 * number of configured CPUs, although this is not a fatal
234 		 * problem.
235 		 */
236 		rss_buckets = (1 << rss_bits);
237 		if (rss_buckets < rss_ncpus)
238 			printf("%s: WARNING: rss_buckets (%u) less than "
239 			    "rss_ncpus (%u)\n", __func__, rss_buckets,
240 			    rss_ncpus);
241 		rss_mask = rss_buckets - 1;
242 	} else {
243 		rss_bits = 0;
244 		rss_buckets = 1;
245 		rss_mask = 0;
246 	}
247 
248 	/*
249 	 * Set up initial CPU assignments: round-robin by default.
250 	 */
251 	cpuid = CPU_FIRST();
252 	for (i = 0; i < rss_buckets; i++) {
253 		rss_table[i].rte_cpu = cpuid;
254 		cpuid = CPU_NEXT(cpuid);
255 	}
256 
257 	/*
258 	 * Randomize rrs_key.
259 	 *
260 	 * XXXRW: Not yet.  If nothing else, will require an rss_isbadkey()
261 	 * loop to check for "bad" RSS keys.
262 	 */
263 }
264 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL);
265 
266 static uint32_t
267 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen,
268     const uint8_t *data)
269 {
270 	uint32_t v;
271 	u_int i;
272 
273 	v = 0;
274 	for (i = 0; i < keylen; i++)
275 		v += key[i];
276 	for (i = 0; i < datalen; i++)
277 		v += data[i];
278 	return (v);
279 }
280 
281 static uint32_t
282 rss_hash(u_int datalen, const uint8_t *data)
283 {
284 
285 	switch (rss_hashalgo) {
286 	case RSS_HASH_TOEPLITZ:
287 		return (toeplitz_hash(sizeof(rss_key), rss_key, datalen,
288 		    data));
289 
290 	case RSS_HASH_NAIVE:
291 		return (rss_naive_hash(sizeof(rss_key), rss_key, datalen,
292 		    data));
293 
294 	default:
295 		panic("%s: unsupported/unknown hashalgo %d", __func__,
296 		    rss_hashalgo);
297 	}
298 }
299 
300 /*
301  * Hash an IPv4 2-tuple.
302  */
303 uint32_t
304 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst)
305 {
306 	uint8_t data[sizeof(src) + sizeof(dst)];
307 	u_int datalen;
308 
309 	datalen = 0;
310 	bcopy(&src, &data[datalen], sizeof(src));
311 	datalen += sizeof(src);
312 	bcopy(&dst, &data[datalen], sizeof(dst));
313 	datalen += sizeof(dst);
314 	return (rss_hash(datalen, data));
315 }
316 
317 /*
318  * Hash an IPv4 4-tuple.
319  */
320 uint32_t
321 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst,
322     u_short dstport)
323 {
324 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
325 	    sizeof(dstport)];
326 	u_int datalen;
327 
328 	datalen = 0;
329 	bcopy(&src, &data[datalen], sizeof(src));
330 	datalen += sizeof(src);
331 	bcopy(&dst, &data[datalen], sizeof(dst));
332 	datalen += sizeof(dst);
333 	bcopy(&srcport, &data[datalen], sizeof(srcport));
334 	datalen += sizeof(srcport);
335 	bcopy(&dstport, &data[datalen], sizeof(dstport));
336 	datalen += sizeof(dstport);
337 	return (rss_hash(datalen, data));
338 }
339 
340 #ifdef INET6
341 /*
342  * Hash an IPv6 2-tuple.
343  */
344 uint32_t
345 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst)
346 {
347 	uint8_t data[sizeof(src) + sizeof(dst)];
348 	u_int datalen;
349 
350 	datalen = 0;
351 	bcopy(&src, &data[datalen], sizeof(src));
352 	datalen += sizeof(src);
353 	bcopy(&dst, &data[datalen], sizeof(dst));
354 	datalen += sizeof(dst);
355 	return (rss_hash(datalen, data));
356 }
357 
358 /*
359  * Hash an IPv6 4-tuple.
360  */
361 uint32_t
362 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport,
363     struct in6_addr dst, u_short dstport)
364 {
365 	uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) +
366 	    sizeof(dstport)];
367 	u_int datalen;
368 
369 	datalen = 0;
370 	bcopy(&src, &data[datalen], sizeof(src));
371 	datalen += sizeof(src);
372 	bcopy(&dst, &data[datalen], sizeof(dst));
373 	datalen += sizeof(dst);
374 	bcopy(&srcport, &data[datalen], sizeof(srcport));
375 	datalen += sizeof(srcport);
376 	bcopy(&dstport, &data[datalen], sizeof(dstport));
377 	datalen += sizeof(dstport);
378 	return (rss_hash(datalen, data));
379 }
380 #endif /* INET6 */
381 
382 /*
383  * Query the number of RSS bits in use.
384  */
385 u_int
386 rss_getbits(void)
387 {
388 
389 	return (rss_bits);
390 }
391 
392 /*
393  * Query the RSS bucket associated with an RSS hash.
394  */
395 u_int
396 rss_getbucket(u_int hash)
397 {
398 
399 	return (hash & rss_mask);
400 }
401 
402 /*
403  * Query the RSS CPU associated with an RSS bucket.
404  */
405 u_int
406 rss_getcpu(u_int bucket)
407 {
408 
409 	return (rss_table[bucket].rte_cpu);
410 }
411 
412 /*
413  * netisr CPU affinity lookup given just the hash and hashtype.
414  */
415 u_int
416 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type)
417 {
418 
419 	switch (hash_type) {
420 	case M_HASHTYPE_RSS_IPV4:
421 	case M_HASHTYPE_RSS_TCP_IPV4:
422 		return (rss_getcpu(rss_getbucket(hash_val)));
423 	default:
424 		return (NETISR_CPUID_NONE);
425 	}
426 }
427 
428 /*
429  * Query the RSS bucket associated with the given hash value and
430  * type.
431  */
432 int
433 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id)
434 {
435 
436 	switch (hash_type) {
437 	case M_HASHTYPE_RSS_IPV4:
438 	case M_HASHTYPE_RSS_TCP_IPV4:
439 		*bucket_id = rss_getbucket(hash_val);
440 		return (0);
441 	default:
442 		return (-1);
443 	}
444 }
445 
446 /*
447  * netisr CPU affinity lookup routine for use by protocols.
448  */
449 struct mbuf *
450 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid)
451 {
452 
453 	M_ASSERTPKTHDR(m);
454 	*cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m));
455 	return (m);
456 }
457 
458 int
459 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id)
460 {
461 
462 	M_ASSERTPKTHDR(m);
463 
464 	return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m),
465 	    bucket_id));
466 }
467 
468 /*
469  * Query the RSS hash algorithm.
470  */
471 u_int
472 rss_gethashalgo(void)
473 {
474 
475 	return (rss_hashalgo);
476 }
477 
478 /*
479  * Query the current RSS key; likely to be used by device drivers when
480  * configuring hardware RSS.  Caller must pass an array of size RSS_KEYSIZE.
481  *
482  * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing?
483  */
484 void
485 rss_getkey(uint8_t *key)
486 {
487 
488 	bcopy(rss_key, key, sizeof(rss_key));
489 }
490 
491 /*
492  * Query the number of buckets; this may be used by both network device
493  * drivers, which will need to populate hardware shadows of the software
494  * indirection table, and the network stack itself (such as when deciding how
495  * many connection groups to allocate).
496  */
497 u_int
498 rss_getnumbuckets(void)
499 {
500 
501 	return (rss_buckets);
502 }
503 
504 /*
505  * Query the number of CPUs in use by RSS; may be useful to device drivers
506  * trying to figure out how to map a larger number of CPUs into a smaller
507  * number of receive queues.
508  */
509 u_int
510 rss_getnumcpus(void)
511 {
512 
513 	return (rss_ncpus);
514 }
515 
516 /*
517  * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
518  * it appearing in debugging output unnecessarily.
519  */
520 static int
521 sysctl_rss_key(SYSCTL_HANDLER_ARGS)
522 {
523 	uint8_t temp_rss_key[RSS_KEYSIZE];
524 	int error;
525 
526 	error = priv_check(req->td, PRIV_NETINET_HASHKEY);
527 	if (error)
528 		return (error);
529 
530 	bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key));
531 	error = sysctl_handle_opaque(oidp, temp_rss_key,
532 	    sizeof(temp_rss_key), req);
533 	if (error)
534 		return (error);
535 	if (req->newptr != NULL) {
536 		/* XXXRW: Not yet. */
537 		return (EINVAL);
538 	}
539 	return (0);
540 }
541 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key,
542     CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key,
543     "", "RSS keying material");
544 
545 static int
546 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS)
547 {
548 	struct sbuf *sb;
549 	int error;
550 	int i;
551 
552 	error = 0;
553 	error = sysctl_wire_old_buffer(req, 0);
554 	if (error != 0)
555 		return (error);
556 	sb = sbuf_new_for_sysctl(NULL, NULL, 512, req);
557 	if (sb == NULL)
558 		return (ENOMEM);
559 	for (i = 0; i < rss_buckets; i++) {
560 		sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ",
561 		    i,
562 		    rss_getcpu(i));
563 	}
564 	error = sbuf_finish(sb);
565 	sbuf_delete(sb);
566 
567 	return (error);
568 }
569 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping,
570     CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
571     sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping");
572