1 /*- 2 * Copyright (c) 2010-2011 Juniper Networks, Inc. 3 * All rights reserved. 4 * 5 * This software was developed by Robert N. M. Watson under contract 6 * to Juniper Networks, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet6.h" 35 #include "opt_pcbgroup.h" 36 37 #ifndef PCBGROUP 38 #error "options RSS depends on options PCBGROUP" 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/priv.h> 45 #include <sys/kernel.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/sbuf.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/netisr.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/in_rss.h> 57 #include <netinet/in_var.h> 58 #include <netinet/toeplitz.h> 59 60 /*- 61 * Operating system parts of receiver-side scaling (RSS), which allows 62 * network cards to direct flows to particular receive queues based on hashes 63 * of header tuples. This implementation aligns RSS buckets with connection 64 * groups at the TCP/IP layer, so each bucket is associated with exactly one 65 * group. As a result, the group lookup structures (and lock) should have an 66 * effective affinity with exactly one CPU. 67 * 68 * Network device drivers needing to configure RSS will query this framework 69 * for parameters, such as the current RSS key, hashing policies, number of 70 * bits, and indirection table mapping hashes to buckets and CPUs. They may 71 * provide their own supplementary information, such as queue<->CPU bindings. 72 * It is the responsibility of the network device driver to inject packets 73 * into the stack on as close to the right CPU as possible, if playing by RSS 74 * rules. 75 * 76 * TODO: 77 * 78 * - Synchronization for rss_key and other future-configurable parameters. 79 * - Event handler drivers can register to pick up RSS configuration changes. 80 * - Should we allow rss_basecpu to be configured? 81 * - Randomize key on boot. 82 * - IPv6 support. 83 * - Statistics on how often there's a misalignment between hardware 84 * placement and pcbgroup expectations. 85 */ 86 87 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); 88 89 /* 90 * Toeplitz is the only required hash function in the RSS spec, so use it by 91 * default. 92 */ 93 static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 94 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RD, &rss_hashalgo, 0, 95 "RSS hash algorithm"); 96 TUNABLE_INT("net.inet.rss.hashalgo", &rss_hashalgo); 97 98 /* 99 * Size of the indirection table; at most 128 entries per the RSS spec. We 100 * size it to at least 2 times the number of CPUs by default to allow useful 101 * rebalancing. If not set explicitly with a loader tunable, we tune based 102 * on the number of CPUs present. 103 * 104 * XXXRW: buckets might be better to use for the tunable than bits. 105 */ 106 static u_int rss_bits; 107 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RD, &rss_bits, 0, 108 "RSS bits"); 109 TUNABLE_INT("net.inet.rss.bits", &rss_bits); 110 111 static u_int rss_mask; 112 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 113 "RSS mask"); 114 115 static const u_int rss_maxbits = RSS_MAXBITS; 116 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 117 __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 118 119 /* 120 * RSS's own count of the number of CPUs it could be using for processing. 121 * Bounded to 64 by RSS constants. 122 */ 123 static u_int rss_ncpus; 124 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 125 "Number of CPUs available to RSS"); 126 127 #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 128 static const u_int rss_maxcpus = RSS_MAXCPUS; 129 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 130 __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 131 132 /* 133 * Variable exists just for reporting rss_bits in a user-friendly way. 134 */ 135 static u_int rss_buckets; 136 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 137 "RSS buckets"); 138 139 /* 140 * Base CPU number; devices will add this to all CPU numbers returned by the 141 * RSS indirection table. Currently unmodifable in FreeBSD. 142 */ 143 static const u_int rss_basecpu; 144 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 145 __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 146 147 /* 148 * RSS secret key, intended to prevent attacks on load-balancing. Its 149 * effectiveness may be limited by algorithm choice and available entropy 150 * during the boot. 151 * 152 * XXXRW: And that we don't randomize it yet! 153 * 154 * XXXRW: This default is actually the default key from Chelsio T3 cards, as 155 * it offers reasonable distribution, unlike all-0 keys which always 156 * generate a hash of 0 (upsettingly). 157 */ 158 static uint8_t rss_key[RSS_KEYSIZE] = { 159 0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d, 160 0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda, 161 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 162 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 163 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 164 }; 165 166 /* 167 * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 168 * Drivers may supplement this table with a seperate CPU<->queue table when 169 * programming devices. 170 */ 171 struct rss_table_entry { 172 uint8_t rte_cpu; /* CPU affinity of bucket. */ 173 }; 174 static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 175 176 static void 177 rss_init(__unused void *arg) 178 { 179 u_int i; 180 u_int cpuid; 181 182 /* 183 * Validate tunables, coerce to sensible values. 184 */ 185 switch (rss_hashalgo) { 186 case RSS_HASH_TOEPLITZ: 187 case RSS_HASH_NAIVE: 188 break; 189 190 default: 191 printf("%s: invalid RSS hashalgo %u, coercing to %u", 192 __func__, rss_hashalgo, RSS_HASH_TOEPLITZ); 193 rss_hashalgo = RSS_HASH_TOEPLITZ; 194 } 195 196 /* 197 * Count available CPUs. 198 * 199 * XXXRW: Note incorrect assumptions regarding contiguity of this set 200 * elsewhere. 201 */ 202 rss_ncpus = 0; 203 for (i = 0; i <= mp_maxid; i++) { 204 if (CPU_ABSENT(i)) 205 continue; 206 rss_ncpus++; 207 } 208 if (rss_ncpus > RSS_MAXCPUS) 209 rss_ncpus = RSS_MAXCPUS; 210 211 /* 212 * Tune RSS table entries to be no less than 2x the number of CPUs 213 * -- unless we're running uniprocessor, in which case there's not 214 * much point in having buckets to rearrange for load-balancing! 215 */ 216 if (rss_ncpus > 1) { 217 if (rss_bits == 0) 218 rss_bits = fls(rss_ncpus - 1) + 1; 219 220 /* 221 * Microsoft limits RSS table entries to 128, so apply that 222 * limit to both auto-detected CPU counts and user-configured 223 * ones. 224 */ 225 if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 226 printf("%s: RSS bits %u not valid, coercing to %u", 227 __func__, rss_bits, RSS_MAXBITS); 228 rss_bits = RSS_MAXBITS; 229 } 230 231 /* 232 * Figure out how many buckets to use; warn if less than the 233 * number of configured CPUs, although this is not a fatal 234 * problem. 235 */ 236 rss_buckets = (1 << rss_bits); 237 if (rss_buckets < rss_ncpus) 238 printf("%s: WARNING: rss_buckets (%u) less than " 239 "rss_ncpus (%u)\n", __func__, rss_buckets, 240 rss_ncpus); 241 rss_mask = rss_buckets - 1; 242 } else { 243 rss_bits = 0; 244 rss_buckets = 1; 245 rss_mask = 0; 246 } 247 248 /* 249 * Set up initial CPU assignments: round-robin by default. 250 */ 251 cpuid = CPU_FIRST(); 252 for (i = 0; i < rss_buckets; i++) { 253 rss_table[i].rte_cpu = cpuid; 254 cpuid = CPU_NEXT(cpuid); 255 } 256 257 /* 258 * Randomize rrs_key. 259 * 260 * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 261 * loop to check for "bad" RSS keys. 262 */ 263 } 264 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 265 266 static uint32_t 267 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 268 const uint8_t *data) 269 { 270 uint32_t v; 271 u_int i; 272 273 v = 0; 274 for (i = 0; i < keylen; i++) 275 v += key[i]; 276 for (i = 0; i < datalen; i++) 277 v += data[i]; 278 return (v); 279 } 280 281 static uint32_t 282 rss_hash(u_int datalen, const uint8_t *data) 283 { 284 285 switch (rss_hashalgo) { 286 case RSS_HASH_TOEPLITZ: 287 return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 288 data)); 289 290 case RSS_HASH_NAIVE: 291 return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 292 data)); 293 294 default: 295 panic("%s: unsupported/unknown hashalgo %d", __func__, 296 rss_hashalgo); 297 } 298 } 299 300 /* 301 * Hash an IPv4 2-tuple. 302 */ 303 uint32_t 304 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst) 305 { 306 uint8_t data[sizeof(src) + sizeof(dst)]; 307 u_int datalen; 308 309 datalen = 0; 310 bcopy(&src, &data[datalen], sizeof(src)); 311 datalen += sizeof(src); 312 bcopy(&dst, &data[datalen], sizeof(dst)); 313 datalen += sizeof(dst); 314 return (rss_hash(datalen, data)); 315 } 316 317 /* 318 * Hash an IPv4 4-tuple. 319 */ 320 uint32_t 321 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst, 322 u_short dstport) 323 { 324 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 325 sizeof(dstport)]; 326 u_int datalen; 327 328 datalen = 0; 329 bcopy(&src, &data[datalen], sizeof(src)); 330 datalen += sizeof(src); 331 bcopy(&dst, &data[datalen], sizeof(dst)); 332 datalen += sizeof(dst); 333 bcopy(&srcport, &data[datalen], sizeof(srcport)); 334 datalen += sizeof(srcport); 335 bcopy(&dstport, &data[datalen], sizeof(dstport)); 336 datalen += sizeof(dstport); 337 return (rss_hash(datalen, data)); 338 } 339 340 #ifdef INET6 341 /* 342 * Hash an IPv6 2-tuple. 343 */ 344 uint32_t 345 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst) 346 { 347 uint8_t data[sizeof(src) + sizeof(dst)]; 348 u_int datalen; 349 350 datalen = 0; 351 bcopy(&src, &data[datalen], sizeof(src)); 352 datalen += sizeof(src); 353 bcopy(&dst, &data[datalen], sizeof(dst)); 354 datalen += sizeof(dst); 355 return (rss_hash(datalen, data)); 356 } 357 358 /* 359 * Hash an IPv6 4-tuple. 360 */ 361 uint32_t 362 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, 363 struct in6_addr dst, u_short dstport) 364 { 365 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 366 sizeof(dstport)]; 367 u_int datalen; 368 369 datalen = 0; 370 bcopy(&src, &data[datalen], sizeof(src)); 371 datalen += sizeof(src); 372 bcopy(&dst, &data[datalen], sizeof(dst)); 373 datalen += sizeof(dst); 374 bcopy(&srcport, &data[datalen], sizeof(srcport)); 375 datalen += sizeof(srcport); 376 bcopy(&dstport, &data[datalen], sizeof(dstport)); 377 datalen += sizeof(dstport); 378 return (rss_hash(datalen, data)); 379 } 380 #endif /* INET6 */ 381 382 /* 383 * Query the number of RSS bits in use. 384 */ 385 u_int 386 rss_getbits(void) 387 { 388 389 return (rss_bits); 390 } 391 392 /* 393 * Query the RSS bucket associated with an RSS hash. 394 */ 395 u_int 396 rss_getbucket(u_int hash) 397 { 398 399 return (hash & rss_mask); 400 } 401 402 /* 403 * Query the RSS CPU associated with an RSS bucket. 404 */ 405 u_int 406 rss_getcpu(u_int bucket) 407 { 408 409 return (rss_table[bucket].rte_cpu); 410 } 411 412 /* 413 * netisr CPU affinity lookup given just the hash and hashtype. 414 */ 415 u_int 416 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 417 { 418 419 switch (hash_type) { 420 case M_HASHTYPE_RSS_IPV4: 421 case M_HASHTYPE_RSS_TCP_IPV4: 422 return (rss_getcpu(rss_getbucket(hash_val))); 423 default: 424 return (NETISR_CPUID_NONE); 425 } 426 } 427 428 /* 429 * Query the RSS bucket associated with the given hash value and 430 * type. 431 */ 432 int 433 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 434 { 435 436 switch (hash_type) { 437 case M_HASHTYPE_RSS_IPV4: 438 case M_HASHTYPE_RSS_TCP_IPV4: 439 *bucket_id = rss_getbucket(hash_val); 440 return (0); 441 default: 442 return (-1); 443 } 444 } 445 446 /* 447 * netisr CPU affinity lookup routine for use by protocols. 448 */ 449 struct mbuf * 450 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 451 { 452 453 M_ASSERTPKTHDR(m); 454 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 455 return (m); 456 } 457 458 int 459 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 460 { 461 462 M_ASSERTPKTHDR(m); 463 464 return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 465 bucket_id)); 466 } 467 468 /* 469 * Query the RSS hash algorithm. 470 */ 471 u_int 472 rss_gethashalgo(void) 473 { 474 475 return (rss_hashalgo); 476 } 477 478 /* 479 * Query the current RSS key; likely to be used by device drivers when 480 * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 481 * 482 * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 483 */ 484 void 485 rss_getkey(uint8_t *key) 486 { 487 488 bcopy(rss_key, key, sizeof(rss_key)); 489 } 490 491 /* 492 * Query the number of buckets; this may be used by both network device 493 * drivers, which will need to populate hardware shadows of the software 494 * indirection table, and the network stack itself (such as when deciding how 495 * many connection groups to allocate). 496 */ 497 u_int 498 rss_getnumbuckets(void) 499 { 500 501 return (rss_buckets); 502 } 503 504 /* 505 * Query the number of CPUs in use by RSS; may be useful to device drivers 506 * trying to figure out how to map a larger number of CPUs into a smaller 507 * number of receive queues. 508 */ 509 u_int 510 rss_getnumcpus(void) 511 { 512 513 return (rss_ncpus); 514 } 515 516 /* 517 * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 518 * it appearing in debugging output unnecessarily. 519 */ 520 static int 521 sysctl_rss_key(SYSCTL_HANDLER_ARGS) 522 { 523 uint8_t temp_rss_key[RSS_KEYSIZE]; 524 int error; 525 526 error = priv_check(req->td, PRIV_NETINET_HASHKEY); 527 if (error) 528 return (error); 529 530 bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 531 error = sysctl_handle_opaque(oidp, temp_rss_key, 532 sizeof(temp_rss_key), req); 533 if (error) 534 return (error); 535 if (req->newptr != NULL) { 536 /* XXXRW: Not yet. */ 537 return (EINVAL); 538 } 539 return (0); 540 } 541 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 542 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 543 "", "RSS keying material"); 544 545 static int 546 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 547 { 548 struct sbuf *sb; 549 int error; 550 int i; 551 552 error = 0; 553 error = sysctl_wire_old_buffer(req, 0); 554 if (error != 0) 555 return (error); 556 sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 557 if (sb == NULL) 558 return (ENOMEM); 559 for (i = 0; i < rss_buckets; i++) { 560 sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 561 i, 562 rss_getcpu(i)); 563 } 564 error = sbuf_finish(sb); 565 sbuf_delete(sb); 566 567 return (error); 568 } 569 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 570 CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 571 sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 572