1 /*- 2 * Copyright (c) 2010-2011 Juniper Networks, Inc. 3 * All rights reserved. 4 * 5 * This software was developed by Robert N. M. Watson under contract 6 * to Juniper Networks, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet6.h" 35 #include "opt_pcbgroup.h" 36 37 #ifndef PCBGROUP 38 #error "options RSS depends on options PCBGROUP" 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/priv.h> 45 #include <sys/kernel.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/sbuf.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/netisr.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/in_rss.h> 57 #include <netinet/in_var.h> 58 #include <netinet/toeplitz.h> 59 60 /*- 61 * Operating system parts of receiver-side scaling (RSS), which allows 62 * network cards to direct flows to particular receive queues based on hashes 63 * of header tuples. This implementation aligns RSS buckets with connection 64 * groups at the TCP/IP layer, so each bucket is associated with exactly one 65 * group. As a result, the group lookup structures (and lock) should have an 66 * effective affinity with exactly one CPU. 67 * 68 * Network device drivers needing to configure RSS will query this framework 69 * for parameters, such as the current RSS key, hashing policies, number of 70 * bits, and indirection table mapping hashes to buckets and CPUs. They may 71 * provide their own supplementary information, such as queue<->CPU bindings. 72 * It is the responsibility of the network device driver to inject packets 73 * into the stack on as close to the right CPU as possible, if playing by RSS 74 * rules. 75 * 76 * TODO: 77 * 78 * - Synchronization for rss_key and other future-configurable parameters. 79 * - Event handler drivers can register to pick up RSS configuration changes. 80 * - Should we allow rss_basecpu to be configured? 81 * - Randomize key on boot. 82 * - IPv6 support. 83 * - Statistics on how often there's a misalignment between hardware 84 * placement and pcbgroup expectations. 85 */ 86 87 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); 88 89 /* 90 * Toeplitz is the only required hash function in the RSS spec, so use it by 91 * default. 92 */ 93 static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 94 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, 95 "RSS hash algorithm"); 96 97 /* 98 * Size of the indirection table; at most 128 entries per the RSS spec. We 99 * size it to at least 2 times the number of CPUs by default to allow useful 100 * rebalancing. If not set explicitly with a loader tunable, we tune based 101 * on the number of CPUs present. 102 * 103 * XXXRW: buckets might be better to use for the tunable than bits. 104 */ 105 static u_int rss_bits; 106 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, 107 "RSS bits"); 108 109 static u_int rss_mask; 110 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 111 "RSS mask"); 112 113 static const u_int rss_maxbits = RSS_MAXBITS; 114 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 115 __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 116 117 /* 118 * RSS's own count of the number of CPUs it could be using for processing. 119 * Bounded to 64 by RSS constants. 120 */ 121 static u_int rss_ncpus; 122 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 123 "Number of CPUs available to RSS"); 124 125 #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 126 static const u_int rss_maxcpus = RSS_MAXCPUS; 127 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 128 __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 129 130 /* 131 * Variable exists just for reporting rss_bits in a user-friendly way. 132 */ 133 static u_int rss_buckets; 134 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 135 "RSS buckets"); 136 137 /* 138 * Base CPU number; devices will add this to all CPU numbers returned by the 139 * RSS indirection table. Currently unmodifable in FreeBSD. 140 */ 141 static const u_int rss_basecpu; 142 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 143 __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 144 145 /* 146 * RSS secret key, intended to prevent attacks on load-balancing. Its 147 * effectiveness may be limited by algorithm choice and available entropy 148 * during the boot. 149 * 150 * XXXRW: And that we don't randomize it yet! 151 * 152 * XXXRW: This default is actually the default key from Chelsio T3 cards, as 153 * it offers reasonable distribution, unlike all-0 keys which always 154 * generate a hash of 0 (upsettingly). 155 */ 156 static uint8_t rss_key[RSS_KEYSIZE] = { 157 0x43, 0xa3, 0x8f, 0xb0, 0x41, 0x67, 0x25, 0x3d, 158 0x25, 0x5b, 0x0e, 0xc2, 0x6d, 0x5a, 0x56, 0xda, 159 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 160 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 161 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 162 }; 163 164 /* 165 * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 166 * Drivers may supplement this table with a seperate CPU<->queue table when 167 * programming devices. 168 */ 169 struct rss_table_entry { 170 uint8_t rte_cpu; /* CPU affinity of bucket. */ 171 }; 172 static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 173 174 static void 175 rss_init(__unused void *arg) 176 { 177 u_int i; 178 u_int cpuid; 179 180 /* 181 * Validate tunables, coerce to sensible values. 182 */ 183 switch (rss_hashalgo) { 184 case RSS_HASH_TOEPLITZ: 185 case RSS_HASH_NAIVE: 186 break; 187 188 default: 189 printf("%s: invalid RSS hashalgo %u, coercing to %u", 190 __func__, rss_hashalgo, RSS_HASH_TOEPLITZ); 191 rss_hashalgo = RSS_HASH_TOEPLITZ; 192 } 193 194 /* 195 * Count available CPUs. 196 * 197 * XXXRW: Note incorrect assumptions regarding contiguity of this set 198 * elsewhere. 199 */ 200 rss_ncpus = 0; 201 for (i = 0; i <= mp_maxid; i++) { 202 if (CPU_ABSENT(i)) 203 continue; 204 rss_ncpus++; 205 } 206 if (rss_ncpus > RSS_MAXCPUS) 207 rss_ncpus = RSS_MAXCPUS; 208 209 /* 210 * Tune RSS table entries to be no less than 2x the number of CPUs 211 * -- unless we're running uniprocessor, in which case there's not 212 * much point in having buckets to rearrange for load-balancing! 213 */ 214 if (rss_ncpus > 1) { 215 if (rss_bits == 0) 216 rss_bits = fls(rss_ncpus - 1) + 1; 217 218 /* 219 * Microsoft limits RSS table entries to 128, so apply that 220 * limit to both auto-detected CPU counts and user-configured 221 * ones. 222 */ 223 if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 224 printf("%s: RSS bits %u not valid, coercing to %u", 225 __func__, rss_bits, RSS_MAXBITS); 226 rss_bits = RSS_MAXBITS; 227 } 228 229 /* 230 * Figure out how many buckets to use; warn if less than the 231 * number of configured CPUs, although this is not a fatal 232 * problem. 233 */ 234 rss_buckets = (1 << rss_bits); 235 if (rss_buckets < rss_ncpus) 236 printf("%s: WARNING: rss_buckets (%u) less than " 237 "rss_ncpus (%u)\n", __func__, rss_buckets, 238 rss_ncpus); 239 rss_mask = rss_buckets - 1; 240 } else { 241 rss_bits = 0; 242 rss_buckets = 1; 243 rss_mask = 0; 244 } 245 246 /* 247 * Set up initial CPU assignments: round-robin by default. 248 */ 249 cpuid = CPU_FIRST(); 250 for (i = 0; i < rss_buckets; i++) { 251 rss_table[i].rte_cpu = cpuid; 252 cpuid = CPU_NEXT(cpuid); 253 } 254 255 /* 256 * Randomize rrs_key. 257 * 258 * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 259 * loop to check for "bad" RSS keys. 260 */ 261 } 262 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 263 264 static uint32_t 265 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 266 const uint8_t *data) 267 { 268 uint32_t v; 269 u_int i; 270 271 v = 0; 272 for (i = 0; i < keylen; i++) 273 v += key[i]; 274 for (i = 0; i < datalen; i++) 275 v += data[i]; 276 return (v); 277 } 278 279 static uint32_t 280 rss_hash(u_int datalen, const uint8_t *data) 281 { 282 283 switch (rss_hashalgo) { 284 case RSS_HASH_TOEPLITZ: 285 return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 286 data)); 287 288 case RSS_HASH_NAIVE: 289 return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 290 data)); 291 292 default: 293 panic("%s: unsupported/unknown hashalgo %d", __func__, 294 rss_hashalgo); 295 } 296 } 297 298 /* 299 * Hash an IPv4 2-tuple. 300 */ 301 uint32_t 302 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst) 303 { 304 uint8_t data[sizeof(src) + sizeof(dst)]; 305 u_int datalen; 306 307 datalen = 0; 308 bcopy(&src, &data[datalen], sizeof(src)); 309 datalen += sizeof(src); 310 bcopy(&dst, &data[datalen], sizeof(dst)); 311 datalen += sizeof(dst); 312 return (rss_hash(datalen, data)); 313 } 314 315 /* 316 * Hash an IPv4 4-tuple. 317 */ 318 uint32_t 319 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst, 320 u_short dstport) 321 { 322 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 323 sizeof(dstport)]; 324 u_int datalen; 325 326 datalen = 0; 327 bcopy(&src, &data[datalen], sizeof(src)); 328 datalen += sizeof(src); 329 bcopy(&dst, &data[datalen], sizeof(dst)); 330 datalen += sizeof(dst); 331 bcopy(&srcport, &data[datalen], sizeof(srcport)); 332 datalen += sizeof(srcport); 333 bcopy(&dstport, &data[datalen], sizeof(dstport)); 334 datalen += sizeof(dstport); 335 return (rss_hash(datalen, data)); 336 } 337 338 #ifdef INET6 339 /* 340 * Hash an IPv6 2-tuple. 341 */ 342 uint32_t 343 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst) 344 { 345 uint8_t data[sizeof(src) + sizeof(dst)]; 346 u_int datalen; 347 348 datalen = 0; 349 bcopy(&src, &data[datalen], sizeof(src)); 350 datalen += sizeof(src); 351 bcopy(&dst, &data[datalen], sizeof(dst)); 352 datalen += sizeof(dst); 353 return (rss_hash(datalen, data)); 354 } 355 356 /* 357 * Hash an IPv6 4-tuple. 358 */ 359 uint32_t 360 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, 361 struct in6_addr dst, u_short dstport) 362 { 363 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 364 sizeof(dstport)]; 365 u_int datalen; 366 367 datalen = 0; 368 bcopy(&src, &data[datalen], sizeof(src)); 369 datalen += sizeof(src); 370 bcopy(&dst, &data[datalen], sizeof(dst)); 371 datalen += sizeof(dst); 372 bcopy(&srcport, &data[datalen], sizeof(srcport)); 373 datalen += sizeof(srcport); 374 bcopy(&dstport, &data[datalen], sizeof(dstport)); 375 datalen += sizeof(dstport); 376 return (rss_hash(datalen, data)); 377 } 378 #endif /* INET6 */ 379 380 /* 381 * Query the number of RSS bits in use. 382 */ 383 u_int 384 rss_getbits(void) 385 { 386 387 return (rss_bits); 388 } 389 390 /* 391 * Query the RSS bucket associated with an RSS hash. 392 */ 393 u_int 394 rss_getbucket(u_int hash) 395 { 396 397 return (hash & rss_mask); 398 } 399 400 /* 401 * Query the RSS layer bucket associated with the given 402 * entry in the RSS hash space. 403 * 404 * The RSS indirection table is 0 .. rss_buckets-1, 405 * covering the low 'rss_bits' of the total 128 slot 406 * RSS indirection table. So just mask off rss_bits and 407 * return that. 408 * 409 * NIC drivers can then iterate over the 128 slot RSS 410 * indirection table and fetch which RSS bucket to 411 * map it to. This will typically be a CPU queue 412 */ 413 u_int 414 rss_get_indirection_to_bucket(u_int index) 415 { 416 417 return (index & rss_mask); 418 } 419 420 /* 421 * Query the RSS CPU associated with an RSS bucket. 422 */ 423 u_int 424 rss_getcpu(u_int bucket) 425 { 426 427 return (rss_table[bucket].rte_cpu); 428 } 429 430 /* 431 * netisr CPU affinity lookup given just the hash and hashtype. 432 */ 433 u_int 434 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 435 { 436 437 switch (hash_type) { 438 case M_HASHTYPE_RSS_IPV4: 439 case M_HASHTYPE_RSS_TCP_IPV4: 440 return (rss_getcpu(rss_getbucket(hash_val))); 441 default: 442 return (NETISR_CPUID_NONE); 443 } 444 } 445 446 /* 447 * Query the RSS bucket associated with the given hash value and 448 * type. 449 */ 450 int 451 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 452 { 453 454 switch (hash_type) { 455 case M_HASHTYPE_RSS_IPV4: 456 case M_HASHTYPE_RSS_TCP_IPV4: 457 *bucket_id = rss_getbucket(hash_val); 458 return (0); 459 default: 460 return (-1); 461 } 462 } 463 464 /* 465 * netisr CPU affinity lookup routine for use by protocols. 466 */ 467 struct mbuf * 468 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 469 { 470 471 M_ASSERTPKTHDR(m); 472 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 473 return (m); 474 } 475 476 int 477 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 478 { 479 480 M_ASSERTPKTHDR(m); 481 482 return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 483 bucket_id)); 484 } 485 486 /* 487 * Query the RSS hash algorithm. 488 */ 489 u_int 490 rss_gethashalgo(void) 491 { 492 493 return (rss_hashalgo); 494 } 495 496 /* 497 * Query the current RSS key; likely to be used by device drivers when 498 * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 499 * 500 * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 501 */ 502 void 503 rss_getkey(uint8_t *key) 504 { 505 506 bcopy(rss_key, key, sizeof(rss_key)); 507 } 508 509 /* 510 * Query the number of buckets; this may be used by both network device 511 * drivers, which will need to populate hardware shadows of the software 512 * indirection table, and the network stack itself (such as when deciding how 513 * many connection groups to allocate). 514 */ 515 u_int 516 rss_getnumbuckets(void) 517 { 518 519 return (rss_buckets); 520 } 521 522 /* 523 * Query the number of CPUs in use by RSS; may be useful to device drivers 524 * trying to figure out how to map a larger number of CPUs into a smaller 525 * number of receive queues. 526 */ 527 u_int 528 rss_getnumcpus(void) 529 { 530 531 return (rss_ncpus); 532 } 533 534 /* 535 * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 536 * it appearing in debugging output unnecessarily. 537 */ 538 static int 539 sysctl_rss_key(SYSCTL_HANDLER_ARGS) 540 { 541 uint8_t temp_rss_key[RSS_KEYSIZE]; 542 int error; 543 544 error = priv_check(req->td, PRIV_NETINET_HASHKEY); 545 if (error) 546 return (error); 547 548 bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 549 error = sysctl_handle_opaque(oidp, temp_rss_key, 550 sizeof(temp_rss_key), req); 551 if (error) 552 return (error); 553 if (req->newptr != NULL) { 554 /* XXXRW: Not yet. */ 555 return (EINVAL); 556 } 557 return (0); 558 } 559 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 560 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 561 "", "RSS keying material"); 562 563 static int 564 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 565 { 566 struct sbuf *sb; 567 int error; 568 int i; 569 570 error = 0; 571 error = sysctl_wire_old_buffer(req, 0); 572 if (error != 0) 573 return (error); 574 sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 575 if (sb == NULL) 576 return (ENOMEM); 577 for (i = 0; i < rss_buckets; i++) { 578 sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 579 i, 580 rss_getcpu(i)); 581 } 582 error = sbuf_finish(sb); 583 sbuf_delete(sb); 584 585 return (error); 586 } 587 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 588 CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 589 sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 590