1 /*- 2 * Copyright (c) 2010-2011 Juniper Networks, Inc. 3 * All rights reserved. 4 * 5 * This software was developed by Robert N. M. Watson under contract 6 * to Juniper Networks, Inc. 7 * 8 * Redistribution and use in source and binary forms, with or without 9 * modification, are permitted provided that the following conditions 10 * are met: 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in the 15 * documentation and/or other materials provided with the distribution. 16 * 17 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27 * SUCH DAMAGE. 28 */ 29 30 #include <sys/cdefs.h> 31 32 __FBSDID("$FreeBSD$"); 33 34 #include "opt_inet6.h" 35 #include "opt_pcbgroup.h" 36 37 #ifndef PCBGROUP 38 #error "options RSS depends on options PCBGROUP" 39 #endif 40 41 #include <sys/param.h> 42 #include <sys/mbuf.h> 43 #include <sys/socket.h> 44 #include <sys/priv.h> 45 #include <sys/kernel.h> 46 #include <sys/smp.h> 47 #include <sys/sysctl.h> 48 #include <sys/sbuf.h> 49 50 #include <net/if.h> 51 #include <net/if_var.h> 52 #include <net/netisr.h> 53 54 #include <netinet/in.h> 55 #include <netinet/in_pcb.h> 56 #include <netinet/in_rss.h> 57 #include <netinet/in_var.h> 58 #include <netinet/toeplitz.h> 59 60 /* for software rss hash support */ 61 #include <netinet/ip.h> 62 #include <netinet/tcp.h> 63 #include <netinet/udp.h> 64 65 /*- 66 * Operating system parts of receiver-side scaling (RSS), which allows 67 * network cards to direct flows to particular receive queues based on hashes 68 * of header tuples. This implementation aligns RSS buckets with connection 69 * groups at the TCP/IP layer, so each bucket is associated with exactly one 70 * group. As a result, the group lookup structures (and lock) should have an 71 * effective affinity with exactly one CPU. 72 * 73 * Network device drivers needing to configure RSS will query this framework 74 * for parameters, such as the current RSS key, hashing policies, number of 75 * bits, and indirection table mapping hashes to buckets and CPUs. They may 76 * provide their own supplementary information, such as queue<->CPU bindings. 77 * It is the responsibility of the network device driver to inject packets 78 * into the stack on as close to the right CPU as possible, if playing by RSS 79 * rules. 80 * 81 * TODO: 82 * 83 * - Synchronization for rss_key and other future-configurable parameters. 84 * - Event handler drivers can register to pick up RSS configuration changes. 85 * - Should we allow rss_basecpu to be configured? 86 * - Randomize key on boot. 87 * - IPv6 support. 88 * - Statistics on how often there's a misalignment between hardware 89 * placement and pcbgroup expectations. 90 */ 91 92 SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); 93 94 /* 95 * Toeplitz is the only required hash function in the RSS spec, so use it by 96 * default. 97 */ 98 static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 99 SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, 100 "RSS hash algorithm"); 101 102 /* 103 * Size of the indirection table; at most 128 entries per the RSS spec. We 104 * size it to at least 2 times the number of CPUs by default to allow useful 105 * rebalancing. If not set explicitly with a loader tunable, we tune based 106 * on the number of CPUs present. 107 * 108 * XXXRW: buckets might be better to use for the tunable than bits. 109 */ 110 static u_int rss_bits; 111 SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, 112 "RSS bits"); 113 114 static u_int rss_mask; 115 SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 116 "RSS mask"); 117 118 static const u_int rss_maxbits = RSS_MAXBITS; 119 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 120 __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 121 122 /* 123 * RSS's own count of the number of CPUs it could be using for processing. 124 * Bounded to 64 by RSS constants. 125 */ 126 static u_int rss_ncpus; 127 SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 128 "Number of CPUs available to RSS"); 129 130 #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 131 static const u_int rss_maxcpus = RSS_MAXCPUS; 132 SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 133 __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 134 135 /* 136 * Variable exists just for reporting rss_bits in a user-friendly way. 137 */ 138 static u_int rss_buckets; 139 SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 140 "RSS buckets"); 141 142 /* 143 * Base CPU number; devices will add this to all CPU numbers returned by the 144 * RSS indirection table. Currently unmodifable in FreeBSD. 145 */ 146 static const u_int rss_basecpu; 147 SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 148 __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 149 150 /* 151 * RSS secret key, intended to prevent attacks on load-balancing. Its 152 * effectiveness may be limited by algorithm choice and available entropy 153 * during the boot. 154 * 155 * XXXRW: And that we don't randomize it yet! 156 * 157 * This is the default Microsoft RSS specification key which is also 158 * the Chelsio T5 firmware default key. 159 */ 160 static uint8_t rss_key[RSS_KEYSIZE] = { 161 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 162 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 163 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 164 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 165 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, 166 }; 167 168 /* 169 * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 170 * Drivers may supplement this table with a seperate CPU<->queue table when 171 * programming devices. 172 */ 173 struct rss_table_entry { 174 uint8_t rte_cpu; /* CPU affinity of bucket. */ 175 }; 176 static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 177 178 static inline u_int rss_gethashconfig_local(void); 179 180 static void 181 rss_init(__unused void *arg) 182 { 183 u_int i; 184 u_int cpuid; 185 186 /* 187 * Validate tunables, coerce to sensible values. 188 */ 189 switch (rss_hashalgo) { 190 case RSS_HASH_TOEPLITZ: 191 case RSS_HASH_NAIVE: 192 break; 193 194 default: 195 printf("%s: invalid RSS hashalgo %u, coercing to %u", 196 __func__, rss_hashalgo, RSS_HASH_TOEPLITZ); 197 rss_hashalgo = RSS_HASH_TOEPLITZ; 198 } 199 200 /* 201 * Count available CPUs. 202 * 203 * XXXRW: Note incorrect assumptions regarding contiguity of this set 204 * elsewhere. 205 */ 206 rss_ncpus = 0; 207 for (i = 0; i <= mp_maxid; i++) { 208 if (CPU_ABSENT(i)) 209 continue; 210 rss_ncpus++; 211 } 212 if (rss_ncpus > RSS_MAXCPUS) 213 rss_ncpus = RSS_MAXCPUS; 214 215 /* 216 * Tune RSS table entries to be no less than 2x the number of CPUs 217 * -- unless we're running uniprocessor, in which case there's not 218 * much point in having buckets to rearrange for load-balancing! 219 */ 220 if (rss_ncpus > 1) { 221 if (rss_bits == 0) 222 rss_bits = fls(rss_ncpus - 1) + 1; 223 224 /* 225 * Microsoft limits RSS table entries to 128, so apply that 226 * limit to both auto-detected CPU counts and user-configured 227 * ones. 228 */ 229 if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 230 printf("%s: RSS bits %u not valid, coercing to %u", 231 __func__, rss_bits, RSS_MAXBITS); 232 rss_bits = RSS_MAXBITS; 233 } 234 235 /* 236 * Figure out how many buckets to use; warn if less than the 237 * number of configured CPUs, although this is not a fatal 238 * problem. 239 */ 240 rss_buckets = (1 << rss_bits); 241 if (rss_buckets < rss_ncpus) 242 printf("%s: WARNING: rss_buckets (%u) less than " 243 "rss_ncpus (%u)\n", __func__, rss_buckets, 244 rss_ncpus); 245 rss_mask = rss_buckets - 1; 246 } else { 247 rss_bits = 0; 248 rss_buckets = 1; 249 rss_mask = 0; 250 } 251 252 /* 253 * Set up initial CPU assignments: round-robin by default. 254 */ 255 cpuid = CPU_FIRST(); 256 for (i = 0; i < rss_buckets; i++) { 257 rss_table[i].rte_cpu = cpuid; 258 cpuid = CPU_NEXT(cpuid); 259 } 260 261 /* 262 * Randomize rrs_key. 263 * 264 * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 265 * loop to check for "bad" RSS keys. 266 */ 267 } 268 SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 269 270 static uint32_t 271 rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 272 const uint8_t *data) 273 { 274 uint32_t v; 275 u_int i; 276 277 v = 0; 278 for (i = 0; i < keylen; i++) 279 v += key[i]; 280 for (i = 0; i < datalen; i++) 281 v += data[i]; 282 return (v); 283 } 284 285 static uint32_t 286 rss_hash(u_int datalen, const uint8_t *data) 287 { 288 289 switch (rss_hashalgo) { 290 case RSS_HASH_TOEPLITZ: 291 return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 292 data)); 293 294 case RSS_HASH_NAIVE: 295 return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 296 data)); 297 298 default: 299 panic("%s: unsupported/unknown hashalgo %d", __func__, 300 rss_hashalgo); 301 } 302 } 303 304 /* 305 * Hash an IPv4 2-tuple. 306 */ 307 uint32_t 308 rss_hash_ip4_2tuple(struct in_addr src, struct in_addr dst) 309 { 310 uint8_t data[sizeof(src) + sizeof(dst)]; 311 u_int datalen; 312 313 datalen = 0; 314 bcopy(&src, &data[datalen], sizeof(src)); 315 datalen += sizeof(src); 316 bcopy(&dst, &data[datalen], sizeof(dst)); 317 datalen += sizeof(dst); 318 return (rss_hash(datalen, data)); 319 } 320 321 /* 322 * Hash an IPv4 4-tuple. 323 */ 324 uint32_t 325 rss_hash_ip4_4tuple(struct in_addr src, u_short srcport, struct in_addr dst, 326 u_short dstport) 327 { 328 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 329 sizeof(dstport)]; 330 u_int datalen; 331 332 datalen = 0; 333 bcopy(&src, &data[datalen], sizeof(src)); 334 datalen += sizeof(src); 335 bcopy(&dst, &data[datalen], sizeof(dst)); 336 datalen += sizeof(dst); 337 bcopy(&srcport, &data[datalen], sizeof(srcport)); 338 datalen += sizeof(srcport); 339 bcopy(&dstport, &data[datalen], sizeof(dstport)); 340 datalen += sizeof(dstport); 341 return (rss_hash(datalen, data)); 342 } 343 344 #ifdef INET6 345 /* 346 * Hash an IPv6 2-tuple. 347 */ 348 uint32_t 349 rss_hash_ip6_2tuple(struct in6_addr src, struct in6_addr dst) 350 { 351 uint8_t data[sizeof(src) + sizeof(dst)]; 352 u_int datalen; 353 354 datalen = 0; 355 bcopy(&src, &data[datalen], sizeof(src)); 356 datalen += sizeof(src); 357 bcopy(&dst, &data[datalen], sizeof(dst)); 358 datalen += sizeof(dst); 359 return (rss_hash(datalen, data)); 360 } 361 362 /* 363 * Hash an IPv6 4-tuple. 364 */ 365 uint32_t 366 rss_hash_ip6_4tuple(struct in6_addr src, u_short srcport, 367 struct in6_addr dst, u_short dstport) 368 { 369 uint8_t data[sizeof(src) + sizeof(dst) + sizeof(srcport) + 370 sizeof(dstport)]; 371 u_int datalen; 372 373 datalen = 0; 374 bcopy(&src, &data[datalen], sizeof(src)); 375 datalen += sizeof(src); 376 bcopy(&dst, &data[datalen], sizeof(dst)); 377 datalen += sizeof(dst); 378 bcopy(&srcport, &data[datalen], sizeof(srcport)); 379 datalen += sizeof(srcport); 380 bcopy(&dstport, &data[datalen], sizeof(dstport)); 381 datalen += sizeof(dstport); 382 return (rss_hash(datalen, data)); 383 } 384 #endif /* INET6 */ 385 386 /* 387 * Query the number of RSS bits in use. 388 */ 389 u_int 390 rss_getbits(void) 391 { 392 393 return (rss_bits); 394 } 395 396 /* 397 * Query the RSS bucket associated with an RSS hash. 398 */ 399 u_int 400 rss_getbucket(u_int hash) 401 { 402 403 return (hash & rss_mask); 404 } 405 406 /* 407 * Query the RSS layer bucket associated with the given 408 * entry in the RSS hash space. 409 * 410 * The RSS indirection table is 0 .. rss_buckets-1, 411 * covering the low 'rss_bits' of the total 128 slot 412 * RSS indirection table. So just mask off rss_bits and 413 * return that. 414 * 415 * NIC drivers can then iterate over the 128 slot RSS 416 * indirection table and fetch which RSS bucket to 417 * map it to. This will typically be a CPU queue 418 */ 419 u_int 420 rss_get_indirection_to_bucket(u_int index) 421 { 422 423 return (index & rss_mask); 424 } 425 426 /* 427 * Query the RSS CPU associated with an RSS bucket. 428 */ 429 u_int 430 rss_getcpu(u_int bucket) 431 { 432 433 return (rss_table[bucket].rte_cpu); 434 } 435 436 /* 437 * netisr CPU affinity lookup given just the hash and hashtype. 438 */ 439 u_int 440 rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 441 { 442 443 switch (hash_type) { 444 case M_HASHTYPE_RSS_IPV4: 445 case M_HASHTYPE_RSS_TCP_IPV4: 446 case M_HASHTYPE_RSS_UDP_IPV4: 447 case M_HASHTYPE_RSS_IPV6: 448 case M_HASHTYPE_RSS_TCP_IPV6: 449 case M_HASHTYPE_RSS_UDP_IPV6: 450 return (rss_getcpu(rss_getbucket(hash_val))); 451 default: 452 return (NETISR_CPUID_NONE); 453 } 454 } 455 456 /* 457 * Query the RSS bucket associated with the given hash value and 458 * type. 459 */ 460 int 461 rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 462 { 463 464 switch (hash_type) { 465 case M_HASHTYPE_RSS_IPV4: 466 case M_HASHTYPE_RSS_TCP_IPV4: 467 case M_HASHTYPE_RSS_UDP_IPV4: 468 case M_HASHTYPE_RSS_IPV6: 469 case M_HASHTYPE_RSS_TCP_IPV6: 470 case M_HASHTYPE_RSS_UDP_IPV6: 471 *bucket_id = rss_getbucket(hash_val); 472 return (0); 473 default: 474 return (-1); 475 } 476 } 477 478 /* 479 * netisr CPU affinity lookup routine for use by protocols. 480 */ 481 struct mbuf * 482 rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 483 { 484 485 M_ASSERTPKTHDR(m); 486 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 487 return (m); 488 } 489 490 int 491 rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 492 { 493 494 M_ASSERTPKTHDR(m); 495 496 return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 497 bucket_id)); 498 } 499 500 /* 501 * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given 502 * IPv4 source/destination address, UDP or TCP source/destination ports 503 * and the protocol type. 504 * 505 * The protocol code may wish to do a software hash of the given 506 * tuple. This depends upon the currently configured RSS hash types. 507 * 508 * This assumes that the packet in question isn't a fragment. 509 * 510 * It also assumes the packet source/destination address 511 * are in "incoming" packet order (ie, source is "far" address.) 512 */ 513 int 514 rss_proto_software_hash_v4(struct in_addr s, struct in_addr d, 515 u_short sp, u_short dp, int proto, 516 uint32_t *hashval, uint32_t *hashtype) 517 { 518 uint32_t hash; 519 520 /* 521 * Next, choose the hash type depending upon the protocol 522 * identifier. 523 */ 524 if ((proto == IPPROTO_TCP) && 525 (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { 526 hash = rss_hash_ip4_4tuple(s, sp, d, dp); 527 *hashval = hash; 528 *hashtype = M_HASHTYPE_RSS_TCP_IPV4; 529 return (0); 530 } else if ((proto == IPPROTO_UDP) && 531 (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { 532 hash = rss_hash_ip4_4tuple(s, sp, d, dp); 533 *hashval = hash; 534 *hashtype = M_HASHTYPE_RSS_UDP_IPV4; 535 return (0); 536 } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { 537 /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ 538 hash = rss_hash_ip4_2tuple(s, d); 539 *hashval = hash; 540 *hashtype = M_HASHTYPE_RSS_IPV4; 541 return (0); 542 } 543 544 /* No configured available hashtypes! */ 545 printf("%s: no available hashtypes!\n", __func__); 546 return (-1); 547 } 548 549 /* 550 * Do a software calculation of the RSS for the given mbuf. 551 * 552 * This is typically used by the input path to recalculate the RSS after 553 * some form of packet processing (eg de-capsulation, IP fragment reassembly.) 554 * 555 * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and 556 * RSS_HASH_PKT_EGRESS for outgoing. 557 * 558 * Returns 0 if a hash was done, -1 if no hash was done, +1 if 559 * the mbuf already had a valid RSS flowid. 560 * 561 * This function doesn't modify the mbuf. It's up to the caller to 562 * assign flowid/flowtype as appropriate. 563 */ 564 int 565 rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, 566 uint32_t *hashtype) 567 { 568 const struct ip *ip; 569 const struct tcphdr *th; 570 const struct udphdr *uh; 571 uint32_t flowid; 572 uint32_t flowtype; 573 uint8_t proto; 574 int iphlen; 575 int is_frag = 0; 576 577 /* 578 * XXX For now this only handles hashing on incoming mbufs. 579 */ 580 if (dir != RSS_HASH_PKT_INGRESS) { 581 printf("%s: called on EGRESS packet!\n", __func__); 582 return (-1); 583 } 584 585 /* 586 * First, validate that the mbuf we have is long enough 587 * to have an IPv4 header in it. 588 */ 589 if (m->m_pkthdr.len < (sizeof(struct ip))) { 590 printf("%s: short mbuf pkthdr\n", __func__); 591 return (-1); 592 } 593 if (m->m_len < (sizeof(struct ip))) { 594 printf("%s: short mbuf len\n", __func__); 595 return (-1); 596 } 597 598 /* Ok, let's dereference that */ 599 ip = mtod(m, struct ip *); 600 proto = ip->ip_p; 601 iphlen = ip->ip_hl << 2; 602 603 /* 604 * If this is a fragment then it shouldn't be four-tuple 605 * hashed just yet. Once it's reassembled into a full 606 * frame it should be re-hashed. 607 */ 608 if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) 609 is_frag = 1; 610 611 /* 612 * If the mbuf flowid/flowtype matches the packet type, 613 * and we don't support the 4-tuple version of the given protocol, 614 * then signal to the owner that it can trust the flowid/flowtype 615 * details. 616 * 617 * This is a little picky - eg, if TCPv4 / UDPv4 hashing 618 * is supported but we got a TCP/UDP frame only 2-tuple hashed, 619 * then we shouldn't just "trust" the 2-tuple hash. We need 620 * a 4-tuple hash. 621 */ 622 flowid = m->m_pkthdr.flowid; 623 flowtype = M_HASHTYPE_GET(m); 624 625 if (flowtype != M_HASHTYPE_NONE) { 626 switch (proto) { 627 case IPPROTO_UDP: 628 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && 629 (flowtype == M_HASHTYPE_RSS_UDP_IPV4) && 630 (is_frag == 0)) { 631 return (1); 632 } 633 /* 634 * Only allow 2-tuple for UDP frames if we don't also 635 * support 4-tuple for UDP. 636 */ 637 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && 638 ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) && 639 flowtype == M_HASHTYPE_RSS_IPV4) { 640 return (1); 641 } 642 break; 643 case IPPROTO_TCP: 644 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && 645 (flowtype == M_HASHTYPE_RSS_TCP_IPV4) && 646 (is_frag == 0)) { 647 return (1); 648 } 649 /* 650 * Only allow 2-tuple for TCP frames if we don't also 651 * support 2-tuple for TCP. 652 */ 653 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && 654 ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) && 655 flowtype == M_HASHTYPE_RSS_IPV4) { 656 return (1); 657 } 658 break; 659 default: 660 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && 661 flowtype == M_HASHTYPE_RSS_IPV4) { 662 return (1); 663 } 664 break; 665 } 666 } 667 668 /* 669 * Decode enough information to make a hash decision. 670 * 671 * XXX TODO: does the hardware hash on 4-tuple if IP 672 * options are present? 673 */ 674 if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && 675 (proto == IPPROTO_TCP) && 676 (is_frag == 0)) { 677 if (m->m_len < iphlen + sizeof(struct tcphdr)) { 678 printf("%s: short TCP frame?\n", __func__); 679 return (-1); 680 } 681 th = (struct tcphdr *)((caddr_t)ip + iphlen); 682 return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, 683 th->th_sport, 684 th->th_dport, 685 proto, 686 hashval, 687 hashtype); 688 } else if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && 689 (proto == IPPROTO_UDP) && 690 (is_frag == 0)) { 691 uh = (struct udphdr *)((caddr_t)ip + iphlen); 692 if (m->m_len < iphlen + sizeof(struct udphdr)) { 693 printf("%s: short UDP frame?\n", __func__); 694 return (-1); 695 } 696 return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, 697 uh->uh_sport, 698 uh->uh_dport, 699 proto, 700 hashval, 701 hashtype); 702 } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { 703 /* Default to 2-tuple hash */ 704 return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, 705 0, /* source port */ 706 0, /* destination port */ 707 0, /* IPPROTO_IP */ 708 hashval, 709 hashtype); 710 } else { 711 printf("%s: no available hashtypes!\n", __func__); 712 return (-1); 713 } 714 } 715 716 /* 717 * Similar to rss_m2cpuid, but designed to be used by the IP NETISR 718 * on incoming frames. 719 * 720 * If an existing RSS hash exists and it matches what the configured 721 * hashing is, then use it. 722 * 723 * If there's an existing RSS hash but the desired hash is different, 724 * or if there's no useful RSS hash, then calculate it via 725 * the software path. 726 * 727 * XXX TODO: definitely want statistics here! 728 */ 729 struct mbuf * 730 rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 731 { 732 uint32_t hash_val, hash_type; 733 int ret; 734 735 M_ASSERTPKTHDR(m); 736 737 ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS, 738 &hash_val, &hash_type); 739 if (ret > 0) { 740 /* mbuf has a valid hash already; don't need to modify it */ 741 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 742 } else if (ret == 0) { 743 /* hash was done; update */ 744 m->m_pkthdr.flowid = hash_val; 745 M_HASHTYPE_SET(m, hash_type); 746 *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 747 } else { /* ret < 0 */ 748 /* no hash was done */ 749 *cpuid = NETISR_CPUID_NONE; 750 } 751 return (m); 752 } 753 754 /* 755 * Query the RSS hash algorithm. 756 */ 757 u_int 758 rss_gethashalgo(void) 759 { 760 761 return (rss_hashalgo); 762 } 763 764 /* 765 * Query the current RSS key; likely to be used by device drivers when 766 * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 767 * 768 * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 769 */ 770 void 771 rss_getkey(uint8_t *key) 772 { 773 774 bcopy(rss_key, key, sizeof(rss_key)); 775 } 776 777 /* 778 * Query the number of buckets; this may be used by both network device 779 * drivers, which will need to populate hardware shadows of the software 780 * indirection table, and the network stack itself (such as when deciding how 781 * many connection groups to allocate). 782 */ 783 u_int 784 rss_getnumbuckets(void) 785 { 786 787 return (rss_buckets); 788 } 789 790 /* 791 * Query the number of CPUs in use by RSS; may be useful to device drivers 792 * trying to figure out how to map a larger number of CPUs into a smaller 793 * number of receive queues. 794 */ 795 u_int 796 rss_getnumcpus(void) 797 { 798 799 return (rss_ncpus); 800 } 801 802 static inline u_int 803 rss_gethashconfig_local(void) 804 { 805 806 /* Return 4-tuple for TCP; 2-tuple for others */ 807 /* 808 * UDP may fragment more often than TCP and thus we'll end up with 809 * NICs returning 2-tuple fragments. 810 * udp_init() and udplite_init() both currently initialise things 811 * as 2-tuple. 812 * So for now disable UDP 4-tuple hashing until all of the other 813 * pieces are in place. 814 */ 815 return ( 816 RSS_HASHTYPE_RSS_IPV4 817 | RSS_HASHTYPE_RSS_TCP_IPV4 818 | RSS_HASHTYPE_RSS_IPV6 819 | RSS_HASHTYPE_RSS_TCP_IPV6 820 | RSS_HASHTYPE_RSS_IPV6_EX 821 | RSS_HASHTYPE_RSS_TCP_IPV6_EX 822 #if 0 823 | RSS_HASHTYPE_RSS_UDP_IPV4 824 | RSS_HASHTYPE_RSS_UDP_IPV4_EX 825 | RSS_HASHTYPE_RSS_UDP_IPV6 826 | RSS_HASHTYPE_RSS_UDP_IPV6_EX 827 #endif 828 ); 829 } 830 831 /* 832 * Return the supported RSS hash configuration. 833 * 834 * NICs should query this to determine what to configure in their redirection 835 * matching table. 836 */ 837 u_int 838 rss_gethashconfig(void) 839 { 840 841 return (rss_gethashconfig_local()); 842 } 843 844 /* 845 * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 846 * it appearing in debugging output unnecessarily. 847 */ 848 static int 849 sysctl_rss_key(SYSCTL_HANDLER_ARGS) 850 { 851 uint8_t temp_rss_key[RSS_KEYSIZE]; 852 int error; 853 854 error = priv_check(req->td, PRIV_NETINET_HASHKEY); 855 if (error) 856 return (error); 857 858 bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 859 error = sysctl_handle_opaque(oidp, temp_rss_key, 860 sizeof(temp_rss_key), req); 861 if (error) 862 return (error); 863 if (req->newptr != NULL) { 864 /* XXXRW: Not yet. */ 865 return (EINVAL); 866 } 867 return (0); 868 } 869 SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 870 CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 871 "", "RSS keying material"); 872 873 static int 874 sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 875 { 876 struct sbuf *sb; 877 int error; 878 int i; 879 880 error = 0; 881 error = sysctl_wire_old_buffer(req, 0); 882 if (error != 0) 883 return (error); 884 sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 885 if (sb == NULL) 886 return (ENOMEM); 887 for (i = 0; i < rss_buckets; i++) { 888 sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 889 i, 890 rss_getcpu(i)); 891 } 892 error = sbuf_finish(sb); 893 sbuf_delete(sb); 894 895 return (error); 896 } 897 SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 898 CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 899 sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 900