1b2bdc62aSAdrian Chadd /*- 2b2bdc62aSAdrian Chadd * Copyright (c) 2010-2011 Juniper Networks, Inc. 3b2bdc62aSAdrian Chadd * All rights reserved. 4b2bdc62aSAdrian Chadd * 5b2bdc62aSAdrian Chadd * This software was developed by Robert N. M. Watson under contract 6b2bdc62aSAdrian Chadd * to Juniper Networks, Inc. 7b2bdc62aSAdrian Chadd * 8b2bdc62aSAdrian Chadd * Redistribution and use in source and binary forms, with or without 9b2bdc62aSAdrian Chadd * modification, are permitted provided that the following conditions 10b2bdc62aSAdrian Chadd * are met: 11b2bdc62aSAdrian Chadd * 1. Redistributions of source code must retain the above copyright 12b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer. 13b2bdc62aSAdrian Chadd * 2. Redistributions in binary form must reproduce the above copyright 14b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer in the 15b2bdc62aSAdrian Chadd * documentation and/or other materials provided with the distribution. 16b2bdc62aSAdrian Chadd * 17b2bdc62aSAdrian Chadd * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18b2bdc62aSAdrian Chadd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19b2bdc62aSAdrian Chadd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20b2bdc62aSAdrian Chadd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21b2bdc62aSAdrian Chadd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22b2bdc62aSAdrian Chadd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23b2bdc62aSAdrian Chadd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24b2bdc62aSAdrian Chadd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25b2bdc62aSAdrian Chadd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26b2bdc62aSAdrian Chadd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27b2bdc62aSAdrian Chadd * SUCH DAMAGE. 28b2bdc62aSAdrian Chadd */ 29b2bdc62aSAdrian Chadd 30b2bdc62aSAdrian Chadd #include <sys/cdefs.h> 31b2bdc62aSAdrian Chadd 32b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$"); 33b2bdc62aSAdrian Chadd 34b2bdc62aSAdrian Chadd #include "opt_inet6.h" 35b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h" 36b2bdc62aSAdrian Chadd 37b2bdc62aSAdrian Chadd #ifndef PCBGROUP 38b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP" 39b2bdc62aSAdrian Chadd #endif 40b2bdc62aSAdrian Chadd 41b2bdc62aSAdrian Chadd #include <sys/param.h> 42b2bdc62aSAdrian Chadd #include <sys/mbuf.h> 43b2bdc62aSAdrian Chadd #include <sys/socket.h> 44b2bdc62aSAdrian Chadd #include <sys/priv.h> 45b2bdc62aSAdrian Chadd #include <sys/kernel.h> 46b2bdc62aSAdrian Chadd #include <sys/smp.h> 47b2bdc62aSAdrian Chadd #include <sys/sysctl.h> 48b2bdc62aSAdrian Chadd #include <sys/sbuf.h> 49b2bdc62aSAdrian Chadd 50b2bdc62aSAdrian Chadd #include <net/if.h> 51b2bdc62aSAdrian Chadd #include <net/if_var.h> 52b2bdc62aSAdrian Chadd #include <net/netisr.h> 53b2bdc62aSAdrian Chadd #include <net/rss_config.h> 54b2bdc62aSAdrian Chadd #include <net/toeplitz.h> 55b2bdc62aSAdrian Chadd 56b2bdc62aSAdrian Chadd #if 0 57b2bdc62aSAdrian Chadd #include <netinet/in.h> 58b2bdc62aSAdrian Chadd #include <netinet/in_pcb.h> 59b2bdc62aSAdrian Chadd #include <netinet/in_rss.h> 60b2bdc62aSAdrian Chadd #include <netinet/in_var.h> 61b2bdc62aSAdrian Chadd 62b2bdc62aSAdrian Chadd /* for software rss hash support */ 63b2bdc62aSAdrian Chadd #include <netinet/ip.h> 64b2bdc62aSAdrian Chadd #include <netinet/tcp.h> 65b2bdc62aSAdrian Chadd #include <netinet/udp.h> 66b2bdc62aSAdrian Chadd #endif 67b2bdc62aSAdrian Chadd 68b2bdc62aSAdrian Chadd /*- 69b2bdc62aSAdrian Chadd * Operating system parts of receiver-side scaling (RSS), which allows 70b2bdc62aSAdrian Chadd * network cards to direct flows to particular receive queues based on hashes 71b2bdc62aSAdrian Chadd * of header tuples. This implementation aligns RSS buckets with connection 72b2bdc62aSAdrian Chadd * groups at the TCP/IP layer, so each bucket is associated with exactly one 73b2bdc62aSAdrian Chadd * group. As a result, the group lookup structures (and lock) should have an 74b2bdc62aSAdrian Chadd * effective affinity with exactly one CPU. 75b2bdc62aSAdrian Chadd * 76b2bdc62aSAdrian Chadd * Network device drivers needing to configure RSS will query this framework 77b2bdc62aSAdrian Chadd * for parameters, such as the current RSS key, hashing policies, number of 78b2bdc62aSAdrian Chadd * bits, and indirection table mapping hashes to buckets and CPUs. They may 79b2bdc62aSAdrian Chadd * provide their own supplementary information, such as queue<->CPU bindings. 80b2bdc62aSAdrian Chadd * It is the responsibility of the network device driver to inject packets 81b2bdc62aSAdrian Chadd * into the stack on as close to the right CPU as possible, if playing by RSS 82b2bdc62aSAdrian Chadd * rules. 83b2bdc62aSAdrian Chadd * 84b2bdc62aSAdrian Chadd * TODO: 85b2bdc62aSAdrian Chadd * 86b2bdc62aSAdrian Chadd * - Synchronization for rss_key and other future-configurable parameters. 87b2bdc62aSAdrian Chadd * - Event handler drivers can register to pick up RSS configuration changes. 88b2bdc62aSAdrian Chadd * - Should we allow rss_basecpu to be configured? 89b2bdc62aSAdrian Chadd * - Randomize key on boot. 90b2bdc62aSAdrian Chadd * - IPv6 support. 91b2bdc62aSAdrian Chadd * - Statistics on how often there's a misalignment between hardware 92b2bdc62aSAdrian Chadd * placement and pcbgroup expectations. 93b2bdc62aSAdrian Chadd */ 94b2bdc62aSAdrian Chadd 95b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet); 96b2bdc62aSAdrian Chadd SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); 97b2bdc62aSAdrian Chadd 98b2bdc62aSAdrian Chadd /* 99b2bdc62aSAdrian Chadd * Toeplitz is the only required hash function in the RSS spec, so use it by 100b2bdc62aSAdrian Chadd * default. 101b2bdc62aSAdrian Chadd */ 102b2bdc62aSAdrian Chadd static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 103b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, 104b2bdc62aSAdrian Chadd "RSS hash algorithm"); 105b2bdc62aSAdrian Chadd 106b2bdc62aSAdrian Chadd /* 107b2bdc62aSAdrian Chadd * Size of the indirection table; at most 128 entries per the RSS spec. We 108b2bdc62aSAdrian Chadd * size it to at least 2 times the number of CPUs by default to allow useful 109b2bdc62aSAdrian Chadd * rebalancing. If not set explicitly with a loader tunable, we tune based 110b2bdc62aSAdrian Chadd * on the number of CPUs present. 111b2bdc62aSAdrian Chadd * 112b2bdc62aSAdrian Chadd * XXXRW: buckets might be better to use for the tunable than bits. 113b2bdc62aSAdrian Chadd */ 114b2bdc62aSAdrian Chadd static u_int rss_bits; 115b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, 116b2bdc62aSAdrian Chadd "RSS bits"); 117b2bdc62aSAdrian Chadd 118b2bdc62aSAdrian Chadd static u_int rss_mask; 119b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 120b2bdc62aSAdrian Chadd "RSS mask"); 121b2bdc62aSAdrian Chadd 122b2bdc62aSAdrian Chadd static const u_int rss_maxbits = RSS_MAXBITS; 123b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 124b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 125b2bdc62aSAdrian Chadd 126b2bdc62aSAdrian Chadd /* 127b2bdc62aSAdrian Chadd * RSS's own count of the number of CPUs it could be using for processing. 128b2bdc62aSAdrian Chadd * Bounded to 64 by RSS constants. 129b2bdc62aSAdrian Chadd */ 130b2bdc62aSAdrian Chadd static u_int rss_ncpus; 131b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 132b2bdc62aSAdrian Chadd "Number of CPUs available to RSS"); 133b2bdc62aSAdrian Chadd 134b2bdc62aSAdrian Chadd #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 135b2bdc62aSAdrian Chadd static const u_int rss_maxcpus = RSS_MAXCPUS; 136b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 137b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 138b2bdc62aSAdrian Chadd 139b2bdc62aSAdrian Chadd /* 140b2bdc62aSAdrian Chadd * Variable exists just for reporting rss_bits in a user-friendly way. 141b2bdc62aSAdrian Chadd */ 142b2bdc62aSAdrian Chadd static u_int rss_buckets; 143b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 144b2bdc62aSAdrian Chadd "RSS buckets"); 145b2bdc62aSAdrian Chadd 146b2bdc62aSAdrian Chadd /* 147b2bdc62aSAdrian Chadd * Base CPU number; devices will add this to all CPU numbers returned by the 148b2bdc62aSAdrian Chadd * RSS indirection table. Currently unmodifable in FreeBSD. 149b2bdc62aSAdrian Chadd */ 150b2bdc62aSAdrian Chadd static const u_int rss_basecpu; 151b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 152b2bdc62aSAdrian Chadd __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 153b2bdc62aSAdrian Chadd 154b2bdc62aSAdrian Chadd /* 155*e5562eb9SAdrian Chadd * Print verbose debugging messages. 156*e5562eb9SAdrian Chadd * 0 - disable 157*e5562eb9SAdrian Chadd * non-zero - enable 158*e5562eb9SAdrian Chadd */ 159*e5562eb9SAdrian Chadd int rss_debug = 0; 160*e5562eb9SAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0, 161*e5562eb9SAdrian Chadd "RSS debug level"); 162*e5562eb9SAdrian Chadd 163*e5562eb9SAdrian Chadd /* 164b2bdc62aSAdrian Chadd * RSS secret key, intended to prevent attacks on load-balancing. Its 165b2bdc62aSAdrian Chadd * effectiveness may be limited by algorithm choice and available entropy 166b2bdc62aSAdrian Chadd * during the boot. 167b2bdc62aSAdrian Chadd * 168b2bdc62aSAdrian Chadd * XXXRW: And that we don't randomize it yet! 169b2bdc62aSAdrian Chadd * 170b2bdc62aSAdrian Chadd * This is the default Microsoft RSS specification key which is also 171b2bdc62aSAdrian Chadd * the Chelsio T5 firmware default key. 172b2bdc62aSAdrian Chadd */ 173b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = { 174b2bdc62aSAdrian Chadd 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 175b2bdc62aSAdrian Chadd 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 176b2bdc62aSAdrian Chadd 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 177b2bdc62aSAdrian Chadd 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 178b2bdc62aSAdrian Chadd 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, 179b2bdc62aSAdrian Chadd }; 180b2bdc62aSAdrian Chadd 181b2bdc62aSAdrian Chadd /* 182b2bdc62aSAdrian Chadd * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 183b2bdc62aSAdrian Chadd * Drivers may supplement this table with a seperate CPU<->queue table when 184b2bdc62aSAdrian Chadd * programming devices. 185b2bdc62aSAdrian Chadd */ 186b2bdc62aSAdrian Chadd struct rss_table_entry { 187b2bdc62aSAdrian Chadd uint8_t rte_cpu; /* CPU affinity of bucket. */ 188b2bdc62aSAdrian Chadd }; 189b2bdc62aSAdrian Chadd static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 190b2bdc62aSAdrian Chadd 191b2bdc62aSAdrian Chadd static void 192b2bdc62aSAdrian Chadd rss_init(__unused void *arg) 193b2bdc62aSAdrian Chadd { 194b2bdc62aSAdrian Chadd u_int i; 195b2bdc62aSAdrian Chadd u_int cpuid; 196b2bdc62aSAdrian Chadd 197b2bdc62aSAdrian Chadd /* 198b2bdc62aSAdrian Chadd * Validate tunables, coerce to sensible values. 199b2bdc62aSAdrian Chadd */ 200b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 201b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 202b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 203b2bdc62aSAdrian Chadd break; 204b2bdc62aSAdrian Chadd 205b2bdc62aSAdrian Chadd default: 206*e5562eb9SAdrian Chadd RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n", 207*e5562eb9SAdrian Chadd rss_hashalgo, RSS_HASH_TOEPLITZ); 208b2bdc62aSAdrian Chadd rss_hashalgo = RSS_HASH_TOEPLITZ; 209b2bdc62aSAdrian Chadd } 210b2bdc62aSAdrian Chadd 211b2bdc62aSAdrian Chadd /* 212b2bdc62aSAdrian Chadd * Count available CPUs. 213b2bdc62aSAdrian Chadd * 214b2bdc62aSAdrian Chadd * XXXRW: Note incorrect assumptions regarding contiguity of this set 215b2bdc62aSAdrian Chadd * elsewhere. 216b2bdc62aSAdrian Chadd */ 217b2bdc62aSAdrian Chadd rss_ncpus = 0; 218b2bdc62aSAdrian Chadd for (i = 0; i <= mp_maxid; i++) { 219b2bdc62aSAdrian Chadd if (CPU_ABSENT(i)) 220b2bdc62aSAdrian Chadd continue; 221b2bdc62aSAdrian Chadd rss_ncpus++; 222b2bdc62aSAdrian Chadd } 223b2bdc62aSAdrian Chadd if (rss_ncpus > RSS_MAXCPUS) 224b2bdc62aSAdrian Chadd rss_ncpus = RSS_MAXCPUS; 225b2bdc62aSAdrian Chadd 226b2bdc62aSAdrian Chadd /* 227b2bdc62aSAdrian Chadd * Tune RSS table entries to be no less than 2x the number of CPUs 228b2bdc62aSAdrian Chadd * -- unless we're running uniprocessor, in which case there's not 229b2bdc62aSAdrian Chadd * much point in having buckets to rearrange for load-balancing! 230b2bdc62aSAdrian Chadd */ 231b2bdc62aSAdrian Chadd if (rss_ncpus > 1) { 232b2bdc62aSAdrian Chadd if (rss_bits == 0) 233b2bdc62aSAdrian Chadd rss_bits = fls(rss_ncpus - 1) + 1; 234b2bdc62aSAdrian Chadd 235b2bdc62aSAdrian Chadd /* 236b2bdc62aSAdrian Chadd * Microsoft limits RSS table entries to 128, so apply that 237b2bdc62aSAdrian Chadd * limit to both auto-detected CPU counts and user-configured 238b2bdc62aSAdrian Chadd * ones. 239b2bdc62aSAdrian Chadd */ 240b2bdc62aSAdrian Chadd if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 241*e5562eb9SAdrian Chadd RSS_DEBUG("RSS bits %u not valid, coercing to %u\n", 242*e5562eb9SAdrian Chadd rss_bits, RSS_MAXBITS); 243b2bdc62aSAdrian Chadd rss_bits = RSS_MAXBITS; 244b2bdc62aSAdrian Chadd } 245b2bdc62aSAdrian Chadd 246b2bdc62aSAdrian Chadd /* 247b2bdc62aSAdrian Chadd * Figure out how many buckets to use; warn if less than the 248b2bdc62aSAdrian Chadd * number of configured CPUs, although this is not a fatal 249b2bdc62aSAdrian Chadd * problem. 250b2bdc62aSAdrian Chadd */ 251b2bdc62aSAdrian Chadd rss_buckets = (1 << rss_bits); 252b2bdc62aSAdrian Chadd if (rss_buckets < rss_ncpus) 253*e5562eb9SAdrian Chadd RSS_DEBUG("WARNING: rss_buckets (%u) less than " 254*e5562eb9SAdrian Chadd "rss_ncpus (%u)\n", rss_buckets, rss_ncpus); 255b2bdc62aSAdrian Chadd rss_mask = rss_buckets - 1; 256b2bdc62aSAdrian Chadd } else { 257b2bdc62aSAdrian Chadd rss_bits = 0; 258b2bdc62aSAdrian Chadd rss_buckets = 1; 259b2bdc62aSAdrian Chadd rss_mask = 0; 260b2bdc62aSAdrian Chadd } 261b2bdc62aSAdrian Chadd 262b2bdc62aSAdrian Chadd /* 263b2bdc62aSAdrian Chadd * Set up initial CPU assignments: round-robin by default. 264b2bdc62aSAdrian Chadd */ 265b2bdc62aSAdrian Chadd cpuid = CPU_FIRST(); 266b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 267b2bdc62aSAdrian Chadd rss_table[i].rte_cpu = cpuid; 268b2bdc62aSAdrian Chadd cpuid = CPU_NEXT(cpuid); 269b2bdc62aSAdrian Chadd } 270b2bdc62aSAdrian Chadd 271b2bdc62aSAdrian Chadd /* 272b2bdc62aSAdrian Chadd * Randomize rrs_key. 273b2bdc62aSAdrian Chadd * 274b2bdc62aSAdrian Chadd * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 275b2bdc62aSAdrian Chadd * loop to check for "bad" RSS keys. 276b2bdc62aSAdrian Chadd */ 277b2bdc62aSAdrian Chadd } 278b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 279b2bdc62aSAdrian Chadd 280b2bdc62aSAdrian Chadd static uint32_t 281b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 282b2bdc62aSAdrian Chadd const uint8_t *data) 283b2bdc62aSAdrian Chadd { 284b2bdc62aSAdrian Chadd uint32_t v; 285b2bdc62aSAdrian Chadd u_int i; 286b2bdc62aSAdrian Chadd 287b2bdc62aSAdrian Chadd v = 0; 288b2bdc62aSAdrian Chadd for (i = 0; i < keylen; i++) 289b2bdc62aSAdrian Chadd v += key[i]; 290b2bdc62aSAdrian Chadd for (i = 0; i < datalen; i++) 291b2bdc62aSAdrian Chadd v += data[i]; 292b2bdc62aSAdrian Chadd return (v); 293b2bdc62aSAdrian Chadd } 294b2bdc62aSAdrian Chadd 295b2bdc62aSAdrian Chadd uint32_t 296b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data) 297b2bdc62aSAdrian Chadd { 298b2bdc62aSAdrian Chadd 299b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 300b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 301b2bdc62aSAdrian Chadd return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 302b2bdc62aSAdrian Chadd data)); 303b2bdc62aSAdrian Chadd 304b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 305b2bdc62aSAdrian Chadd return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 306b2bdc62aSAdrian Chadd data)); 307b2bdc62aSAdrian Chadd 308b2bdc62aSAdrian Chadd default: 309b2bdc62aSAdrian Chadd panic("%s: unsupported/unknown hashalgo %d", __func__, 310b2bdc62aSAdrian Chadd rss_hashalgo); 311b2bdc62aSAdrian Chadd } 312b2bdc62aSAdrian Chadd } 313b2bdc62aSAdrian Chadd 314b2bdc62aSAdrian Chadd /* 315b2bdc62aSAdrian Chadd * Query the number of RSS bits in use. 316b2bdc62aSAdrian Chadd */ 317b2bdc62aSAdrian Chadd u_int 318b2bdc62aSAdrian Chadd rss_getbits(void) 319b2bdc62aSAdrian Chadd { 320b2bdc62aSAdrian Chadd 321b2bdc62aSAdrian Chadd return (rss_bits); 322b2bdc62aSAdrian Chadd } 323b2bdc62aSAdrian Chadd 324b2bdc62aSAdrian Chadd /* 325b2bdc62aSAdrian Chadd * Query the RSS bucket associated with an RSS hash. 326b2bdc62aSAdrian Chadd */ 327b2bdc62aSAdrian Chadd u_int 328b2bdc62aSAdrian Chadd rss_getbucket(u_int hash) 329b2bdc62aSAdrian Chadd { 330b2bdc62aSAdrian Chadd 331b2bdc62aSAdrian Chadd return (hash & rss_mask); 332b2bdc62aSAdrian Chadd } 333b2bdc62aSAdrian Chadd 334b2bdc62aSAdrian Chadd /* 335b2bdc62aSAdrian Chadd * Query the RSS layer bucket associated with the given 336b2bdc62aSAdrian Chadd * entry in the RSS hash space. 337b2bdc62aSAdrian Chadd * 338b2bdc62aSAdrian Chadd * The RSS indirection table is 0 .. rss_buckets-1, 339b2bdc62aSAdrian Chadd * covering the low 'rss_bits' of the total 128 slot 340b2bdc62aSAdrian Chadd * RSS indirection table. So just mask off rss_bits and 341b2bdc62aSAdrian Chadd * return that. 342b2bdc62aSAdrian Chadd * 343b2bdc62aSAdrian Chadd * NIC drivers can then iterate over the 128 slot RSS 344b2bdc62aSAdrian Chadd * indirection table and fetch which RSS bucket to 345b2bdc62aSAdrian Chadd * map it to. This will typically be a CPU queue 346b2bdc62aSAdrian Chadd */ 347b2bdc62aSAdrian Chadd u_int 348b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index) 349b2bdc62aSAdrian Chadd { 350b2bdc62aSAdrian Chadd 351b2bdc62aSAdrian Chadd return (index & rss_mask); 352b2bdc62aSAdrian Chadd } 353b2bdc62aSAdrian Chadd 354b2bdc62aSAdrian Chadd /* 355b2bdc62aSAdrian Chadd * Query the RSS CPU associated with an RSS bucket. 356b2bdc62aSAdrian Chadd */ 357b2bdc62aSAdrian Chadd u_int 358b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket) 359b2bdc62aSAdrian Chadd { 360b2bdc62aSAdrian Chadd 361b2bdc62aSAdrian Chadd return (rss_table[bucket].rte_cpu); 362b2bdc62aSAdrian Chadd } 363b2bdc62aSAdrian Chadd 364b2bdc62aSAdrian Chadd /* 365b2bdc62aSAdrian Chadd * netisr CPU affinity lookup given just the hash and hashtype. 366b2bdc62aSAdrian Chadd */ 367b2bdc62aSAdrian Chadd u_int 368b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 369b2bdc62aSAdrian Chadd { 370b2bdc62aSAdrian Chadd 371b2bdc62aSAdrian Chadd switch (hash_type) { 372b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 373b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 374b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 375b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 376b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 377b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 378b2bdc62aSAdrian Chadd return (rss_getcpu(rss_getbucket(hash_val))); 379b2bdc62aSAdrian Chadd default: 380b2bdc62aSAdrian Chadd return (NETISR_CPUID_NONE); 381b2bdc62aSAdrian Chadd } 382b2bdc62aSAdrian Chadd } 383b2bdc62aSAdrian Chadd 384b2bdc62aSAdrian Chadd /* 385b2bdc62aSAdrian Chadd * Query the RSS bucket associated with the given hash value and 386b2bdc62aSAdrian Chadd * type. 387b2bdc62aSAdrian Chadd */ 388b2bdc62aSAdrian Chadd int 389b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 390b2bdc62aSAdrian Chadd { 391b2bdc62aSAdrian Chadd 392b2bdc62aSAdrian Chadd switch (hash_type) { 393b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 394b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 395b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 396b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 397b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 398b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 399b2bdc62aSAdrian Chadd *bucket_id = rss_getbucket(hash_val); 400b2bdc62aSAdrian Chadd return (0); 401b2bdc62aSAdrian Chadd default: 402b2bdc62aSAdrian Chadd return (-1); 403b2bdc62aSAdrian Chadd } 404b2bdc62aSAdrian Chadd } 405b2bdc62aSAdrian Chadd 406b2bdc62aSAdrian Chadd /* 407b2bdc62aSAdrian Chadd * netisr CPU affinity lookup routine for use by protocols. 408b2bdc62aSAdrian Chadd */ 409b2bdc62aSAdrian Chadd struct mbuf * 410b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 411b2bdc62aSAdrian Chadd { 412b2bdc62aSAdrian Chadd 413b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 414b2bdc62aSAdrian Chadd *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 415b2bdc62aSAdrian Chadd return (m); 416b2bdc62aSAdrian Chadd } 417b2bdc62aSAdrian Chadd 418b2bdc62aSAdrian Chadd int 419b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 420b2bdc62aSAdrian Chadd { 421b2bdc62aSAdrian Chadd 422b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 423b2bdc62aSAdrian Chadd 424b2bdc62aSAdrian Chadd return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 425b2bdc62aSAdrian Chadd bucket_id)); 426b2bdc62aSAdrian Chadd } 427b2bdc62aSAdrian Chadd 428b2bdc62aSAdrian Chadd /* 429b2bdc62aSAdrian Chadd * Query the RSS hash algorithm. 430b2bdc62aSAdrian Chadd */ 431b2bdc62aSAdrian Chadd u_int 432b2bdc62aSAdrian Chadd rss_gethashalgo(void) 433b2bdc62aSAdrian Chadd { 434b2bdc62aSAdrian Chadd 435b2bdc62aSAdrian Chadd return (rss_hashalgo); 436b2bdc62aSAdrian Chadd } 437b2bdc62aSAdrian Chadd 438b2bdc62aSAdrian Chadd /* 439b2bdc62aSAdrian Chadd * Query the current RSS key; likely to be used by device drivers when 440b2bdc62aSAdrian Chadd * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 441b2bdc62aSAdrian Chadd * 442b2bdc62aSAdrian Chadd * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 443b2bdc62aSAdrian Chadd */ 444b2bdc62aSAdrian Chadd void 445b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key) 446b2bdc62aSAdrian Chadd { 447b2bdc62aSAdrian Chadd 448b2bdc62aSAdrian Chadd bcopy(rss_key, key, sizeof(rss_key)); 449b2bdc62aSAdrian Chadd } 450b2bdc62aSAdrian Chadd 451b2bdc62aSAdrian Chadd /* 452b2bdc62aSAdrian Chadd * Query the number of buckets; this may be used by both network device 453b2bdc62aSAdrian Chadd * drivers, which will need to populate hardware shadows of the software 454b2bdc62aSAdrian Chadd * indirection table, and the network stack itself (such as when deciding how 455b2bdc62aSAdrian Chadd * many connection groups to allocate). 456b2bdc62aSAdrian Chadd */ 457b2bdc62aSAdrian Chadd u_int 458b2bdc62aSAdrian Chadd rss_getnumbuckets(void) 459b2bdc62aSAdrian Chadd { 460b2bdc62aSAdrian Chadd 461b2bdc62aSAdrian Chadd return (rss_buckets); 462b2bdc62aSAdrian Chadd } 463b2bdc62aSAdrian Chadd 464b2bdc62aSAdrian Chadd /* 465b2bdc62aSAdrian Chadd * Query the number of CPUs in use by RSS; may be useful to device drivers 466b2bdc62aSAdrian Chadd * trying to figure out how to map a larger number of CPUs into a smaller 467b2bdc62aSAdrian Chadd * number of receive queues. 468b2bdc62aSAdrian Chadd */ 469b2bdc62aSAdrian Chadd u_int 470b2bdc62aSAdrian Chadd rss_getnumcpus(void) 471b2bdc62aSAdrian Chadd { 472b2bdc62aSAdrian Chadd 473b2bdc62aSAdrian Chadd return (rss_ncpus); 474b2bdc62aSAdrian Chadd } 475b2bdc62aSAdrian Chadd 476b2bdc62aSAdrian Chadd /* 477b2bdc62aSAdrian Chadd * Return the supported RSS hash configuration. 478b2bdc62aSAdrian Chadd * 479b2bdc62aSAdrian Chadd * NICs should query this to determine what to configure in their redirection 480b2bdc62aSAdrian Chadd * matching table. 481b2bdc62aSAdrian Chadd */ 482b2bdc62aSAdrian Chadd inline u_int 483b2bdc62aSAdrian Chadd rss_gethashconfig(void) 484b2bdc62aSAdrian Chadd { 485b2bdc62aSAdrian Chadd 486b2bdc62aSAdrian Chadd /* Return 4-tuple for TCP; 2-tuple for others */ 487b2bdc62aSAdrian Chadd /* 488b2bdc62aSAdrian Chadd * UDP may fragment more often than TCP and thus we'll end up with 489b2bdc62aSAdrian Chadd * NICs returning 2-tuple fragments. 490b2bdc62aSAdrian Chadd * udp_init() and udplite_init() both currently initialise things 491b2bdc62aSAdrian Chadd * as 2-tuple. 492b2bdc62aSAdrian Chadd * So for now disable UDP 4-tuple hashing until all of the other 493b2bdc62aSAdrian Chadd * pieces are in place. 494b2bdc62aSAdrian Chadd */ 495b2bdc62aSAdrian Chadd return ( 496b2bdc62aSAdrian Chadd RSS_HASHTYPE_RSS_IPV4 497b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV4 498b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6 499b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6 500b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6_EX 501b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6_EX 502b2bdc62aSAdrian Chadd #if 0 503b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV4 504b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV4_EX 505b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6 506b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6_EX 507b2bdc62aSAdrian Chadd #endif 508b2bdc62aSAdrian Chadd ); 509b2bdc62aSAdrian Chadd } 510b2bdc62aSAdrian Chadd 511b2bdc62aSAdrian Chadd /* 512b2bdc62aSAdrian Chadd * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 513b2bdc62aSAdrian Chadd * it appearing in debugging output unnecessarily. 514b2bdc62aSAdrian Chadd */ 515b2bdc62aSAdrian Chadd static int 516b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS) 517b2bdc62aSAdrian Chadd { 518b2bdc62aSAdrian Chadd uint8_t temp_rss_key[RSS_KEYSIZE]; 519b2bdc62aSAdrian Chadd int error; 520b2bdc62aSAdrian Chadd 521b2bdc62aSAdrian Chadd error = priv_check(req->td, PRIV_NETINET_HASHKEY); 522b2bdc62aSAdrian Chadd if (error) 523b2bdc62aSAdrian Chadd return (error); 524b2bdc62aSAdrian Chadd 525b2bdc62aSAdrian Chadd bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 526b2bdc62aSAdrian Chadd error = sysctl_handle_opaque(oidp, temp_rss_key, 527b2bdc62aSAdrian Chadd sizeof(temp_rss_key), req); 528b2bdc62aSAdrian Chadd if (error) 529b2bdc62aSAdrian Chadd return (error); 530b2bdc62aSAdrian Chadd if (req->newptr != NULL) { 531b2bdc62aSAdrian Chadd /* XXXRW: Not yet. */ 532b2bdc62aSAdrian Chadd return (EINVAL); 533b2bdc62aSAdrian Chadd } 534b2bdc62aSAdrian Chadd return (0); 535b2bdc62aSAdrian Chadd } 536b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 537b2bdc62aSAdrian Chadd CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 538b2bdc62aSAdrian Chadd "", "RSS keying material"); 539b2bdc62aSAdrian Chadd 540b2bdc62aSAdrian Chadd static int 541b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 542b2bdc62aSAdrian Chadd { 543b2bdc62aSAdrian Chadd struct sbuf *sb; 544b2bdc62aSAdrian Chadd int error; 545b2bdc62aSAdrian Chadd int i; 546b2bdc62aSAdrian Chadd 547b2bdc62aSAdrian Chadd error = 0; 548b2bdc62aSAdrian Chadd error = sysctl_wire_old_buffer(req, 0); 549b2bdc62aSAdrian Chadd if (error != 0) 550b2bdc62aSAdrian Chadd return (error); 551b2bdc62aSAdrian Chadd sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 552b2bdc62aSAdrian Chadd if (sb == NULL) 553b2bdc62aSAdrian Chadd return (ENOMEM); 554b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 555b2bdc62aSAdrian Chadd sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 556b2bdc62aSAdrian Chadd i, 557b2bdc62aSAdrian Chadd rss_getcpu(i)); 558b2bdc62aSAdrian Chadd } 559b2bdc62aSAdrian Chadd error = sbuf_finish(sb); 560b2bdc62aSAdrian Chadd sbuf_delete(sb); 561b2bdc62aSAdrian Chadd 562b2bdc62aSAdrian Chadd return (error); 563b2bdc62aSAdrian Chadd } 564b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 565b2bdc62aSAdrian Chadd CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 566b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 567