1b2bdc62aSAdrian Chadd /*- 2b2bdc62aSAdrian Chadd * Copyright (c) 2010-2011 Juniper Networks, Inc. 3b2bdc62aSAdrian Chadd * All rights reserved. 4b2bdc62aSAdrian Chadd * 5b2bdc62aSAdrian Chadd * This software was developed by Robert N. M. Watson under contract 6b2bdc62aSAdrian Chadd * to Juniper Networks, Inc. 7b2bdc62aSAdrian Chadd * 8b2bdc62aSAdrian Chadd * Redistribution and use in source and binary forms, with or without 9b2bdc62aSAdrian Chadd * modification, are permitted provided that the following conditions 10b2bdc62aSAdrian Chadd * are met: 11b2bdc62aSAdrian Chadd * 1. Redistributions of source code must retain the above copyright 12b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer. 13b2bdc62aSAdrian Chadd * 2. Redistributions in binary form must reproduce the above copyright 14b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer in the 15b2bdc62aSAdrian Chadd * documentation and/or other materials provided with the distribution. 16b2bdc62aSAdrian Chadd * 17b2bdc62aSAdrian Chadd * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18b2bdc62aSAdrian Chadd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19b2bdc62aSAdrian Chadd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20b2bdc62aSAdrian Chadd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21b2bdc62aSAdrian Chadd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22b2bdc62aSAdrian Chadd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23b2bdc62aSAdrian Chadd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24b2bdc62aSAdrian Chadd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25b2bdc62aSAdrian Chadd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26b2bdc62aSAdrian Chadd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27b2bdc62aSAdrian Chadd * SUCH DAMAGE. 28b2bdc62aSAdrian Chadd */ 29b2bdc62aSAdrian Chadd 30b2bdc62aSAdrian Chadd #include <sys/cdefs.h> 31b2bdc62aSAdrian Chadd 32b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$"); 33b2bdc62aSAdrian Chadd 34b2bdc62aSAdrian Chadd #include "opt_inet6.h" 35b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h" 36b2bdc62aSAdrian Chadd 37b2bdc62aSAdrian Chadd #ifndef PCBGROUP 38b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP" 39b2bdc62aSAdrian Chadd #endif 40b2bdc62aSAdrian Chadd 41b2bdc62aSAdrian Chadd #include <sys/param.h> 42b2bdc62aSAdrian Chadd #include <sys/mbuf.h> 43b2bdc62aSAdrian Chadd #include <sys/socket.h> 44b2bdc62aSAdrian Chadd #include <sys/priv.h> 45b2bdc62aSAdrian Chadd #include <sys/kernel.h> 46b2bdc62aSAdrian Chadd #include <sys/smp.h> 47b2bdc62aSAdrian Chadd #include <sys/sysctl.h> 48b2bdc62aSAdrian Chadd #include <sys/sbuf.h> 49b2bdc62aSAdrian Chadd 50b2bdc62aSAdrian Chadd #include <net/if.h> 51b2bdc62aSAdrian Chadd #include <net/if_var.h> 52b2bdc62aSAdrian Chadd #include <net/netisr.h> 53b2bdc62aSAdrian Chadd #include <net/rss_config.h> 54b2bdc62aSAdrian Chadd #include <net/toeplitz.h> 55b2bdc62aSAdrian Chadd 56b2bdc62aSAdrian Chadd /*- 57b2bdc62aSAdrian Chadd * Operating system parts of receiver-side scaling (RSS), which allows 58b2bdc62aSAdrian Chadd * network cards to direct flows to particular receive queues based on hashes 59b2bdc62aSAdrian Chadd * of header tuples. This implementation aligns RSS buckets with connection 60b2bdc62aSAdrian Chadd * groups at the TCP/IP layer, so each bucket is associated with exactly one 61b2bdc62aSAdrian Chadd * group. As a result, the group lookup structures (and lock) should have an 62b2bdc62aSAdrian Chadd * effective affinity with exactly one CPU. 63b2bdc62aSAdrian Chadd * 64b2bdc62aSAdrian Chadd * Network device drivers needing to configure RSS will query this framework 65b2bdc62aSAdrian Chadd * for parameters, such as the current RSS key, hashing policies, number of 66b2bdc62aSAdrian Chadd * bits, and indirection table mapping hashes to buckets and CPUs. They may 67b2bdc62aSAdrian Chadd * provide their own supplementary information, such as queue<->CPU bindings. 68b2bdc62aSAdrian Chadd * It is the responsibility of the network device driver to inject packets 69b2bdc62aSAdrian Chadd * into the stack on as close to the right CPU as possible, if playing by RSS 70b2bdc62aSAdrian Chadd * rules. 71b2bdc62aSAdrian Chadd * 72b2bdc62aSAdrian Chadd * TODO: 73b2bdc62aSAdrian Chadd * 74b2bdc62aSAdrian Chadd * - Synchronization for rss_key and other future-configurable parameters. 75b2bdc62aSAdrian Chadd * - Event handler drivers can register to pick up RSS configuration changes. 76b2bdc62aSAdrian Chadd * - Should we allow rss_basecpu to be configured? 77b2bdc62aSAdrian Chadd * - Randomize key on boot. 78b2bdc62aSAdrian Chadd * - IPv6 support. 79b2bdc62aSAdrian Chadd * - Statistics on how often there's a misalignment between hardware 80b2bdc62aSAdrian Chadd * placement and pcbgroup expectations. 81b2bdc62aSAdrian Chadd */ 82b2bdc62aSAdrian Chadd 83b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet); 84*7029da5cSPawel Biernacki SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW | CTLFLAG_MPSAFE, 0, 85*7029da5cSPawel Biernacki "Receive-side steering"); 86b2bdc62aSAdrian Chadd 87b2bdc62aSAdrian Chadd /* 88b2bdc62aSAdrian Chadd * Toeplitz is the only required hash function in the RSS spec, so use it by 89b2bdc62aSAdrian Chadd * default. 90b2bdc62aSAdrian Chadd */ 91b2bdc62aSAdrian Chadd static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 92b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, 93b2bdc62aSAdrian Chadd "RSS hash algorithm"); 94b2bdc62aSAdrian Chadd 95b2bdc62aSAdrian Chadd /* 96b2bdc62aSAdrian Chadd * Size of the indirection table; at most 128 entries per the RSS spec. We 97b2bdc62aSAdrian Chadd * size it to at least 2 times the number of CPUs by default to allow useful 98b2bdc62aSAdrian Chadd * rebalancing. If not set explicitly with a loader tunable, we tune based 99b2bdc62aSAdrian Chadd * on the number of CPUs present. 100b2bdc62aSAdrian Chadd * 101b2bdc62aSAdrian Chadd * XXXRW: buckets might be better to use for the tunable than bits. 102b2bdc62aSAdrian Chadd */ 103b2bdc62aSAdrian Chadd static u_int rss_bits; 104b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, 105b2bdc62aSAdrian Chadd "RSS bits"); 106b2bdc62aSAdrian Chadd 107b2bdc62aSAdrian Chadd static u_int rss_mask; 108b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 109b2bdc62aSAdrian Chadd "RSS mask"); 110b2bdc62aSAdrian Chadd 111b2bdc62aSAdrian Chadd static const u_int rss_maxbits = RSS_MAXBITS; 112b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 113b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 114b2bdc62aSAdrian Chadd 115b2bdc62aSAdrian Chadd /* 116b2bdc62aSAdrian Chadd * RSS's own count of the number of CPUs it could be using for processing. 117b2bdc62aSAdrian Chadd * Bounded to 64 by RSS constants. 118b2bdc62aSAdrian Chadd */ 119b2bdc62aSAdrian Chadd static u_int rss_ncpus; 120b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 121b2bdc62aSAdrian Chadd "Number of CPUs available to RSS"); 122b2bdc62aSAdrian Chadd 123b2bdc62aSAdrian Chadd #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 124b2bdc62aSAdrian Chadd static const u_int rss_maxcpus = RSS_MAXCPUS; 125b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 126b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 127b2bdc62aSAdrian Chadd 128b2bdc62aSAdrian Chadd /* 129b2bdc62aSAdrian Chadd * Variable exists just for reporting rss_bits in a user-friendly way. 130b2bdc62aSAdrian Chadd */ 131b2bdc62aSAdrian Chadd static u_int rss_buckets; 132b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 133b2bdc62aSAdrian Chadd "RSS buckets"); 134b2bdc62aSAdrian Chadd 135b2bdc62aSAdrian Chadd /* 136b2bdc62aSAdrian Chadd * Base CPU number; devices will add this to all CPU numbers returned by the 137b2bdc62aSAdrian Chadd * RSS indirection table. Currently unmodifable in FreeBSD. 138b2bdc62aSAdrian Chadd */ 139b2bdc62aSAdrian Chadd static const u_int rss_basecpu; 140b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 141b2bdc62aSAdrian Chadd __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 142b2bdc62aSAdrian Chadd 143b2bdc62aSAdrian Chadd /* 144e5562eb9SAdrian Chadd * Print verbose debugging messages. 145e5562eb9SAdrian Chadd * 0 - disable 146e5562eb9SAdrian Chadd * non-zero - enable 147e5562eb9SAdrian Chadd */ 148e5562eb9SAdrian Chadd int rss_debug = 0; 149e5562eb9SAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, debug, CTLFLAG_RWTUN, &rss_debug, 0, 150e5562eb9SAdrian Chadd "RSS debug level"); 151e5562eb9SAdrian Chadd 152e5562eb9SAdrian Chadd /* 153b2bdc62aSAdrian Chadd * RSS secret key, intended to prevent attacks on load-balancing. Its 154b2bdc62aSAdrian Chadd * effectiveness may be limited by algorithm choice and available entropy 155b2bdc62aSAdrian Chadd * during the boot. 156b2bdc62aSAdrian Chadd * 157b2bdc62aSAdrian Chadd * XXXRW: And that we don't randomize it yet! 158b2bdc62aSAdrian Chadd * 159b2bdc62aSAdrian Chadd * This is the default Microsoft RSS specification key which is also 160b2bdc62aSAdrian Chadd * the Chelsio T5 firmware default key. 161b2bdc62aSAdrian Chadd */ 162b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = { 163b2bdc62aSAdrian Chadd 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 164b2bdc62aSAdrian Chadd 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 165b2bdc62aSAdrian Chadd 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 166b2bdc62aSAdrian Chadd 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 167b2bdc62aSAdrian Chadd 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, 168b2bdc62aSAdrian Chadd }; 169b2bdc62aSAdrian Chadd 170b2bdc62aSAdrian Chadd /* 171b2bdc62aSAdrian Chadd * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 172a4641f4eSPedro F. Giffuni * Drivers may supplement this table with a separate CPU<->queue table when 173b2bdc62aSAdrian Chadd * programming devices. 174b2bdc62aSAdrian Chadd */ 175b2bdc62aSAdrian Chadd struct rss_table_entry { 176b2bdc62aSAdrian Chadd uint8_t rte_cpu; /* CPU affinity of bucket. */ 177b2bdc62aSAdrian Chadd }; 178b2bdc62aSAdrian Chadd static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 179b2bdc62aSAdrian Chadd 180b2bdc62aSAdrian Chadd static void 181b2bdc62aSAdrian Chadd rss_init(__unused void *arg) 182b2bdc62aSAdrian Chadd { 183b2bdc62aSAdrian Chadd u_int i; 184b2bdc62aSAdrian Chadd u_int cpuid; 185b2bdc62aSAdrian Chadd 186b2bdc62aSAdrian Chadd /* 187b2bdc62aSAdrian Chadd * Validate tunables, coerce to sensible values. 188b2bdc62aSAdrian Chadd */ 189b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 190b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 191b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 192b2bdc62aSAdrian Chadd break; 193b2bdc62aSAdrian Chadd 194b2bdc62aSAdrian Chadd default: 195e5562eb9SAdrian Chadd RSS_DEBUG("invalid RSS hashalgo %u, coercing to %u\n", 196e5562eb9SAdrian Chadd rss_hashalgo, RSS_HASH_TOEPLITZ); 197b2bdc62aSAdrian Chadd rss_hashalgo = RSS_HASH_TOEPLITZ; 198b2bdc62aSAdrian Chadd } 199b2bdc62aSAdrian Chadd 200b2bdc62aSAdrian Chadd /* 201b2bdc62aSAdrian Chadd * Count available CPUs. 202b2bdc62aSAdrian Chadd * 203b2bdc62aSAdrian Chadd * XXXRW: Note incorrect assumptions regarding contiguity of this set 204b2bdc62aSAdrian Chadd * elsewhere. 205b2bdc62aSAdrian Chadd */ 206b2bdc62aSAdrian Chadd rss_ncpus = 0; 207b2bdc62aSAdrian Chadd for (i = 0; i <= mp_maxid; i++) { 208b2bdc62aSAdrian Chadd if (CPU_ABSENT(i)) 209b2bdc62aSAdrian Chadd continue; 210b2bdc62aSAdrian Chadd rss_ncpus++; 211b2bdc62aSAdrian Chadd } 212b2bdc62aSAdrian Chadd if (rss_ncpus > RSS_MAXCPUS) 213b2bdc62aSAdrian Chadd rss_ncpus = RSS_MAXCPUS; 214b2bdc62aSAdrian Chadd 215b2bdc62aSAdrian Chadd /* 216b2bdc62aSAdrian Chadd * Tune RSS table entries to be no less than 2x the number of CPUs 217b2bdc62aSAdrian Chadd * -- unless we're running uniprocessor, in which case there's not 218b2bdc62aSAdrian Chadd * much point in having buckets to rearrange for load-balancing! 219b2bdc62aSAdrian Chadd */ 220b2bdc62aSAdrian Chadd if (rss_ncpus > 1) { 221b2bdc62aSAdrian Chadd if (rss_bits == 0) 222b2bdc62aSAdrian Chadd rss_bits = fls(rss_ncpus - 1) + 1; 223b2bdc62aSAdrian Chadd 224b2bdc62aSAdrian Chadd /* 225b2bdc62aSAdrian Chadd * Microsoft limits RSS table entries to 128, so apply that 226b2bdc62aSAdrian Chadd * limit to both auto-detected CPU counts and user-configured 227b2bdc62aSAdrian Chadd * ones. 228b2bdc62aSAdrian Chadd */ 229b2bdc62aSAdrian Chadd if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 230e5562eb9SAdrian Chadd RSS_DEBUG("RSS bits %u not valid, coercing to %u\n", 231e5562eb9SAdrian Chadd rss_bits, RSS_MAXBITS); 232b2bdc62aSAdrian Chadd rss_bits = RSS_MAXBITS; 233b2bdc62aSAdrian Chadd } 234b2bdc62aSAdrian Chadd 235b2bdc62aSAdrian Chadd /* 236b2bdc62aSAdrian Chadd * Figure out how many buckets to use; warn if less than the 237b2bdc62aSAdrian Chadd * number of configured CPUs, although this is not a fatal 238b2bdc62aSAdrian Chadd * problem. 239b2bdc62aSAdrian Chadd */ 240b2bdc62aSAdrian Chadd rss_buckets = (1 << rss_bits); 241b2bdc62aSAdrian Chadd if (rss_buckets < rss_ncpus) 242e5562eb9SAdrian Chadd RSS_DEBUG("WARNING: rss_buckets (%u) less than " 243e5562eb9SAdrian Chadd "rss_ncpus (%u)\n", rss_buckets, rss_ncpus); 244b2bdc62aSAdrian Chadd rss_mask = rss_buckets - 1; 245b2bdc62aSAdrian Chadd } else { 246b2bdc62aSAdrian Chadd rss_bits = 0; 247b2bdc62aSAdrian Chadd rss_buckets = 1; 248b2bdc62aSAdrian Chadd rss_mask = 0; 249b2bdc62aSAdrian Chadd } 250b2bdc62aSAdrian Chadd 251b2bdc62aSAdrian Chadd /* 252b2bdc62aSAdrian Chadd * Set up initial CPU assignments: round-robin by default. 253b2bdc62aSAdrian Chadd */ 254b2bdc62aSAdrian Chadd cpuid = CPU_FIRST(); 255b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 256b2bdc62aSAdrian Chadd rss_table[i].rte_cpu = cpuid; 257b2bdc62aSAdrian Chadd cpuid = CPU_NEXT(cpuid); 258b2bdc62aSAdrian Chadd } 259b2bdc62aSAdrian Chadd 260b2bdc62aSAdrian Chadd /* 261b2bdc62aSAdrian Chadd * Randomize rrs_key. 262b2bdc62aSAdrian Chadd * 263b2bdc62aSAdrian Chadd * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 264b2bdc62aSAdrian Chadd * loop to check for "bad" RSS keys. 265b2bdc62aSAdrian Chadd */ 266b2bdc62aSAdrian Chadd } 267b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 268b2bdc62aSAdrian Chadd 269b2bdc62aSAdrian Chadd static uint32_t 270b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 271b2bdc62aSAdrian Chadd const uint8_t *data) 272b2bdc62aSAdrian Chadd { 273b2bdc62aSAdrian Chadd uint32_t v; 274b2bdc62aSAdrian Chadd u_int i; 275b2bdc62aSAdrian Chadd 276b2bdc62aSAdrian Chadd v = 0; 277b2bdc62aSAdrian Chadd for (i = 0; i < keylen; i++) 278b2bdc62aSAdrian Chadd v += key[i]; 279b2bdc62aSAdrian Chadd for (i = 0; i < datalen; i++) 280b2bdc62aSAdrian Chadd v += data[i]; 281b2bdc62aSAdrian Chadd return (v); 282b2bdc62aSAdrian Chadd } 283b2bdc62aSAdrian Chadd 284b2bdc62aSAdrian Chadd uint32_t 285b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data) 286b2bdc62aSAdrian Chadd { 287b2bdc62aSAdrian Chadd 288b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 289b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 290b2bdc62aSAdrian Chadd return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 291b2bdc62aSAdrian Chadd data)); 292b2bdc62aSAdrian Chadd 293b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 294b2bdc62aSAdrian Chadd return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 295b2bdc62aSAdrian Chadd data)); 296b2bdc62aSAdrian Chadd 297b2bdc62aSAdrian Chadd default: 298b2bdc62aSAdrian Chadd panic("%s: unsupported/unknown hashalgo %d", __func__, 299b2bdc62aSAdrian Chadd rss_hashalgo); 300b2bdc62aSAdrian Chadd } 301b2bdc62aSAdrian Chadd } 302b2bdc62aSAdrian Chadd 303b2bdc62aSAdrian Chadd /* 304b2bdc62aSAdrian Chadd * Query the number of RSS bits in use. 305b2bdc62aSAdrian Chadd */ 306b2bdc62aSAdrian Chadd u_int 307b2bdc62aSAdrian Chadd rss_getbits(void) 308b2bdc62aSAdrian Chadd { 309b2bdc62aSAdrian Chadd 310b2bdc62aSAdrian Chadd return (rss_bits); 311b2bdc62aSAdrian Chadd } 312b2bdc62aSAdrian Chadd 313b2bdc62aSAdrian Chadd /* 314b2bdc62aSAdrian Chadd * Query the RSS bucket associated with an RSS hash. 315b2bdc62aSAdrian Chadd */ 316b2bdc62aSAdrian Chadd u_int 317b2bdc62aSAdrian Chadd rss_getbucket(u_int hash) 318b2bdc62aSAdrian Chadd { 319b2bdc62aSAdrian Chadd 320b2bdc62aSAdrian Chadd return (hash & rss_mask); 321b2bdc62aSAdrian Chadd } 322b2bdc62aSAdrian Chadd 323b2bdc62aSAdrian Chadd /* 324b2bdc62aSAdrian Chadd * Query the RSS layer bucket associated with the given 325b2bdc62aSAdrian Chadd * entry in the RSS hash space. 326b2bdc62aSAdrian Chadd * 327b2bdc62aSAdrian Chadd * The RSS indirection table is 0 .. rss_buckets-1, 328b2bdc62aSAdrian Chadd * covering the low 'rss_bits' of the total 128 slot 329b2bdc62aSAdrian Chadd * RSS indirection table. So just mask off rss_bits and 330b2bdc62aSAdrian Chadd * return that. 331b2bdc62aSAdrian Chadd * 332b2bdc62aSAdrian Chadd * NIC drivers can then iterate over the 128 slot RSS 333b2bdc62aSAdrian Chadd * indirection table and fetch which RSS bucket to 334b2bdc62aSAdrian Chadd * map it to. This will typically be a CPU queue 335b2bdc62aSAdrian Chadd */ 336b2bdc62aSAdrian Chadd u_int 337b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index) 338b2bdc62aSAdrian Chadd { 339b2bdc62aSAdrian Chadd 340b2bdc62aSAdrian Chadd return (index & rss_mask); 341b2bdc62aSAdrian Chadd } 342b2bdc62aSAdrian Chadd 343b2bdc62aSAdrian Chadd /* 344b2bdc62aSAdrian Chadd * Query the RSS CPU associated with an RSS bucket. 345b2bdc62aSAdrian Chadd */ 346b2bdc62aSAdrian Chadd u_int 347b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket) 348b2bdc62aSAdrian Chadd { 349b2bdc62aSAdrian Chadd 350b2bdc62aSAdrian Chadd return (rss_table[bucket].rte_cpu); 351b2bdc62aSAdrian Chadd } 352b2bdc62aSAdrian Chadd 353b2bdc62aSAdrian Chadd /* 354b2bdc62aSAdrian Chadd * netisr CPU affinity lookup given just the hash and hashtype. 355b2bdc62aSAdrian Chadd */ 356b2bdc62aSAdrian Chadd u_int 357b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 358b2bdc62aSAdrian Chadd { 359b2bdc62aSAdrian Chadd 360b2bdc62aSAdrian Chadd switch (hash_type) { 361b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 362b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 363b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 364b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 365b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 366b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 367b2bdc62aSAdrian Chadd return (rss_getcpu(rss_getbucket(hash_val))); 368b2bdc62aSAdrian Chadd default: 369b2bdc62aSAdrian Chadd return (NETISR_CPUID_NONE); 370b2bdc62aSAdrian Chadd } 371b2bdc62aSAdrian Chadd } 372b2bdc62aSAdrian Chadd 373b2bdc62aSAdrian Chadd /* 374b2bdc62aSAdrian Chadd * Query the RSS bucket associated with the given hash value and 375b2bdc62aSAdrian Chadd * type. 376b2bdc62aSAdrian Chadd */ 377b2bdc62aSAdrian Chadd int 378b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 379b2bdc62aSAdrian Chadd { 380b2bdc62aSAdrian Chadd 381b2bdc62aSAdrian Chadd switch (hash_type) { 382b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 383b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 384b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 385b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 386b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 387b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 388b2bdc62aSAdrian Chadd *bucket_id = rss_getbucket(hash_val); 389b2bdc62aSAdrian Chadd return (0); 390b2bdc62aSAdrian Chadd default: 391b2bdc62aSAdrian Chadd return (-1); 392b2bdc62aSAdrian Chadd } 393b2bdc62aSAdrian Chadd } 394b2bdc62aSAdrian Chadd 395b2bdc62aSAdrian Chadd /* 396b2bdc62aSAdrian Chadd * netisr CPU affinity lookup routine for use by protocols. 397b2bdc62aSAdrian Chadd */ 398b2bdc62aSAdrian Chadd struct mbuf * 399b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 400b2bdc62aSAdrian Chadd { 401b2bdc62aSAdrian Chadd 402b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 403b2bdc62aSAdrian Chadd *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 404b2bdc62aSAdrian Chadd return (m); 405b2bdc62aSAdrian Chadd } 406b2bdc62aSAdrian Chadd 407b2bdc62aSAdrian Chadd int 408b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 409b2bdc62aSAdrian Chadd { 410b2bdc62aSAdrian Chadd 411b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 412b2bdc62aSAdrian Chadd 413b2bdc62aSAdrian Chadd return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 414b2bdc62aSAdrian Chadd bucket_id)); 415b2bdc62aSAdrian Chadd } 416b2bdc62aSAdrian Chadd 417b2bdc62aSAdrian Chadd /* 418b2bdc62aSAdrian Chadd * Query the RSS hash algorithm. 419b2bdc62aSAdrian Chadd */ 420b2bdc62aSAdrian Chadd u_int 421b2bdc62aSAdrian Chadd rss_gethashalgo(void) 422b2bdc62aSAdrian Chadd { 423b2bdc62aSAdrian Chadd 424b2bdc62aSAdrian Chadd return (rss_hashalgo); 425b2bdc62aSAdrian Chadd } 426b2bdc62aSAdrian Chadd 427b2bdc62aSAdrian Chadd /* 428b2bdc62aSAdrian Chadd * Query the current RSS key; likely to be used by device drivers when 429b2bdc62aSAdrian Chadd * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 430b2bdc62aSAdrian Chadd * 431b2bdc62aSAdrian Chadd * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 432b2bdc62aSAdrian Chadd */ 433b2bdc62aSAdrian Chadd void 434b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key) 435b2bdc62aSAdrian Chadd { 436b2bdc62aSAdrian Chadd 437b2bdc62aSAdrian Chadd bcopy(rss_key, key, sizeof(rss_key)); 438b2bdc62aSAdrian Chadd } 439b2bdc62aSAdrian Chadd 440b2bdc62aSAdrian Chadd /* 441b2bdc62aSAdrian Chadd * Query the number of buckets; this may be used by both network device 442b2bdc62aSAdrian Chadd * drivers, which will need to populate hardware shadows of the software 443b2bdc62aSAdrian Chadd * indirection table, and the network stack itself (such as when deciding how 444b2bdc62aSAdrian Chadd * many connection groups to allocate). 445b2bdc62aSAdrian Chadd */ 446b2bdc62aSAdrian Chadd u_int 447b2bdc62aSAdrian Chadd rss_getnumbuckets(void) 448b2bdc62aSAdrian Chadd { 449b2bdc62aSAdrian Chadd 450b2bdc62aSAdrian Chadd return (rss_buckets); 451b2bdc62aSAdrian Chadd } 452b2bdc62aSAdrian Chadd 453b2bdc62aSAdrian Chadd /* 454b2bdc62aSAdrian Chadd * Query the number of CPUs in use by RSS; may be useful to device drivers 455b2bdc62aSAdrian Chadd * trying to figure out how to map a larger number of CPUs into a smaller 456b2bdc62aSAdrian Chadd * number of receive queues. 457b2bdc62aSAdrian Chadd */ 458b2bdc62aSAdrian Chadd u_int 459b2bdc62aSAdrian Chadd rss_getnumcpus(void) 460b2bdc62aSAdrian Chadd { 461b2bdc62aSAdrian Chadd 462b2bdc62aSAdrian Chadd return (rss_ncpus); 463b2bdc62aSAdrian Chadd } 464b2bdc62aSAdrian Chadd 465b2bdc62aSAdrian Chadd /* 466b2bdc62aSAdrian Chadd * Return the supported RSS hash configuration. 467b2bdc62aSAdrian Chadd * 468b2bdc62aSAdrian Chadd * NICs should query this to determine what to configure in their redirection 469b2bdc62aSAdrian Chadd * matching table. 470b2bdc62aSAdrian Chadd */ 471b2bdc62aSAdrian Chadd inline u_int 472b2bdc62aSAdrian Chadd rss_gethashconfig(void) 473b2bdc62aSAdrian Chadd { 474b2bdc62aSAdrian Chadd 475b2bdc62aSAdrian Chadd /* Return 4-tuple for TCP; 2-tuple for others */ 476b2bdc62aSAdrian Chadd /* 477b2bdc62aSAdrian Chadd * UDP may fragment more often than TCP and thus we'll end up with 478b2bdc62aSAdrian Chadd * NICs returning 2-tuple fragments. 479b2bdc62aSAdrian Chadd * udp_init() and udplite_init() both currently initialise things 480b2bdc62aSAdrian Chadd * as 2-tuple. 481b2bdc62aSAdrian Chadd * So for now disable UDP 4-tuple hashing until all of the other 482b2bdc62aSAdrian Chadd * pieces are in place. 483b2bdc62aSAdrian Chadd */ 484b2bdc62aSAdrian Chadd return ( 485b2bdc62aSAdrian Chadd RSS_HASHTYPE_RSS_IPV4 486b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV4 487b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6 488b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6 489b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6_EX 490b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6_EX 491b2bdc62aSAdrian Chadd #if 0 492b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV4 493b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6 494b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6_EX 495b2bdc62aSAdrian Chadd #endif 496b2bdc62aSAdrian Chadd ); 497b2bdc62aSAdrian Chadd } 498b2bdc62aSAdrian Chadd 499b2bdc62aSAdrian Chadd /* 500b2bdc62aSAdrian Chadd * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 501b2bdc62aSAdrian Chadd * it appearing in debugging output unnecessarily. 502b2bdc62aSAdrian Chadd */ 503b2bdc62aSAdrian Chadd static int 504b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS) 505b2bdc62aSAdrian Chadd { 506b2bdc62aSAdrian Chadd uint8_t temp_rss_key[RSS_KEYSIZE]; 507b2bdc62aSAdrian Chadd int error; 508b2bdc62aSAdrian Chadd 509b2bdc62aSAdrian Chadd error = priv_check(req->td, PRIV_NETINET_HASHKEY); 510b2bdc62aSAdrian Chadd if (error) 511b2bdc62aSAdrian Chadd return (error); 512b2bdc62aSAdrian Chadd 513b2bdc62aSAdrian Chadd bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 514b2bdc62aSAdrian Chadd error = sysctl_handle_opaque(oidp, temp_rss_key, 515b2bdc62aSAdrian Chadd sizeof(temp_rss_key), req); 516b2bdc62aSAdrian Chadd if (error) 517b2bdc62aSAdrian Chadd return (error); 518b2bdc62aSAdrian Chadd if (req->newptr != NULL) { 519b2bdc62aSAdrian Chadd /* XXXRW: Not yet. */ 520b2bdc62aSAdrian Chadd return (EINVAL); 521b2bdc62aSAdrian Chadd } 522b2bdc62aSAdrian Chadd return (0); 523b2bdc62aSAdrian Chadd } 524b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 525b2bdc62aSAdrian Chadd CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 526b2bdc62aSAdrian Chadd "", "RSS keying material"); 527b2bdc62aSAdrian Chadd 528b2bdc62aSAdrian Chadd static int 529b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 530b2bdc62aSAdrian Chadd { 531b2bdc62aSAdrian Chadd struct sbuf *sb; 532b2bdc62aSAdrian Chadd int error; 533b2bdc62aSAdrian Chadd int i; 534b2bdc62aSAdrian Chadd 535b2bdc62aSAdrian Chadd error = 0; 536b2bdc62aSAdrian Chadd error = sysctl_wire_old_buffer(req, 0); 537b2bdc62aSAdrian Chadd if (error != 0) 538b2bdc62aSAdrian Chadd return (error); 539b2bdc62aSAdrian Chadd sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 540b2bdc62aSAdrian Chadd if (sb == NULL) 541b2bdc62aSAdrian Chadd return (ENOMEM); 542b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 543b2bdc62aSAdrian Chadd sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 544b2bdc62aSAdrian Chadd i, 545b2bdc62aSAdrian Chadd rss_getcpu(i)); 546b2bdc62aSAdrian Chadd } 547b2bdc62aSAdrian Chadd error = sbuf_finish(sb); 548b2bdc62aSAdrian Chadd sbuf_delete(sb); 549b2bdc62aSAdrian Chadd 550b2bdc62aSAdrian Chadd return (error); 551b2bdc62aSAdrian Chadd } 552b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 553*7029da5cSPawel Biernacki CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, 554b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 555