1*b2bdc62aSAdrian Chadd /*- 2*b2bdc62aSAdrian Chadd * Copyright (c) 2010-2011 Juniper Networks, Inc. 3*b2bdc62aSAdrian Chadd * All rights reserved. 4*b2bdc62aSAdrian Chadd * 5*b2bdc62aSAdrian Chadd * This software was developed by Robert N. M. Watson under contract 6*b2bdc62aSAdrian Chadd * to Juniper Networks, Inc. 7*b2bdc62aSAdrian Chadd * 8*b2bdc62aSAdrian Chadd * Redistribution and use in source and binary forms, with or without 9*b2bdc62aSAdrian Chadd * modification, are permitted provided that the following conditions 10*b2bdc62aSAdrian Chadd * are met: 11*b2bdc62aSAdrian Chadd * 1. Redistributions of source code must retain the above copyright 12*b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer. 13*b2bdc62aSAdrian Chadd * 2. Redistributions in binary form must reproduce the above copyright 14*b2bdc62aSAdrian Chadd * notice, this list of conditions and the following disclaimer in the 15*b2bdc62aSAdrian Chadd * documentation and/or other materials provided with the distribution. 16*b2bdc62aSAdrian Chadd * 17*b2bdc62aSAdrian Chadd * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 18*b2bdc62aSAdrian Chadd * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 19*b2bdc62aSAdrian Chadd * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 20*b2bdc62aSAdrian Chadd * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 21*b2bdc62aSAdrian Chadd * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 22*b2bdc62aSAdrian Chadd * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 23*b2bdc62aSAdrian Chadd * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 24*b2bdc62aSAdrian Chadd * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 25*b2bdc62aSAdrian Chadd * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 26*b2bdc62aSAdrian Chadd * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 27*b2bdc62aSAdrian Chadd * SUCH DAMAGE. 28*b2bdc62aSAdrian Chadd */ 29*b2bdc62aSAdrian Chadd 30*b2bdc62aSAdrian Chadd #include <sys/cdefs.h> 31*b2bdc62aSAdrian Chadd 32*b2bdc62aSAdrian Chadd __FBSDID("$FreeBSD$"); 33*b2bdc62aSAdrian Chadd 34*b2bdc62aSAdrian Chadd #include "opt_inet6.h" 35*b2bdc62aSAdrian Chadd #include "opt_pcbgroup.h" 36*b2bdc62aSAdrian Chadd 37*b2bdc62aSAdrian Chadd #ifndef PCBGROUP 38*b2bdc62aSAdrian Chadd #error "options RSS depends on options PCBGROUP" 39*b2bdc62aSAdrian Chadd #endif 40*b2bdc62aSAdrian Chadd 41*b2bdc62aSAdrian Chadd #include <sys/param.h> 42*b2bdc62aSAdrian Chadd #include <sys/mbuf.h> 43*b2bdc62aSAdrian Chadd #include <sys/socket.h> 44*b2bdc62aSAdrian Chadd #include <sys/priv.h> 45*b2bdc62aSAdrian Chadd #include <sys/kernel.h> 46*b2bdc62aSAdrian Chadd #include <sys/smp.h> 47*b2bdc62aSAdrian Chadd #include <sys/sysctl.h> 48*b2bdc62aSAdrian Chadd #include <sys/sbuf.h> 49*b2bdc62aSAdrian Chadd 50*b2bdc62aSAdrian Chadd #include <net/if.h> 51*b2bdc62aSAdrian Chadd #include <net/if_var.h> 52*b2bdc62aSAdrian Chadd #include <net/netisr.h> 53*b2bdc62aSAdrian Chadd #include <net/rss_config.h> 54*b2bdc62aSAdrian Chadd #include <net/toeplitz.h> 55*b2bdc62aSAdrian Chadd 56*b2bdc62aSAdrian Chadd #if 0 57*b2bdc62aSAdrian Chadd #include <netinet/in.h> 58*b2bdc62aSAdrian Chadd #include <netinet/in_pcb.h> 59*b2bdc62aSAdrian Chadd #include <netinet/in_rss.h> 60*b2bdc62aSAdrian Chadd #include <netinet/in_var.h> 61*b2bdc62aSAdrian Chadd 62*b2bdc62aSAdrian Chadd /* for software rss hash support */ 63*b2bdc62aSAdrian Chadd #include <netinet/ip.h> 64*b2bdc62aSAdrian Chadd #include <netinet/tcp.h> 65*b2bdc62aSAdrian Chadd #include <netinet/udp.h> 66*b2bdc62aSAdrian Chadd #endif 67*b2bdc62aSAdrian Chadd 68*b2bdc62aSAdrian Chadd /*- 69*b2bdc62aSAdrian Chadd * Operating system parts of receiver-side scaling (RSS), which allows 70*b2bdc62aSAdrian Chadd * network cards to direct flows to particular receive queues based on hashes 71*b2bdc62aSAdrian Chadd * of header tuples. This implementation aligns RSS buckets with connection 72*b2bdc62aSAdrian Chadd * groups at the TCP/IP layer, so each bucket is associated with exactly one 73*b2bdc62aSAdrian Chadd * group. As a result, the group lookup structures (and lock) should have an 74*b2bdc62aSAdrian Chadd * effective affinity with exactly one CPU. 75*b2bdc62aSAdrian Chadd * 76*b2bdc62aSAdrian Chadd * Network device drivers needing to configure RSS will query this framework 77*b2bdc62aSAdrian Chadd * for parameters, such as the current RSS key, hashing policies, number of 78*b2bdc62aSAdrian Chadd * bits, and indirection table mapping hashes to buckets and CPUs. They may 79*b2bdc62aSAdrian Chadd * provide their own supplementary information, such as queue<->CPU bindings. 80*b2bdc62aSAdrian Chadd * It is the responsibility of the network device driver to inject packets 81*b2bdc62aSAdrian Chadd * into the stack on as close to the right CPU as possible, if playing by RSS 82*b2bdc62aSAdrian Chadd * rules. 83*b2bdc62aSAdrian Chadd * 84*b2bdc62aSAdrian Chadd * TODO: 85*b2bdc62aSAdrian Chadd * 86*b2bdc62aSAdrian Chadd * - Synchronization for rss_key and other future-configurable parameters. 87*b2bdc62aSAdrian Chadd * - Event handler drivers can register to pick up RSS configuration changes. 88*b2bdc62aSAdrian Chadd * - Should we allow rss_basecpu to be configured? 89*b2bdc62aSAdrian Chadd * - Randomize key on boot. 90*b2bdc62aSAdrian Chadd * - IPv6 support. 91*b2bdc62aSAdrian Chadd * - Statistics on how often there's a misalignment between hardware 92*b2bdc62aSAdrian Chadd * placement and pcbgroup expectations. 93*b2bdc62aSAdrian Chadd */ 94*b2bdc62aSAdrian Chadd 95*b2bdc62aSAdrian Chadd SYSCTL_DECL(_net_inet); 96*b2bdc62aSAdrian Chadd SYSCTL_NODE(_net_inet, OID_AUTO, rss, CTLFLAG_RW, 0, "Receive-side steering"); 97*b2bdc62aSAdrian Chadd 98*b2bdc62aSAdrian Chadd /* 99*b2bdc62aSAdrian Chadd * Toeplitz is the only required hash function in the RSS spec, so use it by 100*b2bdc62aSAdrian Chadd * default. 101*b2bdc62aSAdrian Chadd */ 102*b2bdc62aSAdrian Chadd static u_int rss_hashalgo = RSS_HASH_TOEPLITZ; 103*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, hashalgo, CTLFLAG_RDTUN, &rss_hashalgo, 0, 104*b2bdc62aSAdrian Chadd "RSS hash algorithm"); 105*b2bdc62aSAdrian Chadd 106*b2bdc62aSAdrian Chadd /* 107*b2bdc62aSAdrian Chadd * Size of the indirection table; at most 128 entries per the RSS spec. We 108*b2bdc62aSAdrian Chadd * size it to at least 2 times the number of CPUs by default to allow useful 109*b2bdc62aSAdrian Chadd * rebalancing. If not set explicitly with a loader tunable, we tune based 110*b2bdc62aSAdrian Chadd * on the number of CPUs present. 111*b2bdc62aSAdrian Chadd * 112*b2bdc62aSAdrian Chadd * XXXRW: buckets might be better to use for the tunable than bits. 113*b2bdc62aSAdrian Chadd */ 114*b2bdc62aSAdrian Chadd static u_int rss_bits; 115*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, bits, CTLFLAG_RDTUN, &rss_bits, 0, 116*b2bdc62aSAdrian Chadd "RSS bits"); 117*b2bdc62aSAdrian Chadd 118*b2bdc62aSAdrian Chadd static u_int rss_mask; 119*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, mask, CTLFLAG_RD, &rss_mask, 0, 120*b2bdc62aSAdrian Chadd "RSS mask"); 121*b2bdc62aSAdrian Chadd 122*b2bdc62aSAdrian Chadd static const u_int rss_maxbits = RSS_MAXBITS; 123*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxbits, CTLFLAG_RD, 124*b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxbits), 0, "RSS maximum bits"); 125*b2bdc62aSAdrian Chadd 126*b2bdc62aSAdrian Chadd /* 127*b2bdc62aSAdrian Chadd * RSS's own count of the number of CPUs it could be using for processing. 128*b2bdc62aSAdrian Chadd * Bounded to 64 by RSS constants. 129*b2bdc62aSAdrian Chadd */ 130*b2bdc62aSAdrian Chadd static u_int rss_ncpus; 131*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, ncpus, CTLFLAG_RD, &rss_ncpus, 0, 132*b2bdc62aSAdrian Chadd "Number of CPUs available to RSS"); 133*b2bdc62aSAdrian Chadd 134*b2bdc62aSAdrian Chadd #define RSS_MAXCPUS (1 << (RSS_MAXBITS - 1)) 135*b2bdc62aSAdrian Chadd static const u_int rss_maxcpus = RSS_MAXCPUS; 136*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, maxcpus, CTLFLAG_RD, 137*b2bdc62aSAdrian Chadd __DECONST(int *, &rss_maxcpus), 0, "RSS maximum CPUs that can be used"); 138*b2bdc62aSAdrian Chadd 139*b2bdc62aSAdrian Chadd /* 140*b2bdc62aSAdrian Chadd * Variable exists just for reporting rss_bits in a user-friendly way. 141*b2bdc62aSAdrian Chadd */ 142*b2bdc62aSAdrian Chadd static u_int rss_buckets; 143*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, buckets, CTLFLAG_RD, &rss_buckets, 0, 144*b2bdc62aSAdrian Chadd "RSS buckets"); 145*b2bdc62aSAdrian Chadd 146*b2bdc62aSAdrian Chadd /* 147*b2bdc62aSAdrian Chadd * Base CPU number; devices will add this to all CPU numbers returned by the 148*b2bdc62aSAdrian Chadd * RSS indirection table. Currently unmodifable in FreeBSD. 149*b2bdc62aSAdrian Chadd */ 150*b2bdc62aSAdrian Chadd static const u_int rss_basecpu; 151*b2bdc62aSAdrian Chadd SYSCTL_INT(_net_inet_rss, OID_AUTO, basecpu, CTLFLAG_RD, 152*b2bdc62aSAdrian Chadd __DECONST(int *, &rss_basecpu), 0, "RSS base CPU"); 153*b2bdc62aSAdrian Chadd 154*b2bdc62aSAdrian Chadd /* 155*b2bdc62aSAdrian Chadd * RSS secret key, intended to prevent attacks on load-balancing. Its 156*b2bdc62aSAdrian Chadd * effectiveness may be limited by algorithm choice and available entropy 157*b2bdc62aSAdrian Chadd * during the boot. 158*b2bdc62aSAdrian Chadd * 159*b2bdc62aSAdrian Chadd * XXXRW: And that we don't randomize it yet! 160*b2bdc62aSAdrian Chadd * 161*b2bdc62aSAdrian Chadd * This is the default Microsoft RSS specification key which is also 162*b2bdc62aSAdrian Chadd * the Chelsio T5 firmware default key. 163*b2bdc62aSAdrian Chadd */ 164*b2bdc62aSAdrian Chadd static uint8_t rss_key[RSS_KEYSIZE] = { 165*b2bdc62aSAdrian Chadd 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, 166*b2bdc62aSAdrian Chadd 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, 167*b2bdc62aSAdrian Chadd 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, 168*b2bdc62aSAdrian Chadd 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, 169*b2bdc62aSAdrian Chadd 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa, 170*b2bdc62aSAdrian Chadd }; 171*b2bdc62aSAdrian Chadd 172*b2bdc62aSAdrian Chadd /* 173*b2bdc62aSAdrian Chadd * RSS hash->CPU table, which maps hashed packet headers to particular CPUs. 174*b2bdc62aSAdrian Chadd * Drivers may supplement this table with a seperate CPU<->queue table when 175*b2bdc62aSAdrian Chadd * programming devices. 176*b2bdc62aSAdrian Chadd */ 177*b2bdc62aSAdrian Chadd struct rss_table_entry { 178*b2bdc62aSAdrian Chadd uint8_t rte_cpu; /* CPU affinity of bucket. */ 179*b2bdc62aSAdrian Chadd }; 180*b2bdc62aSAdrian Chadd static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; 181*b2bdc62aSAdrian Chadd 182*b2bdc62aSAdrian Chadd static void 183*b2bdc62aSAdrian Chadd rss_init(__unused void *arg) 184*b2bdc62aSAdrian Chadd { 185*b2bdc62aSAdrian Chadd u_int i; 186*b2bdc62aSAdrian Chadd u_int cpuid; 187*b2bdc62aSAdrian Chadd 188*b2bdc62aSAdrian Chadd /* 189*b2bdc62aSAdrian Chadd * Validate tunables, coerce to sensible values. 190*b2bdc62aSAdrian Chadd */ 191*b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 192*b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 193*b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 194*b2bdc62aSAdrian Chadd break; 195*b2bdc62aSAdrian Chadd 196*b2bdc62aSAdrian Chadd default: 197*b2bdc62aSAdrian Chadd printf("%s: invalid RSS hashalgo %u, coercing to %u", 198*b2bdc62aSAdrian Chadd __func__, rss_hashalgo, RSS_HASH_TOEPLITZ); 199*b2bdc62aSAdrian Chadd rss_hashalgo = RSS_HASH_TOEPLITZ; 200*b2bdc62aSAdrian Chadd } 201*b2bdc62aSAdrian Chadd 202*b2bdc62aSAdrian Chadd /* 203*b2bdc62aSAdrian Chadd * Count available CPUs. 204*b2bdc62aSAdrian Chadd * 205*b2bdc62aSAdrian Chadd * XXXRW: Note incorrect assumptions regarding contiguity of this set 206*b2bdc62aSAdrian Chadd * elsewhere. 207*b2bdc62aSAdrian Chadd */ 208*b2bdc62aSAdrian Chadd rss_ncpus = 0; 209*b2bdc62aSAdrian Chadd for (i = 0; i <= mp_maxid; i++) { 210*b2bdc62aSAdrian Chadd if (CPU_ABSENT(i)) 211*b2bdc62aSAdrian Chadd continue; 212*b2bdc62aSAdrian Chadd rss_ncpus++; 213*b2bdc62aSAdrian Chadd } 214*b2bdc62aSAdrian Chadd if (rss_ncpus > RSS_MAXCPUS) 215*b2bdc62aSAdrian Chadd rss_ncpus = RSS_MAXCPUS; 216*b2bdc62aSAdrian Chadd 217*b2bdc62aSAdrian Chadd /* 218*b2bdc62aSAdrian Chadd * Tune RSS table entries to be no less than 2x the number of CPUs 219*b2bdc62aSAdrian Chadd * -- unless we're running uniprocessor, in which case there's not 220*b2bdc62aSAdrian Chadd * much point in having buckets to rearrange for load-balancing! 221*b2bdc62aSAdrian Chadd */ 222*b2bdc62aSAdrian Chadd if (rss_ncpus > 1) { 223*b2bdc62aSAdrian Chadd if (rss_bits == 0) 224*b2bdc62aSAdrian Chadd rss_bits = fls(rss_ncpus - 1) + 1; 225*b2bdc62aSAdrian Chadd 226*b2bdc62aSAdrian Chadd /* 227*b2bdc62aSAdrian Chadd * Microsoft limits RSS table entries to 128, so apply that 228*b2bdc62aSAdrian Chadd * limit to both auto-detected CPU counts and user-configured 229*b2bdc62aSAdrian Chadd * ones. 230*b2bdc62aSAdrian Chadd */ 231*b2bdc62aSAdrian Chadd if (rss_bits == 0 || rss_bits > RSS_MAXBITS) { 232*b2bdc62aSAdrian Chadd printf("%s: RSS bits %u not valid, coercing to %u", 233*b2bdc62aSAdrian Chadd __func__, rss_bits, RSS_MAXBITS); 234*b2bdc62aSAdrian Chadd rss_bits = RSS_MAXBITS; 235*b2bdc62aSAdrian Chadd } 236*b2bdc62aSAdrian Chadd 237*b2bdc62aSAdrian Chadd /* 238*b2bdc62aSAdrian Chadd * Figure out how many buckets to use; warn if less than the 239*b2bdc62aSAdrian Chadd * number of configured CPUs, although this is not a fatal 240*b2bdc62aSAdrian Chadd * problem. 241*b2bdc62aSAdrian Chadd */ 242*b2bdc62aSAdrian Chadd rss_buckets = (1 << rss_bits); 243*b2bdc62aSAdrian Chadd if (rss_buckets < rss_ncpus) 244*b2bdc62aSAdrian Chadd printf("%s: WARNING: rss_buckets (%u) less than " 245*b2bdc62aSAdrian Chadd "rss_ncpus (%u)\n", __func__, rss_buckets, 246*b2bdc62aSAdrian Chadd rss_ncpus); 247*b2bdc62aSAdrian Chadd rss_mask = rss_buckets - 1; 248*b2bdc62aSAdrian Chadd } else { 249*b2bdc62aSAdrian Chadd rss_bits = 0; 250*b2bdc62aSAdrian Chadd rss_buckets = 1; 251*b2bdc62aSAdrian Chadd rss_mask = 0; 252*b2bdc62aSAdrian Chadd } 253*b2bdc62aSAdrian Chadd 254*b2bdc62aSAdrian Chadd /* 255*b2bdc62aSAdrian Chadd * Set up initial CPU assignments: round-robin by default. 256*b2bdc62aSAdrian Chadd */ 257*b2bdc62aSAdrian Chadd cpuid = CPU_FIRST(); 258*b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 259*b2bdc62aSAdrian Chadd rss_table[i].rte_cpu = cpuid; 260*b2bdc62aSAdrian Chadd cpuid = CPU_NEXT(cpuid); 261*b2bdc62aSAdrian Chadd } 262*b2bdc62aSAdrian Chadd 263*b2bdc62aSAdrian Chadd /* 264*b2bdc62aSAdrian Chadd * Randomize rrs_key. 265*b2bdc62aSAdrian Chadd * 266*b2bdc62aSAdrian Chadd * XXXRW: Not yet. If nothing else, will require an rss_isbadkey() 267*b2bdc62aSAdrian Chadd * loop to check for "bad" RSS keys. 268*b2bdc62aSAdrian Chadd */ 269*b2bdc62aSAdrian Chadd } 270*b2bdc62aSAdrian Chadd SYSINIT(rss_init, SI_SUB_SOFTINTR, SI_ORDER_SECOND, rss_init, NULL); 271*b2bdc62aSAdrian Chadd 272*b2bdc62aSAdrian Chadd static uint32_t 273*b2bdc62aSAdrian Chadd rss_naive_hash(u_int keylen, const uint8_t *key, u_int datalen, 274*b2bdc62aSAdrian Chadd const uint8_t *data) 275*b2bdc62aSAdrian Chadd { 276*b2bdc62aSAdrian Chadd uint32_t v; 277*b2bdc62aSAdrian Chadd u_int i; 278*b2bdc62aSAdrian Chadd 279*b2bdc62aSAdrian Chadd v = 0; 280*b2bdc62aSAdrian Chadd for (i = 0; i < keylen; i++) 281*b2bdc62aSAdrian Chadd v += key[i]; 282*b2bdc62aSAdrian Chadd for (i = 0; i < datalen; i++) 283*b2bdc62aSAdrian Chadd v += data[i]; 284*b2bdc62aSAdrian Chadd return (v); 285*b2bdc62aSAdrian Chadd } 286*b2bdc62aSAdrian Chadd 287*b2bdc62aSAdrian Chadd uint32_t 288*b2bdc62aSAdrian Chadd rss_hash(u_int datalen, const uint8_t *data) 289*b2bdc62aSAdrian Chadd { 290*b2bdc62aSAdrian Chadd 291*b2bdc62aSAdrian Chadd switch (rss_hashalgo) { 292*b2bdc62aSAdrian Chadd case RSS_HASH_TOEPLITZ: 293*b2bdc62aSAdrian Chadd return (toeplitz_hash(sizeof(rss_key), rss_key, datalen, 294*b2bdc62aSAdrian Chadd data)); 295*b2bdc62aSAdrian Chadd 296*b2bdc62aSAdrian Chadd case RSS_HASH_NAIVE: 297*b2bdc62aSAdrian Chadd return (rss_naive_hash(sizeof(rss_key), rss_key, datalen, 298*b2bdc62aSAdrian Chadd data)); 299*b2bdc62aSAdrian Chadd 300*b2bdc62aSAdrian Chadd default: 301*b2bdc62aSAdrian Chadd panic("%s: unsupported/unknown hashalgo %d", __func__, 302*b2bdc62aSAdrian Chadd rss_hashalgo); 303*b2bdc62aSAdrian Chadd } 304*b2bdc62aSAdrian Chadd } 305*b2bdc62aSAdrian Chadd 306*b2bdc62aSAdrian Chadd /* 307*b2bdc62aSAdrian Chadd * Query the number of RSS bits in use. 308*b2bdc62aSAdrian Chadd */ 309*b2bdc62aSAdrian Chadd u_int 310*b2bdc62aSAdrian Chadd rss_getbits(void) 311*b2bdc62aSAdrian Chadd { 312*b2bdc62aSAdrian Chadd 313*b2bdc62aSAdrian Chadd return (rss_bits); 314*b2bdc62aSAdrian Chadd } 315*b2bdc62aSAdrian Chadd 316*b2bdc62aSAdrian Chadd /* 317*b2bdc62aSAdrian Chadd * Query the RSS bucket associated with an RSS hash. 318*b2bdc62aSAdrian Chadd */ 319*b2bdc62aSAdrian Chadd u_int 320*b2bdc62aSAdrian Chadd rss_getbucket(u_int hash) 321*b2bdc62aSAdrian Chadd { 322*b2bdc62aSAdrian Chadd 323*b2bdc62aSAdrian Chadd return (hash & rss_mask); 324*b2bdc62aSAdrian Chadd } 325*b2bdc62aSAdrian Chadd 326*b2bdc62aSAdrian Chadd /* 327*b2bdc62aSAdrian Chadd * Query the RSS layer bucket associated with the given 328*b2bdc62aSAdrian Chadd * entry in the RSS hash space. 329*b2bdc62aSAdrian Chadd * 330*b2bdc62aSAdrian Chadd * The RSS indirection table is 0 .. rss_buckets-1, 331*b2bdc62aSAdrian Chadd * covering the low 'rss_bits' of the total 128 slot 332*b2bdc62aSAdrian Chadd * RSS indirection table. So just mask off rss_bits and 333*b2bdc62aSAdrian Chadd * return that. 334*b2bdc62aSAdrian Chadd * 335*b2bdc62aSAdrian Chadd * NIC drivers can then iterate over the 128 slot RSS 336*b2bdc62aSAdrian Chadd * indirection table and fetch which RSS bucket to 337*b2bdc62aSAdrian Chadd * map it to. This will typically be a CPU queue 338*b2bdc62aSAdrian Chadd */ 339*b2bdc62aSAdrian Chadd u_int 340*b2bdc62aSAdrian Chadd rss_get_indirection_to_bucket(u_int index) 341*b2bdc62aSAdrian Chadd { 342*b2bdc62aSAdrian Chadd 343*b2bdc62aSAdrian Chadd return (index & rss_mask); 344*b2bdc62aSAdrian Chadd } 345*b2bdc62aSAdrian Chadd 346*b2bdc62aSAdrian Chadd /* 347*b2bdc62aSAdrian Chadd * Query the RSS CPU associated with an RSS bucket. 348*b2bdc62aSAdrian Chadd */ 349*b2bdc62aSAdrian Chadd u_int 350*b2bdc62aSAdrian Chadd rss_getcpu(u_int bucket) 351*b2bdc62aSAdrian Chadd { 352*b2bdc62aSAdrian Chadd 353*b2bdc62aSAdrian Chadd return (rss_table[bucket].rte_cpu); 354*b2bdc62aSAdrian Chadd } 355*b2bdc62aSAdrian Chadd 356*b2bdc62aSAdrian Chadd /* 357*b2bdc62aSAdrian Chadd * netisr CPU affinity lookup given just the hash and hashtype. 358*b2bdc62aSAdrian Chadd */ 359*b2bdc62aSAdrian Chadd u_int 360*b2bdc62aSAdrian Chadd rss_hash2cpuid(uint32_t hash_val, uint32_t hash_type) 361*b2bdc62aSAdrian Chadd { 362*b2bdc62aSAdrian Chadd 363*b2bdc62aSAdrian Chadd switch (hash_type) { 364*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 365*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 366*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 367*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 368*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 369*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 370*b2bdc62aSAdrian Chadd return (rss_getcpu(rss_getbucket(hash_val))); 371*b2bdc62aSAdrian Chadd default: 372*b2bdc62aSAdrian Chadd return (NETISR_CPUID_NONE); 373*b2bdc62aSAdrian Chadd } 374*b2bdc62aSAdrian Chadd } 375*b2bdc62aSAdrian Chadd 376*b2bdc62aSAdrian Chadd /* 377*b2bdc62aSAdrian Chadd * Query the RSS bucket associated with the given hash value and 378*b2bdc62aSAdrian Chadd * type. 379*b2bdc62aSAdrian Chadd */ 380*b2bdc62aSAdrian Chadd int 381*b2bdc62aSAdrian Chadd rss_hash2bucket(uint32_t hash_val, uint32_t hash_type, uint32_t *bucket_id) 382*b2bdc62aSAdrian Chadd { 383*b2bdc62aSAdrian Chadd 384*b2bdc62aSAdrian Chadd switch (hash_type) { 385*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV4: 386*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV4: 387*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV4: 388*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_IPV6: 389*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_TCP_IPV6: 390*b2bdc62aSAdrian Chadd case M_HASHTYPE_RSS_UDP_IPV6: 391*b2bdc62aSAdrian Chadd *bucket_id = rss_getbucket(hash_val); 392*b2bdc62aSAdrian Chadd return (0); 393*b2bdc62aSAdrian Chadd default: 394*b2bdc62aSAdrian Chadd return (-1); 395*b2bdc62aSAdrian Chadd } 396*b2bdc62aSAdrian Chadd } 397*b2bdc62aSAdrian Chadd 398*b2bdc62aSAdrian Chadd /* 399*b2bdc62aSAdrian Chadd * netisr CPU affinity lookup routine for use by protocols. 400*b2bdc62aSAdrian Chadd */ 401*b2bdc62aSAdrian Chadd struct mbuf * 402*b2bdc62aSAdrian Chadd rss_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) 403*b2bdc62aSAdrian Chadd { 404*b2bdc62aSAdrian Chadd 405*b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 406*b2bdc62aSAdrian Chadd *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); 407*b2bdc62aSAdrian Chadd return (m); 408*b2bdc62aSAdrian Chadd } 409*b2bdc62aSAdrian Chadd 410*b2bdc62aSAdrian Chadd int 411*b2bdc62aSAdrian Chadd rss_m2bucket(struct mbuf *m, uint32_t *bucket_id) 412*b2bdc62aSAdrian Chadd { 413*b2bdc62aSAdrian Chadd 414*b2bdc62aSAdrian Chadd M_ASSERTPKTHDR(m); 415*b2bdc62aSAdrian Chadd 416*b2bdc62aSAdrian Chadd return(rss_hash2bucket(m->m_pkthdr.flowid, M_HASHTYPE_GET(m), 417*b2bdc62aSAdrian Chadd bucket_id)); 418*b2bdc62aSAdrian Chadd } 419*b2bdc62aSAdrian Chadd 420*b2bdc62aSAdrian Chadd /* 421*b2bdc62aSAdrian Chadd * Query the RSS hash algorithm. 422*b2bdc62aSAdrian Chadd */ 423*b2bdc62aSAdrian Chadd u_int 424*b2bdc62aSAdrian Chadd rss_gethashalgo(void) 425*b2bdc62aSAdrian Chadd { 426*b2bdc62aSAdrian Chadd 427*b2bdc62aSAdrian Chadd return (rss_hashalgo); 428*b2bdc62aSAdrian Chadd } 429*b2bdc62aSAdrian Chadd 430*b2bdc62aSAdrian Chadd /* 431*b2bdc62aSAdrian Chadd * Query the current RSS key; likely to be used by device drivers when 432*b2bdc62aSAdrian Chadd * configuring hardware RSS. Caller must pass an array of size RSS_KEYSIZE. 433*b2bdc62aSAdrian Chadd * 434*b2bdc62aSAdrian Chadd * XXXRW: Perhaps we should do the accept-a-length-and-truncate thing? 435*b2bdc62aSAdrian Chadd */ 436*b2bdc62aSAdrian Chadd void 437*b2bdc62aSAdrian Chadd rss_getkey(uint8_t *key) 438*b2bdc62aSAdrian Chadd { 439*b2bdc62aSAdrian Chadd 440*b2bdc62aSAdrian Chadd bcopy(rss_key, key, sizeof(rss_key)); 441*b2bdc62aSAdrian Chadd } 442*b2bdc62aSAdrian Chadd 443*b2bdc62aSAdrian Chadd /* 444*b2bdc62aSAdrian Chadd * Query the number of buckets; this may be used by both network device 445*b2bdc62aSAdrian Chadd * drivers, which will need to populate hardware shadows of the software 446*b2bdc62aSAdrian Chadd * indirection table, and the network stack itself (such as when deciding how 447*b2bdc62aSAdrian Chadd * many connection groups to allocate). 448*b2bdc62aSAdrian Chadd */ 449*b2bdc62aSAdrian Chadd u_int 450*b2bdc62aSAdrian Chadd rss_getnumbuckets(void) 451*b2bdc62aSAdrian Chadd { 452*b2bdc62aSAdrian Chadd 453*b2bdc62aSAdrian Chadd return (rss_buckets); 454*b2bdc62aSAdrian Chadd } 455*b2bdc62aSAdrian Chadd 456*b2bdc62aSAdrian Chadd /* 457*b2bdc62aSAdrian Chadd * Query the number of CPUs in use by RSS; may be useful to device drivers 458*b2bdc62aSAdrian Chadd * trying to figure out how to map a larger number of CPUs into a smaller 459*b2bdc62aSAdrian Chadd * number of receive queues. 460*b2bdc62aSAdrian Chadd */ 461*b2bdc62aSAdrian Chadd u_int 462*b2bdc62aSAdrian Chadd rss_getnumcpus(void) 463*b2bdc62aSAdrian Chadd { 464*b2bdc62aSAdrian Chadd 465*b2bdc62aSAdrian Chadd return (rss_ncpus); 466*b2bdc62aSAdrian Chadd } 467*b2bdc62aSAdrian Chadd 468*b2bdc62aSAdrian Chadd /* 469*b2bdc62aSAdrian Chadd * Return the supported RSS hash configuration. 470*b2bdc62aSAdrian Chadd * 471*b2bdc62aSAdrian Chadd * NICs should query this to determine what to configure in their redirection 472*b2bdc62aSAdrian Chadd * matching table. 473*b2bdc62aSAdrian Chadd */ 474*b2bdc62aSAdrian Chadd inline u_int 475*b2bdc62aSAdrian Chadd rss_gethashconfig(void) 476*b2bdc62aSAdrian Chadd { 477*b2bdc62aSAdrian Chadd 478*b2bdc62aSAdrian Chadd /* Return 4-tuple for TCP; 2-tuple for others */ 479*b2bdc62aSAdrian Chadd /* 480*b2bdc62aSAdrian Chadd * UDP may fragment more often than TCP and thus we'll end up with 481*b2bdc62aSAdrian Chadd * NICs returning 2-tuple fragments. 482*b2bdc62aSAdrian Chadd * udp_init() and udplite_init() both currently initialise things 483*b2bdc62aSAdrian Chadd * as 2-tuple. 484*b2bdc62aSAdrian Chadd * So for now disable UDP 4-tuple hashing until all of the other 485*b2bdc62aSAdrian Chadd * pieces are in place. 486*b2bdc62aSAdrian Chadd */ 487*b2bdc62aSAdrian Chadd return ( 488*b2bdc62aSAdrian Chadd RSS_HASHTYPE_RSS_IPV4 489*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV4 490*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6 491*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6 492*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_IPV6_EX 493*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_TCP_IPV6_EX 494*b2bdc62aSAdrian Chadd #if 0 495*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV4 496*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV4_EX 497*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6 498*b2bdc62aSAdrian Chadd | RSS_HASHTYPE_RSS_UDP_IPV6_EX 499*b2bdc62aSAdrian Chadd #endif 500*b2bdc62aSAdrian Chadd ); 501*b2bdc62aSAdrian Chadd } 502*b2bdc62aSAdrian Chadd 503*b2bdc62aSAdrian Chadd /* 504*b2bdc62aSAdrian Chadd * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want 505*b2bdc62aSAdrian Chadd * it appearing in debugging output unnecessarily. 506*b2bdc62aSAdrian Chadd */ 507*b2bdc62aSAdrian Chadd static int 508*b2bdc62aSAdrian Chadd sysctl_rss_key(SYSCTL_HANDLER_ARGS) 509*b2bdc62aSAdrian Chadd { 510*b2bdc62aSAdrian Chadd uint8_t temp_rss_key[RSS_KEYSIZE]; 511*b2bdc62aSAdrian Chadd int error; 512*b2bdc62aSAdrian Chadd 513*b2bdc62aSAdrian Chadd error = priv_check(req->td, PRIV_NETINET_HASHKEY); 514*b2bdc62aSAdrian Chadd if (error) 515*b2bdc62aSAdrian Chadd return (error); 516*b2bdc62aSAdrian Chadd 517*b2bdc62aSAdrian Chadd bcopy(rss_key, temp_rss_key, sizeof(temp_rss_key)); 518*b2bdc62aSAdrian Chadd error = sysctl_handle_opaque(oidp, temp_rss_key, 519*b2bdc62aSAdrian Chadd sizeof(temp_rss_key), req); 520*b2bdc62aSAdrian Chadd if (error) 521*b2bdc62aSAdrian Chadd return (error); 522*b2bdc62aSAdrian Chadd if (req->newptr != NULL) { 523*b2bdc62aSAdrian Chadd /* XXXRW: Not yet. */ 524*b2bdc62aSAdrian Chadd return (EINVAL); 525*b2bdc62aSAdrian Chadd } 526*b2bdc62aSAdrian Chadd return (0); 527*b2bdc62aSAdrian Chadd } 528*b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, key, 529*b2bdc62aSAdrian Chadd CTLTYPE_OPAQUE | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0, sysctl_rss_key, 530*b2bdc62aSAdrian Chadd "", "RSS keying material"); 531*b2bdc62aSAdrian Chadd 532*b2bdc62aSAdrian Chadd static int 533*b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping(SYSCTL_HANDLER_ARGS) 534*b2bdc62aSAdrian Chadd { 535*b2bdc62aSAdrian Chadd struct sbuf *sb; 536*b2bdc62aSAdrian Chadd int error; 537*b2bdc62aSAdrian Chadd int i; 538*b2bdc62aSAdrian Chadd 539*b2bdc62aSAdrian Chadd error = 0; 540*b2bdc62aSAdrian Chadd error = sysctl_wire_old_buffer(req, 0); 541*b2bdc62aSAdrian Chadd if (error != 0) 542*b2bdc62aSAdrian Chadd return (error); 543*b2bdc62aSAdrian Chadd sb = sbuf_new_for_sysctl(NULL, NULL, 512, req); 544*b2bdc62aSAdrian Chadd if (sb == NULL) 545*b2bdc62aSAdrian Chadd return (ENOMEM); 546*b2bdc62aSAdrian Chadd for (i = 0; i < rss_buckets; i++) { 547*b2bdc62aSAdrian Chadd sbuf_printf(sb, "%s%d:%d", i == 0 ? "" : " ", 548*b2bdc62aSAdrian Chadd i, 549*b2bdc62aSAdrian Chadd rss_getcpu(i)); 550*b2bdc62aSAdrian Chadd } 551*b2bdc62aSAdrian Chadd error = sbuf_finish(sb); 552*b2bdc62aSAdrian Chadd sbuf_delete(sb); 553*b2bdc62aSAdrian Chadd 554*b2bdc62aSAdrian Chadd return (error); 555*b2bdc62aSAdrian Chadd } 556*b2bdc62aSAdrian Chadd SYSCTL_PROC(_net_inet_rss, OID_AUTO, bucket_mapping, 557*b2bdc62aSAdrian Chadd CTLTYPE_STRING | CTLFLAG_RD, NULL, 0, 558*b2bdc62aSAdrian Chadd sysctl_rss_bucket_mapping, "", "RSS bucket -> CPU mapping"); 559