1 /*- 2 * Copyright (c) 2007-2009 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * netisr is a packet dispatch service, allowing synchronous (directly 32 * dispatched) and asynchronous (deferred dispatch) processing of packets by 33 * registered protocol handlers. Callers pass a protocol identifier and 34 * packet to netisr, along with a direct dispatch hint, and work will either 35 * be immediately processed by the registered handler, or passed to a 36 * software interrupt (SWI) thread for deferred dispatch. Callers will 37 * generally select one or the other based on: 38 * 39 * - Whether directly dispatching a netisr handler lead to code reentrance or 40 * lock recursion, such as entering the socket code from the socket code. 41 * - Whether directly dispatching a netisr handler lead to recursive 42 * processing, such as when decapsulating several wrapped layers of tunnel 43 * information (IPSEC within IPSEC within ...). 44 * 45 * Maintaining ordering for protocol streams is a critical design concern. 46 * Enforcing ordering limits the opportunity for concurrency, but maintains 47 * the strong ordering requirements found in some protocols, such as TCP. Of 48 * related concern is CPU affinity--it is desirable to process all data 49 * associated with a particular stream on the same CPU over time in order to 50 * avoid acquiring locks associated with the connection on different CPUs, 51 * keep connection data in one cache, and to generally encourage associated 52 * user threads to live on the same CPU as the stream. It's also desirable 53 * to avoid lock migration and contention where locks are associated with 54 * more than one flow. 55 * 56 * netisr supports several policy variations, represented by the 57 * NETISR_POLICY_* constants, allowing protocols to play various roles in 58 * identifying flows, assigning work to CPUs, etc. These are described in 59 * netisr.h. 60 */ 61 62 #include "opt_ddb.h" 63 #include "opt_device_polling.h" 64 65 #include <sys/param.h> 66 #include <sys/bus.h> 67 #include <sys/kernel.h> 68 #include <sys/kthread.h> 69 #include <sys/interrupt.h> 70 #include <sys/lock.h> 71 #include <sys/mbuf.h> 72 #include <sys/mutex.h> 73 #include <sys/pcpu.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sched.h> 77 #include <sys/smp.h> 78 #include <sys/socket.h> 79 #include <sys/sysctl.h> 80 #include <sys/systm.h> 81 82 #ifdef DDB 83 #include <ddb/ddb.h> 84 #endif 85 86 #include <net/if.h> 87 #include <net/if_var.h> 88 #include <net/netisr.h> 89 #include <net/vnet.h> 90 91 /*- 92 * Synchronize use and modification of the registered netisr data structures; 93 * acquire a read lock while modifying the set of registered protocols to 94 * prevent partially registered or unregistered protocols from being run. 95 * 96 * The following data structures and fields are protected by this lock: 97 * 98 * - The np array, including all fields of struct netisr_proto. 99 * - The nws array, including all fields of struct netisr_worker. 100 * - The nws_array array. 101 * 102 * Note: the NETISR_LOCKING define controls whether read locks are acquired 103 * in packet processing paths requiring netisr registration stability. This 104 * is disabled by default as it can lead to measurable performance 105 * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and 106 * because netisr registration and unregistration is extremely rare at 107 * runtime. If it becomes more common, this decision should be revisited. 108 * 109 * XXXRW: rmlocks don't support assertions. 110 */ 111 static struct rmlock netisr_rmlock; 112 #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ 113 RM_NOWITNESS) 114 #define NETISR_LOCK_ASSERT() 115 #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) 116 #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) 117 #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) 118 #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) 119 /* #define NETISR_LOCKING */ 120 121 SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); 122 123 /*- 124 * Three direct dispatch policies are supported: 125 * 126 * - Always defer: all work is scheduled for a netisr, regardless of context. 127 * (!direct) 128 * 129 * - Hybrid: if the executing context allows direct dispatch, and we're 130 * running on the CPU the work would be done on, then direct dispatch if it 131 * wouldn't violate ordering constraints on the workstream. 132 * (direct && !direct_force) 133 * 134 * - Always direct: if the executing context allows direct dispatch, always 135 * direct dispatch. (direct && direct_force) 136 * 137 * Notice that changing the global policy could lead to short periods of 138 * misordered processing, but this is considered acceptable as compared to 139 * the complexity of enforcing ordering during policy changes. 140 */ 141 static int netisr_direct_force = 1; /* Always direct dispatch. */ 142 TUNABLE_INT("net.isr.direct_force", &netisr_direct_force); 143 SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW, 144 &netisr_direct_force, 0, "Force direct dispatch"); 145 146 static int netisr_direct = 1; /* Enable direct dispatch. */ 147 TUNABLE_INT("net.isr.direct", &netisr_direct); 148 SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, 149 &netisr_direct, 0, "Enable direct dispatch"); 150 151 /* 152 * Allow the administrator to limit the number of threads (CPUs) to use for 153 * netisr. We don't check netisr_maxthreads before creating the thread for 154 * CPU 0, so in practice we ignore values <= 1. This must be set at boot. 155 * We will create at most one thread per CPU. 156 */ 157 static int netisr_maxthreads = -1; /* Max number of threads. */ 158 TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); 159 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RD, 160 &netisr_maxthreads, 0, 161 "Use at most this many CPUs for netisr processing"); 162 163 static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ 164 TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); 165 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RD, 166 &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); 167 168 /* 169 * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit, 170 * both for initial configuration and later modification using 171 * netisr_setqlimit(). 172 */ 173 #define NETISR_DEFAULT_MAXQLIMIT 10240 174 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; 175 TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); 176 SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RD, 177 &netisr_maxqlimit, 0, 178 "Maximum netisr per-protocol, per-CPU queue depth."); 179 180 /* 181 * The default per-workstream mbuf queue limit for protocols that don't 182 * initialize the nh_qlimit field of their struct netisr_handler. If this is 183 * set above netisr_maxqlimit, we truncate it to the maximum during boot. 184 */ 185 #define NETISR_DEFAULT_DEFAULTQLIMIT 256 186 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; 187 TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); 188 SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RD, 189 &netisr_defaultqlimit, 0, 190 "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); 191 192 /* 193 * Each protocol is described by a struct netisr_proto, which holds all 194 * global per-protocol information. This data structure is set up by 195 * netisr_register(), and derived from the public struct netisr_handler. 196 */ 197 struct netisr_proto { 198 const char *np_name; /* Character string protocol name. */ 199 netisr_handler_t *np_handler; /* Protocol handler. */ 200 netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ 201 netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ 202 netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ 203 u_int np_qlimit; /* Maximum per-CPU queue depth. */ 204 u_int np_policy; /* Work placement policy. */ 205 }; 206 207 #define NETISR_MAXPROT 16 /* Compile-time limit. */ 208 209 /* 210 * The np array describes all registered protocols, indexed by protocol 211 * number. 212 */ 213 static struct netisr_proto np[NETISR_MAXPROT]; 214 215 /* 216 * Protocol-specific work for each workstream is described by struct 217 * netisr_work. Each work descriptor consists of an mbuf queue and 218 * statistics. 219 */ 220 struct netisr_work { 221 /* 222 * Packet queue, linked by m_nextpkt. 223 */ 224 struct mbuf *nw_head; 225 struct mbuf *nw_tail; 226 u_int nw_len; 227 u_int nw_qlimit; 228 u_int nw_watermark; 229 230 /* 231 * Statistics -- written unlocked, but mostly from curcpu. 232 */ 233 u_int64_t nw_dispatched; /* Number of direct dispatches. */ 234 u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ 235 u_int64_t nw_qdrops; /* "" drops. */ 236 u_int64_t nw_queued; /* "" enqueues. */ 237 u_int64_t nw_handled; /* "" handled in worker. */ 238 }; 239 240 /* 241 * Workstreams hold a queue of ordered work across each protocol, and are 242 * described by netisr_workstream. Each workstream is associated with a 243 * worker thread, which in turn is pinned to a CPU. Work associated with a 244 * workstream can be processd in other threads during direct dispatch; 245 * concurrent processing is prevented by the NWS_RUNNING flag, which 246 * indicates that a thread is already processing the work queue. It is 247 * important to prevent a directly dispatched packet from "skipping ahead" of 248 * work already in the workstream queue. 249 */ 250 struct netisr_workstream { 251 struct intr_event *nws_intr_event; /* Handler for stream. */ 252 void *nws_swi_cookie; /* swi(9) cookie for stream. */ 253 struct mtx nws_mtx; /* Synchronize work. */ 254 u_int nws_cpu; /* CPU pinning. */ 255 u_int nws_flags; /* Wakeup flags. */ 256 u_int nws_pendingbits; /* Scheduled protocols. */ 257 258 /* 259 * Each protocol has per-workstream data. 260 */ 261 struct netisr_work nws_work[NETISR_MAXPROT]; 262 } __aligned(CACHE_LINE_SIZE); 263 264 /* 265 * Per-CPU workstream data. 266 */ 267 DPCPU_DEFINE(struct netisr_workstream, nws); 268 269 /* 270 * Map contiguous values between 0 and nws_count into CPU IDs appropriate for 271 * accessing workstreams. This allows constructions of the form 272 * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). 273 */ 274 static u_int nws_array[MAXCPU]; 275 276 /* 277 * Number of registered workstreams. Will be at most the number of running 278 * CPUs once fully started. 279 */ 280 static u_int nws_count; 281 SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, 282 &nws_count, 0, "Number of extant netisr threads."); 283 284 /* 285 * Per-workstream flags. 286 */ 287 #define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ 288 #define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ 289 #define NWS_SCHEDULED 0x00000004 /* Signal issued. */ 290 291 /* 292 * Synchronization for each workstream: a mutex protects all mutable fields 293 * in each stream, including per-protocol state (mbuf queues). The SWI is 294 * woken up if asynchronous dispatch is required. 295 */ 296 #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) 297 #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) 298 #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) 299 #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) 300 301 /* 302 * Utility routines for protocols that implement their own mapping of flows 303 * to CPUs. 304 */ 305 u_int 306 netisr_get_cpucount(void) 307 { 308 309 return (nws_count); 310 } 311 312 u_int 313 netisr_get_cpuid(u_int cpunumber) 314 { 315 316 KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, 317 nws_count)); 318 319 return (nws_array[cpunumber]); 320 } 321 322 /* 323 * The default implementation of flow -> CPU ID mapping. 324 * 325 * Non-static so that protocols can use it to map their own work to specific 326 * CPUs in a manner consistent to netisr for affinity purposes. 327 */ 328 u_int 329 netisr_default_flow2cpu(u_int flowid) 330 { 331 332 return (nws_array[flowid % nws_count]); 333 } 334 335 /* 336 * Register a new netisr handler, which requires initializing per-protocol 337 * fields for each workstream. All netisr work is briefly suspended while 338 * the protocol is installed. 339 */ 340 void 341 netisr_register(const struct netisr_handler *nhp) 342 { 343 struct netisr_work *npwp; 344 const char *name; 345 u_int i, proto; 346 347 proto = nhp->nh_proto; 348 name = nhp->nh_name; 349 350 /* 351 * Test that the requested registration is valid. 352 */ 353 KASSERT(nhp->nh_name != NULL, 354 ("%s: nh_name NULL for %u", __func__, proto)); 355 KASSERT(nhp->nh_handler != NULL, 356 ("%s: nh_handler NULL for %s", __func__, name)); 357 KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || 358 nhp->nh_policy == NETISR_POLICY_FLOW || 359 nhp->nh_policy == NETISR_POLICY_CPU, 360 ("%s: unsupported nh_policy %u for %s", __func__, 361 nhp->nh_policy, name)); 362 KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || 363 nhp->nh_m2flow == NULL, 364 ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, 365 name)); 366 KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, 367 ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, 368 name)); 369 KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, 370 ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, 371 name)); 372 KASSERT(proto < NETISR_MAXPROT, 373 ("%s(%u, %s): protocol too big", __func__, proto, name)); 374 375 /* 376 * Test that no existing registration exists for this protocol. 377 */ 378 NETISR_WLOCK(); 379 KASSERT(np[proto].np_name == NULL, 380 ("%s(%u, %s): name present", __func__, proto, name)); 381 KASSERT(np[proto].np_handler == NULL, 382 ("%s(%u, %s): handler present", __func__, proto, name)); 383 384 np[proto].np_name = name; 385 np[proto].np_handler = nhp->nh_handler; 386 np[proto].np_m2flow = nhp->nh_m2flow; 387 np[proto].np_m2cpuid = nhp->nh_m2cpuid; 388 np[proto].np_drainedcpu = nhp->nh_drainedcpu; 389 if (nhp->nh_qlimit == 0) 390 np[proto].np_qlimit = netisr_defaultqlimit; 391 else if (nhp->nh_qlimit > netisr_maxqlimit) { 392 printf("%s: %s requested queue limit %u capped to " 393 "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, 394 netisr_maxqlimit); 395 np[proto].np_qlimit = netisr_maxqlimit; 396 } else 397 np[proto].np_qlimit = nhp->nh_qlimit; 398 np[proto].np_policy = nhp->nh_policy; 399 for (i = 0; i <= mp_maxid; i++) { 400 if (CPU_ABSENT(i)) 401 continue; 402 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 403 bzero(npwp, sizeof(*npwp)); 404 npwp->nw_qlimit = np[proto].np_qlimit; 405 } 406 NETISR_WUNLOCK(); 407 } 408 409 /* 410 * Clear drop counters across all workstreams for a protocol. 411 */ 412 void 413 netisr_clearqdrops(const struct netisr_handler *nhp) 414 { 415 struct netisr_work *npwp; 416 #ifdef INVARIANTS 417 const char *name; 418 #endif 419 u_int i, proto; 420 421 proto = nhp->nh_proto; 422 #ifdef INVARIANTS 423 name = nhp->nh_name; 424 #endif 425 KASSERT(proto < NETISR_MAXPROT, 426 ("%s(%u): protocol too big for %s", __func__, proto, name)); 427 428 NETISR_WLOCK(); 429 KASSERT(np[proto].np_handler != NULL, 430 ("%s(%u): protocol not registered for %s", __func__, proto, 431 name)); 432 433 for (i = 0; i <= mp_maxid; i++) { 434 if (CPU_ABSENT(i)) 435 continue; 436 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 437 npwp->nw_qdrops = 0; 438 } 439 NETISR_WUNLOCK(); 440 } 441 442 /* 443 * Query current drop counters across all workstreams for a protocol. 444 */ 445 void 446 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) 447 { 448 struct netisr_work *npwp; 449 struct rm_priotracker tracker; 450 #ifdef INVARIANTS 451 const char *name; 452 #endif 453 u_int i, proto; 454 455 *qdropp = 0; 456 proto = nhp->nh_proto; 457 #ifdef INVARIANTS 458 name = nhp->nh_name; 459 #endif 460 KASSERT(proto < NETISR_MAXPROT, 461 ("%s(%u): protocol too big for %s", __func__, proto, name)); 462 463 NETISR_RLOCK(&tracker); 464 KASSERT(np[proto].np_handler != NULL, 465 ("%s(%u): protocol not registered for %s", __func__, proto, 466 name)); 467 468 for (i = 0; i <= mp_maxid; i++) { 469 if (CPU_ABSENT(i)) 470 continue; 471 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 472 *qdropp += npwp->nw_qdrops; 473 } 474 NETISR_RUNLOCK(&tracker); 475 } 476 477 /* 478 * Query current per-workstream queue limit for a protocol. 479 */ 480 void 481 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) 482 { 483 struct rm_priotracker tracker; 484 #ifdef INVARIANTS 485 const char *name; 486 #endif 487 u_int proto; 488 489 proto = nhp->nh_proto; 490 #ifdef INVARIANTS 491 name = nhp->nh_name; 492 #endif 493 KASSERT(proto < NETISR_MAXPROT, 494 ("%s(%u): protocol too big for %s", __func__, proto, name)); 495 496 NETISR_RLOCK(&tracker); 497 KASSERT(np[proto].np_handler != NULL, 498 ("%s(%u): protocol not registered for %s", __func__, proto, 499 name)); 500 *qlimitp = np[proto].np_qlimit; 501 NETISR_RUNLOCK(&tracker); 502 } 503 504 /* 505 * Update the queue limit across per-workstream queues for a protocol. We 506 * simply change the limits, and don't drain overflowed packets as they will 507 * (hopefully) take care of themselves shortly. 508 */ 509 int 510 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) 511 { 512 struct netisr_work *npwp; 513 #ifdef INVARIANTS 514 const char *name; 515 #endif 516 u_int i, proto; 517 518 if (qlimit > netisr_maxqlimit) 519 return (EINVAL); 520 521 proto = nhp->nh_proto; 522 #ifdef INVARIANTS 523 name = nhp->nh_name; 524 #endif 525 KASSERT(proto < NETISR_MAXPROT, 526 ("%s(%u): protocol too big for %s", __func__, proto, name)); 527 528 NETISR_WLOCK(); 529 KASSERT(np[proto].np_handler != NULL, 530 ("%s(%u): protocol not registered for %s", __func__, proto, 531 name)); 532 533 np[proto].np_qlimit = qlimit; 534 for (i = 0; i <= mp_maxid; i++) { 535 if (CPU_ABSENT(i)) 536 continue; 537 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 538 npwp->nw_qlimit = qlimit; 539 } 540 NETISR_WUNLOCK(); 541 return (0); 542 } 543 544 /* 545 * Drain all packets currently held in a particular protocol work queue. 546 */ 547 static void 548 netisr_drain_proto(struct netisr_work *npwp) 549 { 550 struct mbuf *m; 551 552 /* 553 * We would assert the lock on the workstream but it's not passed in. 554 */ 555 while ((m = npwp->nw_head) != NULL) { 556 npwp->nw_head = m->m_nextpkt; 557 m->m_nextpkt = NULL; 558 if (npwp->nw_head == NULL) 559 npwp->nw_tail = NULL; 560 npwp->nw_len--; 561 m_freem(m); 562 } 563 KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); 564 KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); 565 } 566 567 /* 568 * Remove the registration of a network protocol, which requires clearing 569 * per-protocol fields across all workstreams, including freeing all mbufs in 570 * the queues at time of unregister. All work in netisr is briefly suspended 571 * while this takes place. 572 */ 573 void 574 netisr_unregister(const struct netisr_handler *nhp) 575 { 576 struct netisr_work *npwp; 577 #ifdef INVARIANTS 578 const char *name; 579 #endif 580 u_int i, proto; 581 582 proto = nhp->nh_proto; 583 #ifdef INVARIANTS 584 name = nhp->nh_name; 585 #endif 586 KASSERT(proto < NETISR_MAXPROT, 587 ("%s(%u): protocol too big for %s", __func__, proto, name)); 588 589 NETISR_WLOCK(); 590 KASSERT(np[proto].np_handler != NULL, 591 ("%s(%u): protocol not registered for %s", __func__, proto, 592 name)); 593 594 np[proto].np_name = NULL; 595 np[proto].np_handler = NULL; 596 np[proto].np_m2flow = NULL; 597 np[proto].np_m2cpuid = NULL; 598 np[proto].np_qlimit = 0; 599 np[proto].np_policy = 0; 600 for (i = 0; i <= mp_maxid; i++) { 601 if (CPU_ABSENT(i)) 602 continue; 603 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 604 netisr_drain_proto(npwp); 605 bzero(npwp, sizeof(*npwp)); 606 } 607 NETISR_WUNLOCK(); 608 } 609 610 /* 611 * Look up the workstream given a packet and source identifier. Do this by 612 * checking the protocol's policy, and optionally call out to the protocol 613 * for assistance if required. 614 */ 615 static struct mbuf * 616 netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source, 617 struct mbuf *m, u_int *cpuidp) 618 { 619 struct ifnet *ifp; 620 621 NETISR_LOCK_ASSERT(); 622 623 /* 624 * In the event we have only one worker, shortcut and deliver to it 625 * without further ado. 626 */ 627 if (nws_count == 1) { 628 *cpuidp = nws_array[0]; 629 return (m); 630 } 631 632 /* 633 * What happens next depends on the policy selected by the protocol. 634 * If we want to support per-interface policies, we should do that 635 * here first. 636 */ 637 switch (npp->np_policy) { 638 case NETISR_POLICY_CPU: 639 return (npp->np_m2cpuid(m, source, cpuidp)); 640 641 case NETISR_POLICY_FLOW: 642 if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { 643 m = npp->np_m2flow(m, source); 644 if (m == NULL) 645 return (NULL); 646 } 647 if (m->m_flags & M_FLOWID) { 648 *cpuidp = 649 netisr_default_flow2cpu(m->m_pkthdr.flowid); 650 return (m); 651 } 652 /* FALLTHROUGH */ 653 654 case NETISR_POLICY_SOURCE: 655 ifp = m->m_pkthdr.rcvif; 656 if (ifp != NULL) 657 *cpuidp = nws_array[(ifp->if_index + source) % 658 nws_count]; 659 else 660 *cpuidp = nws_array[source % nws_count]; 661 return (m); 662 663 default: 664 panic("%s: invalid policy %u for %s", __func__, 665 npp->np_policy, npp->np_name); 666 } 667 } 668 669 /* 670 * Process packets associated with a workstream and protocol. For reasons of 671 * fairness, we process up to one complete netisr queue at a time, moving the 672 * queue to a stack-local queue for processing, but do not loop refreshing 673 * from the global queue. The caller is responsible for deciding whether to 674 * loop, and for setting the NWS_RUNNING flag. The passed workstream will be 675 * locked on entry and relocked before return, but will be released while 676 * processing. The number of packets processed is returned. 677 */ 678 static u_int 679 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) 680 { 681 struct netisr_work local_npw, *npwp; 682 u_int handled; 683 struct mbuf *m; 684 685 NETISR_LOCK_ASSERT(); 686 NWS_LOCK_ASSERT(nwsp); 687 688 KASSERT(nwsp->nws_flags & NWS_RUNNING, 689 ("%s(%u): not running", __func__, proto)); 690 KASSERT(proto >= 0 && proto < NETISR_MAXPROT, 691 ("%s(%u): invalid proto\n", __func__, proto)); 692 693 npwp = &nwsp->nws_work[proto]; 694 if (npwp->nw_len == 0) 695 return (0); 696 697 /* 698 * Move the global work queue to a thread-local work queue. 699 * 700 * Notice that this means the effective maximum length of the queue 701 * is actually twice that of the maximum queue length specified in 702 * the protocol registration call. 703 */ 704 handled = npwp->nw_len; 705 local_npw = *npwp; 706 npwp->nw_head = NULL; 707 npwp->nw_tail = NULL; 708 npwp->nw_len = 0; 709 nwsp->nws_pendingbits &= ~(1 << proto); 710 NWS_UNLOCK(nwsp); 711 while ((m = local_npw.nw_head) != NULL) { 712 local_npw.nw_head = m->m_nextpkt; 713 m->m_nextpkt = NULL; 714 if (local_npw.nw_head == NULL) 715 local_npw.nw_tail = NULL; 716 local_npw.nw_len--; 717 VNET_ASSERT(m->m_pkthdr.rcvif != NULL); 718 CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); 719 np[proto].np_handler(m); 720 CURVNET_RESTORE(); 721 } 722 KASSERT(local_npw.nw_len == 0, 723 ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); 724 if (np[proto].np_drainedcpu) 725 np[proto].np_drainedcpu(nwsp->nws_cpu); 726 NWS_LOCK(nwsp); 727 npwp->nw_handled += handled; 728 return (handled); 729 } 730 731 /* 732 * SWI handler for netisr -- processes packets in a set of workstreams that 733 * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already 734 * being direct dispatched, go back to sleep and wait for the dispatching 735 * thread to wake us up again. 736 */ 737 static void 738 swi_net(void *arg) 739 { 740 #ifdef NETISR_LOCKING 741 struct rm_priotracker tracker; 742 #endif 743 struct netisr_workstream *nwsp; 744 u_int bits, prot; 745 746 nwsp = arg; 747 748 #ifdef DEVICE_POLLING 749 KASSERT(nws_count == 1, 750 ("%s: device_polling but nws_count != 1", __func__)); 751 netisr_poll(); 752 #endif 753 #ifdef NETISR_LOCKING 754 NETISR_RLOCK(&tracker); 755 #endif 756 NWS_LOCK(nwsp); 757 KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); 758 if (nwsp->nws_flags & NWS_DISPATCHING) 759 goto out; 760 nwsp->nws_flags |= NWS_RUNNING; 761 nwsp->nws_flags &= ~NWS_SCHEDULED; 762 while ((bits = nwsp->nws_pendingbits) != 0) { 763 while ((prot = ffs(bits)) != 0) { 764 prot--; 765 bits &= ~(1 << prot); 766 (void)netisr_process_workstream_proto(nwsp, prot); 767 } 768 } 769 nwsp->nws_flags &= ~NWS_RUNNING; 770 out: 771 NWS_UNLOCK(nwsp); 772 #ifdef NETISR_LOCKING 773 NETISR_RUNLOCK(&tracker); 774 #endif 775 #ifdef DEVICE_POLLING 776 netisr_pollmore(); 777 #endif 778 } 779 780 static int 781 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, 782 struct netisr_work *npwp, struct mbuf *m, int *dosignalp) 783 { 784 785 NWS_LOCK_ASSERT(nwsp); 786 787 *dosignalp = 0; 788 if (npwp->nw_len < npwp->nw_qlimit) { 789 m->m_nextpkt = NULL; 790 if (npwp->nw_head == NULL) { 791 npwp->nw_head = m; 792 npwp->nw_tail = m; 793 } else { 794 npwp->nw_tail->m_nextpkt = m; 795 npwp->nw_tail = m; 796 } 797 npwp->nw_len++; 798 if (npwp->nw_len > npwp->nw_watermark) 799 npwp->nw_watermark = npwp->nw_len; 800 801 /* 802 * We must set the bit regardless of NWS_RUNNING, so that 803 * swi_net() keeps calling netisr_process_workstream_proto(). 804 */ 805 nwsp->nws_pendingbits |= (1 << proto); 806 if (!(nwsp->nws_flags & 807 (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { 808 nwsp->nws_flags |= NWS_SCHEDULED; 809 *dosignalp = 1; /* Defer until unlocked. */ 810 } 811 npwp->nw_queued++; 812 return (0); 813 } else { 814 m_freem(m); 815 npwp->nw_qdrops++; 816 return (ENOBUFS); 817 } 818 } 819 820 static int 821 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) 822 { 823 struct netisr_workstream *nwsp; 824 struct netisr_work *npwp; 825 int dosignal, error; 826 827 #ifdef NETISR_LOCKING 828 NETISR_LOCK_ASSERT(); 829 #endif 830 KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, 831 cpuid, mp_maxid)); 832 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 833 834 dosignal = 0; 835 error = 0; 836 nwsp = DPCPU_ID_PTR(cpuid, nws); 837 npwp = &nwsp->nws_work[proto]; 838 NWS_LOCK(nwsp); 839 error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); 840 NWS_UNLOCK(nwsp); 841 if (dosignal) 842 NWS_SIGNAL(nwsp); 843 return (error); 844 } 845 846 int 847 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) 848 { 849 #ifdef NETISR_LOCKING 850 struct rm_priotracker tracker; 851 #endif 852 u_int cpuid; 853 int error; 854 855 KASSERT(proto < NETISR_MAXPROT, 856 ("%s: invalid proto %u", __func__, proto)); 857 858 #ifdef NETISR_LOCKING 859 NETISR_RLOCK(&tracker); 860 #endif 861 KASSERT(np[proto].np_handler != NULL, 862 ("%s: invalid proto %u", __func__, proto)); 863 864 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 865 if (m != NULL) { 866 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, 867 cpuid)); 868 error = netisr_queue_internal(proto, m, cpuid); 869 } else 870 error = ENOBUFS; 871 #ifdef NETISR_LOCKING 872 NETISR_RUNLOCK(&tracker); 873 #endif 874 return (error); 875 } 876 877 int 878 netisr_queue(u_int proto, struct mbuf *m) 879 { 880 881 return (netisr_queue_src(proto, 0, m)); 882 } 883 884 /* 885 * Dispatch a packet for netisr processing; direct dispatch is permitted by 886 * calling context. 887 */ 888 int 889 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) 890 { 891 #ifdef NETISR_LOCKING 892 struct rm_priotracker tracker; 893 #endif 894 struct netisr_workstream *nwsp; 895 struct netisr_work *npwp; 896 int dosignal, error; 897 u_int cpuid; 898 899 /* 900 * If direct dispatch is entirely disabled, fall back on queueing. 901 */ 902 if (!netisr_direct) 903 return (netisr_queue_src(proto, source, m)); 904 905 KASSERT(proto < NETISR_MAXPROT, 906 ("%s: invalid proto %u", __func__, proto)); 907 #ifdef NETISR_LOCKING 908 NETISR_RLOCK(&tracker); 909 #endif 910 KASSERT(np[proto].np_handler != NULL, 911 ("%s: invalid proto %u", __func__, proto)); 912 913 /* 914 * If direct dispatch is forced, then unconditionally dispatch 915 * without a formal CPU selection. Borrow the current CPU's stats, 916 * even if there's no worker on it. In this case we don't update 917 * nws_flags because all netisr processing will be source ordered due 918 * to always being forced to directly dispatch. 919 */ 920 if (netisr_direct_force) { 921 nwsp = DPCPU_PTR(nws); 922 npwp = &nwsp->nws_work[proto]; 923 npwp->nw_dispatched++; 924 npwp->nw_handled++; 925 np[proto].np_handler(m); 926 error = 0; 927 goto out_unlock; 928 } 929 930 /* 931 * Otherwise, we execute in a hybrid mode where we will try to direct 932 * dispatch if we're on the right CPU and the netisr worker isn't 933 * already running. 934 */ 935 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 936 if (m == NULL) { 937 error = ENOBUFS; 938 goto out_unlock; 939 } 940 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 941 sched_pin(); 942 if (cpuid != curcpu) 943 goto queue_fallback; 944 nwsp = DPCPU_PTR(nws); 945 npwp = &nwsp->nws_work[proto]; 946 947 /*- 948 * We are willing to direct dispatch only if three conditions hold: 949 * 950 * (1) The netisr worker isn't already running, 951 * (2) Another thread isn't already directly dispatching, and 952 * (3) The netisr hasn't already been woken up. 953 */ 954 NWS_LOCK(nwsp); 955 if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { 956 error = netisr_queue_workstream(nwsp, proto, npwp, m, 957 &dosignal); 958 NWS_UNLOCK(nwsp); 959 if (dosignal) 960 NWS_SIGNAL(nwsp); 961 goto out_unpin; 962 } 963 964 /* 965 * The current thread is now effectively the netisr worker, so set 966 * the dispatching flag to prevent concurrent processing of the 967 * stream from another thread (even the netisr worker), which could 968 * otherwise lead to effective misordering of the stream. 969 */ 970 nwsp->nws_flags |= NWS_DISPATCHING; 971 NWS_UNLOCK(nwsp); 972 np[proto].np_handler(m); 973 NWS_LOCK(nwsp); 974 nwsp->nws_flags &= ~NWS_DISPATCHING; 975 npwp->nw_handled++; 976 npwp->nw_hybrid_dispatched++; 977 978 /* 979 * If other work was enqueued by another thread while we were direct 980 * dispatching, we need to signal the netisr worker to do that work. 981 * In the future, we might want to do some of that work in the 982 * current thread, rather than trigger further context switches. If 983 * so, we'll want to establish a reasonable bound on the work done in 984 * the "borrowed" context. 985 */ 986 if (nwsp->nws_pendingbits != 0) { 987 nwsp->nws_flags |= NWS_SCHEDULED; 988 dosignal = 1; 989 } else 990 dosignal = 0; 991 NWS_UNLOCK(nwsp); 992 if (dosignal) 993 NWS_SIGNAL(nwsp); 994 error = 0; 995 goto out_unpin; 996 997 queue_fallback: 998 error = netisr_queue_internal(proto, m, cpuid); 999 out_unpin: 1000 sched_unpin(); 1001 out_unlock: 1002 #ifdef NETISR_LOCKING 1003 NETISR_RUNLOCK(&tracker); 1004 #endif 1005 return (error); 1006 } 1007 1008 int 1009 netisr_dispatch(u_int proto, struct mbuf *m) 1010 { 1011 1012 return (netisr_dispatch_src(proto, 0, m)); 1013 } 1014 1015 #ifdef DEVICE_POLLING 1016 /* 1017 * Kernel polling borrows a netisr thread to run interface polling in; this 1018 * function allows kernel polling to request that the netisr thread be 1019 * scheduled even if no packets are pending for protocols. 1020 */ 1021 void 1022 netisr_sched_poll(void) 1023 { 1024 struct netisr_workstream *nwsp; 1025 1026 nwsp = DPCPU_ID_PTR(nws_array[0], nws); 1027 NWS_SIGNAL(nwsp); 1028 } 1029 #endif 1030 1031 static void 1032 netisr_start_swi(u_int cpuid, struct pcpu *pc) 1033 { 1034 char swiname[12]; 1035 struct netisr_workstream *nwsp; 1036 int error; 1037 1038 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 1039 1040 nwsp = DPCPU_ID_PTR(cpuid, nws); 1041 mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); 1042 nwsp->nws_cpu = cpuid; 1043 snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); 1044 error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, 1045 SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); 1046 if (error) 1047 panic("%s: swi_add %d", __func__, error); 1048 pc->pc_netisr = nwsp->nws_intr_event; 1049 if (netisr_bindthreads) { 1050 error = intr_event_bind(nwsp->nws_intr_event, cpuid); 1051 if (error != 0) 1052 printf("%s: cpu %u: intr_event_bind: %d", __func__, 1053 cpuid, error); 1054 } 1055 NETISR_WLOCK(); 1056 nws_array[nws_count] = nwsp->nws_cpu; 1057 nws_count++; 1058 NETISR_WUNLOCK(); 1059 } 1060 1061 /* 1062 * Initialize the netisr subsystem. We rely on BSS and static initialization 1063 * of most fields in global data structures. 1064 * 1065 * Start a worker thread for the boot CPU so that we can support network 1066 * traffic immediately in case the network stack is used before additional 1067 * CPUs are started (for example, diskless boot). 1068 */ 1069 static void 1070 netisr_init(void *arg) 1071 { 1072 1073 KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); 1074 1075 NETISR_LOCK_INIT(); 1076 if (netisr_maxthreads < 1) 1077 netisr_maxthreads = 1; 1078 if (netisr_maxthreads > mp_ncpus) { 1079 printf("netisr_init: forcing maxthreads from %d to %d\n", 1080 netisr_maxthreads, mp_ncpus); 1081 netisr_maxthreads = mp_ncpus; 1082 } 1083 if (netisr_defaultqlimit > netisr_maxqlimit) { 1084 printf("netisr_init: forcing defaultqlimit from %d to %d\n", 1085 netisr_defaultqlimit, netisr_maxqlimit); 1086 netisr_defaultqlimit = netisr_maxqlimit; 1087 } 1088 #ifdef DEVICE_POLLING 1089 /* 1090 * The device polling code is not yet aware of how to deal with 1091 * multiple netisr threads, so for the time being compiling in device 1092 * polling disables parallel netisr workers. 1093 */ 1094 if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { 1095 printf("netisr_init: forcing maxthreads to 1 and " 1096 "bindthreads to 0 for device polling\n"); 1097 netisr_maxthreads = 1; 1098 netisr_bindthreads = 0; 1099 } 1100 #endif 1101 1102 netisr_start_swi(curcpu, pcpu_find(curcpu)); 1103 } 1104 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); 1105 1106 /* 1107 * Start worker threads for additional CPUs. No attempt to gracefully handle 1108 * work reassignment, we don't yet support dynamic reconfiguration. 1109 */ 1110 static void 1111 netisr_start(void *arg) 1112 { 1113 struct pcpu *pc; 1114 1115 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 1116 if (nws_count >= netisr_maxthreads) 1117 break; 1118 /* XXXRW: Is skipping absent CPUs still required here? */ 1119 if (CPU_ABSENT(pc->pc_cpuid)) 1120 continue; 1121 /* Worker will already be present for boot CPU. */ 1122 if (pc->pc_netisr != NULL) 1123 continue; 1124 netisr_start_swi(pc->pc_cpuid, pc); 1125 } 1126 } 1127 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); 1128 1129 #ifdef DDB 1130 DB_SHOW_COMMAND(netisr, db_show_netisr) 1131 { 1132 struct netisr_workstream *nwsp; 1133 struct netisr_work *nwp; 1134 int first, proto; 1135 u_int cpuid; 1136 1137 db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", 1138 "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); 1139 for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { 1140 if (CPU_ABSENT(cpuid)) 1141 continue; 1142 nwsp = DPCPU_ID_PTR(cpuid, nws); 1143 if (nwsp->nws_intr_event == NULL) 1144 continue; 1145 first = 1; 1146 for (proto = 0; proto < NETISR_MAXPROT; proto++) { 1147 if (np[proto].np_handler == NULL) 1148 continue; 1149 nwp = &nwsp->nws_work[proto]; 1150 if (first) { 1151 db_printf("%3d ", cpuid); 1152 first = 0; 1153 } else 1154 db_printf("%3s ", ""); 1155 db_printf( 1156 "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", 1157 np[proto].np_name, nwp->nw_len, 1158 nwp->nw_watermark, nwp->nw_qlimit, 1159 nwp->nw_dispatched, nwp->nw_hybrid_dispatched, 1160 nwp->nw_qdrops, nwp->nw_queued); 1161 } 1162 } 1163 } 1164 #endif 1165