1 /*- 2 * Copyright (c) 2007-2009 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * netisr is a packet dispatch service, allowing synchronous (directly 32 * dispatched) and asynchronous (deferred dispatch) processing of packets by 33 * registered protocol handlers. Callers pass a protocol identifier and 34 * packet to netisr, along with a direct dispatch hint, and work will either 35 * be immediately processed with the registered handler, or passed to a 36 * kernel software interrupt (SWI) thread for deferred dispatch. Callers 37 * will generally select one or the other based on: 38 * 39 * - Might directly dispatching a netisr handler lead to code reentrance or 40 * lock recursion, such as entering the socket code from the socket code. 41 * - Might directly dispatching a netisr handler lead to recursive 42 * processing, such as when decapsulating several wrapped layers of tunnel 43 * information (IPSEC within IPSEC within ...). 44 * 45 * Maintaining ordering for protocol streams is a critical design concern. 46 * Enforcing ordering limits the opportunity for concurrency, but maintains 47 * the strong ordering requirements found in some protocols, such as TCP. Of 48 * related concern is CPU affinity--it is desirable to process all data 49 * associated with a particular stream on the same CPU over time in order to 50 * avoid acquiring locks associated with the connection on different CPUs, 51 * keep connection data in one cache, and to generally encourage associated 52 * user threads to live on the same CPU as the stream. It's also desirable 53 * to avoid lock migration and contention where locks are associated with 54 * more than one flow. 55 * 56 * netisr supports several policy variations, represented by the 57 * NETISR_POLICY_* constants, allowing protocols to play a varying role in 58 * identifying flows, assigning work to CPUs, etc. These are described in 59 * detail in netisr.h. 60 */ 61 62 #include "opt_ddb.h" 63 #include "opt_device_polling.h" 64 65 #include <sys/param.h> 66 #include <sys/bus.h> 67 #include <sys/kernel.h> 68 #include <sys/kthread.h> 69 #include <sys/interrupt.h> 70 #include <sys/lock.h> 71 #include <sys/mbuf.h> 72 #include <sys/mutex.h> 73 #include <sys/pcpu.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sched.h> 77 #include <sys/smp.h> 78 #include <sys/socket.h> 79 #include <sys/sysctl.h> 80 #include <sys/systm.h> 81 82 #ifdef DDB 83 #include <ddb/ddb.h> 84 #endif 85 86 #include <net/if.h> 87 #include <net/if_var.h> 88 #include <net/netisr.h> 89 #include <net/vnet.h> 90 91 /*- 92 * Synchronize use and modification of the registered netisr data structures; 93 * acquire a read lock while modifying the set of registered protocols to 94 * prevent partially registered or unregistered protocols from being run. 95 * 96 * The following data structures and fields are protected by this lock: 97 * 98 * - The np array, including all fields of struct netisr_proto. 99 * - The nws array, including all fields of struct netisr_worker. 100 * - The nws_array array. 101 * 102 * Note: the NETISR_LOCKING define controls whether read locks are acquired 103 * in packet processing paths requiring netisr registration stability. This 104 * is disabled by default as it can lead to a measurable performance 105 * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and 106 * because netisr registration and unregistration is extremely rare at 107 * runtime. If it becomes more common, this decision should be revisited. 108 * 109 * XXXRW: rmlocks don't support assertions. 110 */ 111 static struct rmlock netisr_rmlock; 112 #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ 113 RM_NOWITNESS) 114 #define NETISR_LOCK_ASSERT() 115 #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) 116 #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) 117 #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) 118 #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) 119 /* #define NETISR_LOCKING */ 120 121 SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); 122 123 /*- 124 * Three direct dispatch policies are supported: 125 * 126 * - Always defer: all work is scheduled for a netisr, regardless of context. 127 * (!direct) 128 * 129 * - Hybrid: if the executing context allows direct dispatch, and we're 130 * running on the CPU the work would be done on, then direct dispatch if it 131 * wouldn't violate ordering constraints on the workstream. 132 * (direct && !direct_force) 133 * 134 * - Always direct: if the executing context allows direct dispatch, always 135 * direct dispatch. (direct && direct_force) 136 * 137 * Notice that changing the global policy could lead to short periods of 138 * misordered processing, but this is considered acceptable as compared to 139 * the complexity of enforcing ordering during policy changes. 140 */ 141 static int netisr_direct_force = 1; /* Always direct dispatch. */ 142 TUNABLE_INT("net.isr.direct_force", &netisr_direct_force); 143 SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW, 144 &netisr_direct_force, 0, "Force direct dispatch"); 145 146 static int netisr_direct = 1; /* Enable direct dispatch. */ 147 TUNABLE_INT("net.isr.direct", &netisr_direct); 148 SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, 149 &netisr_direct, 0, "Enable direct dispatch"); 150 151 /* 152 * Allow the administrator to limit the number of threads (CPUs) to use for 153 * netisr. We don't check netisr_maxthreads before creating the thread for 154 * CPU 0, so in practice we ignore values <= 1. This must be set at boot. 155 * We will create at most one thread per CPU. 156 */ 157 static int netisr_maxthreads = -1; /* Max number of threads. */ 158 TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); 159 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RD, 160 &netisr_maxthreads, 0, 161 "Use at most this many CPUs for netisr processing"); 162 163 static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ 164 TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); 165 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RD, 166 &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); 167 168 /* 169 * Limit per-workstream queues to at most net.isr.maxqlimit, both for initial 170 * configuration and later modification using netisr_setqlimit(). 171 */ 172 #define NETISR_DEFAULT_MAXQLIMIT 10240 173 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; 174 TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); 175 SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RD, 176 &netisr_maxqlimit, 0, 177 "Maximum netisr per-protocol, per-CPU queue depth."); 178 179 /* 180 * The default per-workstream queue limit for protocols that don't initialize 181 * the nh_qlimit field of their struct netisr_handler. If this is set above 182 * netisr_maxqlimit, we truncate it to the maximum during boot. 183 */ 184 #define NETISR_DEFAULT_DEFAULTQLIMIT 256 185 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; 186 TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); 187 SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RD, 188 &netisr_defaultqlimit, 0, 189 "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); 190 191 /* 192 * Each protocol is described by a struct netisr_proto, which holds all 193 * global per-protocol information. This data structure is set up by 194 * netisr_register(), and derived from the public struct netisr_handler. 195 */ 196 struct netisr_proto { 197 const char *np_name; /* Character string protocol name. */ 198 netisr_handler_t *np_handler; /* Protocol handler. */ 199 netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ 200 netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ 201 netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ 202 u_int np_qlimit; /* Maximum per-CPU queue depth. */ 203 u_int np_policy; /* Work placement policy. */ 204 }; 205 206 #define NETISR_MAXPROT 16 /* Compile-time limit. */ 207 208 /* 209 * The np array describes all registered protocols, indexed by protocol 210 * number. 211 */ 212 static struct netisr_proto np[NETISR_MAXPROT]; 213 214 /* 215 * Protocol-specific work for each workstream is described by struct 216 * netisr_work. Each work descriptor consists of an mbuf queue and 217 * statistics. 218 */ 219 struct netisr_work { 220 /* 221 * Packet queue, linked by m_nextpkt. 222 */ 223 struct mbuf *nw_head; 224 struct mbuf *nw_tail; 225 u_int nw_len; 226 u_int nw_qlimit; 227 u_int nw_watermark; 228 229 /* 230 * Statistics -- written unlocked, but mostly from curcpu. 231 */ 232 u_int64_t nw_dispatched; /* Number of direct dispatches. */ 233 u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ 234 u_int64_t nw_qdrops; /* "" drops. */ 235 u_int64_t nw_queued; /* "" enqueues. */ 236 u_int64_t nw_handled; /* "" handled in worker. */ 237 }; 238 239 /* 240 * Workstreams hold a set of ordered work across each protocol, and are 241 * described by netisr_workstream. Each workstream is associated with a 242 * worker thread, which in turn is pinned to a CPU. Work associated with a 243 * workstream can be processd in other threads during direct dispatch; 244 * concurrent processing is prevented by the NWS_RUNNING flag, which 245 * indicates that a thread is already processing the work queue. 246 */ 247 struct netisr_workstream { 248 struct intr_event *nws_intr_event; /* Handler for stream. */ 249 void *nws_swi_cookie; /* swi(9) cookie for stream. */ 250 struct mtx nws_mtx; /* Synchronize work. */ 251 u_int nws_cpu; /* CPU pinning. */ 252 u_int nws_flags; /* Wakeup flags. */ 253 u_int nws_pendingbits; /* Scheduled protocols. */ 254 255 /* 256 * Each protocol has per-workstream data. 257 */ 258 struct netisr_work nws_work[NETISR_MAXPROT]; 259 } __aligned(CACHE_LINE_SIZE); 260 261 /* 262 * Per-CPU workstream data. 263 */ 264 DPCPU_DEFINE(struct netisr_workstream, nws); 265 266 /* 267 * Map contiguous values between 0 and nws_count into CPU IDs appropriate for 268 * accessing workstreams. This allows constructions of the form 269 * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). 270 */ 271 static u_int nws_array[MAXCPU]; 272 273 /* 274 * Number of registered workstreams. Will be at most the number of running 275 * CPUs once fully started. 276 */ 277 static u_int nws_count; 278 SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, 279 &nws_count, 0, "Number of extant netisr threads."); 280 281 /* 282 * Per-workstream flags. 283 */ 284 #define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ 285 #define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ 286 #define NWS_SCHEDULED 0x00000004 /* Signal issued. */ 287 288 /* 289 * Synchronization for each workstream: a mutex protects all mutable fields 290 * in each stream, including per-protocol state (mbuf queues). The SWI is 291 * woken up if asynchronous dispatch is required. 292 */ 293 #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) 294 #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) 295 #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) 296 #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) 297 298 /* 299 * Utility routines for protocols that implement their own mapping of flows 300 * to CPUs. 301 */ 302 u_int 303 netisr_get_cpucount(void) 304 { 305 306 return (nws_count); 307 } 308 309 u_int 310 netisr_get_cpuid(u_int cpunumber) 311 { 312 313 KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, 314 nws_count)); 315 316 return (nws_array[cpunumber]); 317 } 318 319 /* 320 * The default implementation of -> CPU ID mapping. 321 * 322 * Non-static so that protocols can use it to map their own work to specific 323 * CPUs in a manner consistent to netisr for affinity purposes. 324 */ 325 u_int 326 netisr_default_flow2cpu(u_int flowid) 327 { 328 329 return (nws_array[flowid % nws_count]); 330 } 331 332 /* 333 * Register a new netisr handler, which requires initializing per-protocol 334 * fields for each workstream. All netisr work is briefly suspended while 335 * the protocol is installed. 336 */ 337 void 338 netisr_register(const struct netisr_handler *nhp) 339 { 340 struct netisr_work *npwp; 341 const char *name; 342 u_int i, proto; 343 344 proto = nhp->nh_proto; 345 name = nhp->nh_name; 346 347 /* 348 * Test that the requested registration is valid. 349 */ 350 KASSERT(nhp->nh_name != NULL, 351 ("%s: nh_name NULL for %u", __func__, proto)); 352 KASSERT(nhp->nh_handler != NULL, 353 ("%s: nh_handler NULL for %s", __func__, name)); 354 KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || 355 nhp->nh_policy == NETISR_POLICY_FLOW || 356 nhp->nh_policy == NETISR_POLICY_CPU, 357 ("%s: unsupported nh_policy %u for %s", __func__, 358 nhp->nh_policy, name)); 359 KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || 360 nhp->nh_m2flow == NULL, 361 ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, 362 name)); 363 KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, 364 ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, 365 name)); 366 KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, 367 ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, 368 name)); 369 KASSERT(proto < NETISR_MAXPROT, 370 ("%s(%u, %s): protocol too big", __func__, proto, name)); 371 372 /* 373 * Test that no existing registration exists for this protocol. 374 */ 375 NETISR_WLOCK(); 376 KASSERT(np[proto].np_name == NULL, 377 ("%s(%u, %s): name present", __func__, proto, name)); 378 KASSERT(np[proto].np_handler == NULL, 379 ("%s(%u, %s): handler present", __func__, proto, name)); 380 381 np[proto].np_name = name; 382 np[proto].np_handler = nhp->nh_handler; 383 np[proto].np_m2flow = nhp->nh_m2flow; 384 np[proto].np_m2cpuid = nhp->nh_m2cpuid; 385 np[proto].np_drainedcpu = nhp->nh_drainedcpu; 386 if (nhp->nh_qlimit == 0) 387 np[proto].np_qlimit = netisr_defaultqlimit; 388 else if (nhp->nh_qlimit > netisr_maxqlimit) { 389 printf("%s: %s requested queue limit %u capped to " 390 "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, 391 netisr_maxqlimit); 392 np[proto].np_qlimit = netisr_maxqlimit; 393 } else 394 np[proto].np_qlimit = nhp->nh_qlimit; 395 np[proto].np_policy = nhp->nh_policy; 396 for (i = 0; i <= mp_maxid; i++) { 397 if (CPU_ABSENT(i)) 398 continue; 399 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 400 bzero(npwp, sizeof(*npwp)); 401 npwp->nw_qlimit = np[proto].np_qlimit; 402 } 403 NETISR_WUNLOCK(); 404 } 405 406 /* 407 * Clear drop counters across all workstreams for a protocol. 408 */ 409 void 410 netisr_clearqdrops(const struct netisr_handler *nhp) 411 { 412 struct netisr_work *npwp; 413 #ifdef INVARIANTS 414 const char *name; 415 #endif 416 u_int i, proto; 417 418 proto = nhp->nh_proto; 419 #ifdef INVARIANTS 420 name = nhp->nh_name; 421 #endif 422 KASSERT(proto < NETISR_MAXPROT, 423 ("%s(%u): protocol too big for %s", __func__, proto, name)); 424 425 NETISR_WLOCK(); 426 KASSERT(np[proto].np_handler != NULL, 427 ("%s(%u): protocol not registered for %s", __func__, proto, 428 name)); 429 430 for (i = 0; i <= mp_maxid; i++) { 431 if (CPU_ABSENT(i)) 432 continue; 433 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 434 npwp->nw_qdrops = 0; 435 } 436 NETISR_WUNLOCK(); 437 } 438 439 /* 440 * Query the current drop counters across all workstreams for a protocol. 441 */ 442 void 443 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) 444 { 445 struct netisr_work *npwp; 446 struct rm_priotracker tracker; 447 #ifdef INVARIANTS 448 const char *name; 449 #endif 450 u_int i, proto; 451 452 *qdropp = 0; 453 proto = nhp->nh_proto; 454 #ifdef INVARIANTS 455 name = nhp->nh_name; 456 #endif 457 KASSERT(proto < NETISR_MAXPROT, 458 ("%s(%u): protocol too big for %s", __func__, proto, name)); 459 460 NETISR_RLOCK(&tracker); 461 KASSERT(np[proto].np_handler != NULL, 462 ("%s(%u): protocol not registered for %s", __func__, proto, 463 name)); 464 465 for (i = 0; i <= mp_maxid; i++) { 466 if (CPU_ABSENT(i)) 467 continue; 468 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 469 *qdropp += npwp->nw_qdrops; 470 } 471 NETISR_RUNLOCK(&tracker); 472 } 473 474 /* 475 * Query the current queue limit for per-workstream queues for a protocol. 476 */ 477 void 478 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) 479 { 480 struct rm_priotracker tracker; 481 #ifdef INVARIANTS 482 const char *name; 483 #endif 484 u_int proto; 485 486 proto = nhp->nh_proto; 487 #ifdef INVARIANTS 488 name = nhp->nh_name; 489 #endif 490 KASSERT(proto < NETISR_MAXPROT, 491 ("%s(%u): protocol too big for %s", __func__, proto, name)); 492 493 NETISR_RLOCK(&tracker); 494 KASSERT(np[proto].np_handler != NULL, 495 ("%s(%u): protocol not registered for %s", __func__, proto, 496 name)); 497 *qlimitp = np[proto].np_qlimit; 498 NETISR_RUNLOCK(&tracker); 499 } 500 501 /* 502 * Update the queue limit across per-workstream queues for a protocol. We 503 * simply change the limits, and don't drain overflowed packets as they will 504 * (hopefully) take care of themselves shortly. 505 */ 506 int 507 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) 508 { 509 struct netisr_work *npwp; 510 #ifdef INVARIANTS 511 const char *name; 512 #endif 513 u_int i, proto; 514 515 if (qlimit > netisr_maxqlimit) 516 return (EINVAL); 517 518 proto = nhp->nh_proto; 519 #ifdef INVARIANTS 520 name = nhp->nh_name; 521 #endif 522 KASSERT(proto < NETISR_MAXPROT, 523 ("%s(%u): protocol too big for %s", __func__, proto, name)); 524 525 NETISR_WLOCK(); 526 KASSERT(np[proto].np_handler != NULL, 527 ("%s(%u): protocol not registered for %s", __func__, proto, 528 name)); 529 530 np[proto].np_qlimit = qlimit; 531 for (i = 0; i <= mp_maxid; i++) { 532 if (CPU_ABSENT(i)) 533 continue; 534 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 535 npwp->nw_qlimit = qlimit; 536 } 537 NETISR_WUNLOCK(); 538 return (0); 539 } 540 541 /* 542 * Drain all packets currently held in a particular protocol work queue. 543 */ 544 static void 545 netisr_drain_proto(struct netisr_work *npwp) 546 { 547 struct mbuf *m; 548 549 /* 550 * We would assert the lock on the workstream but it's not passed in. 551 */ 552 while ((m = npwp->nw_head) != NULL) { 553 npwp->nw_head = m->m_nextpkt; 554 m->m_nextpkt = NULL; 555 if (npwp->nw_head == NULL) 556 npwp->nw_tail = NULL; 557 npwp->nw_len--; 558 m_freem(m); 559 } 560 KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); 561 KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); 562 } 563 564 /* 565 * Remove the registration of a network protocol, which requires clearing 566 * per-protocol fields across all workstreams, including freeing all mbufs in 567 * the queues at time of unregister. All work in netisr is briefly suspended 568 * while this takes place. 569 */ 570 void 571 netisr_unregister(const struct netisr_handler *nhp) 572 { 573 struct netisr_work *npwp; 574 #ifdef INVARIANTS 575 const char *name; 576 #endif 577 u_int i, proto; 578 579 proto = nhp->nh_proto; 580 #ifdef INVARIANTS 581 name = nhp->nh_name; 582 #endif 583 KASSERT(proto < NETISR_MAXPROT, 584 ("%s(%u): protocol too big for %s", __func__, proto, name)); 585 586 NETISR_WLOCK(); 587 KASSERT(np[proto].np_handler != NULL, 588 ("%s(%u): protocol not registered for %s", __func__, proto, 589 name)); 590 591 np[proto].np_name = NULL; 592 np[proto].np_handler = NULL; 593 np[proto].np_m2flow = NULL; 594 np[proto].np_m2cpuid = NULL; 595 np[proto].np_qlimit = 0; 596 np[proto].np_policy = 0; 597 for (i = 0; i <= mp_maxid; i++) { 598 if (CPU_ABSENT(i)) 599 continue; 600 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 601 netisr_drain_proto(npwp); 602 bzero(npwp, sizeof(*npwp)); 603 } 604 NETISR_WUNLOCK(); 605 } 606 607 /* 608 * Look up the workstream given a packet and source identifier. Do this by 609 * checking the protocol's policy, and optionally call out to the protocol 610 * for assistance if required. 611 */ 612 static struct mbuf * 613 netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source, 614 struct mbuf *m, u_int *cpuidp) 615 { 616 struct ifnet *ifp; 617 618 NETISR_LOCK_ASSERT(); 619 620 /* 621 * In the event we have only one worker, shortcut and deliver to it 622 * without further ado. 623 */ 624 if (nws_count == 1) { 625 *cpuidp = nws_array[0]; 626 return (m); 627 } 628 629 /* 630 * What happens next depends on the policy selected by the protocol. 631 * If we want to support per-interface policies, we should do that 632 * here first. 633 */ 634 switch (npp->np_policy) { 635 case NETISR_POLICY_CPU: 636 return (npp->np_m2cpuid(m, source, cpuidp)); 637 638 case NETISR_POLICY_FLOW: 639 if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { 640 m = npp->np_m2flow(m, source); 641 if (m == NULL) 642 return (NULL); 643 } 644 if (m->m_flags & M_FLOWID) { 645 *cpuidp = 646 netisr_default_flow2cpu(m->m_pkthdr.flowid); 647 return (m); 648 } 649 /* FALLTHROUGH */ 650 651 case NETISR_POLICY_SOURCE: 652 ifp = m->m_pkthdr.rcvif; 653 if (ifp != NULL) 654 *cpuidp = nws_array[(ifp->if_index + source) % 655 nws_count]; 656 else 657 *cpuidp = nws_array[source % nws_count]; 658 return (m); 659 660 default: 661 panic("%s: invalid policy %u for %s", __func__, 662 npp->np_policy, npp->np_name); 663 } 664 } 665 666 /* 667 * Process packets associated with a workstream and protocol. For reasons of 668 * fairness, we process up to one complete netisr queue at a time, moving the 669 * queue to a stack-local queue for processing, but do not loop refreshing 670 * from the global queue. The caller is responsible for deciding whether to 671 * loop, and for setting the NWS_RUNNING flag. The passed workstream will be 672 * locked on entry and relocked before return, but will be released while 673 * processing. The number of packets processed is returned. 674 */ 675 static u_int 676 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) 677 { 678 struct netisr_work local_npw, *npwp; 679 u_int handled; 680 struct mbuf *m; 681 682 NETISR_LOCK_ASSERT(); 683 NWS_LOCK_ASSERT(nwsp); 684 685 KASSERT(nwsp->nws_flags & NWS_RUNNING, 686 ("%s(%u): not running", __func__, proto)); 687 KASSERT(proto >= 0 && proto < NETISR_MAXPROT, 688 ("%s(%u): invalid proto\n", __func__, proto)); 689 690 npwp = &nwsp->nws_work[proto]; 691 if (npwp->nw_len == 0) 692 return (0); 693 694 /* 695 * Move the global work queue to a thread-local work queue. 696 * 697 * Notice that this means the effective maximum length of the queue 698 * is actually twice that of the maximum queue length specified in 699 * the protocol registration call. 700 */ 701 handled = npwp->nw_len; 702 local_npw = *npwp; 703 npwp->nw_head = NULL; 704 npwp->nw_tail = NULL; 705 npwp->nw_len = 0; 706 nwsp->nws_pendingbits &= ~(1 << proto); 707 NWS_UNLOCK(nwsp); 708 while ((m = local_npw.nw_head) != NULL) { 709 local_npw.nw_head = m->m_nextpkt; 710 m->m_nextpkt = NULL; 711 if (local_npw.nw_head == NULL) 712 local_npw.nw_tail = NULL; 713 local_npw.nw_len--; 714 VNET_ASSERT(m->m_pkthdr.rcvif != NULL); 715 CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); 716 np[proto].np_handler(m); 717 CURVNET_RESTORE(); 718 } 719 KASSERT(local_npw.nw_len == 0, 720 ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); 721 if (np[proto].np_drainedcpu) 722 np[proto].np_drainedcpu(nwsp->nws_cpu); 723 NWS_LOCK(nwsp); 724 npwp->nw_handled += handled; 725 return (handled); 726 } 727 728 /* 729 * SWI handler for netisr -- processes prackets in a set of workstreams that 730 * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already 731 * being direct dispatched, go back to sleep and wait for the dispatching 732 * thread to wake us up again. 733 */ 734 static void 735 swi_net(void *arg) 736 { 737 #ifdef NETISR_LOCKING 738 struct rm_priotracker tracker; 739 #endif 740 struct netisr_workstream *nwsp; 741 u_int bits, prot; 742 743 nwsp = arg; 744 745 #ifdef DEVICE_POLLING 746 KASSERT(nws_count == 1, 747 ("%s: device_polling but nws_count != 1", __func__)); 748 netisr_poll(); 749 #endif 750 #ifdef NETISR_LOCKING 751 NETISR_RLOCK(&tracker); 752 #endif 753 NWS_LOCK(nwsp); 754 KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); 755 if (nwsp->nws_flags & NWS_DISPATCHING) 756 goto out; 757 nwsp->nws_flags |= NWS_RUNNING; 758 nwsp->nws_flags &= ~NWS_SCHEDULED; 759 while ((bits = nwsp->nws_pendingbits) != 0) { 760 while ((prot = ffs(bits)) != 0) { 761 prot--; 762 bits &= ~(1 << prot); 763 (void)netisr_process_workstream_proto(nwsp, prot); 764 } 765 } 766 nwsp->nws_flags &= ~NWS_RUNNING; 767 out: 768 NWS_UNLOCK(nwsp); 769 #ifdef NETISR_LOCKING 770 NETISR_RUNLOCK(&tracker); 771 #endif 772 #ifdef DEVICE_POLLING 773 netisr_pollmore(); 774 #endif 775 } 776 777 static int 778 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, 779 struct netisr_work *npwp, struct mbuf *m, int *dosignalp) 780 { 781 782 NWS_LOCK_ASSERT(nwsp); 783 784 *dosignalp = 0; 785 if (npwp->nw_len < npwp->nw_qlimit) { 786 m->m_nextpkt = NULL; 787 if (npwp->nw_head == NULL) { 788 npwp->nw_head = m; 789 npwp->nw_tail = m; 790 } else { 791 npwp->nw_tail->m_nextpkt = m; 792 npwp->nw_tail = m; 793 } 794 npwp->nw_len++; 795 if (npwp->nw_len > npwp->nw_watermark) 796 npwp->nw_watermark = npwp->nw_len; 797 nwsp->nws_pendingbits |= (1 << proto); 798 if (!(nwsp->nws_flags & 799 (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { 800 nwsp->nws_flags |= NWS_SCHEDULED; 801 *dosignalp = 1; /* Defer until unlocked. */ 802 } 803 npwp->nw_queued++; 804 return (0); 805 } else { 806 m_freem(m); 807 npwp->nw_qdrops++; 808 return (ENOBUFS); 809 } 810 } 811 812 static int 813 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) 814 { 815 struct netisr_workstream *nwsp; 816 struct netisr_work *npwp; 817 int dosignal, error; 818 819 #ifdef NETISR_LOCKING 820 NETISR_LOCK_ASSERT(); 821 #endif 822 KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, 823 cpuid, mp_maxid)); 824 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 825 826 dosignal = 0; 827 error = 0; 828 nwsp = DPCPU_ID_PTR(cpuid, nws); 829 npwp = &nwsp->nws_work[proto]; 830 NWS_LOCK(nwsp); 831 error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); 832 NWS_UNLOCK(nwsp); 833 if (dosignal) 834 NWS_SIGNAL(nwsp); 835 return (error); 836 } 837 838 int 839 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) 840 { 841 #ifdef NETISR_LOCKING 842 struct rm_priotracker tracker; 843 #endif 844 u_int cpuid; 845 int error; 846 847 KASSERT(proto < NETISR_MAXPROT, 848 ("%s: invalid proto %u", __func__, proto)); 849 850 #ifdef NETISR_LOCKING 851 NETISR_RLOCK(&tracker); 852 #endif 853 KASSERT(np[proto].np_handler != NULL, 854 ("%s: invalid proto %u", __func__, proto)); 855 856 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 857 if (m != NULL) { 858 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, 859 cpuid)); 860 error = netisr_queue_internal(proto, m, cpuid); 861 } else 862 error = ENOBUFS; 863 #ifdef NETISR_LOCKING 864 NETISR_RUNLOCK(&tracker); 865 #endif 866 return (error); 867 } 868 869 int 870 netisr_queue(u_int proto, struct mbuf *m) 871 { 872 873 return (netisr_queue_src(proto, 0, m)); 874 } 875 876 /* 877 * Dispatch a packet for netisr processing, direct dispatch permitted by 878 * calling context. 879 */ 880 int 881 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) 882 { 883 #ifdef NETISR_LOCKING 884 struct rm_priotracker tracker; 885 #endif 886 struct netisr_workstream *nwsp; 887 struct netisr_work *npwp; 888 int dosignal, error; 889 u_int cpuid; 890 891 /* 892 * If direct dispatch is entirely disabled, fall back on queueing. 893 */ 894 if (!netisr_direct) 895 return (netisr_queue_src(proto, source, m)); 896 897 KASSERT(proto < NETISR_MAXPROT, 898 ("%s: invalid proto %u", __func__, proto)); 899 #ifdef NETISR_LOCKING 900 NETISR_RLOCK(&tracker); 901 #endif 902 KASSERT(np[proto].np_handler != NULL, 903 ("%s: invalid proto %u", __func__, proto)); 904 905 /* 906 * If direct dispatch is forced, then unconditionally dispatch 907 * without a formal CPU selection. Borrow the current CPU's stats, 908 * even if there's no worker on it. In this case we don't update 909 * nws_flags because all netisr processing will be source ordered due 910 * to always being forced to directly dispatch. 911 */ 912 if (netisr_direct_force) { 913 nwsp = DPCPU_PTR(nws); 914 npwp = &nwsp->nws_work[proto]; 915 npwp->nw_dispatched++; 916 npwp->nw_handled++; 917 np[proto].np_handler(m); 918 error = 0; 919 goto out_unlock; 920 } 921 922 /* 923 * Otherwise, we execute in a hybrid mode where we will try to direct 924 * dispatch if we're on the right CPU and the netisr worker isn't 925 * already running. 926 */ 927 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 928 if (m == NULL) { 929 error = ENOBUFS; 930 goto out_unlock; 931 } 932 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 933 sched_pin(); 934 if (cpuid != curcpu) 935 goto queue_fallback; 936 nwsp = DPCPU_PTR(nws); 937 npwp = &nwsp->nws_work[proto]; 938 939 /*- 940 * We are willing to direct dispatch only if three conditions hold: 941 * 942 * (1) The netisr worker isn't already running, 943 * (2) Another thread isn't already directly dispatching, and 944 * (3) The netisr hasn't already been woken up. 945 */ 946 NWS_LOCK(nwsp); 947 if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { 948 error = netisr_queue_workstream(nwsp, proto, npwp, m, 949 &dosignal); 950 NWS_UNLOCK(nwsp); 951 if (dosignal) 952 NWS_SIGNAL(nwsp); 953 goto out_unpin; 954 } 955 956 /* 957 * The current thread is now effectively the netisr worker, so set 958 * the dispatching flag to prevent concurrent processing of the 959 * stream from another thread (even the netisr worker), which could 960 * otherwise lead to effective misordering of the stream. 961 */ 962 nwsp->nws_flags |= NWS_DISPATCHING; 963 NWS_UNLOCK(nwsp); 964 np[proto].np_handler(m); 965 NWS_LOCK(nwsp); 966 nwsp->nws_flags &= ~NWS_DISPATCHING; 967 npwp->nw_handled++; 968 npwp->nw_hybrid_dispatched++; 969 970 /* 971 * If other work was enqueued by another thread while we were direct 972 * dispatching, we need to signal the netisr worker to do that work. 973 * In the future, we might want to do some of that work in the 974 * current thread, rather than trigger further context switches. If 975 * so, we'll want to establish a reasonable bound on the work done in 976 * the "borrowed" context. 977 */ 978 if (nwsp->nws_pendingbits != 0) { 979 nwsp->nws_flags |= NWS_SCHEDULED; 980 dosignal = 1; 981 } else 982 dosignal = 0; 983 NWS_UNLOCK(nwsp); 984 if (dosignal) 985 NWS_SIGNAL(nwsp); 986 error = 0; 987 goto out_unpin; 988 989 queue_fallback: 990 error = netisr_queue_internal(proto, m, cpuid); 991 out_unpin: 992 sched_unpin(); 993 out_unlock: 994 #ifdef NETISR_LOCKING 995 NETISR_RUNLOCK(&tracker); 996 #endif 997 return (error); 998 } 999 1000 int 1001 netisr_dispatch(u_int proto, struct mbuf *m) 1002 { 1003 1004 return (netisr_dispatch_src(proto, 0, m)); 1005 } 1006 1007 #ifdef DEVICE_POLLING 1008 /* 1009 * Kernel polling borrows a netisr thread to run interface polling in; this 1010 * function allows kernel polling to request that the netisr thread be 1011 * scheduled even if no packets are pending for protocols. 1012 */ 1013 void 1014 netisr_sched_poll(void) 1015 { 1016 struct netisr_workstream *nwsp; 1017 1018 nwsp = DPCPU_ID_PTR(nws_array[0], nws); 1019 NWS_SIGNAL(nwsp); 1020 } 1021 #endif 1022 1023 static void 1024 netisr_start_swi(u_int cpuid, struct pcpu *pc) 1025 { 1026 char swiname[12]; 1027 struct netisr_workstream *nwsp; 1028 int error; 1029 1030 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 1031 1032 nwsp = DPCPU_ID_PTR(cpuid, nws); 1033 mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); 1034 nwsp->nws_cpu = cpuid; 1035 snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); 1036 error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, 1037 SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); 1038 if (error) 1039 panic("%s: swi_add %d", __func__, error); 1040 pc->pc_netisr = nwsp->nws_intr_event; 1041 if (netisr_bindthreads) { 1042 error = intr_event_bind(nwsp->nws_intr_event, cpuid); 1043 if (error != 0) 1044 printf("%s: cpu %u: intr_event_bind: %d", __func__, 1045 cpuid, error); 1046 } 1047 NETISR_WLOCK(); 1048 nws_array[nws_count] = nwsp->nws_cpu; 1049 nws_count++; 1050 NETISR_WUNLOCK(); 1051 } 1052 1053 /* 1054 * Initialize the netisr subsystem. We rely on BSS and static initialization 1055 * of most fields in global data structures. 1056 * 1057 * Start a worker thread for the boot CPU so that we can support network 1058 * traffic immediately in case the network stack is used before additional 1059 * CPUs are started (for example, diskless boot). 1060 */ 1061 static void 1062 netisr_init(void *arg) 1063 { 1064 1065 KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); 1066 1067 NETISR_LOCK_INIT(); 1068 if (netisr_maxthreads < 1) 1069 netisr_maxthreads = 1; 1070 if (netisr_maxthreads > mp_ncpus) { 1071 printf("netisr2: forcing maxthreads from %d to %d\n", 1072 netisr_maxthreads, mp_ncpus); 1073 netisr_maxthreads = mp_ncpus; 1074 } 1075 if (netisr_defaultqlimit > netisr_maxqlimit) { 1076 printf("netisr2: forcing defaultqlimit from %d to %d\n", 1077 netisr_defaultqlimit, netisr_maxqlimit); 1078 netisr_defaultqlimit = netisr_maxqlimit; 1079 } 1080 #ifdef DEVICE_POLLING 1081 /* 1082 * The device polling code is not yet aware of how to deal with 1083 * multiple netisr threads, so for the time being compiling in device 1084 * polling disables parallel netisr workers. 1085 */ 1086 if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { 1087 printf("netisr2: forcing maxthreads to 1 and bindthreads to " 1088 "0 for device polling\n"); 1089 netisr_maxthreads = 1; 1090 netisr_bindthreads = 0; 1091 } 1092 #endif 1093 1094 netisr_start_swi(curcpu, pcpu_find(curcpu)); 1095 } 1096 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); 1097 1098 /* 1099 * Start worker threads for additional CPUs. No attempt to gracefully handle 1100 * work reassignment, we don't yet support dynamic reconfiguration. 1101 */ 1102 static void 1103 netisr_start(void *arg) 1104 { 1105 struct pcpu *pc; 1106 1107 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 1108 if (nws_count >= netisr_maxthreads) 1109 break; 1110 /* XXXRW: Is skipping absent CPUs still required here? */ 1111 if (CPU_ABSENT(pc->pc_cpuid)) 1112 continue; 1113 /* Worker will already be present for boot CPU. */ 1114 if (pc->pc_netisr != NULL) 1115 continue; 1116 netisr_start_swi(pc->pc_cpuid, pc); 1117 } 1118 } 1119 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); 1120 1121 #ifdef DDB 1122 DB_SHOW_COMMAND(netisr, db_show_netisr) 1123 { 1124 struct netisr_workstream *nwsp; 1125 struct netisr_work *nwp; 1126 int first, proto; 1127 u_int cpuid; 1128 1129 db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", 1130 "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); 1131 for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { 1132 if (CPU_ABSENT(cpuid)) 1133 continue; 1134 nwsp = DPCPU_ID_PTR(cpuid, nws); 1135 if (nwsp->nws_intr_event == NULL) 1136 continue; 1137 first = 1; 1138 for (proto = 0; proto < NETISR_MAXPROT; proto++) { 1139 if (np[proto].np_handler == NULL) 1140 continue; 1141 nwp = &nwsp->nws_work[proto]; 1142 if (first) { 1143 db_printf("%3d ", cpuid); 1144 first = 0; 1145 } else 1146 db_printf("%3s ", ""); 1147 db_printf( 1148 "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", 1149 np[proto].np_name, nwp->nw_len, 1150 nwp->nw_watermark, nwp->nw_qlimit, 1151 nwp->nw_dispatched, nwp->nw_hybrid_dispatched, 1152 nwp->nw_qdrops, nwp->nw_queued); 1153 } 1154 } 1155 } 1156 #endif 1157