1 /*- 2 * Copyright (c) 2007-2009 Robert N. M. Watson 3 * All rights reserved. 4 * 5 * Redistribution and use in source and binary forms, with or without 6 * modification, are permitted provided that the following conditions 7 * are met: 8 * 1. Redistributions of source code must retain the above copyright 9 * notice, this list of conditions and the following disclaimer. 10 * 2. Redistributions in binary form must reproduce the above copyright 11 * notice, this list of conditions and the following disclaimer in the 12 * documentation and/or other materials provided with the distribution. 13 * 14 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND 15 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 16 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 17 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 18 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 19 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 20 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 21 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 22 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 23 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 24 * SUCH DAMAGE. 25 */ 26 27 #include <sys/cdefs.h> 28 __FBSDID("$FreeBSD$"); 29 30 /* 31 * netisr is a packet dispatch service, allowing synchronous (directly 32 * dispatched) and asynchronous (deferred dispatch) processing of packets by 33 * registered protocol handlers. Callers pass a protocol identifier and 34 * packet to netisr, along with a direct dispatch hint, and work will either 35 * be immediately processed with the registered handler, or passed to a 36 * kernel software interrupt (SWI) thread for deferred dispatch. Callers 37 * will generally select one or the other based on: 38 * 39 * - Might directly dispatching a netisr handler lead to code reentrance or 40 * lock recursion, such as entering the socket code from the socket code. 41 * - Might directly dispatching a netisr handler lead to recursive 42 * processing, such as when decapsulating several wrapped layers of tunnel 43 * information (IPSEC within IPSEC within ...). 44 * 45 * Maintaining ordering for protocol streams is a critical design concern. 46 * Enforcing ordering limits the opportunity for concurrency, but maintains 47 * the strong ordering requirements found in some protocols, such as TCP. Of 48 * related concern is CPU affinity--it is desirable to process all data 49 * associated with a particular stream on the same CPU over time in order to 50 * avoid acquiring locks associated with the connection on different CPUs, 51 * keep connection data in one cache, and to generally encourage associated 52 * user threads to live on the same CPU as the stream. It's also desirable 53 * to avoid lock migration and contention where locks are associated with 54 * more than one flow. 55 * 56 * netisr supports several policy variations, represented by the 57 * NETISR_POLICY_* constants, allowing protocols to play a varying role in 58 * identifying flows, assigning work to CPUs, etc. These are described in 59 * detail in netisr.h. 60 */ 61 62 #include "opt_ddb.h" 63 #include "opt_device_polling.h" 64 65 #include <sys/param.h> 66 #include <sys/bus.h> 67 #include <sys/kernel.h> 68 #include <sys/kthread.h> 69 #include <sys/interrupt.h> 70 #include <sys/lock.h> 71 #include <sys/mbuf.h> 72 #include <sys/mutex.h> 73 #include <sys/pcpu.h> 74 #include <sys/proc.h> 75 #include <sys/rmlock.h> 76 #include <sys/sched.h> 77 #include <sys/smp.h> 78 #include <sys/socket.h> 79 #include <sys/sysctl.h> 80 #include <sys/systm.h> 81 #include <sys/vimage.h> 82 83 #ifdef DDB 84 #include <ddb/ddb.h> 85 #endif 86 87 #include <net/if.h> 88 #include <net/if_var.h> 89 #include <net/netisr.h> 90 91 /*- 92 * Synchronize use and modification of the registered netisr data structures; 93 * acquire a read lock while modifying the set of registered protocols to 94 * prevent partially registered or unregistered protocols from being run. 95 * 96 * The following data structures and fields are protected by this lock: 97 * 98 * - The np array, including all fields of struct netisr_proto. 99 * - The nws array, including all fields of struct netisr_worker. 100 * - The nws_array array. 101 * 102 * Note: the NETISR_LOCKING define controls whether read locks are acquired 103 * in packet processing paths requiring netisr registration stability. This 104 * is disabled by default as it can lead to a measurable performance 105 * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and 106 * because netisr registration and unregistration is extremely rare at 107 * runtime. If it becomes more common, this decision should be revisited. 108 * 109 * XXXRW: rmlocks don't support assertions. 110 */ 111 static struct rmlock netisr_rmlock; 112 #define NETISR_LOCK_INIT() rm_init_flags(&netisr_rmlock, "netisr", \ 113 RM_NOWITNESS) 114 #define NETISR_LOCK_ASSERT() 115 #define NETISR_RLOCK(tracker) rm_rlock(&netisr_rmlock, (tracker)) 116 #define NETISR_RUNLOCK(tracker) rm_runlock(&netisr_rmlock, (tracker)) 117 #define NETISR_WLOCK() rm_wlock(&netisr_rmlock) 118 #define NETISR_WUNLOCK() rm_wunlock(&netisr_rmlock) 119 /* #define NETISR_LOCKING */ 120 121 SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr"); 122 123 /*- 124 * Three direct dispatch policies are supported: 125 * 126 * - Always defer: all work is scheduled for a netisr, regardless of context. 127 * (!direct) 128 * 129 * - Hybrid: if the executing context allows direct dispatch, and we're 130 * running on the CPU the work would be done on, then direct dispatch if it 131 * wouldn't violate ordering constraints on the workstream. 132 * (direct && !direct_force) 133 * 134 * - Always direct: if the executing context allows direct dispatch, always 135 * direct dispatch. (direct && direct_force) 136 * 137 * Notice that changing the global policy could lead to short periods of 138 * misordered processing, but this is considered acceptable as compared to 139 * the complexity of enforcing ordering during policy changes. 140 */ 141 static int netisr_direct_force = 1; /* Always direct dispatch. */ 142 TUNABLE_INT("net.isr.direct_force", &netisr_direct_force); 143 SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW, 144 &netisr_direct_force, 0, "Force direct dispatch"); 145 146 static int netisr_direct = 1; /* Enable direct dispatch. */ 147 TUNABLE_INT("net.isr.direct", &netisr_direct); 148 SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW, 149 &netisr_direct, 0, "Enable direct dispatch"); 150 151 /* 152 * Allow the administrator to limit the number of threads (CPUs) to use for 153 * netisr. We don't check netisr_maxthreads before creating the thread for 154 * CPU 0, so in practice we ignore values <= 1. This must be set at boot. 155 * We will create at most one thread per CPU. 156 */ 157 static int netisr_maxthreads = -1; /* Max number of threads. */ 158 TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads); 159 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RD, 160 &netisr_maxthreads, 0, 161 "Use at most this many CPUs for netisr processing"); 162 163 static int netisr_bindthreads = 0; /* Bind threads to CPUs. */ 164 TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads); 165 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RD, 166 &netisr_bindthreads, 0, "Bind netisr threads to CPUs."); 167 168 /* 169 * Limit per-workstream queues to at most net.isr.maxqlimit, both for initial 170 * configuration and later modification using netisr_setqlimit(). 171 */ 172 #define NETISR_DEFAULT_MAXQLIMIT 10240 173 static u_int netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT; 174 TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit); 175 SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RD, 176 &netisr_maxqlimit, 0, 177 "Maximum netisr per-protocol, per-CPU queue depth."); 178 179 /* 180 * The default per-workstream queue limit for protocols that don't initialize 181 * the nh_qlimit field of their struct netisr_handler. If this is set above 182 * netisr_maxqlimit, we truncate it to the maximum during boot. 183 */ 184 #define NETISR_DEFAULT_DEFAULTQLIMIT 256 185 static u_int netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT; 186 TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit); 187 SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RD, 188 &netisr_defaultqlimit, 0, 189 "Default netisr per-protocol, per-CPU queue limit if not set by protocol"); 190 191 /* 192 * Each protocol is described by a struct netisr_proto, which holds all 193 * global per-protocol information. This data structure is set up by 194 * netisr_register(), and derived from the public struct netisr_handler. 195 */ 196 struct netisr_proto { 197 const char *np_name; /* Character string protocol name. */ 198 netisr_handler_t *np_handler; /* Protocol handler. */ 199 netisr_m2flow_t *np_m2flow; /* Query flow for untagged packet. */ 200 netisr_m2cpuid_t *np_m2cpuid; /* Query CPU to process packet on. */ 201 netisr_drainedcpu_t *np_drainedcpu; /* Callback when drained a queue. */ 202 u_int np_qlimit; /* Maximum per-CPU queue depth. */ 203 u_int np_policy; /* Work placement policy. */ 204 }; 205 206 #define NETISR_MAXPROT 16 /* Compile-time limit. */ 207 208 /* 209 * The np array describes all registered protocols, indexed by protocol 210 * number. 211 */ 212 static struct netisr_proto np[NETISR_MAXPROT]; 213 214 /* 215 * Protocol-specific work for each workstream is described by struct 216 * netisr_work. Each work descriptor consists of an mbuf queue and 217 * statistics. 218 */ 219 struct netisr_work { 220 /* 221 * Packet queue, linked by m_nextpkt. 222 */ 223 struct mbuf *nw_head; 224 struct mbuf *nw_tail; 225 u_int nw_len; 226 u_int nw_qlimit; 227 u_int nw_watermark; 228 229 /* 230 * Statistics -- written unlocked, but mostly from curcpu. 231 */ 232 u_int64_t nw_dispatched; /* Number of direct dispatches. */ 233 u_int64_t nw_hybrid_dispatched; /* "" hybrid dispatches. */ 234 u_int64_t nw_qdrops; /* "" drops. */ 235 u_int64_t nw_queued; /* "" enqueues. */ 236 u_int64_t nw_handled; /* "" handled in worker. */ 237 }; 238 239 /* 240 * Workstreams hold a set of ordered work across each protocol, and are 241 * described by netisr_workstream. Each workstream is associated with a 242 * worker thread, which in turn is pinned to a CPU. Work associated with a 243 * workstream can be processd in other threads during direct dispatch; 244 * concurrent processing is prevented by the NWS_RUNNING flag, which 245 * indicates that a thread is already processing the work queue. 246 */ 247 struct netisr_workstream { 248 struct intr_event *nws_intr_event; /* Handler for stream. */ 249 void *nws_swi_cookie; /* swi(9) cookie for stream. */ 250 struct mtx nws_mtx; /* Synchronize work. */ 251 u_int nws_cpu; /* CPU pinning. */ 252 u_int nws_flags; /* Wakeup flags. */ 253 u_int nws_pendingbits; /* Scheduled protocols. */ 254 255 /* 256 * Each protocol has per-workstream data. 257 */ 258 struct netisr_work nws_work[NETISR_MAXPROT]; 259 } __aligned(CACHE_LINE_SIZE); 260 261 /* 262 * Per-CPU workstream data. 263 */ 264 DPCPU_DEFINE(struct netisr_workstream, nws); 265 266 /* 267 * Map contiguous values between 0 and nws_count into CPU IDs appropriate for 268 * accessing workstreams. This allows constructions of the form 269 * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws). 270 */ 271 static u_int nws_array[MAXCPU]; 272 273 /* 274 * Number of registered workstreams. Will be at most the number of running 275 * CPUs once fully started. 276 */ 277 static u_int nws_count; 278 SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD, 279 &nws_count, 0, "Number of extant netisr threads."); 280 281 /* 282 * Per-workstream flags. 283 */ 284 #define NWS_RUNNING 0x00000001 /* Currently running in a thread. */ 285 #define NWS_DISPATCHING 0x00000002 /* Currently being direct-dispatched. */ 286 #define NWS_SCHEDULED 0x00000004 /* Signal issued. */ 287 288 /* 289 * Synchronization for each workstream: a mutex protects all mutable fields 290 * in each stream, including per-protocol state (mbuf queues). The SWI is 291 * woken up if asynchronous dispatch is required. 292 */ 293 #define NWS_LOCK(s) mtx_lock(&(s)->nws_mtx) 294 #define NWS_LOCK_ASSERT(s) mtx_assert(&(s)->nws_mtx, MA_OWNED) 295 #define NWS_UNLOCK(s) mtx_unlock(&(s)->nws_mtx) 296 #define NWS_SIGNAL(s) swi_sched((s)->nws_swi_cookie, 0) 297 298 /* 299 * Utility routines for protocols that implement their own mapping of flows 300 * to CPUs. 301 */ 302 u_int 303 netisr_get_cpucount(void) 304 { 305 306 return (nws_count); 307 } 308 309 u_int 310 netisr_get_cpuid(u_int cpunumber) 311 { 312 313 KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber, 314 nws_count)); 315 316 return (nws_array[cpunumber]); 317 } 318 319 /* 320 * The default implementation of -> CPU ID mapping. 321 * 322 * Non-static so that protocols can use it to map their own work to specific 323 * CPUs in a manner consistent to netisr for affinity purposes. 324 */ 325 u_int 326 netisr_default_flow2cpu(u_int flowid) 327 { 328 329 return (nws_array[flowid % nws_count]); 330 } 331 332 /* 333 * Register a new netisr handler, which requires initializing per-protocol 334 * fields for each workstream. All netisr work is briefly suspended while 335 * the protocol is installed. 336 */ 337 void 338 netisr_register(const struct netisr_handler *nhp) 339 { 340 struct netisr_work *npwp; 341 const char *name; 342 u_int i, proto; 343 344 proto = nhp->nh_proto; 345 name = nhp->nh_name; 346 347 /* 348 * Test that the requested registration is valid. 349 */ 350 KASSERT(nhp->nh_name != NULL, 351 ("%s: nh_name NULL for %u", __func__, proto)); 352 KASSERT(nhp->nh_handler != NULL, 353 ("%s: nh_handler NULL for %s", __func__, name)); 354 KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE || 355 nhp->nh_policy == NETISR_POLICY_FLOW || 356 nhp->nh_policy == NETISR_POLICY_CPU, 357 ("%s: unsupported nh_policy %u for %s", __func__, 358 nhp->nh_policy, name)); 359 KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW || 360 nhp->nh_m2flow == NULL, 361 ("%s: nh_policy != FLOW but m2flow defined for %s", __func__, 362 name)); 363 KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL, 364 ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__, 365 name)); 366 KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL, 367 ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__, 368 name)); 369 KASSERT(proto < NETISR_MAXPROT, 370 ("%s(%u, %s): protocol too big", __func__, proto, name)); 371 372 /* 373 * Test that no existing registration exists for this protocol. 374 */ 375 NETISR_WLOCK(); 376 KASSERT(np[proto].np_name == NULL, 377 ("%s(%u, %s): name present", __func__, proto, name)); 378 KASSERT(np[proto].np_handler == NULL, 379 ("%s(%u, %s): handler present", __func__, proto, name)); 380 381 np[proto].np_name = name; 382 np[proto].np_handler = nhp->nh_handler; 383 np[proto].np_m2flow = nhp->nh_m2flow; 384 np[proto].np_m2cpuid = nhp->nh_m2cpuid; 385 np[proto].np_drainedcpu = nhp->nh_drainedcpu; 386 if (nhp->nh_qlimit == 0) 387 np[proto].np_qlimit = netisr_defaultqlimit; 388 else if (nhp->nh_qlimit > netisr_maxqlimit) { 389 printf("%s: %s requested queue limit %u capped to " 390 "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit, 391 netisr_maxqlimit); 392 np[proto].np_qlimit = netisr_maxqlimit; 393 } else 394 np[proto].np_qlimit = nhp->nh_qlimit; 395 np[proto].np_policy = nhp->nh_policy; 396 for (i = 0; i <= mp_maxid; i++) { 397 if (CPU_ABSENT(i)) 398 continue; 399 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 400 bzero(npwp, sizeof(*npwp)); 401 npwp->nw_qlimit = np[proto].np_qlimit; 402 } 403 NETISR_WUNLOCK(); 404 } 405 406 /* 407 * Clear drop counters across all workstreams for a protocol. 408 */ 409 void 410 netisr_clearqdrops(const struct netisr_handler *nhp) 411 { 412 struct netisr_work *npwp; 413 #ifdef INVARIANTS 414 const char *name; 415 #endif 416 u_int i, proto; 417 418 proto = nhp->nh_proto; 419 #ifdef INVARIANTS 420 name = nhp->nh_name; 421 #endif 422 KASSERT(proto < NETISR_MAXPROT, 423 ("%s(%u): protocol too big for %s", __func__, proto, name)); 424 425 NETISR_WLOCK(); 426 KASSERT(np[proto].np_handler != NULL, 427 ("%s(%u): protocol not registered for %s", __func__, proto, 428 name)); 429 430 for (i = 0; i <= mp_maxid; i++) { 431 if (CPU_ABSENT(i)) 432 continue; 433 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 434 npwp->nw_qdrops = 0; 435 } 436 NETISR_WUNLOCK(); 437 } 438 439 /* 440 * Query the current drop counters across all workstreams for a protocol. 441 */ 442 void 443 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp) 444 { 445 struct netisr_work *npwp; 446 struct rm_priotracker tracker; 447 #ifdef INVARIANTS 448 const char *name; 449 #endif 450 u_int i, proto; 451 452 *qdropp = 0; 453 proto = nhp->nh_proto; 454 #ifdef INVARIANTS 455 name = nhp->nh_name; 456 #endif 457 KASSERT(proto < NETISR_MAXPROT, 458 ("%s(%u): protocol too big for %s", __func__, proto, name)); 459 460 NETISR_RLOCK(&tracker); 461 KASSERT(np[proto].np_handler != NULL, 462 ("%s(%u): protocol not registered for %s", __func__, proto, 463 name)); 464 465 for (i = 0; i <= mp_maxid; i++) { 466 if (CPU_ABSENT(i)) 467 continue; 468 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 469 *qdropp += npwp->nw_qdrops; 470 } 471 NETISR_RUNLOCK(&tracker); 472 } 473 474 /* 475 * Query the current queue limit for per-workstream queues for a protocol. 476 */ 477 void 478 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp) 479 { 480 struct rm_priotracker tracker; 481 #ifdef INVARIANTS 482 const char *name; 483 #endif 484 u_int proto; 485 486 proto = nhp->nh_proto; 487 #ifdef INVARIANTS 488 name = nhp->nh_name; 489 #endif 490 KASSERT(proto < NETISR_MAXPROT, 491 ("%s(%u): protocol too big for %s", __func__, proto, name)); 492 493 NETISR_RLOCK(&tracker); 494 KASSERT(np[proto].np_handler != NULL, 495 ("%s(%u): protocol not registered for %s", __func__, proto, 496 name)); 497 *qlimitp = np[proto].np_qlimit; 498 NETISR_RUNLOCK(&tracker); 499 } 500 501 /* 502 * Update the queue limit across per-workstream queues for a protocol. We 503 * simply change the limits, and don't drain overflowed packets as they will 504 * (hopefully) take care of themselves shortly. 505 */ 506 int 507 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit) 508 { 509 struct netisr_work *npwp; 510 #ifdef INVARIANTS 511 const char *name; 512 #endif 513 u_int i, proto; 514 515 if (qlimit > netisr_maxqlimit) 516 return (EINVAL); 517 518 proto = nhp->nh_proto; 519 #ifdef INVARIANTS 520 name = nhp->nh_name; 521 #endif 522 KASSERT(proto < NETISR_MAXPROT, 523 ("%s(%u): protocol too big for %s", __func__, proto, name)); 524 525 NETISR_WLOCK(); 526 KASSERT(np[proto].np_handler != NULL, 527 ("%s(%u): protocol not registered for %s", __func__, proto, 528 name)); 529 530 np[proto].np_qlimit = qlimit; 531 for (i = 0; i <= mp_maxid; i++) { 532 if (CPU_ABSENT(i)) 533 continue; 534 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 535 npwp->nw_qlimit = qlimit; 536 } 537 NETISR_WUNLOCK(); 538 return (0); 539 } 540 541 /* 542 * Drain all packets currently held in a particular protocol work queue. 543 */ 544 static void 545 netisr_drain_proto(struct netisr_work *npwp) 546 { 547 struct mbuf *m; 548 549 /* 550 * We would assert the lock on the workstream but it's not passed in. 551 */ 552 while ((m = npwp->nw_head) != NULL) { 553 npwp->nw_head = m->m_nextpkt; 554 m->m_nextpkt = NULL; 555 if (npwp->nw_head == NULL) 556 npwp->nw_tail = NULL; 557 npwp->nw_len--; 558 m_freem(m); 559 } 560 KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__)); 561 KASSERT(npwp->nw_len == 0, ("%s: len", __func__)); 562 } 563 564 /* 565 * Remove the registration of a network protocol, which requires clearing 566 * per-protocol fields across all workstreams, including freeing all mbufs in 567 * the queues at time of unregister. All work in netisr is briefly suspended 568 * while this takes place. 569 */ 570 void 571 netisr_unregister(const struct netisr_handler *nhp) 572 { 573 struct netisr_work *npwp; 574 #ifdef INVARIANTS 575 const char *name; 576 #endif 577 u_int i, proto; 578 579 proto = nhp->nh_proto; 580 #ifdef INVARIANTS 581 name = nhp->nh_name; 582 #endif 583 KASSERT(proto < NETISR_MAXPROT, 584 ("%s(%u): protocol too big for %s", __func__, proto, name)); 585 586 NETISR_WLOCK(); 587 KASSERT(np[proto].np_handler != NULL, 588 ("%s(%u): protocol not registered for %s", __func__, proto, 589 name)); 590 591 np[proto].np_name = NULL; 592 np[proto].np_handler = NULL; 593 np[proto].np_m2flow = NULL; 594 np[proto].np_m2cpuid = NULL; 595 np[proto].np_qlimit = 0; 596 np[proto].np_policy = 0; 597 for (i = 0; i <= mp_maxid; i++) { 598 if (CPU_ABSENT(i)) 599 continue; 600 npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto]; 601 netisr_drain_proto(npwp); 602 bzero(npwp, sizeof(*npwp)); 603 } 604 NETISR_WUNLOCK(); 605 } 606 607 /* 608 * Look up the workstream given a packet and source identifier. Do this by 609 * checking the protocol's policy, and optionally call out to the protocol 610 * for assistance if required. 611 */ 612 static struct mbuf * 613 netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source, 614 struct mbuf *m, u_int *cpuidp) 615 { 616 struct ifnet *ifp; 617 618 NETISR_LOCK_ASSERT(); 619 620 /* 621 * In the event we have only one worker, shortcut and deliver to it 622 * without further ado. 623 */ 624 if (nws_count == 1) { 625 *cpuidp = nws_array[0]; 626 return (m); 627 } 628 629 /* 630 * What happens next depends on the policy selected by the protocol. 631 * If we want to support per-interface policies, we should do that 632 * here first. 633 */ 634 switch (npp->np_policy) { 635 case NETISR_POLICY_CPU: 636 return (npp->np_m2cpuid(m, source, cpuidp)); 637 638 case NETISR_POLICY_FLOW: 639 if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) { 640 m = npp->np_m2flow(m, source); 641 if (m == NULL) 642 return (NULL); 643 } 644 if (m->m_flags & M_FLOWID) { 645 *cpuidp = 646 netisr_default_flow2cpu(m->m_pkthdr.flowid); 647 return (m); 648 } 649 /* FALLTHROUGH */ 650 651 case NETISR_POLICY_SOURCE: 652 ifp = m->m_pkthdr.rcvif; 653 if (ifp != NULL) 654 *cpuidp = nws_array[(ifp->if_index + source) % 655 nws_count]; 656 else 657 *cpuidp = nws_array[source % nws_count]; 658 return (m); 659 660 default: 661 panic("%s: invalid policy %u for %s", __func__, 662 npp->np_policy, npp->np_name); 663 } 664 } 665 666 /* 667 * Process packets associated with a workstream and protocol. For reasons of 668 * fairness, we process up to one complete netisr queue at a time, moving the 669 * queue to a stack-local queue for processing, but do not loop refreshing 670 * from the global queue. The caller is responsible for deciding whether to 671 * loop, and for setting the NWS_RUNNING flag. The passed workstream will be 672 * locked on entry and relocked before return, but will be released while 673 * processing. The number of packets processed is returned. 674 */ 675 static u_int 676 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto) 677 { 678 struct netisr_work local_npw, *npwp; 679 u_int handled; 680 struct mbuf *m; 681 682 NETISR_LOCK_ASSERT(); 683 NWS_LOCK_ASSERT(nwsp); 684 685 KASSERT(nwsp->nws_flags & NWS_RUNNING, 686 ("%s(%u): not running", __func__, proto)); 687 KASSERT(proto >= 0 && proto < NETISR_MAXPROT, 688 ("%s(%u): invalid proto\n", __func__, proto)); 689 690 npwp = &nwsp->nws_work[proto]; 691 if (npwp->nw_len == 0) 692 return (0); 693 694 /* 695 * Move the global work queue to a thread-local work queue. 696 * 697 * Notice that this means the effective maximum length of the queue 698 * is actually twice that of the maximum queue length specified in 699 * the protocol registration call. 700 */ 701 handled = npwp->nw_len; 702 local_npw = *npwp; 703 npwp->nw_head = NULL; 704 npwp->nw_tail = NULL; 705 npwp->nw_len = 0; 706 nwsp->nws_pendingbits &= ~(1 << proto); 707 NWS_UNLOCK(nwsp); 708 while ((m = local_npw.nw_head) != NULL) { 709 local_npw.nw_head = m->m_nextpkt; 710 m->m_nextpkt = NULL; 711 if (local_npw.nw_head == NULL) 712 local_npw.nw_tail = NULL; 713 local_npw.nw_len--; 714 VNET_ASSERT(m->m_pkthdr.rcvif != NULL); 715 CURVNET_SET(m->m_pkthdr.rcvif->if_vnet); 716 np[proto].np_handler(m); 717 CURVNET_RESTORE(); 718 } 719 KASSERT(local_npw.nw_len == 0, 720 ("%s(%u): len %u", __func__, proto, local_npw.nw_len)); 721 if (np[proto].np_drainedcpu) 722 np[proto].np_drainedcpu(nwsp->nws_cpu); 723 NWS_LOCK(nwsp); 724 npwp->nw_handled += handled; 725 return (handled); 726 } 727 728 /* 729 * SWI handler for netisr -- processes prackets in a set of workstreams that 730 * it owns, woken up by calls to NWS_SIGNAL(). If this workstream is already 731 * being direct dispatched, go back to sleep and wait for the dispatching 732 * thread to wake us up again. 733 */ 734 static void 735 swi_net(void *arg) 736 { 737 #ifdef NETISR_LOCKING 738 struct rm_priotracker tracker; 739 #endif 740 struct netisr_workstream *nwsp; 741 u_int bits, prot; 742 743 nwsp = arg; 744 745 #ifdef DEVICE_POLLING 746 KASSERT(nws_count == 1, 747 ("%s: device_polling but nws_count != 1", __func__)); 748 netisr_poll(); 749 #endif 750 #ifdef NETISR_LOCKING 751 NETISR_RLOCK(&tracker); 752 #endif 753 NWS_LOCK(nwsp); 754 KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running")); 755 if (nwsp->nws_flags & NWS_DISPATCHING) 756 goto out; 757 nwsp->nws_flags |= NWS_RUNNING; 758 nwsp->nws_flags &= ~NWS_SCHEDULED; 759 while ((bits = nwsp->nws_pendingbits) != 0) { 760 while ((prot = ffs(bits)) != 0) { 761 prot--; 762 bits &= ~(1 << prot); 763 (void)netisr_process_workstream_proto(nwsp, prot); 764 } 765 } 766 nwsp->nws_flags &= ~NWS_RUNNING; 767 out: 768 NWS_UNLOCK(nwsp); 769 #ifdef NETISR_LOCKING 770 NETISR_RUNLOCK(&tracker); 771 #endif 772 #ifdef DEVICE_POLLING 773 netisr_pollmore(); 774 #endif 775 } 776 777 static int 778 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto, 779 struct netisr_work *npwp, struct mbuf *m, int *dosignalp) 780 { 781 782 NWS_LOCK_ASSERT(nwsp); 783 784 *dosignalp = 0; 785 if (npwp->nw_len < npwp->nw_qlimit) { 786 m->m_nextpkt = NULL; 787 if (npwp->nw_head == NULL) { 788 npwp->nw_head = m; 789 npwp->nw_tail = m; 790 } else { 791 npwp->nw_tail->m_nextpkt = m; 792 npwp->nw_tail = m; 793 } 794 npwp->nw_len++; 795 if (npwp->nw_len > npwp->nw_watermark) 796 npwp->nw_watermark = npwp->nw_len; 797 nwsp->nws_pendingbits |= (1 << proto); 798 if (!(nwsp->nws_flags & 799 (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) { 800 nwsp->nws_flags |= NWS_SCHEDULED; 801 *dosignalp = 1; /* Defer until unlocked. */ 802 } 803 npwp->nw_queued++; 804 return (0); 805 } else { 806 npwp->nw_qdrops++; 807 return (ENOBUFS); 808 } 809 } 810 811 static int 812 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid) 813 { 814 struct netisr_workstream *nwsp; 815 struct netisr_work *npwp; 816 int dosignal, error; 817 818 #ifdef NETISR_LOCKING 819 NETISR_LOCK_ASSERT(); 820 #endif 821 KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__, 822 cpuid, mp_maxid)); 823 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 824 825 dosignal = 0; 826 error = 0; 827 nwsp = DPCPU_ID_PTR(cpuid, nws); 828 npwp = &nwsp->nws_work[proto]; 829 NWS_LOCK(nwsp); 830 error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal); 831 NWS_UNLOCK(nwsp); 832 if (dosignal) 833 NWS_SIGNAL(nwsp); 834 return (error); 835 } 836 837 int 838 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m) 839 { 840 #ifdef NETISR_LOCKING 841 struct rm_priotracker tracker; 842 #endif 843 u_int cpuid; 844 int error; 845 846 KASSERT(proto < NETISR_MAXPROT, 847 ("%s: invalid proto %u", __func__, proto)); 848 849 #ifdef NETISR_LOCKING 850 NETISR_RLOCK(&tracker); 851 #endif 852 KASSERT(np[proto].np_handler != NULL, 853 ("%s: invalid proto %u", __func__, proto)); 854 855 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 856 if (m != NULL) { 857 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, 858 cpuid)); 859 error = netisr_queue_internal(proto, m, cpuid); 860 } else 861 error = ENOBUFS; 862 #ifdef NETISR_LOCKING 863 NETISR_RUNLOCK(&tracker); 864 #endif 865 return (error); 866 } 867 868 int 869 netisr_queue(u_int proto, struct mbuf *m) 870 { 871 872 return (netisr_queue_src(proto, 0, m)); 873 } 874 875 /* 876 * Dispatch a packet for netisr processing, direct dispatch permitted by 877 * calling context. 878 */ 879 int 880 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m) 881 { 882 #ifdef NETISR_LOCKING 883 struct rm_priotracker tracker; 884 #endif 885 struct netisr_workstream *nwsp; 886 struct netisr_work *npwp; 887 int dosignal, error; 888 u_int cpuid; 889 890 /* 891 * If direct dispatch is entirely disabled, fall back on queueing. 892 */ 893 if (!netisr_direct) 894 return (netisr_queue_src(proto, source, m)); 895 896 KASSERT(proto < NETISR_MAXPROT, 897 ("%s: invalid proto %u", __func__, proto)); 898 #ifdef NETISR_LOCKING 899 NETISR_RLOCK(&tracker); 900 #endif 901 KASSERT(np[proto].np_handler != NULL, 902 ("%s: invalid proto %u", __func__, proto)); 903 904 /* 905 * If direct dispatch is forced, then unconditionally dispatch 906 * without a formal CPU selection. Borrow the current CPU's stats, 907 * even if there's no worker on it. In this case we don't update 908 * nws_flags because all netisr processing will be source ordered due 909 * to always being forced to directly dispatch. 910 */ 911 if (netisr_direct_force) { 912 nwsp = DPCPU_PTR(nws); 913 npwp = &nwsp->nws_work[proto]; 914 npwp->nw_dispatched++; 915 npwp->nw_handled++; 916 np[proto].np_handler(m); 917 error = 0; 918 goto out_unlock; 919 } 920 921 /* 922 * Otherwise, we execute in a hybrid mode where we will try to direct 923 * dispatch if we're on the right CPU and the netisr worker isn't 924 * already running. 925 */ 926 m = netisr_select_cpuid(&np[proto], source, m, &cpuid); 927 if (m == NULL) { 928 error = ENOBUFS; 929 goto out_unlock; 930 } 931 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 932 sched_pin(); 933 if (cpuid != curcpu) 934 goto queue_fallback; 935 nwsp = DPCPU_PTR(nws); 936 npwp = &nwsp->nws_work[proto]; 937 938 /*- 939 * We are willing to direct dispatch only if three conditions hold: 940 * 941 * (1) The netisr worker isn't already running, 942 * (2) Another thread isn't already directly dispatching, and 943 * (3) The netisr hasn't already been woken up. 944 */ 945 NWS_LOCK(nwsp); 946 if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) { 947 error = netisr_queue_workstream(nwsp, proto, npwp, m, 948 &dosignal); 949 NWS_UNLOCK(nwsp); 950 if (dosignal) 951 NWS_SIGNAL(nwsp); 952 goto out_unpin; 953 } 954 955 /* 956 * The current thread is now effectively the netisr worker, so set 957 * the dispatching flag to prevent concurrent processing of the 958 * stream from another thread (even the netisr worker), which could 959 * otherwise lead to effective misordering of the stream. 960 */ 961 nwsp->nws_flags |= NWS_DISPATCHING; 962 NWS_UNLOCK(nwsp); 963 np[proto].np_handler(m); 964 NWS_LOCK(nwsp); 965 nwsp->nws_flags &= ~NWS_DISPATCHING; 966 npwp->nw_handled++; 967 npwp->nw_hybrid_dispatched++; 968 969 /* 970 * If other work was enqueued by another thread while we were direct 971 * dispatching, we need to signal the netisr worker to do that work. 972 * In the future, we might want to do some of that work in the 973 * current thread, rather than trigger further context switches. If 974 * so, we'll want to establish a reasonable bound on the work done in 975 * the "borrowed" context. 976 */ 977 if (nwsp->nws_pendingbits != 0) { 978 nwsp->nws_flags |= NWS_SCHEDULED; 979 dosignal = 1; 980 } else 981 dosignal = 0; 982 NWS_UNLOCK(nwsp); 983 if (dosignal) 984 NWS_SIGNAL(nwsp); 985 error = 0; 986 goto out_unpin; 987 988 queue_fallback: 989 error = netisr_queue_internal(proto, m, cpuid); 990 out_unpin: 991 sched_unpin(); 992 out_unlock: 993 #ifdef NETISR_LOCKING 994 NETISR_RUNLOCK(&tracker); 995 #endif 996 return (error); 997 } 998 999 int 1000 netisr_dispatch(u_int proto, struct mbuf *m) 1001 { 1002 1003 return (netisr_dispatch_src(proto, 0, m)); 1004 } 1005 1006 #ifdef DEVICE_POLLING 1007 /* 1008 * Kernel polling borrows a netisr thread to run interface polling in; this 1009 * function allows kernel polling to request that the netisr thread be 1010 * scheduled even if no packets are pending for protocols. 1011 */ 1012 void 1013 netisr_sched_poll(void) 1014 { 1015 struct netisr_workstream *nwsp; 1016 1017 nwsp = DPCPU_ID_PTR(nws_array[0], nws); 1018 NWS_SIGNAL(nwsp); 1019 } 1020 #endif 1021 1022 static void 1023 netisr_start_swi(u_int cpuid, struct pcpu *pc) 1024 { 1025 char swiname[12]; 1026 struct netisr_workstream *nwsp; 1027 int error; 1028 1029 KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid)); 1030 1031 nwsp = DPCPU_ID_PTR(cpuid, nws); 1032 mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF); 1033 nwsp->nws_cpu = cpuid; 1034 snprintf(swiname, sizeof(swiname), "netisr %u", cpuid); 1035 error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp, 1036 SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie); 1037 if (error) 1038 panic("%s: swi_add %d", __func__, error); 1039 pc->pc_netisr = nwsp->nws_intr_event; 1040 if (netisr_bindthreads) { 1041 error = intr_event_bind(nwsp->nws_intr_event, cpuid); 1042 if (error != 0) 1043 printf("%s: cpu %u: intr_event_bind: %d", __func__, 1044 cpuid, error); 1045 } 1046 NETISR_WLOCK(); 1047 nws_array[nws_count] = nwsp->nws_cpu; 1048 nws_count++; 1049 NETISR_WUNLOCK(); 1050 } 1051 1052 /* 1053 * Initialize the netisr subsystem. We rely on BSS and static initialization 1054 * of most fields in global data structures. 1055 * 1056 * Start a worker thread for the boot CPU so that we can support network 1057 * traffic immediately in case the network stack is used before additional 1058 * CPUs are started (for example, diskless boot). 1059 */ 1060 static void 1061 netisr_init(void *arg) 1062 { 1063 1064 KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__)); 1065 1066 NETISR_LOCK_INIT(); 1067 if (netisr_maxthreads < 1) 1068 netisr_maxthreads = 1; 1069 if (netisr_maxthreads > mp_ncpus) { 1070 printf("netisr2: forcing maxthreads from %d to %d\n", 1071 netisr_maxthreads, mp_ncpus); 1072 netisr_maxthreads = mp_ncpus; 1073 } 1074 if (netisr_defaultqlimit > netisr_maxqlimit) { 1075 printf("netisr2: forcing defaultqlimit from %d to %d\n", 1076 netisr_defaultqlimit, netisr_maxqlimit); 1077 netisr_defaultqlimit = netisr_maxqlimit; 1078 } 1079 #ifdef DEVICE_POLLING 1080 /* 1081 * The device polling code is not yet aware of how to deal with 1082 * multiple netisr threads, so for the time being compiling in device 1083 * polling disables parallel netisr workers. 1084 */ 1085 if (netisr_maxthreads != 1 || netisr_bindthreads != 0) { 1086 printf("netisr2: forcing maxthreads to 1 and bindthreads to " 1087 "0 for device polling\n"); 1088 netisr_maxthreads = 1; 1089 netisr_bindthreads = 0; 1090 } 1091 #endif 1092 1093 netisr_start_swi(curcpu, pcpu_find(curcpu)); 1094 } 1095 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL); 1096 1097 /* 1098 * Start worker threads for additional CPUs. No attempt to gracefully handle 1099 * work reassignment, we don't yet support dynamic reconfiguration. 1100 */ 1101 static void 1102 netisr_start(void *arg) 1103 { 1104 struct pcpu *pc; 1105 1106 SLIST_FOREACH(pc, &cpuhead, pc_allcpu) { 1107 if (nws_count >= netisr_maxthreads) 1108 break; 1109 /* XXXRW: Is skipping absent CPUs still required here? */ 1110 if (CPU_ABSENT(pc->pc_cpuid)) 1111 continue; 1112 /* Worker will already be present for boot CPU. */ 1113 if (pc->pc_netisr != NULL) 1114 continue; 1115 netisr_start_swi(pc->pc_cpuid, pc); 1116 } 1117 } 1118 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL); 1119 1120 #ifdef DDB 1121 DB_SHOW_COMMAND(netisr, db_show_netisr) 1122 { 1123 struct netisr_workstream *nwsp; 1124 struct netisr_work *nwp; 1125 int first, proto; 1126 u_int cpuid; 1127 1128 db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto", 1129 "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue"); 1130 for (cpuid = 0; cpuid <= mp_maxid; cpuid++) { 1131 if (CPU_ABSENT(cpuid)) 1132 continue; 1133 nwsp = DPCPU_ID_PTR(cpuid, nws); 1134 if (nwsp->nws_intr_event == NULL) 1135 continue; 1136 first = 1; 1137 for (proto = 0; proto < NETISR_MAXPROT; proto++) { 1138 if (np[proto].np_handler == NULL) 1139 continue; 1140 nwp = &nwsp->nws_work[proto]; 1141 if (first) { 1142 db_printf("%3d ", cpuid); 1143 first = 0; 1144 } else 1145 db_printf("%3s ", ""); 1146 db_printf( 1147 "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n", 1148 np[proto].np_name, nwp->nw_len, 1149 nwp->nw_watermark, nwp->nw_qlimit, 1150 nwp->nw_dispatched, nwp->nw_hybrid_dispatched, 1151 nwp->nw_qdrops, nwp->nw_queued); 1152 } 1153 } 1154 } 1155 #endif 1156