xref: /freebsd/sys/net/netisr.c (revision aa64588d28258aef88cc33b8043112e8856948d0)
1 /*-
2  * Copyright (c) 2007-2009 Robert N. M. Watson
3  * Copyright (c) 2010 Juniper Networks, Inc.
4  * All rights reserved.
5  *
6  * This software was developed by Robert N. M. Watson under contract
7  * to Juniper Networks, Inc.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 /*
35  * netisr is a packet dispatch service, allowing synchronous (directly
36  * dispatched) and asynchronous (deferred dispatch) processing of packets by
37  * registered protocol handlers.  Callers pass a protocol identifier and
38  * packet to netisr, along with a direct dispatch hint, and work will either
39  * be immediately processed by the registered handler, or passed to a
40  * software interrupt (SWI) thread for deferred dispatch.  Callers will
41  * generally select one or the other based on:
42  *
43  * - Whether directly dispatching a netisr handler lead to code reentrance or
44  *   lock recursion, such as entering the socket code from the socket code.
45  * - Whether directly dispatching a netisr handler lead to recursive
46  *   processing, such as when decapsulating several wrapped layers of tunnel
47  *   information (IPSEC within IPSEC within ...).
48  *
49  * Maintaining ordering for protocol streams is a critical design concern.
50  * Enforcing ordering limits the opportunity for concurrency, but maintains
51  * the strong ordering requirements found in some protocols, such as TCP.  Of
52  * related concern is CPU affinity--it is desirable to process all data
53  * associated with a particular stream on the same CPU over time in order to
54  * avoid acquiring locks associated with the connection on different CPUs,
55  * keep connection data in one cache, and to generally encourage associated
56  * user threads to live on the same CPU as the stream.  It's also desirable
57  * to avoid lock migration and contention where locks are associated with
58  * more than one flow.
59  *
60  * netisr supports several policy variations, represented by the
61  * NETISR_POLICY_* constants, allowing protocols to play various roles in
62  * identifying flows, assigning work to CPUs, etc.  These are described in
63  * netisr.h.
64  */
65 
66 #include "opt_ddb.h"
67 #include "opt_device_polling.h"
68 
69 #include <sys/param.h>
70 #include <sys/bus.h>
71 #include <sys/kernel.h>
72 #include <sys/kthread.h>
73 #include <sys/interrupt.h>
74 #include <sys/lock.h>
75 #include <sys/mbuf.h>
76 #include <sys/mutex.h>
77 #include <sys/pcpu.h>
78 #include <sys/proc.h>
79 #include <sys/rmlock.h>
80 #include <sys/sched.h>
81 #include <sys/smp.h>
82 #include <sys/socket.h>
83 #include <sys/sysctl.h>
84 #include <sys/systm.h>
85 
86 #ifdef DDB
87 #include <ddb/ddb.h>
88 #endif
89 
90 #define	_WANT_NETISR_INTERNAL	/* Enable definitions from netisr_internal.h */
91 #include <net/if.h>
92 #include <net/if_var.h>
93 #include <net/netisr.h>
94 #include <net/netisr_internal.h>
95 #include <net/vnet.h>
96 
97 /*-
98  * Synchronize use and modification of the registered netisr data structures;
99  * acquire a read lock while modifying the set of registered protocols to
100  * prevent partially registered or unregistered protocols from being run.
101  *
102  * The following data structures and fields are protected by this lock:
103  *
104  * - The netisr_proto array, including all fields of struct netisr_proto.
105  * - The nws array, including all fields of struct netisr_worker.
106  * - The nws_array array.
107  *
108  * Note: the NETISR_LOCKING define controls whether read locks are acquired
109  * in packet processing paths requiring netisr registration stability.  This
110  * is disabled by default as it can lead to measurable performance
111  * degradation even with rmlocks (3%-6% for loopback ping-pong traffic), and
112  * because netisr registration and unregistration is extremely rare at
113  * runtime.  If it becomes more common, this decision should be revisited.
114  *
115  * XXXRW: rmlocks don't support assertions.
116  */
117 static struct rmlock	netisr_rmlock;
118 #define	NETISR_LOCK_INIT()	rm_init_flags(&netisr_rmlock, "netisr", \
119 				    RM_NOWITNESS)
120 #define	NETISR_LOCK_ASSERT()
121 #define	NETISR_RLOCK(tracker)	rm_rlock(&netisr_rmlock, (tracker))
122 #define	NETISR_RUNLOCK(tracker)	rm_runlock(&netisr_rmlock, (tracker))
123 #define	NETISR_WLOCK()		rm_wlock(&netisr_rmlock)
124 #define	NETISR_WUNLOCK()	rm_wunlock(&netisr_rmlock)
125 /* #define	NETISR_LOCKING */
126 
127 SYSCTL_NODE(_net, OID_AUTO, isr, CTLFLAG_RW, 0, "netisr");
128 
129 /*-
130  * Three direct dispatch policies are supported:
131  *
132  * - Always defer: all work is scheduled for a netisr, regardless of context.
133  *   (!direct)
134  *
135  * - Hybrid: if the executing context allows direct dispatch, and we're
136  *   running on the CPU the work would be done on, then direct dispatch if it
137  *   wouldn't violate ordering constraints on the workstream.
138  *   (direct && !direct_force)
139  *
140  * - Always direct: if the executing context allows direct dispatch, always
141  *   direct dispatch.  (direct && direct_force)
142  *
143  * Notice that changing the global policy could lead to short periods of
144  * misordered processing, but this is considered acceptable as compared to
145  * the complexity of enforcing ordering during policy changes.
146  */
147 static int	netisr_direct_force = 1;	/* Always direct dispatch. */
148 TUNABLE_INT("net.isr.direct_force", &netisr_direct_force);
149 SYSCTL_INT(_net_isr, OID_AUTO, direct_force, CTLFLAG_RW,
150     &netisr_direct_force, 0, "Force direct dispatch");
151 
152 static int	netisr_direct = 1;	/* Enable direct dispatch. */
153 TUNABLE_INT("net.isr.direct", &netisr_direct);
154 SYSCTL_INT(_net_isr, OID_AUTO, direct, CTLFLAG_RW,
155     &netisr_direct, 0, "Enable direct dispatch");
156 
157 /*
158  * Allow the administrator to limit the number of threads (CPUs) to use for
159  * netisr.  We don't check netisr_maxthreads before creating the thread for
160  * CPU 0, so in practice we ignore values <= 1.  This must be set at boot.
161  * We will create at most one thread per CPU.
162  */
163 static int	netisr_maxthreads = -1;		/* Max number of threads. */
164 TUNABLE_INT("net.isr.maxthreads", &netisr_maxthreads);
165 SYSCTL_INT(_net_isr, OID_AUTO, maxthreads, CTLFLAG_RDTUN,
166     &netisr_maxthreads, 0,
167     "Use at most this many CPUs for netisr processing");
168 
169 static int	netisr_bindthreads = 0;		/* Bind threads to CPUs. */
170 TUNABLE_INT("net.isr.bindthreads", &netisr_bindthreads);
171 SYSCTL_INT(_net_isr, OID_AUTO, bindthreads, CTLFLAG_RDTUN,
172     &netisr_bindthreads, 0, "Bind netisr threads to CPUs.");
173 
174 /*
175  * Limit per-workstream mbuf queue limits s to at most net.isr.maxqlimit,
176  * both for initial configuration and later modification using
177  * netisr_setqlimit().
178  */
179 #define	NETISR_DEFAULT_MAXQLIMIT	10240
180 static u_int	netisr_maxqlimit = NETISR_DEFAULT_MAXQLIMIT;
181 TUNABLE_INT("net.isr.maxqlimit", &netisr_maxqlimit);
182 SYSCTL_INT(_net_isr, OID_AUTO, maxqlimit, CTLFLAG_RDTUN,
183     &netisr_maxqlimit, 0,
184     "Maximum netisr per-protocol, per-CPU queue depth.");
185 
186 /*
187  * The default per-workstream mbuf queue limit for protocols that don't
188  * initialize the nh_qlimit field of their struct netisr_handler.  If this is
189  * set above netisr_maxqlimit, we truncate it to the maximum during boot.
190  */
191 #define	NETISR_DEFAULT_DEFAULTQLIMIT	256
192 static u_int	netisr_defaultqlimit = NETISR_DEFAULT_DEFAULTQLIMIT;
193 TUNABLE_INT("net.isr.defaultqlimit", &netisr_defaultqlimit);
194 SYSCTL_INT(_net_isr, OID_AUTO, defaultqlimit, CTLFLAG_RDTUN,
195     &netisr_defaultqlimit, 0,
196     "Default netisr per-protocol, per-CPU queue limit if not set by protocol");
197 
198 /*
199  * Store and export the compile-time constant NETISR_MAXPROT limit on the
200  * number of protocols that can register with netisr at a time.  This is
201  * required for crashdump analysis, as it sizes netisr_proto[].
202  */
203 static u_int	netisr_maxprot = NETISR_MAXPROT;
204 SYSCTL_INT(_net_isr, OID_AUTO, maxprot, CTLFLAG_RD,
205     &netisr_maxprot, 0,
206     "Compile-time limit on the number of protocols supported by netisr.");
207 
208 /*
209  * The netisr_proto array describes all registered protocols, indexed by
210  * protocol number.  See netisr_internal.h for more details.
211  */
212 static struct netisr_proto	netisr_proto[NETISR_MAXPROT];
213 
214 /*
215  * Per-CPU workstream data.  See netisr_internal.h for more details.
216  */
217 DPCPU_DEFINE(struct netisr_workstream, nws);
218 
219 /*
220  * Map contiguous values between 0 and nws_count into CPU IDs appropriate for
221  * accessing workstreams.  This allows constructions of the form
222  * DPCPU_ID_GET(nws_array[arbitraryvalue % nws_count], nws).
223  */
224 static u_int				 nws_array[MAXCPU];
225 
226 /*
227  * Number of registered workstreams.  Will be at most the number of running
228  * CPUs once fully started.
229  */
230 static u_int				 nws_count;
231 SYSCTL_INT(_net_isr, OID_AUTO, numthreads, CTLFLAG_RD,
232     &nws_count, 0, "Number of extant netisr threads.");
233 
234 /*
235  * Synchronization for each workstream: a mutex protects all mutable fields
236  * in each stream, including per-protocol state (mbuf queues).  The SWI is
237  * woken up if asynchronous dispatch is required.
238  */
239 #define	NWS_LOCK(s)		mtx_lock(&(s)->nws_mtx)
240 #define	NWS_LOCK_ASSERT(s)	mtx_assert(&(s)->nws_mtx, MA_OWNED)
241 #define	NWS_UNLOCK(s)		mtx_unlock(&(s)->nws_mtx)
242 #define	NWS_SIGNAL(s)		swi_sched((s)->nws_swi_cookie, 0)
243 
244 /*
245  * Utility routines for protocols that implement their own mapping of flows
246  * to CPUs.
247  */
248 u_int
249 netisr_get_cpucount(void)
250 {
251 
252 	return (nws_count);
253 }
254 
255 u_int
256 netisr_get_cpuid(u_int cpunumber)
257 {
258 
259 	KASSERT(cpunumber < nws_count, ("%s: %u > %u", __func__, cpunumber,
260 	    nws_count));
261 
262 	return (nws_array[cpunumber]);
263 }
264 
265 /*
266  * The default implementation of flow -> CPU ID mapping.
267  *
268  * Non-static so that protocols can use it to map their own work to specific
269  * CPUs in a manner consistent to netisr for affinity purposes.
270  */
271 u_int
272 netisr_default_flow2cpu(u_int flowid)
273 {
274 
275 	return (nws_array[flowid % nws_count]);
276 }
277 
278 /*
279  * Register a new netisr handler, which requires initializing per-protocol
280  * fields for each workstream.  All netisr work is briefly suspended while
281  * the protocol is installed.
282  */
283 void
284 netisr_register(const struct netisr_handler *nhp)
285 {
286 	struct netisr_work *npwp;
287 	const char *name;
288 	u_int i, proto;
289 
290 	proto = nhp->nh_proto;
291 	name = nhp->nh_name;
292 
293 	/*
294 	 * Test that the requested registration is valid.
295 	 */
296 	KASSERT(nhp->nh_name != NULL,
297 	    ("%s: nh_name NULL for %u", __func__, proto));
298 	KASSERT(nhp->nh_handler != NULL,
299 	    ("%s: nh_handler NULL for %s", __func__, name));
300 	KASSERT(nhp->nh_policy == NETISR_POLICY_SOURCE ||
301 	    nhp->nh_policy == NETISR_POLICY_FLOW ||
302 	    nhp->nh_policy == NETISR_POLICY_CPU,
303 	    ("%s: unsupported nh_policy %u for %s", __func__,
304 	    nhp->nh_policy, name));
305 	KASSERT(nhp->nh_policy == NETISR_POLICY_FLOW ||
306 	    nhp->nh_m2flow == NULL,
307 	    ("%s: nh_policy != FLOW but m2flow defined for %s", __func__,
308 	    name));
309 	KASSERT(nhp->nh_policy == NETISR_POLICY_CPU || nhp->nh_m2cpuid == NULL,
310 	    ("%s: nh_policy != CPU but m2cpuid defined for %s", __func__,
311 	    name));
312 	KASSERT(nhp->nh_policy != NETISR_POLICY_CPU || nhp->nh_m2cpuid != NULL,
313 	    ("%s: nh_policy == CPU but m2cpuid not defined for %s", __func__,
314 	    name));
315 	KASSERT(proto < NETISR_MAXPROT,
316 	    ("%s(%u, %s): protocol too big", __func__, proto, name));
317 
318 	/*
319 	 * Test that no existing registration exists for this protocol.
320 	 */
321 	NETISR_WLOCK();
322 	KASSERT(netisr_proto[proto].np_name == NULL,
323 	    ("%s(%u, %s): name present", __func__, proto, name));
324 	KASSERT(netisr_proto[proto].np_handler == NULL,
325 	    ("%s(%u, %s): handler present", __func__, proto, name));
326 
327 	netisr_proto[proto].np_name = name;
328 	netisr_proto[proto].np_handler = nhp->nh_handler;
329 	netisr_proto[proto].np_m2flow = nhp->nh_m2flow;
330 	netisr_proto[proto].np_m2cpuid = nhp->nh_m2cpuid;
331 	netisr_proto[proto].np_drainedcpu = nhp->nh_drainedcpu;
332 	if (nhp->nh_qlimit == 0)
333 		netisr_proto[proto].np_qlimit = netisr_defaultqlimit;
334 	else if (nhp->nh_qlimit > netisr_maxqlimit) {
335 		printf("%s: %s requested queue limit %u capped to "
336 		    "net.isr.maxqlimit %u\n", __func__, name, nhp->nh_qlimit,
337 		    netisr_maxqlimit);
338 		netisr_proto[proto].np_qlimit = netisr_maxqlimit;
339 	} else
340 		netisr_proto[proto].np_qlimit = nhp->nh_qlimit;
341 	netisr_proto[proto].np_policy = nhp->nh_policy;
342 	for (i = 0; i <= mp_maxid; i++) {
343 		if (CPU_ABSENT(i))
344 			continue;
345 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
346 		bzero(npwp, sizeof(*npwp));
347 		npwp->nw_qlimit = netisr_proto[proto].np_qlimit;
348 	}
349 	NETISR_WUNLOCK();
350 }
351 
352 /*
353  * Clear drop counters across all workstreams for a protocol.
354  */
355 void
356 netisr_clearqdrops(const struct netisr_handler *nhp)
357 {
358 	struct netisr_work *npwp;
359 #ifdef INVARIANTS
360 	const char *name;
361 #endif
362 	u_int i, proto;
363 
364 	proto = nhp->nh_proto;
365 #ifdef INVARIANTS
366 	name = nhp->nh_name;
367 #endif
368 	KASSERT(proto < NETISR_MAXPROT,
369 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
370 
371 	NETISR_WLOCK();
372 	KASSERT(netisr_proto[proto].np_handler != NULL,
373 	    ("%s(%u): protocol not registered for %s", __func__, proto,
374 	    name));
375 
376 	for (i = 0; i <= mp_maxid; i++) {
377 		if (CPU_ABSENT(i))
378 			continue;
379 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
380 		npwp->nw_qdrops = 0;
381 	}
382 	NETISR_WUNLOCK();
383 }
384 
385 /*
386  * Query current drop counters across all workstreams for a protocol.
387  */
388 void
389 netisr_getqdrops(const struct netisr_handler *nhp, u_int64_t *qdropp)
390 {
391 	struct netisr_work *npwp;
392 	struct rm_priotracker tracker;
393 #ifdef INVARIANTS
394 	const char *name;
395 #endif
396 	u_int i, proto;
397 
398 	*qdropp = 0;
399 	proto = nhp->nh_proto;
400 #ifdef INVARIANTS
401 	name = nhp->nh_name;
402 #endif
403 	KASSERT(proto < NETISR_MAXPROT,
404 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
405 
406 	NETISR_RLOCK(&tracker);
407 	KASSERT(netisr_proto[proto].np_handler != NULL,
408 	    ("%s(%u): protocol not registered for %s", __func__, proto,
409 	    name));
410 
411 	for (i = 0; i <= mp_maxid; i++) {
412 		if (CPU_ABSENT(i))
413 			continue;
414 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
415 		*qdropp += npwp->nw_qdrops;
416 	}
417 	NETISR_RUNLOCK(&tracker);
418 }
419 
420 /*
421  * Query current per-workstream queue limit for a protocol.
422  */
423 void
424 netisr_getqlimit(const struct netisr_handler *nhp, u_int *qlimitp)
425 {
426 	struct rm_priotracker tracker;
427 #ifdef INVARIANTS
428 	const char *name;
429 #endif
430 	u_int proto;
431 
432 	proto = nhp->nh_proto;
433 #ifdef INVARIANTS
434 	name = nhp->nh_name;
435 #endif
436 	KASSERT(proto < NETISR_MAXPROT,
437 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
438 
439 	NETISR_RLOCK(&tracker);
440 	KASSERT(netisr_proto[proto].np_handler != NULL,
441 	    ("%s(%u): protocol not registered for %s", __func__, proto,
442 	    name));
443 	*qlimitp = netisr_proto[proto].np_qlimit;
444 	NETISR_RUNLOCK(&tracker);
445 }
446 
447 /*
448  * Update the queue limit across per-workstream queues for a protocol.  We
449  * simply change the limits, and don't drain overflowed packets as they will
450  * (hopefully) take care of themselves shortly.
451  */
452 int
453 netisr_setqlimit(const struct netisr_handler *nhp, u_int qlimit)
454 {
455 	struct netisr_work *npwp;
456 #ifdef INVARIANTS
457 	const char *name;
458 #endif
459 	u_int i, proto;
460 
461 	if (qlimit > netisr_maxqlimit)
462 		return (EINVAL);
463 
464 	proto = nhp->nh_proto;
465 #ifdef INVARIANTS
466 	name = nhp->nh_name;
467 #endif
468 	KASSERT(proto < NETISR_MAXPROT,
469 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
470 
471 	NETISR_WLOCK();
472 	KASSERT(netisr_proto[proto].np_handler != NULL,
473 	    ("%s(%u): protocol not registered for %s", __func__, proto,
474 	    name));
475 
476 	netisr_proto[proto].np_qlimit = qlimit;
477 	for (i = 0; i <= mp_maxid; i++) {
478 		if (CPU_ABSENT(i))
479 			continue;
480 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
481 		npwp->nw_qlimit = qlimit;
482 	}
483 	NETISR_WUNLOCK();
484 	return (0);
485 }
486 
487 /*
488  * Drain all packets currently held in a particular protocol work queue.
489  */
490 static void
491 netisr_drain_proto(struct netisr_work *npwp)
492 {
493 	struct mbuf *m;
494 
495 	/*
496 	 * We would assert the lock on the workstream but it's not passed in.
497 	 */
498 	while ((m = npwp->nw_head) != NULL) {
499 		npwp->nw_head = m->m_nextpkt;
500 		m->m_nextpkt = NULL;
501 		if (npwp->nw_head == NULL)
502 			npwp->nw_tail = NULL;
503 		npwp->nw_len--;
504 		m_freem(m);
505 	}
506 	KASSERT(npwp->nw_tail == NULL, ("%s: tail", __func__));
507 	KASSERT(npwp->nw_len == 0, ("%s: len", __func__));
508 }
509 
510 /*
511  * Remove the registration of a network protocol, which requires clearing
512  * per-protocol fields across all workstreams, including freeing all mbufs in
513  * the queues at time of unregister.  All work in netisr is briefly suspended
514  * while this takes place.
515  */
516 void
517 netisr_unregister(const struct netisr_handler *nhp)
518 {
519 	struct netisr_work *npwp;
520 #ifdef INVARIANTS
521 	const char *name;
522 #endif
523 	u_int i, proto;
524 
525 	proto = nhp->nh_proto;
526 #ifdef INVARIANTS
527 	name = nhp->nh_name;
528 #endif
529 	KASSERT(proto < NETISR_MAXPROT,
530 	    ("%s(%u): protocol too big for %s", __func__, proto, name));
531 
532 	NETISR_WLOCK();
533 	KASSERT(netisr_proto[proto].np_handler != NULL,
534 	    ("%s(%u): protocol not registered for %s", __func__, proto,
535 	    name));
536 
537 	netisr_proto[proto].np_name = NULL;
538 	netisr_proto[proto].np_handler = NULL;
539 	netisr_proto[proto].np_m2flow = NULL;
540 	netisr_proto[proto].np_m2cpuid = NULL;
541 	netisr_proto[proto].np_qlimit = 0;
542 	netisr_proto[proto].np_policy = 0;
543 	for (i = 0; i <= mp_maxid; i++) {
544 		if (CPU_ABSENT(i))
545 			continue;
546 		npwp = &(DPCPU_ID_PTR(i, nws))->nws_work[proto];
547 		netisr_drain_proto(npwp);
548 		bzero(npwp, sizeof(*npwp));
549 	}
550 	NETISR_WUNLOCK();
551 }
552 
553 /*
554  * Look up the workstream given a packet and source identifier.  Do this by
555  * checking the protocol's policy, and optionally call out to the protocol
556  * for assistance if required.
557  */
558 static struct mbuf *
559 netisr_select_cpuid(struct netisr_proto *npp, uintptr_t source,
560     struct mbuf *m, u_int *cpuidp)
561 {
562 	struct ifnet *ifp;
563 
564 	NETISR_LOCK_ASSERT();
565 
566 	/*
567 	 * In the event we have only one worker, shortcut and deliver to it
568 	 * without further ado.
569 	 */
570 	if (nws_count == 1) {
571 		*cpuidp = nws_array[0];
572 		return (m);
573 	}
574 
575 	/*
576 	 * What happens next depends on the policy selected by the protocol.
577 	 * If we want to support per-interface policies, we should do that
578 	 * here first.
579 	 */
580 	switch (npp->np_policy) {
581 	case NETISR_POLICY_CPU:
582 		return (npp->np_m2cpuid(m, source, cpuidp));
583 
584 	case NETISR_POLICY_FLOW:
585 		if (!(m->m_flags & M_FLOWID) && npp->np_m2flow != NULL) {
586 			m = npp->np_m2flow(m, source);
587 			if (m == NULL)
588 				return (NULL);
589 		}
590 		if (m->m_flags & M_FLOWID) {
591 			*cpuidp =
592 			    netisr_default_flow2cpu(m->m_pkthdr.flowid);
593 			return (m);
594 		}
595 		/* FALLTHROUGH */
596 
597 	case NETISR_POLICY_SOURCE:
598 		ifp = m->m_pkthdr.rcvif;
599 		if (ifp != NULL)
600 			*cpuidp = nws_array[(ifp->if_index + source) %
601 			    nws_count];
602 		else
603 			*cpuidp = nws_array[source % nws_count];
604 		return (m);
605 
606 	default:
607 		panic("%s: invalid policy %u for %s", __func__,
608 		    npp->np_policy, npp->np_name);
609 	}
610 }
611 
612 /*
613  * Process packets associated with a workstream and protocol.  For reasons of
614  * fairness, we process up to one complete netisr queue at a time, moving the
615  * queue to a stack-local queue for processing, but do not loop refreshing
616  * from the global queue.  The caller is responsible for deciding whether to
617  * loop, and for setting the NWS_RUNNING flag.  The passed workstream will be
618  * locked on entry and relocked before return, but will be released while
619  * processing.  The number of packets processed is returned.
620  */
621 static u_int
622 netisr_process_workstream_proto(struct netisr_workstream *nwsp, u_int proto)
623 {
624 	struct netisr_work local_npw, *npwp;
625 	u_int handled;
626 	struct mbuf *m;
627 
628 	NETISR_LOCK_ASSERT();
629 	NWS_LOCK_ASSERT(nwsp);
630 
631 	KASSERT(nwsp->nws_flags & NWS_RUNNING,
632 	    ("%s(%u): not running", __func__, proto));
633 	KASSERT(proto >= 0 && proto < NETISR_MAXPROT,
634 	    ("%s(%u): invalid proto\n", __func__, proto));
635 
636 	npwp = &nwsp->nws_work[proto];
637 	if (npwp->nw_len == 0)
638 		return (0);
639 
640 	/*
641 	 * Move the global work queue to a thread-local work queue.
642 	 *
643 	 * Notice that this means the effective maximum length of the queue
644 	 * is actually twice that of the maximum queue length specified in
645 	 * the protocol registration call.
646 	 */
647 	handled = npwp->nw_len;
648 	local_npw = *npwp;
649 	npwp->nw_head = NULL;
650 	npwp->nw_tail = NULL;
651 	npwp->nw_len = 0;
652 	nwsp->nws_pendingbits &= ~(1 << proto);
653 	NWS_UNLOCK(nwsp);
654 	while ((m = local_npw.nw_head) != NULL) {
655 		local_npw.nw_head = m->m_nextpkt;
656 		m->m_nextpkt = NULL;
657 		if (local_npw.nw_head == NULL)
658 			local_npw.nw_tail = NULL;
659 		local_npw.nw_len--;
660 		VNET_ASSERT(m->m_pkthdr.rcvif != NULL);
661 		CURVNET_SET(m->m_pkthdr.rcvif->if_vnet);
662 		netisr_proto[proto].np_handler(m);
663 		CURVNET_RESTORE();
664 	}
665 	KASSERT(local_npw.nw_len == 0,
666 	    ("%s(%u): len %u", __func__, proto, local_npw.nw_len));
667 	if (netisr_proto[proto].np_drainedcpu)
668 		netisr_proto[proto].np_drainedcpu(nwsp->nws_cpu);
669 	NWS_LOCK(nwsp);
670 	npwp->nw_handled += handled;
671 	return (handled);
672 }
673 
674 /*
675  * SWI handler for netisr -- processes packets in a set of workstreams that
676  * it owns, woken up by calls to NWS_SIGNAL().  If this workstream is already
677  * being direct dispatched, go back to sleep and wait for the dispatching
678  * thread to wake us up again.
679  */
680 static void
681 swi_net(void *arg)
682 {
683 #ifdef NETISR_LOCKING
684 	struct rm_priotracker tracker;
685 #endif
686 	struct netisr_workstream *nwsp;
687 	u_int bits, prot;
688 
689 	nwsp = arg;
690 
691 #ifdef DEVICE_POLLING
692 	KASSERT(nws_count == 1,
693 	    ("%s: device_polling but nws_count != 1", __func__));
694 	netisr_poll();
695 #endif
696 #ifdef NETISR_LOCKING
697 	NETISR_RLOCK(&tracker);
698 #endif
699 	NWS_LOCK(nwsp);
700 	KASSERT(!(nwsp->nws_flags & NWS_RUNNING), ("swi_net: running"));
701 	if (nwsp->nws_flags & NWS_DISPATCHING)
702 		goto out;
703 	nwsp->nws_flags |= NWS_RUNNING;
704 	nwsp->nws_flags &= ~NWS_SCHEDULED;
705 	while ((bits = nwsp->nws_pendingbits) != 0) {
706 		while ((prot = ffs(bits)) != 0) {
707 			prot--;
708 			bits &= ~(1 << prot);
709 			(void)netisr_process_workstream_proto(nwsp, prot);
710 		}
711 	}
712 	nwsp->nws_flags &= ~NWS_RUNNING;
713 out:
714 	NWS_UNLOCK(nwsp);
715 #ifdef NETISR_LOCKING
716 	NETISR_RUNLOCK(&tracker);
717 #endif
718 #ifdef DEVICE_POLLING
719 	netisr_pollmore();
720 #endif
721 }
722 
723 static int
724 netisr_queue_workstream(struct netisr_workstream *nwsp, u_int proto,
725     struct netisr_work *npwp, struct mbuf *m, int *dosignalp)
726 {
727 
728 	NWS_LOCK_ASSERT(nwsp);
729 
730 	*dosignalp = 0;
731 	if (npwp->nw_len < npwp->nw_qlimit) {
732 		m->m_nextpkt = NULL;
733 		if (npwp->nw_head == NULL) {
734 			npwp->nw_head = m;
735 			npwp->nw_tail = m;
736 		} else {
737 			npwp->nw_tail->m_nextpkt = m;
738 			npwp->nw_tail = m;
739 		}
740 		npwp->nw_len++;
741 		if (npwp->nw_len > npwp->nw_watermark)
742 			npwp->nw_watermark = npwp->nw_len;
743 
744 		/*
745 		 * We must set the bit regardless of NWS_RUNNING, so that
746 		 * swi_net() keeps calling netisr_process_workstream_proto().
747 		 */
748 		nwsp->nws_pendingbits |= (1 << proto);
749 		if (!(nwsp->nws_flags &
750 		    (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED))) {
751 			nwsp->nws_flags |= NWS_SCHEDULED;
752 			*dosignalp = 1;	/* Defer until unlocked. */
753 		}
754 		npwp->nw_queued++;
755 		return (0);
756 	} else {
757 		m_freem(m);
758 		npwp->nw_qdrops++;
759 		return (ENOBUFS);
760 	}
761 }
762 
763 static int
764 netisr_queue_internal(u_int proto, struct mbuf *m, u_int cpuid)
765 {
766 	struct netisr_workstream *nwsp;
767 	struct netisr_work *npwp;
768 	int dosignal, error;
769 
770 #ifdef NETISR_LOCKING
771 	NETISR_LOCK_ASSERT();
772 #endif
773 	KASSERT(cpuid <= mp_maxid, ("%s: cpuid too big (%u, %u)", __func__,
774 	    cpuid, mp_maxid));
775 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
776 
777 	dosignal = 0;
778 	error = 0;
779 	nwsp = DPCPU_ID_PTR(cpuid, nws);
780 	npwp = &nwsp->nws_work[proto];
781 	NWS_LOCK(nwsp);
782 	error = netisr_queue_workstream(nwsp, proto, npwp, m, &dosignal);
783 	NWS_UNLOCK(nwsp);
784 	if (dosignal)
785 		NWS_SIGNAL(nwsp);
786 	return (error);
787 }
788 
789 int
790 netisr_queue_src(u_int proto, uintptr_t source, struct mbuf *m)
791 {
792 #ifdef NETISR_LOCKING
793 	struct rm_priotracker tracker;
794 #endif
795 	u_int cpuid;
796 	int error;
797 
798 	KASSERT(proto < NETISR_MAXPROT,
799 	    ("%s: invalid proto %u", __func__, proto));
800 
801 #ifdef NETISR_LOCKING
802 	NETISR_RLOCK(&tracker);
803 #endif
804 	KASSERT(netisr_proto[proto].np_handler != NULL,
805 	    ("%s: invalid proto %u", __func__, proto));
806 
807 	m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid);
808 	if (m != NULL) {
809 		KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__,
810 		    cpuid));
811 		error = netisr_queue_internal(proto, m, cpuid);
812 	} else
813 		error = ENOBUFS;
814 #ifdef NETISR_LOCKING
815 	NETISR_RUNLOCK(&tracker);
816 #endif
817 	return (error);
818 }
819 
820 int
821 netisr_queue(u_int proto, struct mbuf *m)
822 {
823 
824 	return (netisr_queue_src(proto, 0, m));
825 }
826 
827 /*
828  * Dispatch a packet for netisr processing; direct dispatch is permitted by
829  * calling context.
830  */
831 int
832 netisr_dispatch_src(u_int proto, uintptr_t source, struct mbuf *m)
833 {
834 #ifdef NETISR_LOCKING
835 	struct rm_priotracker tracker;
836 #endif
837 	struct netisr_workstream *nwsp;
838 	struct netisr_work *npwp;
839 	int dosignal, error;
840 	u_int cpuid;
841 
842 	/*
843 	 * If direct dispatch is entirely disabled, fall back on queueing.
844 	 */
845 	if (!netisr_direct)
846 		return (netisr_queue_src(proto, source, m));
847 
848 	KASSERT(proto < NETISR_MAXPROT,
849 	    ("%s: invalid proto %u", __func__, proto));
850 #ifdef NETISR_LOCKING
851 	NETISR_RLOCK(&tracker);
852 #endif
853 	KASSERT(netisr_proto[proto].np_handler != NULL,
854 	    ("%s: invalid proto %u", __func__, proto));
855 
856 	/*
857 	 * If direct dispatch is forced, then unconditionally dispatch
858 	 * without a formal CPU selection.  Borrow the current CPU's stats,
859 	 * even if there's no worker on it.  In this case we don't update
860 	 * nws_flags because all netisr processing will be source ordered due
861 	 * to always being forced to directly dispatch.
862 	 */
863 	if (netisr_direct_force) {
864 		nwsp = DPCPU_PTR(nws);
865 		npwp = &nwsp->nws_work[proto];
866 		npwp->nw_dispatched++;
867 		npwp->nw_handled++;
868 		netisr_proto[proto].np_handler(m);
869 		error = 0;
870 		goto out_unlock;
871 	}
872 
873 	/*
874 	 * Otherwise, we execute in a hybrid mode where we will try to direct
875 	 * dispatch if we're on the right CPU and the netisr worker isn't
876 	 * already running.
877 	 */
878 	m = netisr_select_cpuid(&netisr_proto[proto], source, m, &cpuid);
879 	if (m == NULL) {
880 		error = ENOBUFS;
881 		goto out_unlock;
882 	}
883 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
884 	sched_pin();
885 	if (cpuid != curcpu)
886 		goto queue_fallback;
887 	nwsp = DPCPU_PTR(nws);
888 	npwp = &nwsp->nws_work[proto];
889 
890 	/*-
891 	 * We are willing to direct dispatch only if three conditions hold:
892 	 *
893 	 * (1) The netisr worker isn't already running,
894 	 * (2) Another thread isn't already directly dispatching, and
895 	 * (3) The netisr hasn't already been woken up.
896 	 */
897 	NWS_LOCK(nwsp);
898 	if (nwsp->nws_flags & (NWS_RUNNING | NWS_DISPATCHING | NWS_SCHEDULED)) {
899 		error = netisr_queue_workstream(nwsp, proto, npwp, m,
900 		    &dosignal);
901 		NWS_UNLOCK(nwsp);
902 		if (dosignal)
903 			NWS_SIGNAL(nwsp);
904 		goto out_unpin;
905 	}
906 
907 	/*
908 	 * The current thread is now effectively the netisr worker, so set
909 	 * the dispatching flag to prevent concurrent processing of the
910 	 * stream from another thread (even the netisr worker), which could
911 	 * otherwise lead to effective misordering of the stream.
912 	 */
913 	nwsp->nws_flags |= NWS_DISPATCHING;
914 	NWS_UNLOCK(nwsp);
915 	netisr_proto[proto].np_handler(m);
916 	NWS_LOCK(nwsp);
917 	nwsp->nws_flags &= ~NWS_DISPATCHING;
918 	npwp->nw_handled++;
919 	npwp->nw_hybrid_dispatched++;
920 
921 	/*
922 	 * If other work was enqueued by another thread while we were direct
923 	 * dispatching, we need to signal the netisr worker to do that work.
924 	 * In the future, we might want to do some of that work in the
925 	 * current thread, rather than trigger further context switches.  If
926 	 * so, we'll want to establish a reasonable bound on the work done in
927 	 * the "borrowed" context.
928 	 */
929 	if (nwsp->nws_pendingbits != 0) {
930 		nwsp->nws_flags |= NWS_SCHEDULED;
931 		dosignal = 1;
932 	} else
933 		dosignal = 0;
934 	NWS_UNLOCK(nwsp);
935 	if (dosignal)
936 		NWS_SIGNAL(nwsp);
937 	error = 0;
938 	goto out_unpin;
939 
940 queue_fallback:
941 	error = netisr_queue_internal(proto, m, cpuid);
942 out_unpin:
943 	sched_unpin();
944 out_unlock:
945 #ifdef NETISR_LOCKING
946 	NETISR_RUNLOCK(&tracker);
947 #endif
948 	return (error);
949 }
950 
951 int
952 netisr_dispatch(u_int proto, struct mbuf *m)
953 {
954 
955 	return (netisr_dispatch_src(proto, 0, m));
956 }
957 
958 #ifdef DEVICE_POLLING
959 /*
960  * Kernel polling borrows a netisr thread to run interface polling in; this
961  * function allows kernel polling to request that the netisr thread be
962  * scheduled even if no packets are pending for protocols.
963  */
964 void
965 netisr_sched_poll(void)
966 {
967 	struct netisr_workstream *nwsp;
968 
969 	nwsp = DPCPU_ID_PTR(nws_array[0], nws);
970 	NWS_SIGNAL(nwsp);
971 }
972 #endif
973 
974 static void
975 netisr_start_swi(u_int cpuid, struct pcpu *pc)
976 {
977 	char swiname[12];
978 	struct netisr_workstream *nwsp;
979 	int error;
980 
981 	KASSERT(!CPU_ABSENT(cpuid), ("%s: CPU %u absent", __func__, cpuid));
982 
983 	nwsp = DPCPU_ID_PTR(cpuid, nws);
984 	mtx_init(&nwsp->nws_mtx, "netisr_mtx", NULL, MTX_DEF);
985 	nwsp->nws_cpu = cpuid;
986 	snprintf(swiname, sizeof(swiname), "netisr %u", cpuid);
987 	error = swi_add(&nwsp->nws_intr_event, swiname, swi_net, nwsp,
988 	    SWI_NET, INTR_MPSAFE, &nwsp->nws_swi_cookie);
989 	if (error)
990 		panic("%s: swi_add %d", __func__, error);
991 	pc->pc_netisr = nwsp->nws_intr_event;
992 	if (netisr_bindthreads) {
993 		error = intr_event_bind(nwsp->nws_intr_event, cpuid);
994 		if (error != 0)
995 			printf("%s: cpu %u: intr_event_bind: %d", __func__,
996 			    cpuid, error);
997 	}
998 	NETISR_WLOCK();
999 	nws_array[nws_count] = nwsp->nws_cpu;
1000 	nws_count++;
1001 	NETISR_WUNLOCK();
1002 }
1003 
1004 /*
1005  * Initialize the netisr subsystem.  We rely on BSS and static initialization
1006  * of most fields in global data structures.
1007  *
1008  * Start a worker thread for the boot CPU so that we can support network
1009  * traffic immediately in case the network stack is used before additional
1010  * CPUs are started (for example, diskless boot).
1011  */
1012 static void
1013 netisr_init(void *arg)
1014 {
1015 
1016 	KASSERT(curcpu == 0, ("%s: not on CPU 0", __func__));
1017 
1018 	NETISR_LOCK_INIT();
1019 	if (netisr_maxthreads < 1)
1020 		netisr_maxthreads = 1;
1021 	if (netisr_maxthreads > mp_ncpus) {
1022 		printf("netisr_init: forcing maxthreads from %d to %d\n",
1023 		    netisr_maxthreads, mp_ncpus);
1024 		netisr_maxthreads = mp_ncpus;
1025 	}
1026 	if (netisr_defaultqlimit > netisr_maxqlimit) {
1027 		printf("netisr_init: forcing defaultqlimit from %d to %d\n",
1028 		    netisr_defaultqlimit, netisr_maxqlimit);
1029 		netisr_defaultqlimit = netisr_maxqlimit;
1030 	}
1031 #ifdef DEVICE_POLLING
1032 	/*
1033 	 * The device polling code is not yet aware of how to deal with
1034 	 * multiple netisr threads, so for the time being compiling in device
1035 	 * polling disables parallel netisr workers.
1036 	 */
1037 	if (netisr_maxthreads != 1 || netisr_bindthreads != 0) {
1038 		printf("netisr_init: forcing maxthreads to 1 and "
1039 		    "bindthreads to 0 for device polling\n");
1040 		netisr_maxthreads = 1;
1041 		netisr_bindthreads = 0;
1042 	}
1043 #endif
1044 
1045 	netisr_start_swi(curcpu, pcpu_find(curcpu));
1046 }
1047 SYSINIT(netisr_init, SI_SUB_SOFTINTR, SI_ORDER_FIRST, netisr_init, NULL);
1048 
1049 /*
1050  * Start worker threads for additional CPUs.  No attempt to gracefully handle
1051  * work reassignment, we don't yet support dynamic reconfiguration.
1052  */
1053 static void
1054 netisr_start(void *arg)
1055 {
1056 	struct pcpu *pc;
1057 
1058 	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
1059 		if (nws_count >= netisr_maxthreads)
1060 			break;
1061 		/* XXXRW: Is skipping absent CPUs still required here? */
1062 		if (CPU_ABSENT(pc->pc_cpuid))
1063 			continue;
1064 		/* Worker will already be present for boot CPU. */
1065 		if (pc->pc_netisr != NULL)
1066 			continue;
1067 		netisr_start_swi(pc->pc_cpuid, pc);
1068 	}
1069 }
1070 SYSINIT(netisr_start, SI_SUB_SMP, SI_ORDER_MIDDLE, netisr_start, NULL);
1071 
1072 /*
1073  * Sysctl monitoring for netisr: query a list of registered protocols.
1074  */
1075 static int
1076 sysctl_netisr_proto(SYSCTL_HANDLER_ARGS)
1077 {
1078 	struct rm_priotracker tracker;
1079 	struct sysctl_netisr_proto *snpp, *snp_array;
1080 	struct netisr_proto *npp;
1081 	u_int counter, proto;
1082 	int error;
1083 
1084 	if (req->newptr != NULL)
1085 		return (EINVAL);
1086 	snp_array = malloc(sizeof(*snp_array) * NETISR_MAXPROT, M_TEMP,
1087 	    M_ZERO | M_WAITOK);
1088 	counter = 0;
1089 	NETISR_RLOCK(&tracker);
1090 	for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1091 		npp = &netisr_proto[proto];
1092 		if (npp->np_name == NULL)
1093 			continue;
1094 		snpp = &snp_array[counter];
1095 		snpp->snp_version = sizeof(*snpp);
1096 		strlcpy(snpp->snp_name, npp->np_name, NETISR_NAMEMAXLEN);
1097 		snpp->snp_proto = proto;
1098 		snpp->snp_qlimit = npp->np_qlimit;
1099 		snpp->snp_policy = npp->np_policy;
1100 		if (npp->np_m2flow != NULL)
1101 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2FLOW;
1102 		if (npp->np_m2cpuid != NULL)
1103 			snpp->snp_flags |= NETISR_SNP_FLAGS_M2CPUID;
1104 		if (npp->np_drainedcpu != NULL)
1105 			snpp->snp_flags |= NETISR_SNP_FLAGS_DRAINEDCPU;
1106 		counter++;
1107 	}
1108 	NETISR_RUNLOCK(&tracker);
1109 	KASSERT(counter <= NETISR_MAXPROT,
1110 	    ("sysctl_netisr_proto: counter too big (%d)", counter));
1111 	error = SYSCTL_OUT(req, snp_array, sizeof(*snp_array) * counter);
1112 	free(snp_array, M_TEMP);
1113 	return (error);
1114 }
1115 
1116 SYSCTL_PROC(_net_isr, OID_AUTO, proto,
1117     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_proto,
1118     "S,sysctl_netisr_proto",
1119     "Return list of protocols registered with netisr");
1120 
1121 /*
1122  * Sysctl monitoring for netisr: query a list of workstreams.
1123  */
1124 static int
1125 sysctl_netisr_workstream(SYSCTL_HANDLER_ARGS)
1126 {
1127 	struct rm_priotracker tracker;
1128 	struct sysctl_netisr_workstream *snwsp, *snws_array;
1129 	struct netisr_workstream *nwsp;
1130 	u_int counter, cpuid;
1131 	int error;
1132 
1133 	if (req->newptr != NULL)
1134 		return (EINVAL);
1135 	snws_array = malloc(sizeof(*snws_array) * MAXCPU, M_TEMP,
1136 	    M_ZERO | M_WAITOK);
1137 	counter = 0;
1138 	NETISR_RLOCK(&tracker);
1139 	for (cpuid = 0; cpuid < MAXCPU; cpuid++) {
1140 		if (CPU_ABSENT(cpuid))
1141 			continue;
1142 		nwsp = DPCPU_ID_PTR(cpuid, nws);
1143 		if (nwsp->nws_intr_event == NULL)
1144 			continue;
1145 		NWS_LOCK(nwsp);
1146 		snwsp = &snws_array[counter];
1147 		snwsp->snws_version = sizeof(*snwsp);
1148 
1149 		/*
1150 		 * For now, we equate workstream IDs and CPU IDs in the
1151 		 * kernel, but expose them independently to userspace in case
1152 		 * that assumption changes in the future.
1153 		 */
1154 		snwsp->snws_wsid = cpuid;
1155 		snwsp->snws_cpu = cpuid;
1156 		if (nwsp->nws_intr_event != NULL)
1157 			snwsp->snws_flags |= NETISR_SNWS_FLAGS_INTR;
1158 		NWS_UNLOCK(nwsp);
1159 		counter++;
1160 	}
1161 	NETISR_RUNLOCK(&tracker);
1162 	KASSERT(counter <= MAXCPU,
1163 	    ("sysctl_netisr_workstream: counter too big (%d)", counter));
1164 	error = SYSCTL_OUT(req, snws_array, sizeof(*snws_array) * counter);
1165 	free(snws_array, M_TEMP);
1166 	return (error);
1167 }
1168 
1169 SYSCTL_PROC(_net_isr, OID_AUTO, workstream,
1170     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_workstream,
1171     "S,sysctl_netisr_workstream",
1172     "Return list of workstreams implemented by netisr");
1173 
1174 /*
1175  * Sysctl monitoring for netisr: query per-protocol data across all
1176  * workstreams.
1177  */
1178 static int
1179 sysctl_netisr_work(SYSCTL_HANDLER_ARGS)
1180 {
1181 	struct rm_priotracker tracker;
1182 	struct sysctl_netisr_work *snwp, *snw_array;
1183 	struct netisr_workstream *nwsp;
1184 	struct netisr_proto *npp;
1185 	struct netisr_work *nwp;
1186 	u_int counter, cpuid, proto;
1187 	int error;
1188 
1189 	if (req->newptr != NULL)
1190 		return (EINVAL);
1191 	snw_array = malloc(sizeof(*snw_array) * MAXCPU * NETISR_MAXPROT,
1192 	    M_TEMP, M_ZERO | M_WAITOK);
1193 	counter = 0;
1194 	NETISR_RLOCK(&tracker);
1195 	for (cpuid = 0; cpuid < MAXCPU; cpuid++) {
1196 		if (CPU_ABSENT(cpuid))
1197 			continue;
1198 		nwsp = DPCPU_ID_PTR(cpuid, nws);
1199 		if (nwsp->nws_intr_event == NULL)
1200 			continue;
1201 		NWS_LOCK(nwsp);
1202 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1203 			npp = &netisr_proto[proto];
1204 			if (npp->np_name == NULL)
1205 				continue;
1206 			nwp = &nwsp->nws_work[proto];
1207 			snwp = &snw_array[counter];
1208 			snwp->snw_version = sizeof(*snwp);
1209 			snwp->snw_wsid = cpuid;		/* See comment above. */
1210 			snwp->snw_proto = proto;
1211 			snwp->snw_len = nwp->nw_len;
1212 			snwp->snw_watermark = nwp->nw_watermark;
1213 			snwp->snw_dispatched = nwp->nw_dispatched;
1214 			snwp->snw_hybrid_dispatched =
1215 			    nwp->nw_hybrid_dispatched;
1216 			snwp->snw_qdrops = nwp->nw_qdrops;
1217 			snwp->snw_queued = nwp->nw_queued;
1218 			snwp->snw_handled = nwp->nw_handled;
1219 			counter++;
1220 		}
1221 		NWS_UNLOCK(nwsp);
1222 	}
1223 	KASSERT(counter <= MAXCPU * NETISR_MAXPROT,
1224 	    ("sysctl_netisr_work: counter too big (%d)", counter));
1225 	NETISR_RUNLOCK(&tracker);
1226 	error = SYSCTL_OUT(req, snw_array, sizeof(*snw_array) * counter);
1227 	free(snw_array, M_TEMP);
1228 	return (error);
1229 }
1230 
1231 SYSCTL_PROC(_net_isr, OID_AUTO, work,
1232     CTLFLAG_RD|CTLTYPE_STRUCT|CTLFLAG_MPSAFE, 0, 0, sysctl_netisr_work,
1233     "S,sysctl_netisr_work",
1234     "Return list of per-workstream, per-protocol work in netisr");
1235 
1236 #ifdef DDB
1237 DB_SHOW_COMMAND(netisr, db_show_netisr)
1238 {
1239 	struct netisr_workstream *nwsp;
1240 	struct netisr_work *nwp;
1241 	int first, proto;
1242 	u_int cpuid;
1243 
1244 	db_printf("%3s %6s %5s %5s %5s %8s %8s %8s %8s\n", "CPU", "Proto",
1245 	    "Len", "WMark", "Max", "Disp", "HDisp", "Drop", "Queue");
1246 	for (cpuid = 0; cpuid <= mp_maxid; cpuid++) {
1247 		if (CPU_ABSENT(cpuid))
1248 			continue;
1249 		nwsp = DPCPU_ID_PTR(cpuid, nws);
1250 		if (nwsp->nws_intr_event == NULL)
1251 			continue;
1252 		first = 1;
1253 		for (proto = 0; proto < NETISR_MAXPROT; proto++) {
1254 			if (netisr_proto[proto].np_handler == NULL)
1255 				continue;
1256 			nwp = &nwsp->nws_work[proto];
1257 			if (first) {
1258 				db_printf("%3d ", cpuid);
1259 				first = 0;
1260 			} else
1261 				db_printf("%3s ", "");
1262 			db_printf(
1263 			    "%6s %5d %5d %5d %8ju %8ju %8ju %8ju\n",
1264 			    netisr_proto[proto].np_name, nwp->nw_len,
1265 			    nwp->nw_watermark, nwp->nw_qlimit,
1266 			    nwp->nw_dispatched, nwp->nw_hybrid_dispatched,
1267 			    nwp->nw_qdrops, nwp->nw_queued);
1268 		}
1269 	}
1270 }
1271 #endif
1272