xref: /illumos-gate/usr/src/uts/common/os/netstack.c (revision 8459c777fc1aaabb2f7dad05de1313aa169417cd)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2017, Joyent, Inc.  All rights reserved.
26  */
27 
28 #include <sys/param.h>
29 #include <sys/sysmacros.h>
30 #include <sys/vm.h>
31 #include <sys/proc.h>
32 #include <sys/tuneable.h>
33 #include <sys/systm.h>
34 #include <sys/cmn_err.h>
35 #include <sys/debug.h>
36 #include <sys/sdt.h>
37 #include <sys/mutex.h>
38 #include <sys/bitmap.h>
39 #include <sys/atomic.h>
40 #include <sys/sunddi.h>
41 #include <sys/kobj.h>
42 #include <sys/disp.h>
43 #include <vm/seg_kmem.h>
44 #include <sys/zone.h>
45 #include <sys/netstack.h>
46 
47 /*
48  * What we use so that the zones framework can tell us about new zones,
49  * which we use to create new stacks.
50  */
51 static zone_key_t netstack_zone_key;
52 
53 static int	netstack_initialized = 0;
54 
55 /*
56  * Track the registered netstacks.
57  * The global lock protects
58  * - ns_reg
59  * - the list starting at netstack_head and following the netstack_next
60  *   pointers.
61  */
62 static kmutex_t netstack_g_lock;
63 
64 /*
65  * Registry of netstacks with their create/shutdown/destory functions.
66  */
67 static struct netstack_registry	ns_reg[NS_MAX];
68 
69 /*
70  * Global list of existing stacks.  We use this when a new zone with
71  * an exclusive IP instance is created.
72  *
73  * Note that in some cases a netstack_t needs to stay around after the zone
74  * has gone away. This is because there might be outstanding references
75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
77  * when the last reference to it is dropped.
78  * However, the same zone might be rebooted. That is handled using the
79  * assumption that the zones framework picks a new zoneid each time a zone
80  * is (re)booted. We assert for that condition in netstack_zone_create().
81  * Thus the old netstack_t can take its time for things to time out.
82  */
83 static netstack_t *netstack_head;
84 
85 /*
86  * To support kstat_create_netstack() using kstat_zone_add we need
87  * to track both
88  *  - all zoneids that use the global/shared stack
89  *  - all kstats that have been added for the shared stack
90  */
91 struct shared_zone_list {
92 	struct shared_zone_list *sz_next;
93 	zoneid_t		sz_zoneid;
94 };
95 
96 struct shared_kstat_list {
97 	struct shared_kstat_list *sk_next;
98 	kstat_t			 *sk_kstat;
99 };
100 
101 static kmutex_t netstack_shared_lock;	/* protects the following two */
102 static struct shared_zone_list	*netstack_shared_zones;
103 static struct shared_kstat_list	*netstack_shared_kstats;
104 
105 static void	*netstack_zone_create(zoneid_t zoneid);
106 static void	netstack_zone_shutdown(zoneid_t zoneid, void *arg);
107 static void	netstack_zone_destroy(zoneid_t zoneid, void *arg);
108 
109 static void	netstack_shared_zone_add(zoneid_t zoneid);
110 static void	netstack_shared_zone_remove(zoneid_t zoneid);
111 static void	netstack_shared_kstat_add(kstat_t *ks);
112 static void	netstack_shared_kstat_remove(kstat_t *ks);
113 
114 typedef boolean_t applyfn_t(kmutex_t *, netstack_t *, int);
115 
116 static void	apply_all_netstacks(int, applyfn_t *);
117 static void	apply_all_modules(netstack_t *, applyfn_t *);
118 static void	apply_all_modules_reverse(netstack_t *, applyfn_t *);
119 static boolean_t netstack_apply_create(kmutex_t *, netstack_t *, int);
120 static boolean_t netstack_apply_shutdown(kmutex_t *, netstack_t *, int);
121 static boolean_t netstack_apply_destroy(kmutex_t *, netstack_t *, int);
122 static boolean_t wait_for_zone_creator(netstack_t *, kmutex_t *);
123 static boolean_t wait_for_nms_inprogress(netstack_t *, nm_state_t *,
124     kmutex_t *);
125 
126 static void netstack_hold_locked(netstack_t *);
127 
128 static ksema_t netstack_reap_limiter;
129 /*
130  * Hard-coded constant, but since this is not tunable in real-time, it seems
131  * making it an /etc/system tunable is better than nothing.
132  */
133 uint_t netstack_outstanding_reaps = 1024;
134 
135 void
136 netstack_init(void)
137 {
138 	mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
139 	mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
140 
141 	sema_init(&netstack_reap_limiter, netstack_outstanding_reaps, NULL,
142 	    SEMA_DRIVER, NULL);
143 
144 	netstack_initialized = 1;
145 
146 	/*
147 	 * We want to be informed each time a zone is created or
148 	 * destroyed in the kernel, so we can maintain the
149 	 * stack instance information.
150 	 */
151 	zone_key_create(&netstack_zone_key, netstack_zone_create,
152 	    netstack_zone_shutdown, netstack_zone_destroy);
153 }
154 
155 /*
156  * Register a new module with the framework.
157  * This registers interest in changes to the set of netstacks.
158  * The createfn and destroyfn are required, but the shutdownfn can be
159  * NULL.
160  * Note that due to the current zsd implementation, when the create
161  * function is called the zone isn't fully present, thus functions
162  * like zone_find_by_* will fail, hence the create function can not
163  * use many zones kernel functions including zcmn_err().
164  */
165 void
166 netstack_register(int moduleid,
167     void *(*module_create)(netstackid_t, netstack_t *),
168     void (*module_shutdown)(netstackid_t, void *),
169     void (*module_destroy)(netstackid_t, void *))
170 {
171 	netstack_t *ns;
172 
173 	ASSERT(netstack_initialized);
174 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
175 	ASSERT(module_create != NULL);
176 
177 	/*
178 	 * Make instances created after this point in time run the create
179 	 * callback.
180 	 */
181 	mutex_enter(&netstack_g_lock);
182 	ASSERT(ns_reg[moduleid].nr_create == NULL);
183 	ASSERT(ns_reg[moduleid].nr_flags == 0);
184 	ns_reg[moduleid].nr_create = module_create;
185 	ns_reg[moduleid].nr_shutdown = module_shutdown;
186 	ns_reg[moduleid].nr_destroy = module_destroy;
187 	ns_reg[moduleid].nr_flags = NRF_REGISTERED;
188 
189 	/*
190 	 * Determine the set of stacks that exist before we drop the lock.
191 	 * Set NSS_CREATE_NEEDED for each of those.
192 	 * netstacks which have been deleted will have NSS_CREATE_COMPLETED
193 	 * set, but check NSF_CLOSING to be sure.
194 	 */
195 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
196 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
197 
198 		mutex_enter(&ns->netstack_lock);
199 		if (!(ns->netstack_flags & NSF_CLOSING) &&
200 		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
201 			nms->nms_flags |= NSS_CREATE_NEEDED;
202 			DTRACE_PROBE2(netstack__create__needed,
203 			    netstack_t *, ns, int, moduleid);
204 		}
205 		mutex_exit(&ns->netstack_lock);
206 	}
207 	mutex_exit(&netstack_g_lock);
208 
209 	/*
210 	 * At this point in time a new instance can be created or an instance
211 	 * can be destroyed, or some other module can register or unregister.
212 	 * Make sure we either run all the create functions for this moduleid
213 	 * or we wait for any other creators for this moduleid.
214 	 */
215 	apply_all_netstacks(moduleid, netstack_apply_create);
216 }
217 
218 void
219 netstack_unregister(int moduleid)
220 {
221 	netstack_t *ns;
222 
223 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
224 
225 	ASSERT(ns_reg[moduleid].nr_create != NULL);
226 	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
227 
228 	mutex_enter(&netstack_g_lock);
229 	/*
230 	 * Determine the set of stacks that exist before we drop the lock.
231 	 * Set NSS_SHUTDOWN_NEEDED and NSS_DESTROY_NEEDED for each of those.
232 	 * That ensures that when we return all the callbacks for existing
233 	 * instances have completed. And since we set NRF_DYING no new
234 	 * instances can use this module.
235 	 */
236 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
237 		boolean_t created = B_FALSE;
238 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
239 
240 		mutex_enter(&ns->netstack_lock);
241 
242 		/*
243 		 * We need to be careful here. We could actually have a netstack
244 		 * being created as we speak waiting for us to let go of this
245 		 * lock to proceed. It may have set NSS_CREATE_NEEDED, but not
246 		 * have gotten to the point of completing it yet. If
247 		 * NSS_CREATE_NEEDED, we can safely just remove it here and
248 		 * never create the module. However, if NSS_CREATE_INPROGRESS is
249 		 * set, we need to still flag this module for shutdown and
250 		 * deletion, just as though it had reached NSS_CREATE_COMPLETED.
251 		 *
252 		 * It is safe to do that because of two different guarantees
253 		 * that exist in the system. The first is that before we do a
254 		 * create, shutdown, or destroy, we ensure that nothing else is
255 		 * in progress in the system for this netstack and wait for it
256 		 * to complete. Secondly, because the zone is being created, we
257 		 * know that the following call to apply_all_netstack will block
258 		 * on the zone finishing its initialization.
259 		 */
260 		if (nms->nms_flags & NSS_CREATE_NEEDED)
261 			nms->nms_flags &= ~NSS_CREATE_NEEDED;
262 
263 		if (nms->nms_flags & NSS_CREATE_INPROGRESS ||
264 		    nms->nms_flags & NSS_CREATE_COMPLETED)
265 			created = B_TRUE;
266 
267 		if (ns_reg[moduleid].nr_shutdown != NULL && created &&
268 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
269 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
270 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
271 			DTRACE_PROBE2(netstack__shutdown__needed,
272 			    netstack_t *, ns, int, moduleid);
273 		}
274 		if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
275 		    ns_reg[moduleid].nr_destroy != NULL && created &&
276 		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
277 			nms->nms_flags |= NSS_DESTROY_NEEDED;
278 			DTRACE_PROBE2(netstack__destroy__needed,
279 			    netstack_t *, ns, int, moduleid);
280 		}
281 		mutex_exit(&ns->netstack_lock);
282 	}
283 	/*
284 	 * Prevent any new netstack from calling the registered create
285 	 * function, while keeping the function pointers in place until the
286 	 * shutdown and destroy callbacks are complete.
287 	 */
288 	ns_reg[moduleid].nr_flags |= NRF_DYING;
289 	mutex_exit(&netstack_g_lock);
290 
291 	apply_all_netstacks(moduleid, netstack_apply_shutdown);
292 	apply_all_netstacks(moduleid, netstack_apply_destroy);
293 
294 	/*
295 	 * Clear the nms_flags so that we can handle this module
296 	 * being loaded again.
297 	 * Also remove the registered functions.
298 	 */
299 	mutex_enter(&netstack_g_lock);
300 	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
301 	ASSERT(ns_reg[moduleid].nr_flags & NRF_DYING);
302 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
303 		nm_state_t *nms = &ns->netstack_m_state[moduleid];
304 
305 		mutex_enter(&ns->netstack_lock);
306 		if (nms->nms_flags & NSS_DESTROY_COMPLETED) {
307 			nms->nms_flags = 0;
308 			DTRACE_PROBE2(netstack__destroy__done,
309 			    netstack_t *, ns, int, moduleid);
310 		}
311 		mutex_exit(&ns->netstack_lock);
312 	}
313 
314 	ns_reg[moduleid].nr_create = NULL;
315 	ns_reg[moduleid].nr_shutdown = NULL;
316 	ns_reg[moduleid].nr_destroy = NULL;
317 	ns_reg[moduleid].nr_flags = 0;
318 	mutex_exit(&netstack_g_lock);
319 }
320 
321 /*
322  * Lookup and/or allocate a netstack for this zone.
323  */
324 static void *
325 netstack_zone_create(zoneid_t zoneid)
326 {
327 	netstackid_t stackid;
328 	netstack_t *ns;
329 	netstack_t **nsp;
330 	zone_t	*zone;
331 	int i;
332 
333 	ASSERT(netstack_initialized);
334 
335 	zone = zone_find_by_id_nolock(zoneid);
336 	ASSERT(zone != NULL);
337 
338 	if (zone->zone_flags & ZF_NET_EXCL) {
339 		stackid = zoneid;
340 	} else {
341 		/* Look for the stack instance for the global */
342 		stackid = GLOBAL_NETSTACKID;
343 	}
344 
345 	/* Allocate even if it isn't needed; simplifies locking */
346 	ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
347 
348 	/* Look if there is a matching stack instance */
349 	mutex_enter(&netstack_g_lock);
350 	for (nsp = &netstack_head; *nsp != NULL;
351 	    nsp = &((*nsp)->netstack_next)) {
352 		if ((*nsp)->netstack_stackid == stackid) {
353 			/*
354 			 * Should never find a pre-existing exclusive stack
355 			 */
356 			VERIFY(stackid == GLOBAL_NETSTACKID);
357 			kmem_free(ns, sizeof (netstack_t));
358 			ns = *nsp;
359 			mutex_enter(&ns->netstack_lock);
360 			ns->netstack_numzones++;
361 			mutex_exit(&ns->netstack_lock);
362 			mutex_exit(&netstack_g_lock);
363 			DTRACE_PROBE1(netstack__inc__numzones,
364 			    netstack_t *, ns);
365 			/* Record that we have a new shared stack zone */
366 			netstack_shared_zone_add(zoneid);
367 			zone->zone_netstack = ns;
368 			return (ns);
369 		}
370 	}
371 	/* Not found */
372 	mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
373 	cv_init(&ns->netstack_cv, NULL, CV_DEFAULT, NULL);
374 	ns->netstack_stackid = zoneid;
375 	ns->netstack_numzones = 1;
376 	ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
377 	ns->netstack_flags = NSF_UNINIT;
378 	*nsp = ns;
379 	zone->zone_netstack = ns;
380 
381 	mutex_enter(&ns->netstack_lock);
382 	/*
383 	 * Mark this netstack as having a CREATE running so
384 	 * any netstack_register/netstack_unregister waits for
385 	 * the existing create callbacks to complete in moduleid order
386 	 */
387 	ns->netstack_flags |= NSF_ZONE_CREATE;
388 
389 	/*
390 	 * Determine the set of module create functions that need to be
391 	 * called before we drop the lock.
392 	 * Set NSS_CREATE_NEEDED for each of those.
393 	 * Skip any with NRF_DYING set, since those are in the process of
394 	 * going away, by checking for flags being exactly NRF_REGISTERED.
395 	 */
396 	for (i = 0; i < NS_MAX; i++) {
397 		nm_state_t *nms = &ns->netstack_m_state[i];
398 
399 		cv_init(&nms->nms_cv, NULL, CV_DEFAULT, NULL);
400 
401 		if ((ns_reg[i].nr_flags == NRF_REGISTERED) &&
402 		    (nms->nms_flags & NSS_CREATE_ALL) == 0) {
403 			nms->nms_flags |= NSS_CREATE_NEEDED;
404 			DTRACE_PROBE2(netstack__create__needed,
405 			    netstack_t *, ns, int, i);
406 		}
407 	}
408 	mutex_exit(&ns->netstack_lock);
409 	mutex_exit(&netstack_g_lock);
410 
411 	apply_all_modules(ns, netstack_apply_create);
412 
413 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
414 	mutex_enter(&ns->netstack_lock);
415 	ns->netstack_flags &= ~NSF_UNINIT;
416 	ASSERT(ns->netstack_flags & NSF_ZONE_CREATE);
417 	ns->netstack_flags &= ~NSF_ZONE_CREATE;
418 	cv_broadcast(&ns->netstack_cv);
419 	mutex_exit(&ns->netstack_lock);
420 
421 	return (ns);
422 }
423 
424 /* ARGSUSED */
425 static void
426 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
427 {
428 	netstack_t *ns = (netstack_t *)arg;
429 	int i;
430 
431 	ASSERT(arg != NULL);
432 
433 	mutex_enter(&ns->netstack_lock);
434 	ASSERT(ns->netstack_numzones > 0);
435 	if (ns->netstack_numzones != 1) {
436 		/* Stack instance being used by other zone */
437 		mutex_exit(&ns->netstack_lock);
438 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
439 		return;
440 	}
441 	mutex_exit(&ns->netstack_lock);
442 
443 	mutex_enter(&netstack_g_lock);
444 	mutex_enter(&ns->netstack_lock);
445 	/*
446 	 * Mark this netstack as having a SHUTDOWN running so
447 	 * any netstack_register/netstack_unregister waits for
448 	 * the existing create callbacks to complete in moduleid order
449 	 */
450 	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
451 	ns->netstack_flags |= NSF_ZONE_SHUTDOWN;
452 
453 	/*
454 	 * Determine the set of stacks that exist before we drop the lock.
455 	 * Set NSS_SHUTDOWN_NEEDED for each of those.
456 	 */
457 	for (i = 0; i < NS_MAX; i++) {
458 		nm_state_t *nms = &ns->netstack_m_state[i];
459 
460 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
461 		    ns_reg[i].nr_shutdown != NULL &&
462 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
463 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
464 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
465 			DTRACE_PROBE2(netstack__shutdown__needed,
466 			    netstack_t *, ns, int, i);
467 		}
468 	}
469 	mutex_exit(&ns->netstack_lock);
470 	mutex_exit(&netstack_g_lock);
471 
472 	/*
473 	 * Call the shutdown function for all registered modules for this
474 	 * netstack.
475 	 */
476 	apply_all_modules_reverse(ns, netstack_apply_shutdown);
477 
478 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
479 	mutex_enter(&ns->netstack_lock);
480 	ASSERT(ns->netstack_flags & NSF_ZONE_SHUTDOWN);
481 	ns->netstack_flags &= ~NSF_ZONE_SHUTDOWN;
482 	cv_broadcast(&ns->netstack_cv);
483 	mutex_exit(&ns->netstack_lock);
484 }
485 
486 /*
487  * Common routine to release a zone.
488  * If this was the last zone using the stack instance then prepare to
489  * have the refcnt dropping to zero free the zone.
490  */
491 /* ARGSUSED */
492 static void
493 netstack_zone_destroy(zoneid_t zoneid, void *arg)
494 {
495 	netstack_t *ns = (netstack_t *)arg;
496 
497 	ASSERT(arg != NULL);
498 
499 	mutex_enter(&ns->netstack_lock);
500 	ASSERT(ns->netstack_numzones > 0);
501 	ns->netstack_numzones--;
502 	if (ns->netstack_numzones != 0) {
503 		/* Stack instance being used by other zone */
504 		mutex_exit(&ns->netstack_lock);
505 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
506 		/* Record that we a shared stack zone has gone away */
507 		netstack_shared_zone_remove(zoneid);
508 		return;
509 	}
510 	/*
511 	 * Set CLOSING so that netstack_find_by will not find it.
512 	 */
513 	ns->netstack_flags |= NSF_CLOSING;
514 	mutex_exit(&ns->netstack_lock);
515 	DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
516 	/* No other thread can call zone_destroy for this stack */
517 
518 	/*
519 	 * Decrease refcnt to account for the one in netstack_zone_init()
520 	 */
521 	netstack_rele(ns);
522 }
523 
524 /*
525  * Called when the reference count drops to zero.
526  * Call the destroy functions for each registered module.
527  */
528 static void
529 netstack_stack_inactive(netstack_t *ns)
530 {
531 	int i;
532 
533 	mutex_enter(&netstack_g_lock);
534 	mutex_enter(&ns->netstack_lock);
535 	/*
536 	 * Mark this netstack as having a DESTROY running so
537 	 * any netstack_register/netstack_unregister waits for
538 	 * the existing destroy callbacks to complete in reverse moduleid order
539 	 */
540 	ASSERT(!(ns->netstack_flags & NSF_ZONE_INPROGRESS));
541 	ns->netstack_flags |= NSF_ZONE_DESTROY;
542 	/*
543 	 * If the shutdown callback wasn't called earlier (e.g., if this is
544 	 * a netstack shared between multiple zones), then we schedule it now.
545 	 *
546 	 * Determine the set of stacks that exist before we drop the lock.
547 	 * Set NSS_DESTROY_NEEDED for each of those. That
548 	 * ensures that when we return all the callbacks for existing
549 	 * instances have completed.
550 	 */
551 	for (i = 0; i < NS_MAX; i++) {
552 		nm_state_t *nms = &ns->netstack_m_state[i];
553 
554 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
555 		    ns_reg[i].nr_shutdown != NULL &&
556 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
557 		    (nms->nms_flags & NSS_SHUTDOWN_ALL) == 0) {
558 			nms->nms_flags |= NSS_SHUTDOWN_NEEDED;
559 			DTRACE_PROBE2(netstack__shutdown__needed,
560 			    netstack_t *, ns, int, i);
561 		}
562 
563 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
564 		    ns_reg[i].nr_destroy != NULL &&
565 		    (nms->nms_flags & NSS_CREATE_COMPLETED) &&
566 		    (nms->nms_flags & NSS_DESTROY_ALL) == 0) {
567 			nms->nms_flags |= NSS_DESTROY_NEEDED;
568 			DTRACE_PROBE2(netstack__destroy__needed,
569 			    netstack_t *, ns, int, i);
570 		}
571 	}
572 	mutex_exit(&ns->netstack_lock);
573 	mutex_exit(&netstack_g_lock);
574 
575 	/*
576 	 * Call the shutdown and destroy functions for all registered modules
577 	 * for this netstack.
578 	 *
579 	 * Since there are some ordering dependencies between the modules we
580 	 * tear them down in the reverse order of what was used to create them.
581 	 *
582 	 * Since a netstack_t is never reused (when a zone is rebooted it gets
583 	 * a new zoneid == netstackid i.e. a new netstack_t is allocated) we
584 	 * leave nms_flags the way it is i.e. with NSS_DESTROY_COMPLETED set.
585 	 * That is different than in the netstack_unregister() case.
586 	 */
587 	apply_all_modules_reverse(ns, netstack_apply_shutdown);
588 	apply_all_modules_reverse(ns, netstack_apply_destroy);
589 
590 	/* Tell any waiting netstack_register/netstack_unregister to proceed */
591 	mutex_enter(&ns->netstack_lock);
592 	ASSERT(ns->netstack_flags & NSF_ZONE_DESTROY);
593 	ns->netstack_flags &= ~NSF_ZONE_DESTROY;
594 	cv_broadcast(&ns->netstack_cv);
595 	mutex_exit(&ns->netstack_lock);
596 }
597 
598 /*
599  * Apply a function to all netstacks for a particular moduleid.
600  *
601  * If there is any zone activity (due to a zone being created, shutdown,
602  * or destroyed) we wait for that to complete before we proceed. This ensures
603  * that the moduleids are processed in order when a zone is created or
604  * destroyed.
605  *
606  * The applyfn has to drop netstack_g_lock if it does some work.
607  * In that case we don't follow netstack_next,
608  * even if it is possible to do so without any hazards. This is
609  * because we want the design to allow for the list of netstacks threaded
610  * by netstack_next to change in any arbitrary way during the time the
611  * lock was dropped.
612  *
613  * It is safe to restart the loop at netstack_head since the applyfn
614  * changes netstack_m_state as it processes things, so a subsequent
615  * pass through will have no effect in applyfn, hence the loop will terminate
616  * in at worst O(N^2).
617  */
618 static void
619 apply_all_netstacks(int moduleid, applyfn_t *applyfn)
620 {
621 	netstack_t *ns;
622 
623 	mutex_enter(&netstack_g_lock);
624 	ns = netstack_head;
625 	while (ns != NULL) {
626 		if (wait_for_zone_creator(ns, &netstack_g_lock)) {
627 			/* Lock dropped - restart at head */
628 			ns = netstack_head;
629 		} else if ((applyfn)(&netstack_g_lock, ns, moduleid)) {
630 			/* Lock dropped - restart at head */
631 			ns = netstack_head;
632 		} else {
633 			ns = ns->netstack_next;
634 		}
635 	}
636 	mutex_exit(&netstack_g_lock);
637 }
638 
639 /*
640  * Apply a function to all moduleids for a particular netstack.
641  *
642  * Since the netstack linkage doesn't matter in this case we can
643  * ignore whether the function drops the lock.
644  */
645 static void
646 apply_all_modules(netstack_t *ns, applyfn_t *applyfn)
647 {
648 	int i;
649 
650 	mutex_enter(&netstack_g_lock);
651 	for (i = 0; i < NS_MAX; i++) {
652 		/*
653 		 * We don't care whether the lock was dropped
654 		 * since we are not iterating over netstack_head.
655 		 */
656 		(void) (applyfn)(&netstack_g_lock, ns, i);
657 	}
658 	mutex_exit(&netstack_g_lock);
659 }
660 
661 /* Like the above but in reverse moduleid order */
662 static void
663 apply_all_modules_reverse(netstack_t *ns, applyfn_t *applyfn)
664 {
665 	int i;
666 
667 	mutex_enter(&netstack_g_lock);
668 	for (i = NS_MAX-1; i >= 0; i--) {
669 		/*
670 		 * We don't care whether the lock was dropped
671 		 * since we are not iterating over netstack_head.
672 		 */
673 		(void) (applyfn)(&netstack_g_lock, ns, i);
674 	}
675 	mutex_exit(&netstack_g_lock);
676 }
677 
678 /*
679  * Call the create function for the ns and moduleid if CREATE_NEEDED
680  * is set.
681  * If some other thread gets here first and sets *_INPROGRESS, then
682  * we wait for that thread to complete so that we can ensure that
683  * all the callbacks are done when we've looped over all netstacks/moduleids.
684  *
685  * When we call the create function, we temporarily drop the netstack_lock
686  * held by the caller, and return true to tell the caller it needs to
687  * re-evalute the state.
688  */
689 static boolean_t
690 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
691 {
692 	void *result;
693 	netstackid_t stackid;
694 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
695 	boolean_t dropped = B_FALSE;
696 
697 	ASSERT(MUTEX_HELD(lockp));
698 	mutex_enter(&ns->netstack_lock);
699 
700 	if (wait_for_nms_inprogress(ns, nms, lockp))
701 		dropped = B_TRUE;
702 
703 	if (nms->nms_flags & NSS_CREATE_NEEDED) {
704 		nms->nms_flags &= ~NSS_CREATE_NEEDED;
705 		nms->nms_flags |= NSS_CREATE_INPROGRESS;
706 		DTRACE_PROBE2(netstack__create__inprogress,
707 		    netstack_t *, ns, int, moduleid);
708 		mutex_exit(&ns->netstack_lock);
709 		mutex_exit(lockp);
710 		dropped = B_TRUE;
711 
712 		ASSERT(ns_reg[moduleid].nr_create != NULL);
713 		stackid = ns->netstack_stackid;
714 		DTRACE_PROBE2(netstack__create__start,
715 		    netstackid_t, stackid,
716 		    netstack_t *, ns);
717 		result = (ns_reg[moduleid].nr_create)(stackid, ns);
718 		DTRACE_PROBE2(netstack__create__end,
719 		    void *, result, netstack_t *, ns);
720 
721 		ASSERT(result != NULL);
722 		mutex_enter(lockp);
723 		mutex_enter(&ns->netstack_lock);
724 		ns->netstack_modules[moduleid] = result;
725 		nms->nms_flags &= ~NSS_CREATE_INPROGRESS;
726 		nms->nms_flags |= NSS_CREATE_COMPLETED;
727 		cv_broadcast(&nms->nms_cv);
728 		DTRACE_PROBE2(netstack__create__completed,
729 		    netstack_t *, ns, int, moduleid);
730 		mutex_exit(&ns->netstack_lock);
731 		return (dropped);
732 	} else {
733 		mutex_exit(&ns->netstack_lock);
734 		return (dropped);
735 	}
736 }
737 
738 /*
739  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
740  * is set.
741  * If some other thread gets here first and sets *_INPROGRESS, then
742  * we wait for that thread to complete so that we can ensure that
743  * all the callbacks are done when we've looped over all netstacks/moduleids.
744  *
745  * When we call the shutdown function, we temporarily drop the netstack_lock
746  * held by the caller, and return true to tell the caller it needs to
747  * re-evalute the state.
748  */
749 static boolean_t
750 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
751 {
752 	netstackid_t stackid;
753 	void * netstack_module;
754 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
755 	boolean_t dropped = B_FALSE;
756 
757 	ASSERT(MUTEX_HELD(lockp));
758 	mutex_enter(&ns->netstack_lock);
759 
760 	if (wait_for_nms_inprogress(ns, nms, lockp))
761 		dropped = B_TRUE;
762 
763 	if (nms->nms_flags & NSS_SHUTDOWN_NEEDED) {
764 		nms->nms_flags &= ~NSS_SHUTDOWN_NEEDED;
765 		nms->nms_flags |= NSS_SHUTDOWN_INPROGRESS;
766 		DTRACE_PROBE2(netstack__shutdown__inprogress,
767 		    netstack_t *, ns, int, moduleid);
768 		mutex_exit(&ns->netstack_lock);
769 		mutex_exit(lockp);
770 		dropped = B_TRUE;
771 
772 		ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
773 		stackid = ns->netstack_stackid;
774 		netstack_module = ns->netstack_modules[moduleid];
775 		DTRACE_PROBE2(netstack__shutdown__start,
776 		    netstackid_t, stackid,
777 		    void *, netstack_module);
778 		(ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
779 		DTRACE_PROBE1(netstack__shutdown__end,
780 		    netstack_t *, ns);
781 
782 		mutex_enter(lockp);
783 		mutex_enter(&ns->netstack_lock);
784 		nms->nms_flags &= ~NSS_SHUTDOWN_INPROGRESS;
785 		nms->nms_flags |= NSS_SHUTDOWN_COMPLETED;
786 		cv_broadcast(&nms->nms_cv);
787 		DTRACE_PROBE2(netstack__shutdown__completed,
788 		    netstack_t *, ns, int, moduleid);
789 		mutex_exit(&ns->netstack_lock);
790 		return (dropped);
791 	} else {
792 		mutex_exit(&ns->netstack_lock);
793 		return (dropped);
794 	}
795 }
796 
797 /*
798  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
799  * is set.
800  * If some other thread gets here first and sets *_INPROGRESS, then
801  * we wait for that thread to complete so that we can ensure that
802  * all the callbacks are done when we've looped over all netstacks/moduleids.
803  *
804  * When we call the destroy function, we temporarily drop the netstack_lock
805  * held by the caller, and return true to tell the caller it needs to
806  * re-evalute the state.
807  */
808 static boolean_t
809 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
810 {
811 	netstackid_t stackid;
812 	void * netstack_module;
813 	nm_state_t *nms = &ns->netstack_m_state[moduleid];
814 	boolean_t dropped = B_FALSE;
815 
816 	ASSERT(MUTEX_HELD(lockp));
817 	mutex_enter(&ns->netstack_lock);
818 
819 	if (wait_for_nms_inprogress(ns, nms, lockp))
820 		dropped = B_TRUE;
821 
822 	if (nms->nms_flags & NSS_DESTROY_NEEDED) {
823 		nms->nms_flags &= ~NSS_DESTROY_NEEDED;
824 		nms->nms_flags |= NSS_DESTROY_INPROGRESS;
825 		DTRACE_PROBE2(netstack__destroy__inprogress,
826 		    netstack_t *, ns, int, moduleid);
827 		mutex_exit(&ns->netstack_lock);
828 		mutex_exit(lockp);
829 		dropped = B_TRUE;
830 
831 		ASSERT(ns_reg[moduleid].nr_destroy != NULL);
832 		stackid = ns->netstack_stackid;
833 		netstack_module = ns->netstack_modules[moduleid];
834 		DTRACE_PROBE2(netstack__destroy__start,
835 		    netstackid_t, stackid,
836 		    void *, netstack_module);
837 		(ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
838 		DTRACE_PROBE1(netstack__destroy__end,
839 		    netstack_t *, ns);
840 
841 		mutex_enter(lockp);
842 		mutex_enter(&ns->netstack_lock);
843 		ns->netstack_modules[moduleid] = NULL;
844 		nms->nms_flags &= ~NSS_DESTROY_INPROGRESS;
845 		nms->nms_flags |= NSS_DESTROY_COMPLETED;
846 		cv_broadcast(&nms->nms_cv);
847 		DTRACE_PROBE2(netstack__destroy__completed,
848 		    netstack_t *, ns, int, moduleid);
849 		mutex_exit(&ns->netstack_lock);
850 		return (dropped);
851 	} else {
852 		mutex_exit(&ns->netstack_lock);
853 		return (dropped);
854 	}
855 }
856 
857 /*
858  * If somebody  is creating the netstack (due to a new zone being created)
859  * then we wait for them to complete. This ensures that any additional
860  * netstack_register() doesn't cause the create functions to run out of
861  * order.
862  * Note that we do not need such a global wait in the case of the shutdown
863  * and destroy callbacks, since in that case it is sufficient for both
864  * threads to set NEEDED and wait for INPROGRESS to ensure ordering.
865  * Returns true if lockp was temporarily dropped while waiting.
866  */
867 static boolean_t
868 wait_for_zone_creator(netstack_t *ns, kmutex_t *lockp)
869 {
870 	boolean_t dropped = B_FALSE;
871 
872 	mutex_enter(&ns->netstack_lock);
873 	while (ns->netstack_flags & NSF_ZONE_CREATE) {
874 		DTRACE_PROBE1(netstack__wait__zone__inprogress,
875 		    netstack_t *, ns);
876 		if (lockp != NULL) {
877 			dropped = B_TRUE;
878 			mutex_exit(lockp);
879 		}
880 		cv_wait(&ns->netstack_cv, &ns->netstack_lock);
881 		if (lockp != NULL) {
882 			/* First drop netstack_lock to preserve order */
883 			mutex_exit(&ns->netstack_lock);
884 			mutex_enter(lockp);
885 			mutex_enter(&ns->netstack_lock);
886 		}
887 	}
888 	mutex_exit(&ns->netstack_lock);
889 	return (dropped);
890 }
891 
892 /*
893  * Wait for any INPROGRESS flag to be cleared for the netstack/moduleid
894  * combination.
895  * Returns true if lockp was temporarily dropped while waiting.
896  */
897 static boolean_t
898 wait_for_nms_inprogress(netstack_t *ns, nm_state_t *nms, kmutex_t *lockp)
899 {
900 	boolean_t dropped = B_FALSE;
901 
902 	while (nms->nms_flags & NSS_ALL_INPROGRESS) {
903 		DTRACE_PROBE2(netstack__wait__nms__inprogress,
904 		    netstack_t *, ns, nm_state_t *, nms);
905 		if (lockp != NULL) {
906 			dropped = B_TRUE;
907 			mutex_exit(lockp);
908 		}
909 		cv_wait(&nms->nms_cv, &ns->netstack_lock);
910 		if (lockp != NULL) {
911 			/* First drop netstack_lock to preserve order */
912 			mutex_exit(&ns->netstack_lock);
913 			mutex_enter(lockp);
914 			mutex_enter(&ns->netstack_lock);
915 		}
916 	}
917 	return (dropped);
918 }
919 
920 /*
921  * Get the stack instance used in caller's zone.
922  * Increases the reference count, caller must do a netstack_rele.
923  * It can't be called after zone_destroy() has started.
924  */
925 netstack_t *
926 netstack_get_current(void)
927 {
928 	netstack_t *ns;
929 
930 	ns = curproc->p_zone->zone_netstack;
931 	ASSERT(ns != NULL);
932 	return (netstack_hold_if_active(ns));
933 }
934 
935 /*
936  * Find a stack instance given the cred.
937  * This is used by the modules to potentially allow for a future when
938  * something other than the zoneid is used to determine the stack.
939  */
940 netstack_t *
941 netstack_find_by_cred(const cred_t *cr)
942 {
943 	zoneid_t zoneid = crgetzoneid(cr);
944 
945 	/* Handle the case when cr_zone is NULL */
946 	if (zoneid == (zoneid_t)-1)
947 		zoneid = GLOBAL_ZONEID;
948 
949 	/* For performance ... */
950 	if (curproc->p_zone->zone_id == zoneid)
951 		return (netstack_get_current());
952 	else
953 		return (netstack_find_by_zoneid(zoneid));
954 }
955 
956 /*
957  * Find a stack instance given the zoneid.
958  * Increases the reference count if found; caller must do a
959  * netstack_rele().
960  *
961  * If there is no exact match then assume the shared stack instance
962  * matches.
963  *
964  * Skip the uninitialized and closing ones.
965  */
966 netstack_t *
967 netstack_find_by_zoneid(zoneid_t zoneid)
968 {
969 	netstack_t *ns;
970 	zone_t *zone;
971 
972 	zone = zone_find_by_id(zoneid);
973 
974 	if (zone == NULL)
975 		return (NULL);
976 
977 	ASSERT(zone->zone_netstack != NULL);
978 	ns = netstack_hold_if_active(zone->zone_netstack);
979 
980 	zone_rele(zone);
981 	return (ns);
982 }
983 
984 /*
985  * Find a stack instance given the zoneid. Can only be called from
986  * the create callback. See the comments in zone_find_by_id_nolock why
987  * that limitation exists.
988  *
989  * Increases the reference count if found; caller must do a
990  * netstack_rele().
991  *
992  * If there is no exact match then assume the shared stack instance
993  * matches.
994  *
995  * Skip the unitialized ones.
996  */
997 netstack_t *
998 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
999 {
1000 	zone_t *zone;
1001 
1002 	zone = zone_find_by_id_nolock(zoneid);
1003 
1004 	if (zone == NULL)
1005 		return (NULL);
1006 
1007 	ASSERT(zone->zone_netstack != NULL);
1008 	/* zone_find_by_id_nolock does not have a hold on the zone */
1009 	return (netstack_hold_if_active(zone->zone_netstack));
1010 }
1011 
1012 /*
1013  * Find a stack instance given the stackid with exact match?
1014  * Increases the reference count if found; caller must do a
1015  * netstack_rele().
1016  *
1017  * Skip the unitialized ones.
1018  */
1019 netstack_t *
1020 netstack_find_by_stackid(netstackid_t stackid)
1021 {
1022 	netstack_t *ns;
1023 
1024 	mutex_enter(&netstack_g_lock);
1025 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1026 		/* Can't use hold_if_active because of stackid check. */
1027 		mutex_enter(&ns->netstack_lock);
1028 		if (ns->netstack_stackid == stackid &&
1029 		    !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
1030 			netstack_hold_locked(ns);
1031 			mutex_exit(&ns->netstack_lock);
1032 			mutex_exit(&netstack_g_lock);
1033 			return (ns);
1034 		}
1035 		mutex_exit(&ns->netstack_lock);
1036 	}
1037 	mutex_exit(&netstack_g_lock);
1038 	return (NULL);
1039 }
1040 
1041 boolean_t
1042 netstack_inuse_by_stackid(netstackid_t stackid)
1043 {
1044 	netstack_t *ns;
1045 	boolean_t rval = B_FALSE;
1046 
1047 	mutex_enter(&netstack_g_lock);
1048 
1049 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
1050 		if (ns->netstack_stackid == stackid) {
1051 			rval = B_TRUE;
1052 			break;
1053 		}
1054 	}
1055 
1056 	mutex_exit(&netstack_g_lock);
1057 
1058 	return (rval);
1059 }
1060 
1061 
1062 static void
1063 netstack_reap(void *arg)
1064 {
1065 	netstack_t **nsp, *ns = (netstack_t *)arg;
1066 	boolean_t found;
1067 	int i;
1068 
1069 	/*
1070 	 * Time to call the destroy functions and free up
1071 	 * the structure
1072 	 */
1073 	netstack_stack_inactive(ns);
1074 
1075 	/* Make sure nothing increased the references */
1076 	ASSERT(ns->netstack_refcnt == 0);
1077 	ASSERT(ns->netstack_numzones == 0);
1078 
1079 	/* Finally remove from list of netstacks */
1080 	mutex_enter(&netstack_g_lock);
1081 	found = B_FALSE;
1082 	for (nsp = &netstack_head; *nsp != NULL;
1083 	    nsp = &(*nsp)->netstack_next) {
1084 		if (*nsp == ns) {
1085 			*nsp = ns->netstack_next;
1086 			ns->netstack_next = NULL;
1087 			found = B_TRUE;
1088 			break;
1089 		}
1090 	}
1091 	ASSERT(found);
1092 	mutex_exit(&netstack_g_lock);
1093 
1094 	/* Make sure nothing increased the references */
1095 	ASSERT(ns->netstack_refcnt == 0);
1096 	ASSERT(ns->netstack_numzones == 0);
1097 
1098 	ASSERT(ns->netstack_flags & NSF_CLOSING);
1099 
1100 	for (i = 0; i < NS_MAX; i++) {
1101 		nm_state_t *nms = &ns->netstack_m_state[i];
1102 
1103 		cv_destroy(&nms->nms_cv);
1104 	}
1105 	mutex_destroy(&ns->netstack_lock);
1106 	cv_destroy(&ns->netstack_cv);
1107 	kmem_free(ns, sizeof (*ns));
1108 	/* Allow another reap to be scheduled. */
1109 	sema_v(&netstack_reap_limiter);
1110 }
1111 
1112 void
1113 netstack_rele(netstack_t *ns)
1114 {
1115 	int refcnt, numzones;
1116 
1117 	mutex_enter(&ns->netstack_lock);
1118 	ASSERT(ns->netstack_refcnt > 0);
1119 	ns->netstack_refcnt--;
1120 	/*
1121 	 * As we drop the lock additional netstack_rele()s can come in
1122 	 * and decrement the refcnt to zero and free the netstack_t.
1123 	 * Store pointers in local variables and if we were not the last
1124 	 * then don't reference the netstack_t after that.
1125 	 */
1126 	refcnt = ns->netstack_refcnt;
1127 	numzones = ns->netstack_numzones;
1128 	DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
1129 	mutex_exit(&ns->netstack_lock);
1130 
1131 	if (refcnt == 0 && numzones == 0) {
1132 		/*
1133 		 * Because there are possibilities of re-entrancy in various
1134 		 * netstack structures by callers, which might cause a lock up
1135 		 * due to odd reference models, or other factors, we choose to
1136 		 * schedule the actual deletion of this netstack as a deferred
1137 		 * task on the system taskq.  This way, any such reference
1138 		 * models won't trip over themselves.
1139 		 *
1140 		 * Assume we aren't in a high-priority interrupt context, so
1141 		 * we can use KM_SLEEP and semaphores.
1142 		 */
1143 		if (sema_tryp(&netstack_reap_limiter) == 0) {
1144 			/*
1145 			 * Indicate we're slamming against a limit.
1146 			 */
1147 			hrtime_t measurement = gethrtime();
1148 
1149 			sema_p(&netstack_reap_limiter);
1150 			/* Capture delay in ns. */
1151 			DTRACE_PROBE1(netstack__reap__rate__limited,
1152 			    hrtime_t, gethrtime() - measurement);
1153 		}
1154 
1155 		/* TQ_SLEEP should prevent taskq_dispatch() from failing. */
1156 		(void) taskq_dispatch(system_taskq, netstack_reap, ns,
1157 		    TQ_SLEEP);
1158 	}
1159 }
1160 
1161 static void
1162 netstack_hold_locked(netstack_t *ns)
1163 {
1164 	ASSERT(MUTEX_HELD(&ns->netstack_lock));
1165 	ns->netstack_refcnt++;
1166 	ASSERT(ns->netstack_refcnt > 0);
1167 	DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
1168 }
1169 
1170 /*
1171  * If the passed-in netstack isn't active (i.e. it's uninitialized or closing),
1172  * return NULL, otherwise return it with its reference held.  Common code
1173  * for many netstack_find*() functions.
1174  */
1175 netstack_t *
1176 netstack_hold_if_active(netstack_t *ns)
1177 {
1178 	netstack_t *retval;
1179 
1180 	mutex_enter(&ns->netstack_lock);
1181 	if (ns->netstack_flags & (NSF_UNINIT | NSF_CLOSING)) {
1182 		retval = NULL;
1183 	} else {
1184 		netstack_hold_locked(ns);
1185 		retval = ns;
1186 	}
1187 	mutex_exit(&ns->netstack_lock);
1188 
1189 	return (retval);
1190 }
1191 
1192 void
1193 netstack_hold(netstack_t *ns)
1194 {
1195 	mutex_enter(&ns->netstack_lock);
1196 	netstack_hold_locked(ns);
1197 	mutex_exit(&ns->netstack_lock);
1198 }
1199 
1200 /*
1201  * To support kstat_create_netstack() using kstat_zone_add we need
1202  * to track both
1203  *  - all zoneids that use the global/shared stack
1204  *  - all kstats that have been added for the shared stack
1205  */
1206 kstat_t *
1207 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
1208     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
1209     netstackid_t ks_netstackid)
1210 {
1211 	kstat_t *ks;
1212 
1213 	if (ks_netstackid == GLOBAL_NETSTACKID) {
1214 		ks = kstat_create_zone(ks_module, ks_instance, ks_name,
1215 		    ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
1216 		if (ks != NULL)
1217 			netstack_shared_kstat_add(ks);
1218 		return (ks);
1219 	} else {
1220 		zoneid_t zoneid = ks_netstackid;
1221 
1222 		return (kstat_create_zone(ks_module, ks_instance, ks_name,
1223 		    ks_class, ks_type, ks_ndata, ks_flags, zoneid));
1224 	}
1225 }
1226 
1227 void
1228 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
1229 {
1230 	if (ks_netstackid == GLOBAL_NETSTACKID) {
1231 		netstack_shared_kstat_remove(ks);
1232 	}
1233 	kstat_delete(ks);
1234 }
1235 
1236 static void
1237 netstack_shared_zone_add(zoneid_t zoneid)
1238 {
1239 	struct shared_zone_list *sz;
1240 	struct shared_kstat_list *sk;
1241 
1242 	sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1243 	sz->sz_zoneid = zoneid;
1244 
1245 	/* Insert in list */
1246 	mutex_enter(&netstack_shared_lock);
1247 	sz->sz_next = netstack_shared_zones;
1248 	netstack_shared_zones = sz;
1249 
1250 	/*
1251 	 * Perform kstat_zone_add for each existing shared stack kstat.
1252 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1253 	 */
1254 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1255 		kstat_zone_add(sk->sk_kstat, zoneid);
1256 	}
1257 	mutex_exit(&netstack_shared_lock);
1258 }
1259 
1260 static void
1261 netstack_shared_zone_remove(zoneid_t zoneid)
1262 {
1263 	struct shared_zone_list **szp, *sz;
1264 	struct shared_kstat_list *sk;
1265 
1266 	/* Find in list */
1267 	mutex_enter(&netstack_shared_lock);
1268 	sz = NULL;
1269 	for (szp = &netstack_shared_zones; *szp != NULL;
1270 	    szp = &((*szp)->sz_next)) {
1271 		if ((*szp)->sz_zoneid == zoneid) {
1272 			sz = *szp;
1273 			break;
1274 		}
1275 	}
1276 	/* We must find it */
1277 	ASSERT(sz != NULL);
1278 	*szp = sz->sz_next;
1279 	sz->sz_next = NULL;
1280 
1281 	/*
1282 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1283 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1284 	 */
1285 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1286 		kstat_zone_remove(sk->sk_kstat, zoneid);
1287 	}
1288 	mutex_exit(&netstack_shared_lock);
1289 
1290 	kmem_free(sz, sizeof (*sz));
1291 }
1292 
1293 static void
1294 netstack_shared_kstat_add(kstat_t *ks)
1295 {
1296 	struct shared_zone_list *sz;
1297 	struct shared_kstat_list *sk;
1298 
1299 	sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1300 	sk->sk_kstat = ks;
1301 
1302 	/* Insert in list */
1303 	mutex_enter(&netstack_shared_lock);
1304 	sk->sk_next = netstack_shared_kstats;
1305 	netstack_shared_kstats = sk;
1306 
1307 	/*
1308 	 * Perform kstat_zone_add for each existing shared stack zone.
1309 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1310 	 */
1311 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1312 		kstat_zone_add(ks, sz->sz_zoneid);
1313 	}
1314 	mutex_exit(&netstack_shared_lock);
1315 }
1316 
1317 static void
1318 netstack_shared_kstat_remove(kstat_t *ks)
1319 {
1320 	struct shared_zone_list *sz;
1321 	struct shared_kstat_list **skp, *sk;
1322 
1323 	/* Find in list */
1324 	mutex_enter(&netstack_shared_lock);
1325 	sk = NULL;
1326 	for (skp = &netstack_shared_kstats; *skp != NULL;
1327 	    skp = &((*skp)->sk_next)) {
1328 		if ((*skp)->sk_kstat == ks) {
1329 			sk = *skp;
1330 			break;
1331 		}
1332 	}
1333 	/* Must find it */
1334 	ASSERT(sk != NULL);
1335 	*skp = sk->sk_next;
1336 	sk->sk_next = NULL;
1337 
1338 	/*
1339 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1340 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1341 	 */
1342 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1343 		kstat_zone_remove(ks, sz->sz_zoneid);
1344 	}
1345 	mutex_exit(&netstack_shared_lock);
1346 	kmem_free(sk, sizeof (*sk));
1347 }
1348 
1349 /*
1350  * If a zoneid is part of the shared zone, return true
1351  */
1352 static boolean_t
1353 netstack_find_shared_zoneid(zoneid_t zoneid)
1354 {
1355 	struct shared_zone_list *sz;
1356 
1357 	mutex_enter(&netstack_shared_lock);
1358 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1359 		if (sz->sz_zoneid == zoneid) {
1360 			mutex_exit(&netstack_shared_lock);
1361 			return (B_TRUE);
1362 		}
1363 	}
1364 	mutex_exit(&netstack_shared_lock);
1365 	return (B_FALSE);
1366 }
1367 
1368 /*
1369  * Hide the fact that zoneids and netstackids are allocated from
1370  * the same space in the current implementation.
1371  * We currently do not check that the stackid/zoneids are valid, since there
1372  * is no need for that. But this should only be done for ids that are
1373  * valid.
1374  */
1375 zoneid_t
1376 netstackid_to_zoneid(netstackid_t stackid)
1377 {
1378 	return (stackid);
1379 }
1380 
1381 netstackid_t
1382 zoneid_to_netstackid(zoneid_t zoneid)
1383 {
1384 	if (netstack_find_shared_zoneid(zoneid))
1385 		return (GLOBAL_ZONEID);
1386 	else
1387 		return (zoneid);
1388 }
1389 
1390 zoneid_t
1391 netstack_get_zoneid(netstack_t *ns)
1392 {
1393 	return (netstackid_to_zoneid(ns->netstack_stackid));
1394 }
1395 
1396 /*
1397  * Simplistic support for walking all the handles.
1398  * Example usage:
1399  *	netstack_handle_t nh;
1400  *	netstack_t *ns;
1401  *
1402  *	netstack_next_init(&nh);
1403  *	while ((ns = netstack_next(&nh)) != NULL) {
1404  *		do something;
1405  *		netstack_rele(ns);
1406  *	}
1407  *	netstack_next_fini(&nh);
1408  */
1409 void
1410 netstack_next_init(netstack_handle_t *handle)
1411 {
1412 	*handle = 0;
1413 }
1414 
1415 /* ARGSUSED */
1416 void
1417 netstack_next_fini(netstack_handle_t *handle)
1418 {
1419 }
1420 
1421 netstack_t *
1422 netstack_next(netstack_handle_t *handle)
1423 {
1424 	netstack_t *ns;
1425 	int i, end;
1426 
1427 	end = *handle;
1428 	/* Walk skipping *handle number of instances */
1429 
1430 	/* Look if there is a matching stack instance */
1431 	mutex_enter(&netstack_g_lock);
1432 	ns = netstack_head;
1433 	for (i = 0; i < end; i++) {
1434 		if (ns == NULL)
1435 			break;
1436 		ns = ns->netstack_next;
1437 	}
1438 	/*
1439 	 * Skip those that aren't really here (uninitialized or closing).
1440 	 * Can't use hold_if_active because of "end" tracking.
1441 	 */
1442 	while (ns != NULL) {
1443 		mutex_enter(&ns->netstack_lock);
1444 		if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1445 			*handle = end + 1;
1446 			netstack_hold_locked(ns);
1447 			mutex_exit(&ns->netstack_lock);
1448 			break;
1449 		}
1450 		mutex_exit(&ns->netstack_lock);
1451 		end++;
1452 		ns = ns->netstack_next;
1453 	}
1454 	mutex_exit(&netstack_g_lock);
1455 	return (ns);
1456 }
1457