xref: /titanic_50/usr/src/uts/common/os/netstack.c (revision 4e4761498aa6990539820cfc2ee7b1c7c53b6bc3)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 #include <sys/param.h>
30 #include <sys/sysmacros.h>
31 #include <sys/vm.h>
32 #include <sys/proc.h>
33 #include <sys/tuneable.h>
34 #include <sys/systm.h>
35 #include <sys/cmn_err.h>
36 #include <sys/debug.h>
37 #include <sys/sdt.h>
38 #include <sys/mutex.h>
39 #include <sys/bitmap.h>
40 #include <sys/atomic.h>
41 #include <sys/kobj.h>
42 #include <sys/disp.h>
43 #include <vm/seg_kmem.h>
44 #include <sys/zone.h>
45 #include <sys/netstack.h>
46 
47 /*
48  * What we use so that the zones framework can tell us about new zones,
49  * which we use to create new stacks.
50  */
51 static zone_key_t netstack_zone_key;
52 
53 static int	netstack_initialized = 0;
54 
55 /*
56  * Track the registered netstacks.
57  * The global lock protects
58  * - ns_reg
59  * - the list starting at netstack_head and following the netstack_next
60  *   pointers.
61  */
62 static kmutex_t netstack_g_lock;
63 
64 /*
65  * Registry of netstacks with their create/shutdown/destory functions.
66  */
67 static struct netstack_registry	ns_reg[NS_MAX];
68 
69 /*
70  * Global list of existing stacks.  We use this when a new zone with
71  * an exclusive IP instance is created.
72  *
73  * Note that in some cases a netstack_t needs to stay around after the zone
74  * has gone away. This is because there might be outstanding references
75  * (from TCP TIME_WAIT connections, IPsec state, etc). The netstack_t data
76  * structure and all the foo_stack_t's hanging off of it will be cleaned up
77  * when the last reference to it is dropped.
78  * However, the same zone might be rebooted. That is handled using the
79  * assumption that the zones framework picks a new zoneid each time a zone
80  * is (re)booted. We assert for that condition in netstack_zone_create().
81  * Thus the old netstack_t can take its time for things to time out.
82  */
83 static netstack_t *netstack_head;
84 
85 /*
86  * To support kstat_create_netstack() using kstat_zone_add we need
87  * to track both
88  *  - all zoneids that use the global/shared stack
89  *  - all kstats that have been added for the shared stack
90  */
91 struct shared_zone_list {
92 	struct shared_zone_list *sz_next;
93 	zoneid_t		sz_zoneid;
94 };
95 
96 struct shared_kstat_list {
97 	struct shared_kstat_list *sk_next;
98 	kstat_t			 *sk_kstat;
99 };
100 
101 static kmutex_t netstack_shared_lock;	/* protects the following two */
102 static struct shared_zone_list	*netstack_shared_zones;
103 static struct shared_kstat_list	*netstack_shared_kstats;
104 
105 static void	*netstack_zone_create(zoneid_t zoneid);
106 static void	netstack_zone_shutdown(zoneid_t zoneid, void *arg);
107 static void	netstack_zone_destroy(zoneid_t zoneid, void *arg);
108 
109 static void	netstack_do_create(void);
110 static void	netstack_do_shutdown(void);
111 static void	netstack_do_destroy(void);
112 
113 static void	netstack_shared_zone_add(zoneid_t zoneid);
114 static void	netstack_shared_zone_remove(zoneid_t zoneid);
115 static void	netstack_shared_kstat_add(kstat_t *ks);
116 static void	netstack_shared_kstat_remove(kstat_t *ks);
117 
118 
119 void
120 netstack_init(void)
121 {
122 	mutex_init(&netstack_g_lock, NULL, MUTEX_DEFAULT, NULL);
123 	mutex_init(&netstack_shared_lock, NULL, MUTEX_DEFAULT, NULL);
124 
125 	netstack_initialized = 1;
126 
127 	/*
128 	 * We want to be informed each time a zone is created or
129 	 * destroyed in the kernel, so we can maintain the
130 	 * stack instance information.
131 	 */
132 	zone_key_create(&netstack_zone_key, netstack_zone_create,
133 	    netstack_zone_shutdown, netstack_zone_destroy);
134 }
135 
136 /*
137  * Register a new module with the framework.
138  * This registers interest in changes to the set of netstacks.
139  * The createfn and destroyfn are required, but the shutdownfn can be
140  * NULL.
141  * Note that due to the current zsd implementation, when the create
142  * function is called the zone isn't fully present, thus functions
143  * like zone_find_by_* will fail, hence the create function can not
144  * use many zones kernel functions including zcmn_err().
145  */
146 void
147 netstack_register(int moduleid,
148     void *(*module_create)(netstackid_t, netstack_t *),
149     void (*module_shutdown)(netstackid_t, void *),
150     void (*module_destroy)(netstackid_t, void *))
151 {
152 	netstack_t *ns;
153 
154 	ASSERT(netstack_initialized);
155 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
156 	ASSERT(module_create != NULL);
157 
158 	mutex_enter(&netstack_g_lock);
159 	ASSERT(ns_reg[moduleid].nr_create == NULL);
160 	ASSERT(ns_reg[moduleid].nr_flags == 0);
161 	ns_reg[moduleid].nr_create = module_create;
162 	ns_reg[moduleid].nr_shutdown = module_shutdown;
163 	ns_reg[moduleid].nr_destroy = module_destroy;
164 	ns_reg[moduleid].nr_flags = NRF_REGISTERED;
165 
166 	/*
167 	 * Determine the set of stacks that exist before we drop the lock.
168 	 * Set CREATE_NEEDED for each of those.
169 	 * netstacks which have been deleted will have NSS_CREATE_COMPLETED
170 	 * set, but check NSF_CLOSING to be sure.
171 	 */
172 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
173 		mutex_enter(&ns->netstack_lock);
174 		if (!(ns->netstack_flags & NSF_CLOSING) &&
175 		    (ns->netstack_m_state[moduleid] & NSS_CREATE_ALL) == 0) {
176 			ns->netstack_m_state[moduleid] |= NSS_CREATE_NEEDED;
177 			DTRACE_PROBE2(netstack__create__needed,
178 			    netstack_t *, ns, int, moduleid);
179 		}
180 		mutex_exit(&ns->netstack_lock);
181 	}
182 	mutex_exit(&netstack_g_lock);
183 
184 	/*
185 	 * Call the create function for each stack that has CREATE_NEEDED.
186 	 * Set CREATE_INPROGRESS, drop lock, and after done,
187 	 * set CREATE_COMPLETE
188 	 */
189 	netstack_do_create();
190 }
191 
192 void
193 netstack_unregister(int moduleid)
194 {
195 	netstack_t *ns;
196 
197 	ASSERT(moduleid >= 0 && moduleid < NS_MAX);
198 
199 	ASSERT(ns_reg[moduleid].nr_create != NULL);
200 	ASSERT(ns_reg[moduleid].nr_flags & NRF_REGISTERED);
201 
202 	mutex_enter(&netstack_g_lock);
203 	/*
204 	 * Determine the set of stacks that exist before we drop the lock.
205 	 * Set SHUTDOWN_NEEDED and DESTROY_NEEDED for each of those.
206 	 */
207 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
208 		mutex_enter(&ns->netstack_lock);
209 		if (ns_reg[moduleid].nr_shutdown != NULL &&
210 		    (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
211 		    (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_ALL) == 0) {
212 			ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_NEEDED;
213 			DTRACE_PROBE2(netstack__shutdown__needed,
214 			    netstack_t *, ns, int, moduleid);
215 		}
216 		if ((ns_reg[moduleid].nr_flags & NRF_REGISTERED) &&
217 		    ns_reg[moduleid].nr_destroy != NULL &&
218 		    (ns->netstack_m_state[moduleid] & NSS_CREATE_COMPLETED) &&
219 		    (ns->netstack_m_state[moduleid] & NSS_DESTROY_ALL) == 0) {
220 			ns->netstack_m_state[moduleid] |= NSS_DESTROY_NEEDED;
221 			DTRACE_PROBE2(netstack__destroy__needed,
222 			    netstack_t *, ns, int, moduleid);
223 		}
224 		mutex_exit(&ns->netstack_lock);
225 	}
226 	mutex_exit(&netstack_g_lock);
227 
228 	netstack_do_shutdown();
229 	netstack_do_destroy();
230 
231 	/*
232 	 * Clear the netstack_m_state so that we can handle this module
233 	 * being loaded again.
234 	 */
235 	mutex_enter(&netstack_g_lock);
236 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
237 		mutex_enter(&ns->netstack_lock);
238 		if (ns->netstack_m_state[moduleid] & NSS_DESTROY_COMPLETED) {
239 			ns->netstack_m_state[moduleid] = 0;
240 			DTRACE_PROBE2(netstack__destroy__done,
241 			    netstack_t *, ns, int, moduleid);
242 		}
243 		mutex_exit(&ns->netstack_lock);
244 	}
245 
246 	ns_reg[moduleid].nr_create = NULL;
247 	ns_reg[moduleid].nr_shutdown = NULL;
248 	ns_reg[moduleid].nr_destroy = NULL;
249 	ns_reg[moduleid].nr_flags = 0;
250 	mutex_exit(&netstack_g_lock);
251 }
252 
253 /*
254  * Lookup and/or allocate a netstack for this zone.
255  */
256 static void *
257 netstack_zone_create(zoneid_t zoneid)
258 {
259 	netstackid_t stackid;
260 	netstack_t *ns;
261 	netstack_t **nsp;
262 	zone_t	*zone;
263 	int i;
264 
265 	ASSERT(netstack_initialized);
266 
267 	zone = zone_find_by_id_nolock(zoneid);
268 	ASSERT(zone != NULL);
269 
270 	if (zone->zone_flags & ZF_NET_EXCL) {
271 		stackid = zoneid;
272 	} else {
273 		/* Look for the stack instance for the global */
274 		stackid = GLOBAL_NETSTACKID;
275 	}
276 
277 	/* Allocate even if it isn't needed; simplifies locking */
278 	ns = (netstack_t *)kmem_zalloc(sizeof (netstack_t), KM_SLEEP);
279 
280 	/* Look if there is a matching stack instance */
281 	mutex_enter(&netstack_g_lock);
282 	for (nsp = &netstack_head; *nsp != NULL;
283 	    nsp = &((*nsp)->netstack_next)) {
284 		if ((*nsp)->netstack_stackid == stackid) {
285 			/*
286 			 * Should never find a pre-existing exclusive stack
287 			 */
288 			ASSERT(stackid == GLOBAL_NETSTACKID);
289 			kmem_free(ns, sizeof (netstack_t));
290 			ns = *nsp;
291 			mutex_enter(&ns->netstack_lock);
292 			ns->netstack_numzones++;
293 			mutex_exit(&ns->netstack_lock);
294 			mutex_exit(&netstack_g_lock);
295 			DTRACE_PROBE1(netstack__inc__numzones,
296 			    netstack_t *, ns);
297 			/* Record that we have a new shared stack zone */
298 			netstack_shared_zone_add(zoneid);
299 			zone->zone_netstack = ns;
300 			return (ns);
301 		}
302 	}
303 	/* Not found */
304 	mutex_init(&ns->netstack_lock, NULL, MUTEX_DEFAULT, NULL);
305 	ns->netstack_stackid = zoneid;
306 	ns->netstack_numzones = 1;
307 	ns->netstack_refcnt = 1; /* Decremented by netstack_zone_destroy */
308 	ns->netstack_flags = NSF_UNINIT;
309 	*nsp = ns;
310 	zone->zone_netstack = ns;
311 
312 	/*
313 	 * Determine the set of module create functions that need to be
314 	 * called before we drop the lock.
315 	 */
316 	for (i = 0; i < NS_MAX; i++) {
317 		mutex_enter(&ns->netstack_lock);
318 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
319 		    (ns->netstack_m_state[i] & NSS_CREATE_ALL) == 0) {
320 			ns->netstack_m_state[i] |= NSS_CREATE_NEEDED;
321 			DTRACE_PROBE2(netstack__create__needed,
322 			    netstack_t *, ns, int, i);
323 		}
324 		mutex_exit(&ns->netstack_lock);
325 	}
326 	mutex_exit(&netstack_g_lock);
327 
328 	netstack_do_create();
329 
330 	mutex_enter(&ns->netstack_lock);
331 	ns->netstack_flags &= ~NSF_UNINIT;
332 	mutex_exit(&ns->netstack_lock);
333 
334 	return (ns);
335 }
336 
337 /* ARGSUSED */
338 static void
339 netstack_zone_shutdown(zoneid_t zoneid, void *arg)
340 {
341 	netstack_t *ns = (netstack_t *)arg;
342 	int i;
343 
344 	ASSERT(arg != NULL);
345 
346 	mutex_enter(&ns->netstack_lock);
347 	ASSERT(ns->netstack_numzones > 0);
348 	if (ns->netstack_numzones != 1) {
349 		/* Stack instance being used by other zone */
350 		mutex_exit(&ns->netstack_lock);
351 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
352 		return;
353 	}
354 	mutex_exit(&ns->netstack_lock);
355 
356 	mutex_enter(&netstack_g_lock);
357 	/*
358 	 * Determine the set of stacks that exist before we drop the lock.
359 	 * Set SHUTDOWN_NEEDED for each of those.
360 	 */
361 	for (i = 0; i < NS_MAX; i++) {
362 		mutex_enter(&ns->netstack_lock);
363 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
364 		    ns_reg[i].nr_shutdown != NULL &&
365 		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
366 		    (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
367 			ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
368 			DTRACE_PROBE2(netstack__shutdown__needed,
369 			    netstack_t *, ns, int, i);
370 		}
371 		mutex_exit(&ns->netstack_lock);
372 	}
373 	mutex_exit(&netstack_g_lock);
374 
375 	/* Call the shutdown function for all registered modules */
376 	netstack_do_shutdown();
377 }
378 
379 /*
380  * Common routine to release a zone.
381  * If this was the last zone using the stack instance then prepare to
382  * have the refcnt dropping to zero free the zone.
383  */
384 /* ARGSUSED */
385 static void
386 netstack_zone_destroy(zoneid_t zoneid, void *arg)
387 {
388 	netstack_t *ns = (netstack_t *)arg;
389 
390 	ASSERT(arg != NULL);
391 
392 	mutex_enter(&ns->netstack_lock);
393 	ASSERT(ns->netstack_numzones > 0);
394 	ns->netstack_numzones--;
395 	if (ns->netstack_numzones != 0) {
396 		/* Stack instance being used by other zone */
397 		mutex_exit(&ns->netstack_lock);
398 		ASSERT(ns->netstack_stackid == GLOBAL_NETSTACKID);
399 		/* Record that we a shared stack zone has gone away */
400 		netstack_shared_zone_remove(zoneid);
401 		return;
402 	}
403 	/*
404 	 * Set CLOSING so that netstack_find_by will not find it
405 	 * and decrement the reference count.
406 	 */
407 	ns->netstack_flags |= NSF_CLOSING;
408 	mutex_exit(&ns->netstack_lock);
409 	DTRACE_PROBE1(netstack__dec__numzones, netstack_t *, ns);
410 	/* No other thread can call zone_destroy for this stack */
411 
412 	/*
413 	 * Decrease refcnt to account for the one in netstack_zone_init()
414 	 */
415 	netstack_rele(ns);
416 }
417 
418 /*
419  * Called when the reference count drops to zero.
420  * Call the destroy functions for each registered module.
421  */
422 static void
423 netstack_stack_inactive(netstack_t *ns)
424 {
425 	int i;
426 
427 	mutex_enter(&netstack_g_lock);
428 	/*
429 	 * If the shutdown callback wasn't called earlier (e.g., if this is
430 	 * a netstack shared between multiple zones), then we call it now.
431 	 */
432 	for (i = 0; i < NS_MAX; i++) {
433 		mutex_enter(&ns->netstack_lock);
434 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
435 		    ns_reg[i].nr_shutdown != NULL &&
436 		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
437 		    (ns->netstack_m_state[i] & NSS_SHUTDOWN_ALL) == 0) {
438 			ns->netstack_m_state[i] |= NSS_SHUTDOWN_NEEDED;
439 			DTRACE_PROBE2(netstack__shutdown__needed,
440 			    netstack_t *, ns, int, i);
441 		}
442 		mutex_exit(&ns->netstack_lock);
443 	}
444 	/*
445 	 * Determine the set of stacks that exist before we drop the lock.
446 	 * Set DESTROY_NEEDED for each of those.
447 	 */
448 	for (i = 0; i < NS_MAX; i++) {
449 		mutex_enter(&ns->netstack_lock);
450 		if ((ns_reg[i].nr_flags & NRF_REGISTERED) &&
451 		    ns_reg[i].nr_destroy != NULL &&
452 		    (ns->netstack_m_state[i] & NSS_CREATE_COMPLETED) &&
453 		    (ns->netstack_m_state[i] & NSS_DESTROY_ALL) == 0) {
454 			ns->netstack_m_state[i] |= NSS_DESTROY_NEEDED;
455 			DTRACE_PROBE2(netstack__destroy__needed,
456 			    netstack_t *, ns, int, i);
457 		}
458 		mutex_exit(&ns->netstack_lock);
459 	}
460 	mutex_exit(&netstack_g_lock);
461 
462 	netstack_do_shutdown();
463 	netstack_do_destroy();
464 }
465 
466 /*
467  * Call the create function for the ns and moduleid if CREATE_NEEDED
468  * is set.
469  * When it calls it, it drops the netstack_lock held by the caller,
470  * and returns true to tell the caller it needs to re-evalute the
471  * state..
472  */
473 static boolean_t
474 netstack_apply_create(kmutex_t *lockp, netstack_t *ns, int moduleid)
475 {
476 	void *result;
477 	netstackid_t stackid;
478 
479 	ASSERT(MUTEX_HELD(lockp));
480 	mutex_enter(&ns->netstack_lock);
481 	if (ns->netstack_m_state[moduleid] & NSS_CREATE_NEEDED) {
482 		ns->netstack_m_state[moduleid] &= ~NSS_CREATE_NEEDED;
483 		ns->netstack_m_state[moduleid] |= NSS_CREATE_INPROGRESS;
484 		DTRACE_PROBE2(netstack__create__inprogress,
485 		    netstack_t *, ns, int, moduleid);
486 		mutex_exit(&ns->netstack_lock);
487 		mutex_exit(lockp);
488 
489 		ASSERT(ns_reg[moduleid].nr_create != NULL);
490 		stackid = ns->netstack_stackid;
491 		DTRACE_PROBE2(netstack__create__start,
492 		    netstackid_t, stackid,
493 		    netstack_t *, ns);
494 		result = (ns_reg[moduleid].nr_create)(stackid, ns);
495 		DTRACE_PROBE2(netstack__create__end,
496 		    void *, result, netstack_t *, ns);
497 
498 		ASSERT(result != NULL);
499 		mutex_enter(&ns->netstack_lock);
500 		ns->netstack_modules[moduleid] = result;
501 		ns->netstack_m_state[moduleid] &= ~NSS_CREATE_INPROGRESS;
502 		ns->netstack_m_state[moduleid] |= NSS_CREATE_COMPLETED;
503 		DTRACE_PROBE2(netstack__create__completed,
504 		    netstack_t *, ns, int, moduleid);
505 		mutex_exit(&ns->netstack_lock);
506 		return (B_TRUE);
507 	} else {
508 		mutex_exit(&ns->netstack_lock);
509 		return (B_FALSE);
510 	}
511 }
512 
513 /*
514  * Call the shutdown function for the ns and moduleid if SHUTDOWN_NEEDED
515  * is set.
516  * When it calls it, it drops the netstack_lock held by the caller,
517  * and returns true to tell the caller it needs to re-evalute the
518  * state..
519  */
520 static boolean_t
521 netstack_apply_shutdown(kmutex_t *lockp, netstack_t *ns, int moduleid)
522 {
523 	netstackid_t stackid;
524 	void * netstack_module;
525 
526 	ASSERT(MUTEX_HELD(lockp));
527 	mutex_enter(&ns->netstack_lock);
528 	if (ns->netstack_m_state[moduleid] & NSS_SHUTDOWN_NEEDED) {
529 		ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_NEEDED;
530 		ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_INPROGRESS;
531 		DTRACE_PROBE2(netstack__shutdown__inprogress,
532 		    netstack_t *, ns, int, moduleid);
533 		mutex_exit(&ns->netstack_lock);
534 		mutex_exit(lockp);
535 
536 		ASSERT(ns_reg[moduleid].nr_shutdown != NULL);
537 		stackid = ns->netstack_stackid;
538 		netstack_module = ns->netstack_modules[moduleid];
539 		DTRACE_PROBE2(netstack__shutdown__start,
540 		    netstackid_t, stackid,
541 		    void *, netstack_module);
542 		(ns_reg[moduleid].nr_shutdown)(stackid, netstack_module);
543 		DTRACE_PROBE1(netstack__shutdown__end,
544 		    netstack_t *, ns);
545 
546 		mutex_enter(&ns->netstack_lock);
547 		ns->netstack_m_state[moduleid] &= ~NSS_SHUTDOWN_INPROGRESS;
548 		ns->netstack_m_state[moduleid] |= NSS_SHUTDOWN_COMPLETED;
549 		DTRACE_PROBE2(netstack__shutdown__completed,
550 		    netstack_t *, ns, int, moduleid);
551 		mutex_exit(&ns->netstack_lock);
552 		return (B_TRUE);
553 	} else {
554 		mutex_exit(&ns->netstack_lock);
555 		return (B_FALSE);
556 	}
557 }
558 
559 /*
560  * Call the destroy function for the ns and moduleid if DESTROY_NEEDED
561  * is set.
562  * When it calls it, it drops the netstack_lock held by the caller,
563  * and returns true to tell the caller it needs to re-evalute the
564  * state..
565  */
566 static boolean_t
567 netstack_apply_destroy(kmutex_t *lockp, netstack_t *ns, int moduleid)
568 {
569 	netstackid_t stackid;
570 	void * netstack_module;
571 
572 	ASSERT(MUTEX_HELD(lockp));
573 	mutex_enter(&ns->netstack_lock);
574 	if (ns->netstack_m_state[moduleid] & NSS_DESTROY_NEEDED) {
575 		ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_NEEDED;
576 		ns->netstack_m_state[moduleid] |= NSS_DESTROY_INPROGRESS;
577 		DTRACE_PROBE2(netstack__destroy__inprogress,
578 		    netstack_t *, ns, int, moduleid);
579 		mutex_exit(&ns->netstack_lock);
580 		mutex_exit(lockp);
581 
582 		/* XXX race against unregister? */
583 		ASSERT(ns_reg[moduleid].nr_destroy != NULL);
584 		stackid = ns->netstack_stackid;
585 		netstack_module = ns->netstack_modules[moduleid];
586 		DTRACE_PROBE2(netstack__destroy__start,
587 		    netstackid_t, stackid,
588 		    void *, netstack_module);
589 		(ns_reg[moduleid].nr_destroy)(stackid, netstack_module);
590 		DTRACE_PROBE1(netstack__destroy__end,
591 		    netstack_t *, ns);
592 
593 		mutex_enter(&ns->netstack_lock);
594 		ns->netstack_modules[moduleid] = NULL;
595 		ns->netstack_m_state[moduleid] &= ~NSS_DESTROY_INPROGRESS;
596 		ns->netstack_m_state[moduleid] |= NSS_DESTROY_COMPLETED;
597 		DTRACE_PROBE2(netstack__destroy__completed,
598 		    netstack_t *, ns, int, moduleid);
599 		mutex_exit(&ns->netstack_lock);
600 		return (B_TRUE);
601 	} else {
602 		mutex_exit(&ns->netstack_lock);
603 		return (B_FALSE);
604 	}
605 }
606 
607 static void
608 apply_loop(netstack_t **headp, kmutex_t *lockp,
609     boolean_t (*applyfn)(kmutex_t *, netstack_t *, int moduleid))
610 {
611 	netstack_t *ns;
612 	int i;
613 	boolean_t lock_dropped, result;
614 
615 	lock_dropped = B_FALSE;
616 	ns = *headp;
617 	while (ns != NULL) {
618 		for (i = 0; i < NS_MAX; i++) {
619 			result = (applyfn)(lockp, ns, i);
620 			if (result) {
621 #ifdef NS_DEBUG
622 				(void) printf("netstack_do_apply: "
623 				    "LD for %p/%d, %d\n",
624 				    (void *)ns, ns->netstack_stackid, i);
625 #endif
626 				lock_dropped = B_TRUE;
627 				mutex_enter(lockp);
628 			}
629 		}
630 		/*
631 		 * If at least one applyfn call caused lockp to be dropped,
632 		 * then we don't follow netstack_next after reacquiring the
633 		 * lock, even if it is possible to do so without any hazards.
634 		 * This is because we want the design to allow for the list of
635 		 * netstacks threaded by netstack_next to change in any
636 		 * arbitrary way during the time the 'lockp' was dropped.
637 		 *
638 		 * It is safe to restart the loop at *headp since
639 		 * the applyfn changes netstack_m_state as it processes
640 		 * things, so a subsequent pass through will have no
641 		 * effect in applyfn, hence the loop will terminate
642 		 * in at worst O(N^2).
643 		 */
644 		if (lock_dropped) {
645 #ifdef NS_DEBUG
646 			(void) printf("netstack_do_apply: "
647 			    "Lock Dropped for %p/%d, %d\n",
648 			    (void *)ns, ns->netstack_stackid, i);
649 #endif
650 			lock_dropped = B_FALSE;
651 			ns = *headp;
652 		} else {
653 			ns = ns->netstack_next;
654 		}
655 	}
656 }
657 
658 /* Like above, but in the reverse order of moduleids */
659 static void
660 apply_loop_reverse(netstack_t **headp, kmutex_t *lockp,
661     boolean_t (*applyfn)(kmutex_t *, netstack_t *, int moduleid))
662 {
663 	netstack_t *ns;
664 	int i;
665 	boolean_t lock_dropped, result;
666 
667 	lock_dropped = B_FALSE;
668 	ns = *headp;
669 	while (ns != NULL) {
670 		for (i = NS_MAX-1; i >= 0; i--) {
671 			result = (applyfn)(lockp, ns, i);
672 			if (result) {
673 #ifdef NS_DEBUG
674 				(void) printf("netstack_do_apply: "
675 				    "LD for %p/%d, %d\n",
676 				    (void *)ns, ns->netstack_stackid, i);
677 #endif
678 				lock_dropped = B_TRUE;
679 				mutex_enter(lockp);
680 			}
681 		}
682 		/*
683 		 * If at least one applyfn call caused lockp to be dropped,
684 		 * then we don't follow netstack_next after reacquiring the
685 		 * lock, even if it is possible to do so without any hazards.
686 		 * This is because we want the design to allow for the list of
687 		 * netstacks threaded by netstack_next to change in any
688 		 * arbitrary way during the time the 'lockp' was dropped.
689 		 *
690 		 * It is safe to restart the loop at *headp since
691 		 * the applyfn changes netstack_m_state as it processes
692 		 * things, so a subsequent pass through will have no
693 		 * effect in applyfn, hence the loop will terminate
694 		 * in at worst O(N^2).
695 		 */
696 		if (lock_dropped) {
697 #ifdef NS_DEBUG
698 			(void) printf("netstack_do_apply: "
699 			    "Lock Dropped for %p/%d, %d\n",
700 			    (void *)ns, ns->netstack_stackid, i);
701 #endif
702 			lock_dropped = B_FALSE;
703 			ns = *headp;
704 		} else {
705 			ns = ns->netstack_next;
706 		}
707 	}
708 }
709 
710 /*
711  * Apply a function to all module/netstack combinations.
712  * The applyfn returns true if it had dropped the locks.
713  */
714 static void
715 netstack_do_apply(int reverse,
716     boolean_t (*applyfn)(kmutex_t *, netstack_t *, int moduleid))
717 {
718 	mutex_enter(&netstack_g_lock);
719 	if (reverse)
720 		apply_loop_reverse(&netstack_head, &netstack_g_lock, applyfn);
721 	else
722 		apply_loop(&netstack_head, &netstack_g_lock, applyfn);
723 	mutex_exit(&netstack_g_lock);
724 }
725 
726 /*
727  * Run the create function for all modules x stack combinations
728  * that have NSS_CREATE_NEEDED set.
729  *
730  * Call the create function for each stack that has CREATE_NEEDED.
731  * Set CREATE_INPROGRESS, drop lock, and after done,
732  * set CREATE_COMPLETE
733  */
734 static void
735 netstack_do_create(void)
736 {
737 	netstack_do_apply(B_FALSE, netstack_apply_create);
738 }
739 
740 /*
741  * Run the shutdown function for all modules x stack combinations
742  * that have NSS_SHUTDOWN_NEEDED set.
743  *
744  * Call the shutdown function for each stack that has SHUTDOWN_NEEDED.
745  * Set SHUTDOWN_INPROGRESS, drop lock, and after done,
746  * set SHUTDOWN_COMPLETE
747  */
748 static void
749 netstack_do_shutdown(void)
750 {
751 	netstack_do_apply(B_FALSE, netstack_apply_shutdown);
752 }
753 
754 /*
755  * Run the destroy function for all modules x stack combinations
756  * that have NSS_DESTROY_NEEDED set.
757  *
758  * Call the destroy function for each stack that has DESTROY_NEEDED.
759  * Set DESTROY_INPROGRESS, drop lock, and after done,
760  * set DESTROY_COMPLETE
761  *
762  * Since a netstack_t is never reused (when a zone is rebooted it gets
763  * a new zoneid == netstackid i.e. a new netstack_t is allocated) we leave
764  * netstack_m_state the way it is i.e. with NSS_DESTROY_COMPLETED set.
765  */
766 static void
767 netstack_do_destroy(void)
768 {
769 	/*
770 	 * Have to walk the moduleids in reverse order since some
771 	 * modules make implicit assumptions about the order
772 	 */
773 	netstack_do_apply(B_TRUE, netstack_apply_destroy);
774 }
775 
776 /*
777  * Get the stack instance used in caller's zone.
778  * Increases the reference count, caller must do a netstack_rele.
779  * It can't be called after zone_destroy() has started.
780  */
781 static netstack_t *
782 netstack_get_current(void)
783 {
784 	netstack_t *ns;
785 
786 	ns = curproc->p_zone->zone_netstack;
787 	ASSERT(ns != NULL);
788 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
789 		return (NULL);
790 
791 	netstack_hold(ns);
792 
793 	return (ns);
794 }
795 
796 /*
797  * Find a stack instance given the cred.
798  * This is used by the modules to potentially allow for a future when
799  * something other than the zoneid is used to determine the stack.
800  */
801 netstack_t *
802 netstack_find_by_cred(const cred_t *cr)
803 {
804 	zoneid_t zoneid = crgetzoneid(cr);
805 
806 	/* Handle the case when cr_zone is NULL */
807 	if (zoneid == (zoneid_t)-1)
808 		zoneid = GLOBAL_ZONEID;
809 
810 	/* For performance ... */
811 	if (curproc->p_zone->zone_id == zoneid)
812 		return (netstack_get_current());
813 	else
814 		return (netstack_find_by_zoneid(zoneid));
815 }
816 
817 /*
818  * Find a stack instance given the zoneid.
819  * Increases the reference count if found; caller must do a
820  * netstack_rele().
821  *
822  * If there is no exact match then assume the shared stack instance
823  * matches.
824  *
825  * Skip the unitialized ones.
826  */
827 netstack_t *
828 netstack_find_by_zoneid(zoneid_t zoneid)
829 {
830 	netstack_t *ns;
831 	zone_t *zone;
832 
833 	zone = zone_find_by_id(zoneid);
834 
835 	if (zone == NULL)
836 		return (NULL);
837 
838 	ns = zone->zone_netstack;
839 	ASSERT(ns != NULL);
840 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
841 		ns = NULL;
842 	else
843 		netstack_hold(ns);
844 
845 	zone_rele(zone);
846 	return (ns);
847 }
848 
849 /*
850  * Find a stack instance given the zoneid.
851  * Increases the reference count if found; caller must do a
852  * netstack_rele().
853  *
854  * If there is no exact match then assume the shared stack instance
855  * matches.
856  *
857  * Skip the unitialized ones.
858  *
859  * NOTE: The caller must hold zonehash_lock.
860  */
861 netstack_t *
862 netstack_find_by_zoneid_nolock(zoneid_t zoneid)
863 {
864 	netstack_t *ns;
865 	zone_t *zone;
866 
867 	zone = zone_find_by_id_nolock(zoneid);
868 
869 	if (zone == NULL)
870 		return (NULL);
871 
872 	ns = zone->zone_netstack;
873 	ASSERT(ns != NULL);
874 
875 	if (ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))
876 		ns = NULL;
877 	else
878 		netstack_hold(ns);
879 
880 	zone_rele(zone);
881 	return (ns);
882 }
883 
884 /*
885  * Find a stack instance given the stackid with exact match?
886  * Increases the reference count if found; caller must do a
887  * netstack_rele().
888  *
889  * Skip the unitialized ones.
890  */
891 netstack_t *
892 netstack_find_by_stackid(netstackid_t stackid)
893 {
894 	netstack_t *ns;
895 
896 	mutex_enter(&netstack_g_lock);
897 	for (ns = netstack_head; ns != NULL; ns = ns->netstack_next) {
898 		mutex_enter(&ns->netstack_lock);
899 		if (ns->netstack_stackid == stackid &&
900 		    !(ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING))) {
901 			mutex_exit(&ns->netstack_lock);
902 			netstack_hold(ns);
903 			mutex_exit(&netstack_g_lock);
904 			return (ns);
905 		}
906 		mutex_exit(&ns->netstack_lock);
907 	}
908 	mutex_exit(&netstack_g_lock);
909 	return (NULL);
910 }
911 
912 void
913 netstack_rele(netstack_t *ns)
914 {
915 	netstack_t **nsp;
916 	boolean_t found;
917 	int refcnt, numzones;
918 
919 	mutex_enter(&ns->netstack_lock);
920 	ASSERT(ns->netstack_refcnt > 0);
921 	ns->netstack_refcnt--;
922 	/*
923 	 * As we drop the lock additional netstack_rele()s can come in
924 	 * and decrement the refcnt to zero and free the netstack_t.
925 	 * Store pointers in local variables and if we were not the last
926 	 * then don't reference the netstack_t after that.
927 	 */
928 	refcnt = ns->netstack_refcnt;
929 	numzones = ns->netstack_numzones;
930 	DTRACE_PROBE1(netstack__dec__ref, netstack_t *, ns);
931 	mutex_exit(&ns->netstack_lock);
932 
933 	if (refcnt == 0 && numzones == 0) {
934 		/*
935 		 * Time to call the destroy functions and free up
936 		 * the structure
937 		 */
938 		netstack_stack_inactive(ns);
939 
940 		/* Finally remove from list of netstacks */
941 		mutex_enter(&netstack_g_lock);
942 		found = B_FALSE;
943 		for (nsp = &netstack_head; *nsp != NULL;
944 		    nsp = &(*nsp)->netstack_next) {
945 			if (*nsp == ns) {
946 				*nsp = ns->netstack_next;
947 				ns->netstack_next = NULL;
948 				found = B_TRUE;
949 				break;
950 			}
951 		}
952 		ASSERT(found);
953 		mutex_exit(&netstack_g_lock);
954 
955 		ASSERT(ns->netstack_flags & NSF_CLOSING);
956 		kmem_free(ns, sizeof (*ns));
957 	}
958 }
959 
960 void
961 netstack_hold(netstack_t *ns)
962 {
963 	mutex_enter(&ns->netstack_lock);
964 	ns->netstack_refcnt++;
965 	ASSERT(ns->netstack_refcnt > 0);
966 	mutex_exit(&ns->netstack_lock);
967 	DTRACE_PROBE1(netstack__inc__ref, netstack_t *, ns);
968 }
969 
970 /*
971  * To support kstat_create_netstack() using kstat_zone_add we need
972  * to track both
973  *  - all zoneids that use the global/shared stack
974  *  - all kstats that have been added for the shared stack
975  */
976 kstat_t *
977 kstat_create_netstack(char *ks_module, int ks_instance, char *ks_name,
978     char *ks_class, uchar_t ks_type, uint_t ks_ndata, uchar_t ks_flags,
979     netstackid_t ks_netstackid)
980 {
981 	kstat_t *ks;
982 
983 	if (ks_netstackid == GLOBAL_NETSTACKID) {
984 		ks = kstat_create_zone(ks_module, ks_instance, ks_name,
985 		    ks_class, ks_type, ks_ndata, ks_flags, GLOBAL_ZONEID);
986 		if (ks != NULL)
987 			netstack_shared_kstat_add(ks);
988 		return (ks);
989 	} else {
990 		zoneid_t zoneid = ks_netstackid;
991 
992 		return (kstat_create_zone(ks_module, ks_instance, ks_name,
993 			ks_class, ks_type, ks_ndata, ks_flags, zoneid));
994 	}
995 }
996 
997 void
998 kstat_delete_netstack(kstat_t *ks, netstackid_t ks_netstackid)
999 {
1000 	if (ks_netstackid == GLOBAL_NETSTACKID) {
1001 		netstack_shared_kstat_remove(ks);
1002 	}
1003 	kstat_delete(ks);
1004 }
1005 
1006 static void
1007 netstack_shared_zone_add(zoneid_t zoneid)
1008 {
1009 	struct shared_zone_list *sz;
1010 	struct shared_kstat_list *sk;
1011 
1012 	sz = (struct shared_zone_list *)kmem_zalloc(sizeof (*sz), KM_SLEEP);
1013 	sz->sz_zoneid = zoneid;
1014 
1015 	/* Insert in list */
1016 	mutex_enter(&netstack_shared_lock);
1017 	sz->sz_next = netstack_shared_zones;
1018 	netstack_shared_zones = sz;
1019 
1020 	/*
1021 	 * Perform kstat_zone_add for each existing shared stack kstat.
1022 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1023 	 */
1024 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1025 		kstat_zone_add(sk->sk_kstat, zoneid);
1026 	}
1027 	mutex_exit(&netstack_shared_lock);
1028 }
1029 
1030 static void
1031 netstack_shared_zone_remove(zoneid_t zoneid)
1032 {
1033 	struct shared_zone_list **szp, *sz;
1034 	struct shared_kstat_list *sk;
1035 
1036 	/* Find in list */
1037 	mutex_enter(&netstack_shared_lock);
1038 	sz = NULL;
1039 	for (szp = &netstack_shared_zones; *szp != NULL;
1040 	    szp = &((*szp)->sz_next)) {
1041 		if ((*szp)->sz_zoneid == zoneid) {
1042 			sz = *szp;
1043 			break;
1044 		}
1045 	}
1046 	/* We must find it */
1047 	ASSERT(sz != NULL);
1048 	*szp = sz->sz_next;
1049 	sz->sz_next = NULL;
1050 
1051 	/*
1052 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1053 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1054 	 */
1055 	for (sk = netstack_shared_kstats; sk != NULL; sk = sk->sk_next) {
1056 		kstat_zone_remove(sk->sk_kstat, zoneid);
1057 	}
1058 	mutex_exit(&netstack_shared_lock);
1059 
1060 	kmem_free(sz, sizeof (*sz));
1061 }
1062 
1063 static void
1064 netstack_shared_kstat_add(kstat_t *ks)
1065 {
1066 	struct shared_zone_list *sz;
1067 	struct shared_kstat_list *sk;
1068 
1069 	sk = (struct shared_kstat_list *)kmem_zalloc(sizeof (*sk), KM_SLEEP);
1070 	sk->sk_kstat = ks;
1071 
1072 	/* Insert in list */
1073 	mutex_enter(&netstack_shared_lock);
1074 	sk->sk_next = netstack_shared_kstats;
1075 	netstack_shared_kstats = sk;
1076 
1077 	/*
1078 	 * Perform kstat_zone_add for each existing shared stack zone.
1079 	 * Note: Holds netstack_shared_lock lock across kstat_zone_add.
1080 	 */
1081 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1082 		kstat_zone_add(ks, sz->sz_zoneid);
1083 	}
1084 	mutex_exit(&netstack_shared_lock);
1085 }
1086 
1087 static void
1088 netstack_shared_kstat_remove(kstat_t *ks)
1089 {
1090 	struct shared_zone_list *sz;
1091 	struct shared_kstat_list **skp, *sk;
1092 
1093 	/* Find in list */
1094 	mutex_enter(&netstack_shared_lock);
1095 	sk = NULL;
1096 	for (skp = &netstack_shared_kstats; *skp != NULL;
1097 	    skp = &((*skp)->sk_next)) {
1098 		if ((*skp)->sk_kstat == ks) {
1099 			sk = *skp;
1100 			break;
1101 		}
1102 	}
1103 	/* Must find it */
1104 	ASSERT(sk != NULL);
1105 	*skp = sk->sk_next;
1106 	sk->sk_next = NULL;
1107 
1108 	/*
1109 	 * Perform kstat_zone_remove for each existing shared stack kstat.
1110 	 * Note: Holds netstack_shared_lock lock across kstat_zone_remove.
1111 	 */
1112 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1113 		kstat_zone_remove(ks, sz->sz_zoneid);
1114 	}
1115 	mutex_exit(&netstack_shared_lock);
1116 	kmem_free(sk, sizeof (*sk));
1117 }
1118 
1119 /*
1120  * If a zoneid is part of the shared zone, return true
1121  */
1122 static boolean_t
1123 netstack_find_shared_zoneid(zoneid_t zoneid)
1124 {
1125 	struct shared_zone_list *sz;
1126 
1127 	mutex_enter(&netstack_shared_lock);
1128 	for (sz = netstack_shared_zones; sz != NULL; sz = sz->sz_next) {
1129 		if (sz->sz_zoneid == zoneid) {
1130 			mutex_exit(&netstack_shared_lock);
1131 			return (B_TRUE);
1132 		}
1133 	}
1134 	mutex_exit(&netstack_shared_lock);
1135 	return (B_FALSE);
1136 }
1137 
1138 /*
1139  * Hide the fact that zoneids and netstackids are allocated from
1140  * the same space in the current implementation.
1141  * XXX could add checks that the stackid/zoneids are valid...
1142  */
1143 zoneid_t
1144 netstackid_to_zoneid(netstackid_t stackid)
1145 {
1146 	return (stackid);
1147 }
1148 
1149 netstackid_t
1150 zoneid_to_netstackid(zoneid_t zoneid)
1151 {
1152 	if (netstack_find_shared_zoneid(zoneid))
1153 		return (GLOBAL_ZONEID);
1154 	else
1155 		return (zoneid);
1156 }
1157 
1158 /*
1159  * Simplistic support for walking all the handles.
1160  * Example usage:
1161  *	netstack_handle_t nh;
1162  *	netstack_t *ns;
1163  *
1164  *	netstack_next_init(&nh);
1165  *	while ((ns = netstack_next(&nh)) != NULL) {
1166  *		do something;
1167  *		netstack_rele(ns);
1168  *	}
1169  *	netstack_next_fini(&nh);
1170  */
1171 void
1172 netstack_next_init(netstack_handle_t *handle)
1173 {
1174 	*handle = 0;
1175 }
1176 
1177 /* ARGSUSED */
1178 void
1179 netstack_next_fini(netstack_handle_t *handle)
1180 {
1181 }
1182 
1183 netstack_t *
1184 netstack_next(netstack_handle_t *handle)
1185 {
1186 	netstack_t *ns;
1187 	int i, end;
1188 
1189 	end = *handle;
1190 	/* Walk skipping *handle number of instances */
1191 
1192 	/* Look if there is a matching stack instance */
1193 	mutex_enter(&netstack_g_lock);
1194 	ns = netstack_head;
1195 	for (i = 0; i < end; i++) {
1196 		if (ns == NULL)
1197 			break;
1198 		ns = ns->netstack_next;
1199 	}
1200 	/* skip those with that aren't really here */
1201 	while (ns != NULL) {
1202 		mutex_enter(&ns->netstack_lock);
1203 		if ((ns->netstack_flags & (NSF_UNINIT|NSF_CLOSING)) == 0) {
1204 			mutex_exit(&ns->netstack_lock);
1205 			break;
1206 		}
1207 		mutex_exit(&ns->netstack_lock);
1208 		end++;
1209 		ns = ns->netstack_next;
1210 	}
1211 	if (ns != NULL) {
1212 		*handle = end + 1;
1213 		netstack_hold(ns);
1214 	}
1215 	mutex_exit(&netstack_g_lock);
1216 	return (ns);
1217 }
1218