xref: /titanic_51/usr/src/uts/common/inet/ipnet/ipnet.c (revision 808f26a819b6259a3340d8d53074a2f1635315cb)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * The ipnet device defined here provides access to packets at the IP layer. To
29  * provide access to packets at this layer it registers a callback function in
30  * the ip module and when there are open instances of the device ip will pass
31  * packets into the device. Packets from ip are passed on the input, output and
32  * loopback paths. Internally the module returns to ip as soon as possible by
33  * deferring processing using a taskq.
34  *
35  * Management of the devices in /dev/ipnet/ is handled by the devname
36  * filesystem and use of the neti interfaces.  This module registers for NIC
37  * events using the neti framework so that when IP interfaces are bought up,
38  * taken down etc. the ipnet module is notified and its view of the interfaces
39  * configured on the system adjusted.  On attach, the module gets an initial
40  * view of the system again using the neti framework but as it has already
41  * registered for IP interface events, it is still up-to-date with any changes.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/conf.h>
46 #include <sys/cred.h>
47 #include <sys/stat.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/modctl.h>
51 #include <sys/dlpi.h>
52 #include <sys/strsun.h>
53 #include <sys/id_space.h>
54 #include <sys/kmem.h>
55 #include <sys/mkdev.h>
56 #include <sys/neti.h>
57 #include <net/if.h>
58 #include <sys/errno.h>
59 #include <sys/list.h>
60 #include <sys/ksynch.h>
61 #include <sys/hook_event.h>
62 #include <sys/stropts.h>
63 #include <sys/sysmacros.h>
64 #include <inet/ip.h>
65 #include <inet/ip_multi.h>
66 #include <inet/ip6.h>
67 #include <inet/ipnet.h>
68 
69 static struct module_info ipnet_minfo = {
70 	1,		/* mi_idnum */
71 	"ipnet",	/* mi_idname */
72 	0,		/* mi_minpsz */
73 	INFPSZ,		/* mi_maxpsz */
74 	2048,		/* mi_hiwat */
75 	0		/* mi_lowat */
76 };
77 
78 /*
79  * List to hold static view of ipnetif_t's on the system. This is needed to
80  * avoid holding the lock protecting the avl tree of ipnetif's over the
81  * callback into the dev filesystem.
82  */
83 typedef struct ipnetif_cbdata {
84 	char		ic_ifname[LIFNAMSIZ];
85 	dev_t		ic_dev;
86 	list_node_t	ic_next;
87 } ipnetif_cbdata_t;
88 
89 /*
90  * Convenience enumerated type for ipnet_accept().  It describes the
91  * properties of a given ipnet_addrp_t relative to a single ipnet_t
92  * client stream.  The values represent whether the address is ...
93  */
94 typedef enum {
95 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
96 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
97 	IPNETADDR_UNKNOWN	/* none of the above. */
98 } ipnet_addrtype_t;
99 
100 /* Argument used for the ipnet_nicevent_taskq callback. */
101 typedef struct ipnet_nicevent_s {
102 	nic_event_t		ipne_event;
103 	net_handle_t		ipne_protocol;
104 	netstackid_t		ipne_stackid;
105 	uint64_t		ipne_ifindex;
106 	uint64_t		ipne_lifindex;
107 	char			ipne_ifname[LIFNAMSIZ];
108 } ipnet_nicevent_t;
109 
110 static dev_info_t	*ipnet_dip;
111 static major_t		ipnet_major;
112 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
113 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
114 static id_space_t	*ipnet_minor_space;
115 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
116 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
117 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
118 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
119 
120 static void	ipnet_input(mblk_t *);
121 static int	ipnet_wput(queue_t *, mblk_t *);
122 static int	ipnet_rsrv(queue_t *);
123 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
124 static int	ipnet_close(queue_t *);
125 static void	ipnet_ioctl(queue_t *, mblk_t *);
126 static void	ipnet_iocdata(queue_t *, mblk_t *);
127 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
128 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
129 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
130 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
131 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
132 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
133 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
134 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
135 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
136 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
137 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
138 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
139 static void	ipnet_nicevent_task(void *);
140 static ipnetif_t *ipnet_create_if(const char *, uint64_t, ipnet_stack_t *);
141 static void	ipnet_remove_if(ipnetif_t *, ipnet_stack_t *);
142 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
143 static ipnetif_t *ipnet_if_getby_index(uint64_t, ipnet_stack_t *);
144 static ipnetif_t *ipnet_if_getby_dev(dev_t, ipnet_stack_t *);
145 static boolean_t ipnet_if_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
146 static void	ipnet_if_zonecheck(ipnetif_t *, ipnet_stack_t *);
147 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
148 static int 	ipnet_if_compare_name(const void *, const void *);
149 static int 	ipnet_if_compare_index(const void *, const void *);
150 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
151 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
152 static void	ipnetif_refhold(ipnetif_t *);
153 static void	ipnetif_refrele(ipnetif_t *);
154 static void	ipnet_walkers_inc(ipnet_stack_t *);
155 static void	ipnet_walkers_dec(ipnet_stack_t *);
156 static void	ipnet_register_netihook(ipnet_stack_t *);
157 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
158 static void	ipnet_stack_fini(netstackid_t, void *);
159 
160 static struct qinit ipnet_rinit = {
161 	NULL,		/* qi_putp */
162 	ipnet_rsrv,	/* qi_srvp */
163 	ipnet_open,	/* qi_qopen */
164 	ipnet_close,	/* qi_qclose */
165 	NULL,		/* qi_qadmin */
166 	&ipnet_minfo,	/* qi_minfo */
167 };
168 
169 static struct qinit ipnet_winit = {
170 	ipnet_wput,	/* qi_putp */
171 	NULL,		/* qi_srvp */
172 	NULL,		/* qi_qopen */
173 	NULL,		/* qi_qclose */
174 	NULL,		/* qi_qadmin */
175 	&ipnet_minfo,	/* qi_minfo */
176 };
177 
178 static struct streamtab ipnet_info = {
179 	&ipnet_rinit, &ipnet_winit
180 };
181 
182 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
183     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
184     ddi_quiesce_not_supported);
185 
186 static struct modldrv modldrv = {
187 	&mod_driverops,
188 	"STREAMS ipnet driver",
189 	&ipnet_ops
190 };
191 
192 static struct modlinkage modlinkage = {
193 	MODREV_1, &modldrv, NULL
194 };
195 
196 /*
197  * Walk the list of physical interfaces on the machine, for each
198  * interface create a new ipnetif_t and add any addresses to it. We
199  * need to do the walk twice, once for IPv4 and once for IPv6.
200  *
201  * The interfaces are destroyed as part of ipnet_stack_fini() for each
202  * stack.  Note that we cannot do this initialization in
203  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
204  */
205 static int
206 ipnet_if_init(void)
207 {
208 	netstack_handle_t	nh;
209 	netstack_t		*ns;
210 	ipnet_stack_t		*ips;
211 	int			ret = 0;
212 
213 	netstack_next_init(&nh);
214 	while ((ns = netstack_next(&nh)) != NULL) {
215 		ips = ns->netstack_ipnet;
216 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
217 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
218 		netstack_rele(ns);
219 		if (ret != 0)
220 			break;
221 	}
222 	netstack_next_fini(&nh);
223 	return (ret);
224 }
225 
226 /*
227  * Standard module entry points.
228  */
229 int
230 _init(void)
231 {
232 	int	ret;
233 
234 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
235 		return (ENODEV);
236 	ipnet_minor_space = id_space_create("ipnet_minor_space",
237 	    IPNET_MINOR_MIN, MAXMIN32);
238 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
239 	/*
240 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
241 	 * delivery of packets to clients.
242 	 */
243 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
244 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
245 	    1, TASKQ_DEFAULTPRI, 0);
246 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
247 		ret = ENOMEM;
248 		goto done;
249 	}
250 	if ((ret = ipnet_if_init()) == 0)
251 		ret = mod_install(&modlinkage);
252 done:
253 	if (ret != 0) {
254 		if (ipnet_taskq != NULL)
255 			ddi_taskq_destroy(ipnet_taskq);
256 		if (ipnet_nicevent_taskq != NULL)
257 			ddi_taskq_destroy(ipnet_nicevent_taskq);
258 		netstack_unregister(NS_IPNET);
259 		id_space_destroy(ipnet_minor_space);
260 	}
261 	return (ret);
262 }
263 
264 int
265 _fini(void)
266 {
267 	int err;
268 
269 	if ((err = mod_remove(&modlinkage)) != 0)
270 		return (err);
271 	ddi_taskq_destroy(ipnet_nicevent_taskq);
272 	ddi_taskq_destroy(ipnet_taskq);
273 	netstack_unregister(NS_IPNET);
274 	id_space_destroy(ipnet_minor_space);
275 	return (0);
276 }
277 
278 int
279 _info(struct modinfo *modinfop)
280 {
281 	return (mod_info(&modlinkage, modinfop));
282 }
283 
284 static void
285 ipnet_register_netihook(ipnet_stack_t *ips)
286 {
287 	int		ret;
288 	zoneid_t	zoneid;
289 	netid_t		netid;
290 
291 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
292 	    ips);
293 
294 	/*
295 	 * It is possible for an exclusive stack to be in the process of
296 	 * shutting down here, and the netid and protocol lookups could fail
297 	 * in that case.
298 	 */
299 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
300 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
301 		return;
302 
303 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
304 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
305 		    ips->ips_nicevents)) != 0) {
306 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
307 			ips->ips_ndv4 = NULL;
308 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
309 			    " in zone %d: %d", zoneid, ret);
310 		}
311 	}
312 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
313 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
314 		    ips->ips_nicevents)) != 0) {
315 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
316 			ips->ips_ndv6 = NULL;
317 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
318 			    " in zone %d: %d", zoneid, ret);
319 		}
320 	}
321 }
322 
323 /*
324  * This function is called on attach to build an initial view of the
325  * interfaces on the system. It will be called once for IPv4 and once
326  * for IPv6, although there is only one ipnet interface for both IPv4
327  * and IPv6 there are separate address lists.
328  */
329 static int
330 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
331 {
332 	phy_if_t		phyif;
333 	lif_if_t		lif;
334 	ipnetif_t		*ipnetif;
335 	char			name[LIFNAMSIZ];
336 	boolean_t		new_if = B_FALSE;
337 	uint64_t		ifflags;
338 	int			ret = 0;
339 
340 	/*
341 	 * If ipnet_register_netihook() was unable to initialize this
342 	 * stack's net_handle_t, then we cannot populate any interface
343 	 * information.  This usually happens when we attempted to
344 	 * grab a net_handle_t as a stack was shutting down.  We don't
345 	 * want to fail the entire _init() operation because of a
346 	 * stack shutdown (other stacks will continue to work just
347 	 * fine), so we silently return success here.
348 	 */
349 	if (nd == NULL)
350 		return (0);
351 
352 	/*
353 	 * Make sure we're not processing NIC events during the
354 	 * population of our interfaces and address lists.
355 	 */
356 	mutex_enter(&ips->ips_event_lock);
357 
358 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
359 	    phyif = net_phygetnext(nd, phyif)) {
360 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
361 			continue;
362 		if ((ipnetif = ipnet_if_getby_index(phyif, ips)) == NULL) {
363 			ipnetif = ipnet_create_if(name, phyif, ips);
364 			if (ipnetif == NULL) {
365 				ret = ENOMEM;
366 				goto done;
367 			}
368 			new_if = B_TRUE;
369 		}
370 		ipnetif->if_flags |=
371 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
372 
373 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
374 		    lif = net_lifgetnext(nd, phyif, lif)) {
375 			/*
376 			 * Skip addresses that aren't up.  We'll add
377 			 * them when we receive an NE_LIF_UP event.
378 			 */
379 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
380 			    !(ifflags & IFF_UP))
381 				continue;
382 			/* Don't add it if we already have it. */
383 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
384 				continue;
385 			ipnet_add_ifaddr(lif, ipnetif, nd);
386 		}
387 		if (!new_if)
388 			ipnetif_refrele(ipnetif);
389 	}
390 
391 done:
392 	mutex_exit(&ips->ips_event_lock);
393 	return (ret);
394 }
395 
396 static int
397 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
398 {
399 	if (cmd != DDI_ATTACH)
400 		return (DDI_FAILURE);
401 
402 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
403 	    DDI_PSEUDO, 0) == DDI_FAILURE)
404 		return (DDI_FAILURE);
405 
406 	ipnet_dip = dip;
407 	return (DDI_SUCCESS);
408 }
409 
410 static int
411 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
412 {
413 	if (cmd != DDI_DETACH)
414 		return (DDI_FAILURE);
415 
416 	ASSERT(dip == ipnet_dip);
417 	ddi_remove_minor_node(ipnet_dip, NULL);
418 	ipnet_dip = NULL;
419 	return (DDI_SUCCESS);
420 }
421 
422 /* ARGSUSED */
423 static int
424 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
425 {
426 	int error = DDI_FAILURE;
427 
428 	switch (infocmd) {
429 	case DDI_INFO_DEVT2INSTANCE:
430 		*result = (void *)0;
431 		error = DDI_SUCCESS;
432 		break;
433 	case DDI_INFO_DEVT2DEVINFO:
434 		if (ipnet_dip != NULL) {
435 			*result = ipnet_dip;
436 			error = DDI_SUCCESS;
437 		}
438 		break;
439 	}
440 	return (error);
441 }
442 
443 /* ARGSUSED */
444 static int
445 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
446 {
447 	ipnet_t		*ipnet;
448 	netstack_t	*ns = NULL;
449 	ipnet_stack_t	*ips;
450 	int		err = 0;
451 	zoneid_t	zoneid = crgetzoneid(crp);
452 
453 	/*
454 	 * If the system is labeled, only the global zone is allowed to open
455 	 * IP observability nodes.
456 	 */
457 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
458 		return (EACCES);
459 
460 	/* We don't support open as a module */
461 	if (sflag & MODOPEN)
462 		return (ENOTSUP);
463 
464 	/* This driver is self-cloning, we don't support re-open. */
465 	if (rq->q_ptr != NULL)
466 		return (EBUSY);
467 
468 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
469 		return (ENOMEM);
470 
471 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
472 	ips = ns->netstack_ipnet;
473 
474 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
475 	ipnet->ipnet_rq = rq;
476 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
477 	ipnet->ipnet_zoneid = zoneid;
478 	ipnet->ipnet_dlstate = DL_UNBOUND;
479 	ipnet->ipnet_sap = 0;
480 	ipnet->ipnet_ns = ns;
481 
482 	/*
483 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
484 	 * to be processed after ipnet_if is set and the ipnet_t has been
485 	 * inserted in the ips_str_list.
486 	 */
487 	mutex_enter(&ips->ips_event_lock);
488 	if (getminor(*dev) == IPNET_MINOR_LO) {
489 		ipnet->ipnet_flags |= IPNET_LOMODE;
490 		ipnet->ipnet_acceptfn = ipnet_loaccept;
491 	} else {
492 		ipnet->ipnet_acceptfn = ipnet_accept;
493 		ipnet->ipnet_if = ipnet_if_getby_dev(*dev, ips);
494 		if (ipnet->ipnet_if == NULL ||
495 		    !ipnet_if_in_zone(ipnet->ipnet_if, zoneid, ips)) {
496 			err = ENODEV;
497 			goto done;
498 		}
499 	}
500 
501 	mutex_enter(&ips->ips_walkers_lock);
502 	while (ips->ips_walkers_cnt != 0)
503 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
504 	list_insert_head(&ips->ips_str_list, ipnet);
505 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
506 	qprocson(rq);
507 
508 	/*
509 	 * Only register our callback if we're the first open client; we call
510 	 * unregister in close() for the last open client.
511 	 */
512 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
513 		ipobs_register_hook(ns, ipnet_input);
514 	mutex_exit(&ips->ips_walkers_lock);
515 
516 done:
517 	mutex_exit(&ips->ips_event_lock);
518 	if (err != 0) {
519 		netstack_rele(ns);
520 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
521 		if (ipnet->ipnet_if != NULL)
522 			ipnetif_refrele(ipnet->ipnet_if);
523 		kmem_free(ipnet, sizeof (*ipnet));
524 	}
525 	return (err);
526 }
527 
528 static int
529 ipnet_close(queue_t *rq)
530 {
531 	ipnet_t		*ipnet = rq->q_ptr;
532 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
533 
534 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
535 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
536 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
537 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
538 
539 	mutex_enter(&ips->ips_walkers_lock);
540 	while (ips->ips_walkers_cnt != 0)
541 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
542 
543 	qprocsoff(rq);
544 
545 	list_remove(&ips->ips_str_list, ipnet);
546 	if (ipnet->ipnet_if != NULL)
547 		ipnetif_refrele(ipnet->ipnet_if);
548 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
549 	kmem_free(ipnet, sizeof (*ipnet));
550 
551 	if (list_is_empty(&ips->ips_str_list))
552 		ipobs_unregister_hook(ips->ips_netstack, ipnet_input);
553 
554 	mutex_exit(&ips->ips_walkers_lock);
555 	netstack_rele(ips->ips_netstack);
556 	return (0);
557 }
558 
559 static int
560 ipnet_wput(queue_t *q, mblk_t *mp)
561 {
562 	switch (mp->b_datap->db_type) {
563 	case M_FLUSH:
564 		if (*mp->b_rptr & FLUSHW) {
565 			flushq(q, FLUSHDATA);
566 			*mp->b_rptr &= ~FLUSHW;
567 		}
568 		if (*mp->b_rptr & FLUSHR)
569 			qreply(q, mp);
570 		else
571 			freemsg(mp);
572 		break;
573 	case M_PROTO:
574 	case M_PCPROTO:
575 		ipnet_wputnondata(q, mp);
576 		break;
577 	case M_IOCTL:
578 		ipnet_ioctl(q, mp);
579 		break;
580 	case M_IOCDATA:
581 		ipnet_iocdata(q, mp);
582 		break;
583 	default:
584 		freemsg(mp);
585 		break;
586 	}
587 	return (0);
588 }
589 
590 static int
591 ipnet_rsrv(queue_t *q)
592 {
593 	mblk_t *mp;
594 
595 	while ((mp = getq(q)) != NULL) {
596 		ASSERT(DB_TYPE(mp) == M_DATA);
597 		if (canputnext(q)) {
598 			putnext(q, mp);
599 		} else {
600 			(void) putbq(q, mp);
601 			break;
602 		}
603 	}
604 	return (0);
605 }
606 
607 static void
608 ipnet_ioctl(queue_t *q, mblk_t *mp)
609 {
610 	struct iocblk *iocp = (struct iocblk *)mp->b_rptr;
611 
612 	switch (iocp->ioc_cmd) {
613 	case DLIOCRAW:
614 		miocack(q, mp, 0, 0);
615 		break;
616 	case DLIOCIPNETINFO:
617 		if (iocp->ioc_count == TRANSPARENT) {
618 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
619 			qreply(q, mp);
620 			break;
621 		}
622 		/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
623 	default:
624 		miocnak(q, mp, 0, EINVAL);
625 		break;
626 	}
627 }
628 
629 static void
630 ipnet_iocdata(queue_t *q, mblk_t *mp)
631 {
632 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
633 	ipnet_t		*ipnet = q->q_ptr;
634 
635 	switch (iocp->ioc_cmd) {
636 	case DLIOCIPNETINFO:
637 		if (*(int *)mp->b_cont->b_rptr == 1)
638 			ipnet->ipnet_flags |= IPNET_INFO;
639 		else if (*(int *)mp->b_cont->b_rptr == 0)
640 			ipnet->ipnet_flags &= ~IPNET_INFO;
641 		else
642 			goto iocnak;
643 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
644 		break;
645 	default:
646 	iocnak:
647 		miocnak(q, mp, 0, EINVAL);
648 		break;
649 	}
650 }
651 
652 static void
653 ipnet_wputnondata(queue_t *q, mblk_t *mp)
654 {
655 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
656 	t_uscalar_t		prim = dlp->dl_primitive;
657 
658 	switch (prim) {
659 	case DL_INFO_REQ:
660 		ipnet_inforeq(q, mp);
661 		break;
662 	case DL_UNBIND_REQ:
663 		ipnet_unbindreq(q, mp);
664 		break;
665 	case DL_BIND_REQ:
666 		ipnet_bindreq(q, mp);
667 		break;
668 	case DL_PROMISCON_REQ:
669 		ipnet_dlpromisconreq(q, mp);
670 		break;
671 	case DL_PROMISCOFF_REQ:
672 		ipnet_dlpromiscoffreq(q, mp);
673 		break;
674 	case DL_UNITDATA_REQ:
675 	case DL_DETACH_REQ:
676 	case DL_PHYS_ADDR_REQ:
677 	case DL_SET_PHYS_ADDR_REQ:
678 	case DL_ENABMULTI_REQ:
679 	case DL_DISABMULTI_REQ:
680 	case DL_ATTACH_REQ:
681 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
682 		break;
683 	default:
684 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
685 		break;
686 	}
687 }
688 
689 static void
690 ipnet_inforeq(queue_t *q, mblk_t *mp)
691 {
692 	dl_info_ack_t	*dlip;
693 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
694 
695 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
696 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
697 		return;
698 	}
699 
700 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
701 		return;
702 
703 	dlip = (dl_info_ack_t *)mp->b_rptr;
704 	*dlip = ipnet_infoack;
705 	qreply(q, mp);
706 }
707 
708 static void
709 ipnet_bindreq(queue_t *q, mblk_t *mp)
710 {
711 	union   DL_primitives *dlp = (union DL_primitives *)mp->b_rptr;
712 	int32_t sap;
713 	ipnet_t	*ipnet = q->q_ptr;
714 
715 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
716 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
717 		return;
718 	}
719 
720 	sap = dlp->bind_req.dl_sap;
721 	if (sap != IPV4_VERSION && sap != IPV6_VERSION && sap != 0) {
722 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
723 	} else {
724 		ipnet->ipnet_sap = sap;
725 		ipnet->ipnet_dlstate = DL_IDLE;
726 		dlbindack(q, mp, sap, 0, 0, 0, 0);
727 	}
728 }
729 
730 static void
731 ipnet_unbindreq(queue_t *q, mblk_t *mp)
732 {
733 	ipnet_t	*ipnet = q->q_ptr;
734 
735 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
736 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
737 		return;
738 	}
739 
740 	if (ipnet->ipnet_dlstate != DL_IDLE) {
741 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
742 	} else {
743 		ipnet->ipnet_dlstate = DL_UNBOUND;
744 		ipnet->ipnet_sap = 0;
745 		dlokack(q, mp, DL_UNBIND_REQ);
746 	}
747 }
748 
749 static void
750 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
751 {
752 	ipnet_t		*ipnet = q->q_ptr;
753 	t_uscalar_t	level;
754 	int		err;
755 
756 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
757 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
758 		return;
759 	}
760 
761 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
762 		dlokack(q, mp, DL_PROMISCON_REQ);
763 		return;
764 	}
765 
766 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
767 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
768 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
769 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
770 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
771 			return;
772 		}
773 	}
774 
775 	switch (level) {
776 	case DL_PROMISC_PHYS:
777 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
778 		break;
779 	case DL_PROMISC_SAP:
780 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
781 		break;
782 	case DL_PROMISC_MULTI:
783 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
784 		break;
785 	default:
786 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
787 		return;
788 	}
789 
790 	dlokack(q, mp, DL_PROMISCON_REQ);
791 }
792 
793 static void
794 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
795 {
796 	ipnet_t		*ipnet = q->q_ptr;
797 	t_uscalar_t	level;
798 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
799 
800 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
801 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
802 		return;
803 	}
804 
805 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
806 		dlokack(q, mp, DL_PROMISCOFF_REQ);
807 		return;
808 	}
809 
810 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
811 	switch (level) {
812 	case DL_PROMISC_PHYS:
813 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
814 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
815 		break;
816 	case DL_PROMISC_SAP:
817 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
818 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
819 		break;
820 	case DL_PROMISC_MULTI:
821 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
822 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
823 		break;
824 	default:
825 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
826 		return;
827 	}
828 
829 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
830 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
831 		return;
832 	}
833 
834 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
835 		ipnet_leave_allmulti(ipnet->ipnet_if,
836 		    ipnet->ipnet_ns->netstack_ipnet);
837 	}
838 
839 	dlokack(q, mp, DL_PROMISCOFF_REQ);
840 }
841 
842 static int
843 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
844 {
845 	int		err = 0;
846 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
847 	uint64_t	index = ipnetif->if_index;
848 
849 	mutex_enter(&ips->ips_event_lock);
850 	if (ipnetif->if_multicnt == 0) {
851 		ASSERT((ipnetif->if_flags &
852 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
853 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
854 			err = ip_join_allmulti(index, B_FALSE, ipst);
855 			if (err != 0)
856 				goto done;
857 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
858 		}
859 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
860 			err = ip_join_allmulti(index, B_TRUE, ipst);
861 			if (err != 0 &&
862 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
863 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
864 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
865 				goto done;
866 			}
867 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
868 		}
869 	}
870 	ipnetif->if_multicnt++;
871 
872 done:
873 	mutex_exit(&ips->ips_event_lock);
874 	return (err);
875 }
876 
877 static void
878 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
879 {
880 	int		err;
881 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
882 	uint64_t	index = ipnetif->if_index;
883 
884 	mutex_enter(&ips->ips_event_lock);
885 	ASSERT(ipnetif->if_multicnt != 0);
886 	if (--ipnetif->if_multicnt == 0) {
887 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
888 			err = ip_leave_allmulti(index, B_FALSE, ipst);
889 			ASSERT(err == 0 || err == ENODEV);
890 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
891 		}
892 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
893 			err = ip_leave_allmulti(index, B_TRUE, ipst);
894 			ASSERT(err == 0 || err == ENODEV);
895 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
896 		}
897 	}
898 	mutex_exit(&ips->ips_event_lock);
899 }
900 
901 static mblk_t *
902 ipnet_addheader(ipobs_hook_data_t *ihd, mblk_t *mp)
903 {
904 	mblk_t		*dlhdr;
905 	dl_ipnetinfo_t	*dl;
906 
907 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
908 		freemsg(mp);
909 		return (NULL);
910 	}
911 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
912 	dl->dli_version = DL_IPNETINFO_VERSION;
913 	dl->dli_len = htons(sizeof (*dl));
914 	dl->dli_ipver = ihd->ihd_ipver;
915 	dl->dli_srczone = BE_64((uint64_t)ihd->ihd_zsrc);
916 	dl->dli_dstzone = BE_64((uint64_t)ihd->ihd_zdst);
917 	dlhdr->b_wptr += sizeof (*dl);
918 	dlhdr->b_cont = mp;
919 
920 	return (dlhdr);
921 }
922 
923 static ipnet_addrtype_t
924 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
925 {
926 	list_t			*list;
927 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
928 	ipnetif_addr_t		*ifaddr;
929 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
930 
931 	/* First check if the address is multicast or limited broadcast. */
932 	switch (addr->iap_family) {
933 	case AF_INET:
934 		if (CLASSD(*(addr->iap_addr4)) ||
935 		    *(addr->iap_addr4) == INADDR_BROADCAST)
936 			return (IPNETADDR_MBCAST);
937 		break;
938 	case AF_INET6:
939 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
940 			return (IPNETADDR_MBCAST);
941 		break;
942 	}
943 
944 	/*
945 	 * Walk the address list to see if the address belongs to our
946 	 * interface or is one of our subnet broadcast addresses.
947 	 */
948 	mutex_enter(&ipnetif->if_addr_lock);
949 	list = (addr->iap_family == AF_INET) ?
950 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
951 	for (ifaddr = list_head(list);
952 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
953 	    ifaddr = list_next(list, ifaddr)) {
954 		/*
955 		 * If we're not in the global zone, then only look at
956 		 * addresses in our zone.
957 		 */
958 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
959 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
960 			continue;
961 		switch (addr->iap_family) {
962 		case AF_INET:
963 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
964 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
965 				addrtype = IPNETADDR_MYADDR;
966 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
967 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
968 				addrtype = IPNETADDR_MBCAST;
969 			break;
970 		case AF_INET6:
971 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
972 			    &ifaddr->ifa_ip6addr))
973 				addrtype = IPNETADDR_MYADDR;
974 			break;
975 		}
976 	}
977 	mutex_exit(&ipnetif->if_addr_lock);
978 
979 	return (addrtype);
980 }
981 
982 /*
983  * Verify if the packet contained in ihd should be passed up to the
984  * ipnet client stream.
985  */
986 static boolean_t
987 ipnet_accept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
988     ipnet_addrp_t *dst)
989 {
990 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
991 	ipnet_addrtype_t	srctype, dsttype;
992 
993 	srctype = ipnet_get_addrtype(ipnet, src);
994 	dsttype = ipnet_get_addrtype(ipnet, dst);
995 
996 	/*
997 	 * Do not allow an ipnet stream to see packets that are not from or to
998 	 * its zone.  The exception is when zones are using the shared stack
999 	 * model.  In this case, streams in the global zone have visibility
1000 	 * into other shared-stack zones, and broadcast and multicast traffic
1001 	 * is visible by all zones in the stack.
1002 	 */
1003 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1004 	    dsttype != IPNETADDR_MBCAST) {
1005 		if (ipnet->ipnet_zoneid != ihd->ihd_zsrc &&
1006 		    ipnet->ipnet_zoneid != ihd->ihd_zdst)
1007 			return (B_FALSE);
1008 	}
1009 
1010 	/*
1011 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1012 	 * packet's IP version.
1013 	 */
1014 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1015 	    ipnet->ipnet_sap != ihd->ihd_ipver)
1016 		return (B_FALSE);
1017 
1018 	/* If the destination address is ours, then accept the packet. */
1019 	if (dsttype == IPNETADDR_MYADDR)
1020 		return (B_TRUE);
1021 
1022 	/*
1023 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1024 	 * sent or received on the interface we're observing, or packets that
1025 	 * have our source address (this allows us to see packets we send).
1026 	 */
1027 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1028 		if (ihd->ihd_ifindex == ifindex || srctype == IPNETADDR_MYADDR)
1029 			return (B_TRUE);
1030 	}
1031 
1032 	/*
1033 	 * We accept multicast and broadcast packets transmitted or received
1034 	 * on the interface we're observing.
1035 	 */
1036 	if (dsttype == IPNETADDR_MBCAST && ihd->ihd_ifindex == ifindex)
1037 		return (B_TRUE);
1038 
1039 	return (B_FALSE);
1040 }
1041 
1042 /*
1043  * Verify if the packet contained in ihd should be passed up to the ipnet
1044  * client stream that's in IPNET_LOMODE.
1045  */
1046 /* ARGSUSED */
1047 static boolean_t
1048 ipnet_loaccept(ipnet_t *ipnet, ipobs_hook_data_t *ihd, ipnet_addrp_t *src,
1049     ipnet_addrp_t *dst)
1050 {
1051 	if (ihd->ihd_htype != IPOBS_HOOK_LOCAL)
1052 		return (B_FALSE);
1053 
1054 	/*
1055 	 * An ipnet stream must not see packets that are not from/to its zone.
1056 	 */
1057 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1058 		if (ipnet->ipnet_zoneid != ihd->ihd_zsrc &&
1059 		    ipnet->ipnet_zoneid != ihd->ihd_zdst)
1060 			return (B_FALSE);
1061 	}
1062 
1063 	return (ipnet->ipnet_sap == 0 || ipnet->ipnet_sap == ihd->ihd_ipver);
1064 }
1065 
1066 static void
1067 ipnet_dispatch(void *arg)
1068 {
1069 	mblk_t			*mp = arg;
1070 	ipobs_hook_data_t	*ihd = (ipobs_hook_data_t *)mp->b_rptr;
1071 	ipnet_t			*ipnet;
1072 	mblk_t			*netmp;
1073 	list_t			*list;
1074 	ipnet_stack_t		*ips = ihd->ihd_stack->netstack_ipnet;
1075 	ipnet_addrp_t		src, dst;
1076 
1077 	if (ihd->ihd_ipver == IPV4_VERSION) {
1078 		src.iap_family = dst.iap_family = AF_INET;
1079 		src.iap_addr4 = &((ipha_t *)(ihd->ihd_mp->b_rptr))->ipha_src;
1080 		dst.iap_addr4 = &((ipha_t *)(ihd->ihd_mp->b_rptr))->ipha_dst;
1081 	} else {
1082 		src.iap_family = dst.iap_family = AF_INET6;
1083 		src.iap_addr6 = &((ip6_t *)(ihd->ihd_mp->b_rptr))->ip6_src;
1084 		dst.iap_addr6 = &((ip6_t *)(ihd->ihd_mp->b_rptr))->ip6_dst;
1085 	}
1086 
1087 	ipnet_walkers_inc(ips);
1088 
1089 	list = &ips->ips_str_list;
1090 	for (ipnet = list_head(list); ipnet != NULL;
1091 	    ipnet = list_next(list, ipnet)) {
1092 		if (!(*ipnet->ipnet_acceptfn)(ipnet, ihd, &src, &dst))
1093 			continue;
1094 
1095 		if (list_next(list, ipnet) == NULL) {
1096 			netmp = ihd->ihd_mp;
1097 			ihd->ihd_mp = NULL;
1098 		} else {
1099 			if ((netmp = dupmsg(ihd->ihd_mp)) == NULL &&
1100 			    (netmp = copymsg(ihd->ihd_mp)) == NULL) {
1101 				atomic_inc_64(&ips->ips_drops);
1102 				continue;
1103 			}
1104 		}
1105 
1106 		if (ipnet->ipnet_flags & IPNET_INFO) {
1107 			if ((netmp = ipnet_addheader(ihd, netmp)) == NULL) {
1108 				atomic_inc_64(&ips->ips_drops);
1109 				continue;
1110 			}
1111 		}
1112 
1113 		if (ipnet->ipnet_rq->q_first == NULL &&
1114 		    canputnext(ipnet->ipnet_rq)) {
1115 			putnext(ipnet->ipnet_rq, netmp);
1116 		} else if (canput(ipnet->ipnet_rq)) {
1117 			(void) putq(ipnet->ipnet_rq, netmp);
1118 		} else {
1119 			freemsg(netmp);
1120 			atomic_inc_64(&ips->ips_drops);
1121 		}
1122 	}
1123 
1124 	ipnet_walkers_dec(ips);
1125 
1126 	freemsg(ihd->ihd_mp);
1127 	freemsg(mp);
1128 }
1129 
1130 static void
1131 ipnet_input(mblk_t *mp)
1132 {
1133 	ipobs_hook_data_t  *ihd = (ipobs_hook_data_t *)mp->b_rptr;
1134 
1135 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1136 	    DDI_SUCCESS) {
1137 		atomic_inc_64(&ihd->ihd_stack->netstack_ipnet->ips_drops);
1138 		freemsg(ihd->ihd_mp);
1139 		freemsg(mp);
1140 	}
1141 }
1142 
1143 /*
1144  * Create a new ipnetif_t and new minor node for it.  If creation is
1145  * successful the new ipnetif_t is inserted into an avl_tree
1146  * containing ipnetif's for this stack instance.
1147  */
1148 static ipnetif_t *
1149 ipnet_create_if(const char *name, uint64_t index, ipnet_stack_t *ips)
1150 {
1151 	ipnetif_t	*ipnetif;
1152 	avl_index_t	where = 0;
1153 	minor_t		ifminor;
1154 
1155 	/*
1156 	 * Because ipnet_create_if() can be called from a NIC event
1157 	 * callback, it should not block.
1158 	 */
1159 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1160 	if (ifminor == (minor_t)-1)
1161 		return (NULL);
1162 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL) {
1163 		id_free(ipnet_minor_space, ifminor);
1164 		return (NULL);
1165 	}
1166 
1167 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1168 	ipnetif->if_index = index;
1169 
1170 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1171 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1172 	    offsetof(ipnetif_addr_t, ifa_link));
1173 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1174 	    offsetof(ipnetif_addr_t, ifa_link));
1175 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1176 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1177 	ipnetif->if_refcnt = 1;
1178 
1179 	mutex_enter(&ips->ips_avl_lock);
1180 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1181 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1182 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1183 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1184 	mutex_exit(&ips->ips_avl_lock);
1185 
1186 	return (ipnetif);
1187 }
1188 
1189 static void
1190 ipnet_remove_if(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1191 {
1192 	ipnet_t	*ipnet;
1193 
1194 	ipnet_walkers_inc(ips);
1195 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1196 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1197 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1198 		if (ipnet->ipnet_if == ipnetif)
1199 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1200 	}
1201 	ipnet_walkers_dec(ips);
1202 	mutex_enter(&ips->ips_avl_lock);
1203 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1204 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1205 	mutex_exit(&ips->ips_avl_lock);
1206 	/* Release the reference we implicitly held in ipnet_create_if(). */
1207 	ipnetif_refrele(ipnetif);
1208 }
1209 
1210 static void
1211 ipnet_purge_addrlist(list_t *addrlist)
1212 {
1213 	ipnetif_addr_t *ifa;
1214 
1215 	while ((ifa = list_head(addrlist)) != NULL) {
1216 		list_remove(addrlist, ifa);
1217 		kmem_free(ifa, sizeof (*ifa));
1218 	}
1219 }
1220 
1221 static void
1222 ipnet_free_if(ipnetif_t *ipnetif)
1223 {
1224 	ASSERT(ipnetif->if_refcnt == 0);
1225 
1226 	/* Remove IPv4/v6 address lists from the ipnetif */
1227 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1228 	list_destroy(&ipnetif->if_ip4addr_list);
1229 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1230 	list_destroy(&ipnetif->if_ip6addr_list);
1231 	mutex_destroy(&ipnetif->if_addr_lock);
1232 	mutex_destroy(&ipnetif->if_reflock);
1233 	id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1234 	kmem_free(ipnetif, sizeof (*ipnetif));
1235 }
1236 
1237 /*
1238  * Create an ipnetif_addr_t with the given logical interface id (lif)
1239  * and add it to the supplied ipnetif.  The lif is the netinfo
1240  * representation of logical interface id, and we use this id to match
1241  * incoming netinfo events against our lists of addresses.
1242  */
1243 static void
1244 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1245 {
1246 	ipnetif_addr_t		*ifaddr;
1247 	zoneid_t		zoneid;
1248 	struct sockaddr_in	bcast;
1249 	struct sockaddr_storage	addr;
1250 	net_ifaddr_t		type = NA_ADDRESS;
1251 	uint64_t		phyif = ipnetif->if_index;
1252 
1253 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1254 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1255 		return;
1256 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1257 		return;
1258 
1259 	ifaddr->ifa_zone = zoneid;
1260 	ifaddr->ifa_id = lif;
1261 
1262 	switch (addr.ss_family) {
1263 	case AF_INET:
1264 		ifaddr->ifa_ip4addr =
1265 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1266 		/*
1267 		 * Try and get the broadcast address.  Note that it's okay for
1268 		 * an interface to not have a broadcast address, so we don't
1269 		 * fail the entire operation if net_getlifaddr() fails here.
1270 		 */
1271 		type = NA_BROADCAST;
1272 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1273 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1274 		break;
1275 	case AF_INET6:
1276 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1277 		break;
1278 	}
1279 
1280 	mutex_enter(&ipnetif->if_addr_lock);
1281 	list_insert_tail(addr.ss_family == AF_INET ?
1282 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1283 	mutex_exit(&ipnetif->if_addr_lock);
1284 }
1285 
1286 static void
1287 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1288 {
1289 	mutex_enter(&ipnetif->if_addr_lock);
1290 	list_remove(isv6 ?
1291 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1292 	mutex_exit(&ipnetif->if_addr_lock);
1293 	kmem_free(ifaddr, sizeof (*ifaddr));
1294 }
1295 
1296 static void
1297 ipnet_plumb_ev(uint64_t ifindex, const char *ifname, ipnet_stack_t *ips,
1298     boolean_t isv6)
1299 {
1300 	ipnetif_t	*ipnetif;
1301 	boolean_t	refrele_needed = B_TRUE;
1302 
1303 	if ((ipnetif = ipnet_if_getby_index(ifindex, ips)) == NULL) {
1304 		ipnetif = ipnet_create_if(ifname, ifindex, ips);
1305 		refrele_needed = B_FALSE;
1306 	}
1307 	if (ipnetif != NULL) {
1308 		ipnetif->if_flags |=
1309 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1310 	}
1311 
1312 	if (ipnetif->if_multicnt != 0) {
1313 		if (ip_join_allmulti(ifindex, isv6,
1314 		    ips->ips_netstack->netstack_ip) == 0) {
1315 			ipnetif->if_flags |=
1316 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1317 		}
1318 	}
1319 
1320 	if (refrele_needed)
1321 		ipnetif_refrele(ipnetif);
1322 }
1323 
1324 static void
1325 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1326 {
1327 	ipnetif_t	*ipnetif;
1328 
1329 	if ((ipnetif = ipnet_if_getby_index(ifindex, ips)) == NULL)
1330 		return;
1331 
1332 	mutex_enter(&ipnetif->if_addr_lock);
1333 	ipnet_purge_addrlist(isv6 ?
1334 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1335 	mutex_exit(&ipnetif->if_addr_lock);
1336 
1337 	/*
1338 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1339 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1340 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1341 	 */
1342 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1343 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1344 		ipnet_remove_if(ipnetif, ips);
1345 	ipnetif_refrele(ipnetif);
1346 }
1347 
1348 static void
1349 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1350     ipnet_stack_t *ips, boolean_t isv6)
1351 {
1352 	ipnetif_t	*ipnetif;
1353 	ipnetif_addr_t	*ifaddr;
1354 
1355 	if ((ipnetif = ipnet_if_getby_index(ifindex, ips)) == NULL)
1356 		return;
1357 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1358 		/*
1359 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1360 		 * ifaddr and re-create it.
1361 		 */
1362 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1363 	}
1364 
1365 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1366 	ipnetif_refrele(ipnetif);
1367 }
1368 
1369 static void
1370 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1371     boolean_t isv6)
1372 {
1373 	ipnetif_t	*ipnetif;
1374 	ipnetif_addr_t	*ifaddr;
1375 
1376 	if ((ipnetif = ipnet_if_getby_index(ifindex, ips)) == NULL)
1377 		return;
1378 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1379 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1380 	ipnetif_refrele(ipnetif);
1381 	/*
1382 	 * Make sure that open streams on this ipnetif are still allowed to
1383 	 * have it open.
1384 	 */
1385 	ipnet_if_zonecheck(ipnetif, ips);
1386 }
1387 
1388 /*
1389  * This callback from the NIC event framework dispatches a taskq as the event
1390  * handlers may block.
1391  */
1392 /* ARGSUSED */
1393 static int
1394 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1395 {
1396 	ipnet_stack_t		*ips = arg;
1397 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1398 	ipnet_nicevent_t	*ipne;
1399 
1400 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1401 		return (0);
1402 	ipne->ipne_event = hn->hne_event;
1403 	ipne->ipne_protocol = hn->hne_protocol;
1404 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1405 	ipne->ipne_ifindex = hn->hne_nic;
1406 	ipne->ipne_lifindex = hn->hne_lif;
1407 	if (hn->hne_datalen != 0) {
1408 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1409 		    sizeof (ipne->ipne_ifname));
1410 	}
1411 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1412 	    ipne, DDI_NOSLEEP);
1413 	return (0);
1414 }
1415 
1416 static void
1417 ipnet_nicevent_task(void *arg)
1418 {
1419 	ipnet_nicevent_t	*ipne = arg;
1420 	netstack_t		*ns;
1421 	ipnet_stack_t		*ips;
1422 	boolean_t		isv6;
1423 
1424 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1425 		goto done;
1426 	ips = ns->netstack_ipnet;
1427 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1428 
1429 	mutex_enter(&ips->ips_event_lock);
1430 	switch (ipne->ipne_event) {
1431 	case NE_PLUMB:
1432 		ipnet_plumb_ev(ipne->ipne_ifindex, ipne->ipne_ifname, ips,
1433 		    isv6);
1434 		break;
1435 	case NE_UNPLUMB:
1436 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1437 		break;
1438 	case NE_LIF_UP:
1439 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1440 		    ipne->ipne_protocol, ips, isv6);
1441 		break;
1442 	case NE_LIF_DOWN:
1443 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1444 		    isv6);
1445 		break;
1446 	default:
1447 		break;
1448 	}
1449 	mutex_exit(&ips->ips_event_lock);
1450 done:
1451 	if (ns != NULL)
1452 		netstack_rele(ns);
1453 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1454 }
1455 
1456 dev_t
1457 ipnet_if_getdev(char *name, zoneid_t zoneid)
1458 {
1459 	netstack_t	*ns;
1460 	ipnet_stack_t	*ips;
1461 	ipnetif_t	*ipnetif;
1462 	dev_t		dev = (dev_t)-1;
1463 
1464 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1465 		return (dev);
1466 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1467 		return (dev);
1468 
1469 	ips = ns->netstack_ipnet;
1470 	mutex_enter(&ips->ips_avl_lock);
1471 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1472 		if (ipnet_if_in_zone(ipnetif, zoneid, ips))
1473 			dev = ipnetif->if_dev;
1474 	}
1475 	mutex_exit(&ips->ips_avl_lock);
1476 	netstack_rele(ns);
1477 
1478 	return (dev);
1479 }
1480 
1481 static ipnetif_t *
1482 ipnet_if_getby_index(uint64_t id, ipnet_stack_t *ips)
1483 {
1484 	ipnetif_t	*ipnetif;
1485 
1486 	mutex_enter(&ips->ips_avl_lock);
1487 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1488 		ipnetif_refhold(ipnetif);
1489 	mutex_exit(&ips->ips_avl_lock);
1490 	return (ipnetif);
1491 }
1492 
1493 static ipnetif_t *
1494 ipnet_if_getby_dev(dev_t dev, ipnet_stack_t *ips)
1495 {
1496 	ipnetif_t	*ipnetif;
1497 	avl_tree_t	*tree;
1498 
1499 	mutex_enter(&ips->ips_avl_lock);
1500 	tree = &ips->ips_avl_by_index;
1501 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1502 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1503 		if (ipnetif->if_dev == dev) {
1504 			ipnetif_refhold(ipnetif);
1505 			break;
1506 		}
1507 	}
1508 	mutex_exit(&ips->ips_avl_lock);
1509 	return (ipnetif);
1510 }
1511 
1512 static ipnetif_addr_t *
1513 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1514 {
1515 	ipnetif_addr_t	*ifaddr;
1516 	list_t		*list;
1517 
1518 	mutex_enter(&ipnetif->if_addr_lock);
1519 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1520 	for (ifaddr = list_head(list); ifaddr != NULL;
1521 	    ifaddr = list_next(list, ifaddr)) {
1522 		if (lid == ifaddr->ifa_id)
1523 			break;
1524 	}
1525 	mutex_exit(&ipnetif->if_addr_lock);
1526 	return (ifaddr);
1527 }
1528 
1529 /* ARGSUSED */
1530 static void *
1531 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1532 {
1533 	ipnet_stack_t	*ips;
1534 
1535 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1536 	ips->ips_netstack = ns;
1537 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1538 	avl_create(&ips->ips_avl_by_index, ipnet_if_compare_index,
1539 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1540 	avl_create(&ips->ips_avl_by_name, ipnet_if_compare_name,
1541 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1542 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1543 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1544 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1545 	    offsetof(ipnet_t, ipnet_next));
1546 	ipnet_register_netihook(ips);
1547 	return (ips);
1548 }
1549 
1550 /* ARGSUSED */
1551 static void
1552 ipnet_stack_fini(netstackid_t stackid, void *arg)
1553 {
1554 	ipnet_stack_t	*ips = arg;
1555 	ipnetif_t	*ipnetif, *nipnetif;
1556 
1557 	if (ips->ips_ndv4 != NULL) {
1558 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1559 		    ips->ips_nicevents) == 0);
1560 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1561 	}
1562 	if (ips->ips_ndv6 != NULL) {
1563 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1564 		    ips->ips_nicevents) == 0);
1565 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1566 	}
1567 	hook_free(ips->ips_nicevents);
1568 
1569 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1570 	    ipnetif = nipnetif) {
1571 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1572 		ipnet_remove_if(ipnetif, ips);
1573 	}
1574 	avl_destroy(&ips->ips_avl_by_index);
1575 	avl_destroy(&ips->ips_avl_by_name);
1576 	mutex_destroy(&ips->ips_avl_lock);
1577 	mutex_destroy(&ips->ips_walkers_lock);
1578 	cv_destroy(&ips->ips_walkers_cv);
1579 	list_destroy(&ips->ips_str_list);
1580 	kmem_free(ips, sizeof (*ips));
1581 }
1582 
1583 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1584 static boolean_t
1585 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1586 {
1587 	ipnetif_addr_t *ifa;
1588 
1589 	for (ifa = list_head(addrlist); ifa != NULL;
1590 	    ifa = list_next(addrlist, ifa)) {
1591 		if (ifa->ifa_zone == zoneid)
1592 			return (B_TRUE);
1593 	}
1594 	return (B_FALSE);
1595 }
1596 
1597 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1598 static boolean_t
1599 ipnet_if_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1600 {
1601 	int ret;
1602 
1603 	/*
1604 	 * The global zone has visibility into all interfaces in the global
1605 	 * stack, and exclusive stack zones have visibility into all
1606 	 * interfaces in their stack.
1607 	 */
1608 	if (zoneid == GLOBAL_ZONEID ||
1609 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1610 		return (B_TRUE);
1611 
1612 	/*
1613 	 * Shared-stack zones only have visibility for interfaces that have
1614 	 * addresses in their zone.
1615 	 */
1616 	mutex_enter(&ipnetif->if_addr_lock);
1617 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1618 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1619 	mutex_exit(&ipnetif->if_addr_lock);
1620 	return (ret);
1621 }
1622 
1623 /*
1624  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1625  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1626  * to have an ipnetif open if there are no longer any addresses that belong to
1627  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1628  * case, send the ipnet_t an M_HANGUP.
1629  */
1630 static void
1631 ipnet_if_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1632 {
1633 	list_t	*strlist = &ips->ips_str_list;
1634 	ipnet_t	*ipnet;
1635 
1636 	ipnet_walkers_inc(ips);
1637 	for (ipnet = list_head(strlist); ipnet != NULL;
1638 	    ipnet = list_next(strlist, ipnet)) {
1639 		if (ipnet->ipnet_if != ipnetif)
1640 			continue;
1641 		if (!ipnet_if_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1642 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1643 	}
1644 	ipnet_walkers_dec(ips);
1645 }
1646 
1647 void
1648 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1649 {
1650 	ipnetif_t 		*ipnetif;
1651 	list_t			cbdata;
1652 	ipnetif_cbdata_t	*cbnode;
1653 	netstack_t		*ns;
1654 	ipnet_stack_t		*ips;
1655 
1656 	/*
1657 	 * On labeled systems, non-global zones shouldn't see anything
1658 	 * in /dev/ipnet.
1659 	 */
1660 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1661 		return;
1662 
1663 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1664 		return;
1665 
1666 	ips = ns->netstack_ipnet;
1667 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1668 	    offsetof(ipnetif_cbdata_t, ic_next));
1669 
1670 	mutex_enter(&ips->ips_avl_lock);
1671 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1672 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1673 		if (!ipnet_if_in_zone(ipnetif, zoneid, ips))
1674 			continue;
1675 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1676 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1677 		cbnode->ic_dev = ipnetif->if_dev;
1678 		list_insert_head(&cbdata, cbnode);
1679 	}
1680 	mutex_exit(&ips->ips_avl_lock);
1681 
1682 	while ((cbnode = list_head(&cbdata)) != NULL) {
1683 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1684 		list_remove(&cbdata, cbnode);
1685 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1686 	}
1687 	list_destroy(&cbdata);
1688 	netstack_rele(ns);
1689 }
1690 
1691 static int
1692 ipnet_if_compare_index(const void *index_ptr, const void *ipnetifp)
1693 {
1694 	int64_t index1 = *((int64_t *)index_ptr);
1695 	int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1696 
1697 	return (SIGNOF(index2 - index1));
1698 }
1699 
1700 static int
1701 ipnet_if_compare_name(const void *name_ptr, const void *ipnetifp)
1702 {
1703 	int res;
1704 
1705 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1706 	return (SIGNOF(res));
1707 }
1708 
1709 static void
1710 ipnetif_refhold(ipnetif_t *ipnetif)
1711 {
1712 	mutex_enter(&ipnetif->if_reflock);
1713 	ipnetif->if_refcnt++;
1714 	mutex_exit(&ipnetif->if_reflock);
1715 }
1716 
1717 static void
1718 ipnetif_refrele(ipnetif_t *ipnetif)
1719 {
1720 	mutex_enter(&ipnetif->if_reflock);
1721 	ASSERT(ipnetif->if_refcnt != 0);
1722 	if (--ipnetif->if_refcnt == 0)
1723 		ipnet_free_if(ipnetif);
1724 	else
1725 		mutex_exit(&ipnetif->if_reflock);
1726 }
1727 
1728 static void
1729 ipnet_walkers_inc(ipnet_stack_t *ips)
1730 {
1731 	mutex_enter(&ips->ips_walkers_lock);
1732 	ips->ips_walkers_cnt++;
1733 	mutex_exit(&ips->ips_walkers_lock);
1734 }
1735 
1736 static void
1737 ipnet_walkers_dec(ipnet_stack_t *ips)
1738 {
1739 	mutex_enter(&ips->ips_walkers_lock);
1740 	ASSERT(ips->ips_walkers_cnt != 0);
1741 	if (--ips->ips_walkers_cnt == 0)
1742 		cv_broadcast(&ips->ips_walkers_cv);
1743 	mutex_exit(&ips->ips_walkers_lock);
1744 }
1745