xref: /illumos-gate/usr/src/uts/common/inet/ipnet/ipnet.c (revision 8c69cc8fbe729fa7b091e901c4b50508ccc6bb33)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
30  */
31 
32 /*
33  * The ipnet device defined here provides access to packets at the IP layer. To
34  * provide access to packets at this layer it registers a callback function in
35  * the ip module and when there are open instances of the device ip will pass
36  * packets into the device. Packets from ip are passed on the input, output and
37  * loopback paths. Internally the module returns to ip as soon as possible by
38  * deferring processing using a taskq.
39  *
40  * Management of the devices in /dev/ipnet/ is handled by the devname
41  * filesystem and use of the neti interfaces.  This module registers for NIC
42  * events using the neti framework so that when IP interfaces are bought up,
43  * taken down etc. the ipnet module is notified and its view of the interfaces
44  * configured on the system adjusted.  On attach, the module gets an initial
45  * view of the system again using the neti framework but as it has already
46  * registered for IP interface events, it is still up-to-date with any changes.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/conf.h>
51 #include <sys/cred.h>
52 #include <sys/stat.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/modctl.h>
56 #include <sys/dlpi.h>
57 #include <sys/strsun.h>
58 #include <sys/id_space.h>
59 #include <sys/kmem.h>
60 #include <sys/mkdev.h>
61 #include <sys/neti.h>
62 #include <net/if.h>
63 #include <sys/errno.h>
64 #include <sys/list.h>
65 #include <sys/ksynch.h>
66 #include <sys/hook_event.h>
67 #include <sys/sdt.h>
68 #include <sys/stropts.h>
69 #include <sys/sysmacros.h>
70 #include <inet/ip.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip6.h>
74 #include <inet/ipnet.h>
75 #include <net/bpf.h>
76 #include <net/bpfdesc.h>
77 #include <net/dlt.h>
78 
79 static struct module_info ipnet_minfo = {
80 	1,		/* mi_idnum */
81 	"ipnet",	/* mi_idname */
82 	0,		/* mi_minpsz */
83 	INFPSZ,		/* mi_maxpsz */
84 	2048,		/* mi_hiwat */
85 	0		/* mi_lowat */
86 };
87 
88 /*
89  * List to hold static view of ipnetif_t's on the system. This is needed to
90  * avoid holding the lock protecting the avl tree of ipnetif's over the
91  * callback into the dev filesystem.
92  */
93 typedef struct ipnetif_cbdata {
94 	char		ic_ifname[LIFNAMSIZ];
95 	dev_t		ic_dev;
96 	list_node_t	ic_next;
97 } ipnetif_cbdata_t;
98 
99 /*
100  * Convenience enumerated type for ipnet_accept().  It describes the
101  * properties of a given ipnet_addrp_t relative to a single ipnet_t
102  * client stream.  The values represent whether the address is ...
103  */
104 typedef enum {
105 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
106 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
107 	IPNETADDR_UNKNOWN	/* none of the above. */
108 } ipnet_addrtype_t;
109 
110 /* Argument used for the ipnet_nicevent_taskq callback. */
111 typedef struct ipnet_nicevent_s {
112 	nic_event_t		ipne_event;
113 	net_handle_t		ipne_protocol;
114 	netstackid_t		ipne_stackid;
115 	uint64_t		ipne_ifindex;
116 	uint64_t		ipne_lifindex;
117 	char			ipne_ifname[LIFNAMSIZ];
118 } ipnet_nicevent_t;
119 
120 static dev_info_t	*ipnet_dip;
121 static major_t		ipnet_major;
122 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
123 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
124 static id_space_t	*ipnet_minor_space;
125 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
126 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
127 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
128 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
129 static bpf_itap_fn_t	ipnet_itap;
130 
131 static void	ipnet_input(mblk_t *);
132 static int	ipnet_wput(queue_t *, mblk_t *);
133 static int	ipnet_rsrv(queue_t *);
134 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
135 static int	ipnet_close(queue_t *);
136 static void	ipnet_ioctl(queue_t *, mblk_t *);
137 static void	ipnet_iocdata(queue_t *, mblk_t *);
138 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
139 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
141 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
143 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
144 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
145 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
146 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
147 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
148 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
149 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
150 static void	ipnet_nicevent_task(void *);
151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
152     uint64_t);
153 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
158 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
159 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
160 static int 	ipnetif_compare_name(const void *, const void *);
161 static int 	ipnetif_compare_name_zone(const void *, const void *);
162 static int 	ipnetif_compare_index(const void *, const void *);
163 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
164 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
165 static void	ipnetif_refhold(ipnetif_t *);
166 static void	ipnetif_refrele(ipnetif_t *);
167 static void	ipnet_walkers_inc(ipnet_stack_t *);
168 static void	ipnet_walkers_dec(ipnet_stack_t *);
169 static void	ipnet_register_netihook(ipnet_stack_t *);
170 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
171 static void	ipnet_stack_fini(netstackid_t, void *);
172 static void	ipnet_dispatch(void *);
173 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
174 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
176 static void	ipnetif_clone_release(ipnetif_t *);
177 
178 static struct qinit ipnet_rinit = {
179 	NULL,		/* qi_putp */
180 	ipnet_rsrv,	/* qi_srvp */
181 	ipnet_open,	/* qi_qopen */
182 	ipnet_close,	/* qi_qclose */
183 	NULL,		/* qi_qadmin */
184 	&ipnet_minfo,	/* qi_minfo */
185 };
186 
187 static struct qinit ipnet_winit = {
188 	ipnet_wput,	/* qi_putp */
189 	NULL,		/* qi_srvp */
190 	NULL,		/* qi_qopen */
191 	NULL,		/* qi_qclose */
192 	NULL,		/* qi_qadmin */
193 	&ipnet_minfo,	/* qi_minfo */
194 };
195 
196 static struct streamtab ipnet_info = {
197 	&ipnet_rinit, &ipnet_winit
198 };
199 
200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
201     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
202     ddi_quiesce_not_supported);
203 
204 static struct modldrv modldrv = {
205 	&mod_driverops,
206 	"STREAMS ipnet driver",
207 	&ipnet_ops
208 };
209 
210 static struct modlinkage modlinkage = {
211 	MODREV_1, &modldrv, NULL
212 };
213 
214 /*
215  * This structure contains the template data (names and type) that is
216  * copied, in bulk, into the new kstats structure created by net_kstat_create.
217  * No actual statistical information is stored in this instance of the
218  * ipnet_kstats_t structure.
219  */
220 static ipnet_kstats_t stats_template = {
221 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
222 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
223 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
224 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
225 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
226 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
227 	{ "acceptOk",		KSTAT_DATA_UINT64 },
228 	{ "acceptFail",		KSTAT_DATA_UINT64 }
229 };
230 
231 /*
232  * Walk the list of physical interfaces on the machine, for each
233  * interface create a new ipnetif_t and add any addresses to it. We
234  * need to do the walk twice, once for IPv4 and once for IPv6.
235  *
236  * The interfaces are destroyed as part of ipnet_stack_fini() for each
237  * stack.  Note that we cannot do this initialization in
238  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
239  */
240 static int
241 ipnetif_init(void)
242 {
243 	netstack_handle_t	nh;
244 	netstack_t		*ns;
245 	ipnet_stack_t		*ips;
246 	int			ret = 0;
247 
248 	netstack_next_init(&nh);
249 	while ((ns = netstack_next(&nh)) != NULL) {
250 		ips = ns->netstack_ipnet;
251 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
252 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
253 		netstack_rele(ns);
254 		if (ret != 0)
255 			break;
256 	}
257 	netstack_next_fini(&nh);
258 	return (ret);
259 }
260 
261 /*
262  * Standard module entry points.
263  */
264 int
265 _init(void)
266 {
267 	int		ret;
268 	boolean_t	netstack_registered = B_FALSE;
269 
270 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
271 		return (ENODEV);
272 	ipnet_minor_space = id_space_create("ipnet_minor_space",
273 	    IPNET_MINOR_MIN, MAXMIN32);
274 
275 	/*
276 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
277 	 * delivery of packets to clients.  Note that we need to create the
278 	 * taskqs before calling netstack_register() since ipnet_stack_init()
279 	 * registers callbacks that use 'em.
280 	 */
281 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
282 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
283 	    1, TASKQ_DEFAULTPRI, 0);
284 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
285 		ret = ENOMEM;
286 		goto done;
287 	}
288 
289 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
290 	netstack_registered = B_TRUE;
291 
292 	if ((ret = ipnetif_init()) == 0)
293 		ret = mod_install(&modlinkage);
294 done:
295 	if (ret != 0) {
296 		if (ipnet_taskq != NULL)
297 			ddi_taskq_destroy(ipnet_taskq);
298 		if (ipnet_nicevent_taskq != NULL)
299 			ddi_taskq_destroy(ipnet_nicevent_taskq);
300 		if (netstack_registered)
301 			netstack_unregister(NS_IPNET);
302 		id_space_destroy(ipnet_minor_space);
303 	}
304 	return (ret);
305 }
306 
307 int
308 _fini(void)
309 {
310 	int	err;
311 
312 	if ((err = mod_remove(&modlinkage)) != 0)
313 		return (err);
314 
315 	netstack_unregister(NS_IPNET);
316 	ddi_taskq_destroy(ipnet_nicevent_taskq);
317 	ddi_taskq_destroy(ipnet_taskq);
318 	id_space_destroy(ipnet_minor_space);
319 	return (0);
320 }
321 
322 int
323 _info(struct modinfo *modinfop)
324 {
325 	return (mod_info(&modlinkage, modinfop));
326 }
327 
328 static void
329 ipnet_register_netihook(ipnet_stack_t *ips)
330 {
331 	int		ret;
332 	zoneid_t	zoneid;
333 	netid_t		netid;
334 
335 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
336 	    ips);
337 
338 	/*
339 	 * It is possible for an exclusive stack to be in the process of
340 	 * shutting down here, and the netid and protocol lookups could fail
341 	 * in that case.
342 	 */
343 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
344 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
345 		return;
346 
347 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
348 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
349 		    ips->ips_nicevents)) != 0) {
350 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
351 			ips->ips_ndv4 = NULL;
352 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
353 			    " in zone %d: %d", zoneid, ret);
354 		}
355 	}
356 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
357 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
358 		    ips->ips_nicevents)) != 0) {
359 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
360 			ips->ips_ndv6 = NULL;
361 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
362 			    " in zone %d: %d", zoneid, ret);
363 		}
364 	}
365 
366 	/*
367 	 * Create a local set of kstats for each zone.
368 	 */
369 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
370 	    "misc", KSTAT_TYPE_NAMED,
371 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
372 	if (ips->ips_kstatp != NULL) {
373 		bcopy(&stats_template, &ips->ips_stats,
374 		    sizeof (ips->ips_stats));
375 		ips->ips_kstatp->ks_data = &ips->ips_stats;
376 		ips->ips_kstatp->ks_private =
377 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
378 		kstat_install(ips->ips_kstatp);
379 	} else {
380 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
381 		    "ipnet", "ipnet_stats", "misc");
382 	}
383 }
384 
385 /*
386  * This function is called on attach to build an initial view of the
387  * interfaces on the system. It will be called once for IPv4 and once
388  * for IPv6, although there is only one ipnet interface for both IPv4
389  * and IPv6 there are separate address lists.
390  */
391 static int
392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
393 {
394 	phy_if_t	phyif;
395 	lif_if_t	lif;
396 	ipnetif_t	*ipnetif;
397 	char		name[LIFNAMSIZ];
398 	boolean_t	new_if = B_FALSE;
399 	uint64_t	ifflags;
400 	int		ret = 0;
401 
402 	/*
403 	 * If ipnet_register_netihook() was unable to initialize this
404 	 * stack's net_handle_t, then we cannot populate any interface
405 	 * information.  This usually happens when we attempted to
406 	 * grab a net_handle_t as a stack was shutting down.  We don't
407 	 * want to fail the entire _init() operation because of a
408 	 * stack shutdown (other stacks will continue to work just
409 	 * fine), so we silently return success here.
410 	 */
411 	if (nd == NULL)
412 		return (0);
413 
414 	/*
415 	 * Make sure we're not processing NIC events during the
416 	 * population of our interfaces and address lists.
417 	 */
418 	mutex_enter(&ips->ips_event_lock);
419 
420 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
421 	    phyif = net_phygetnext(nd, phyif)) {
422 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
423 			continue;
424 		ifflags =  0;
425 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
426 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
427 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
428 			if (ipnetif == NULL) {
429 				ret = ENOMEM;
430 				goto done;
431 			}
432 			new_if = B_TRUE;
433 		}
434 		ipnetif->if_flags |=
435 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
436 
437 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
438 		    lif = net_lifgetnext(nd, phyif, lif)) {
439 			/*
440 			 * Skip addresses that aren't up.  We'll add
441 			 * them when we receive an NE_LIF_UP event.
442 			 */
443 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
444 			    !(ifflags & IFF_UP))
445 				continue;
446 			/* Don't add it if we already have it. */
447 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
448 				continue;
449 			ipnet_add_ifaddr(lif, ipnetif, nd);
450 		}
451 		if (!new_if)
452 			ipnetif_refrele(ipnetif);
453 	}
454 
455 done:
456 	mutex_exit(&ips->ips_event_lock);
457 	return (ret);
458 }
459 
460 static int
461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
462 {
463 	if (cmd != DDI_ATTACH)
464 		return (DDI_FAILURE);
465 
466 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
467 	    DDI_PSEUDO, 0) == DDI_FAILURE)
468 		return (DDI_FAILURE);
469 
470 	ipnet_dip = dip;
471 	return (DDI_SUCCESS);
472 }
473 
474 static int
475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
476 {
477 	if (cmd != DDI_DETACH)
478 		return (DDI_FAILURE);
479 
480 	ASSERT(dip == ipnet_dip);
481 	ddi_remove_minor_node(ipnet_dip, NULL);
482 	ipnet_dip = NULL;
483 	return (DDI_SUCCESS);
484 }
485 
486 /* ARGSUSED */
487 static int
488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
489 {
490 	int	error = DDI_FAILURE;
491 
492 	switch (infocmd) {
493 	case DDI_INFO_DEVT2INSTANCE:
494 		*result = (void *)0;
495 		error = DDI_SUCCESS;
496 		break;
497 	case DDI_INFO_DEVT2DEVINFO:
498 		if (ipnet_dip != NULL) {
499 			*result = ipnet_dip;
500 			error = DDI_SUCCESS;
501 		}
502 		break;
503 	}
504 	return (error);
505 }
506 
507 /* ARGSUSED */
508 static int
509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
510 {
511 	ipnet_t		*ipnet;
512 	netstack_t	*ns = NULL;
513 	ipnet_stack_t	*ips;
514 	int		err = 0;
515 	zoneid_t	zoneid = crgetzoneid(crp);
516 
517 	/*
518 	 * If the system is labeled, only the global zone is allowed to open
519 	 * IP observability nodes.
520 	 */
521 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
522 		return (EACCES);
523 
524 	/* We don't support open as a module */
525 	if (sflag & MODOPEN)
526 		return (ENOTSUP);
527 
528 	/* This driver is self-cloning, we don't support re-open. */
529 	if (rq->q_ptr != NULL)
530 		return (EBUSY);
531 
532 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
533 		return (ENOMEM);
534 
535 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
536 	ips = ns->netstack_ipnet;
537 
538 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
539 	ipnet->ipnet_rq = rq;
540 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
541 	ipnet->ipnet_zoneid = zoneid;
542 	ipnet->ipnet_dlstate = DL_UNBOUND;
543 	ipnet->ipnet_ns = ns;
544 
545 	/*
546 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
547 	 * to be processed after ipnet_if is set and the ipnet_t has been
548 	 * inserted in the ips_str_list.
549 	 */
550 	mutex_enter(&ips->ips_event_lock);
551 	if (getminor(*dev) == IPNET_MINOR_LO) {
552 		ipnet->ipnet_flags |= IPNET_LOMODE;
553 		ipnet->ipnet_acceptfn = ipnet_loaccept;
554 	} else {
555 		ipnet->ipnet_acceptfn = ipnet_accept;
556 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
557 		if (ipnet->ipnet_if == NULL ||
558 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
559 			err = ENODEV;
560 			goto done;
561 		}
562 	}
563 
564 	mutex_enter(&ips->ips_walkers_lock);
565 	while (ips->ips_walkers_cnt != 0)
566 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
567 	list_insert_head(&ips->ips_str_list, ipnet);
568 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
569 	qprocson(rq);
570 
571 	/*
572 	 * Only register our callback if we're the first open client; we call
573 	 * unregister in close() for the last open client.
574 	 */
575 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
576 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
577 	mutex_exit(&ips->ips_walkers_lock);
578 
579 done:
580 	mutex_exit(&ips->ips_event_lock);
581 	if (err != 0) {
582 		netstack_rele(ns);
583 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
584 		if (ipnet->ipnet_if != NULL)
585 			ipnetif_refrele(ipnet->ipnet_if);
586 		kmem_free(ipnet, sizeof (*ipnet));
587 	}
588 	return (err);
589 }
590 
591 static int
592 ipnet_close(queue_t *rq)
593 {
594 	ipnet_t		*ipnet = rq->q_ptr;
595 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
596 
597 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
598 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
599 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
600 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
601 
602 	mutex_enter(&ips->ips_walkers_lock);
603 	while (ips->ips_walkers_cnt != 0)
604 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
605 
606 	qprocsoff(rq);
607 
608 	list_remove(&ips->ips_str_list, ipnet);
609 	if (ipnet->ipnet_if != NULL)
610 		ipnetif_refrele(ipnet->ipnet_if);
611 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
612 
613 	if (list_is_empty(&ips->ips_str_list)) {
614 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
615 		ips->ips_hook = NULL;
616 	}
617 
618 	kmem_free(ipnet, sizeof (*ipnet));
619 
620 	mutex_exit(&ips->ips_walkers_lock);
621 	netstack_rele(ips->ips_netstack);
622 	return (0);
623 }
624 
625 static int
626 ipnet_wput(queue_t *q, mblk_t *mp)
627 {
628 	switch (mp->b_datap->db_type) {
629 	case M_FLUSH:
630 		if (*mp->b_rptr & FLUSHW) {
631 			flushq(q, FLUSHDATA);
632 			*mp->b_rptr &= ~FLUSHW;
633 		}
634 		if (*mp->b_rptr & FLUSHR)
635 			qreply(q, mp);
636 		else
637 			freemsg(mp);
638 		break;
639 	case M_PROTO:
640 	case M_PCPROTO:
641 		ipnet_wputnondata(q, mp);
642 		break;
643 	case M_IOCTL:
644 		ipnet_ioctl(q, mp);
645 		break;
646 	case M_IOCDATA:
647 		ipnet_iocdata(q, mp);
648 		break;
649 	default:
650 		freemsg(mp);
651 		break;
652 	}
653 	return (0);
654 }
655 
656 static int
657 ipnet_rsrv(queue_t *q)
658 {
659 	mblk_t	*mp;
660 
661 	while ((mp = getq(q)) != NULL) {
662 		ASSERT(DB_TYPE(mp) == M_DATA);
663 		if (canputnext(q)) {
664 			putnext(q, mp);
665 		} else {
666 			(void) putbq(q, mp);
667 			break;
668 		}
669 	}
670 	return (0);
671 }
672 
673 static void
674 ipnet_ioctl(queue_t *q, mblk_t *mp)
675 {
676 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
677 
678 	switch (iocp->ioc_cmd) {
679 	case DLIOCRAW:
680 		miocack(q, mp, 0, 0);
681 		break;
682 	case DLIOCIPNETINFO:
683 		if (iocp->ioc_count == TRANSPARENT) {
684 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
685 			qreply(q, mp);
686 			break;
687 		}
688 		/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
689 	default:
690 		miocnak(q, mp, 0, EINVAL);
691 		break;
692 	}
693 }
694 
695 static void
696 ipnet_iocdata(queue_t *q, mblk_t *mp)
697 {
698 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
699 	ipnet_t	*ipnet = q->q_ptr;
700 
701 	switch (iocp->ioc_cmd) {
702 	case DLIOCIPNETINFO:
703 		if (*(int *)mp->b_cont->b_rptr == 1)
704 			ipnet->ipnet_flags |= IPNET_INFO;
705 		else if (*(int *)mp->b_cont->b_rptr == 0)
706 			ipnet->ipnet_flags &= ~IPNET_INFO;
707 		else
708 			goto iocnak;
709 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
710 		break;
711 	default:
712 iocnak:
713 		miocnak(q, mp, 0, EINVAL);
714 		break;
715 	}
716 }
717 
718 static void
719 ipnet_wputnondata(queue_t *q, mblk_t *mp)
720 {
721 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
722 	t_uscalar_t		prim = dlp->dl_primitive;
723 
724 	switch (prim) {
725 	case DL_INFO_REQ:
726 		ipnet_inforeq(q, mp);
727 		break;
728 	case DL_UNBIND_REQ:
729 		ipnet_unbindreq(q, mp);
730 		break;
731 	case DL_BIND_REQ:
732 		ipnet_bindreq(q, mp);
733 		break;
734 	case DL_PROMISCON_REQ:
735 		ipnet_dlpromisconreq(q, mp);
736 		break;
737 	case DL_PROMISCOFF_REQ:
738 		ipnet_dlpromiscoffreq(q, mp);
739 		break;
740 	case DL_UNITDATA_REQ:
741 	case DL_DETACH_REQ:
742 	case DL_PHYS_ADDR_REQ:
743 	case DL_SET_PHYS_ADDR_REQ:
744 	case DL_ENABMULTI_REQ:
745 	case DL_DISABMULTI_REQ:
746 	case DL_ATTACH_REQ:
747 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
748 		break;
749 	default:
750 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
751 		break;
752 	}
753 }
754 
755 static void
756 ipnet_inforeq(queue_t *q, mblk_t *mp)
757 {
758 	dl_info_ack_t	*dlip;
759 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
760 
761 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
762 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
763 		return;
764 	}
765 
766 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
767 		return;
768 
769 	dlip = (dl_info_ack_t *)mp->b_rptr;
770 	*dlip = ipnet_infoack;
771 	qreply(q, mp);
772 }
773 
774 static void
775 ipnet_bindreq(queue_t *q, mblk_t *mp)
776 {
777 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
778 	ipnet_t			*ipnet = q->q_ptr;
779 
780 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
781 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
782 		return;
783 	}
784 
785 	switch (dlp->bind_req.dl_sap) {
786 	case 0 :
787 		ipnet->ipnet_family = AF_UNSPEC;
788 		break;
789 	case IPV4_VERSION :
790 		ipnet->ipnet_family = AF_INET;
791 		break;
792 	case IPV6_VERSION :
793 		ipnet->ipnet_family = AF_INET6;
794 		break;
795 	default :
796 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
797 		return;
798 		/*NOTREACHED*/
799 	}
800 
801 	ipnet->ipnet_dlstate = DL_IDLE;
802 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
803 }
804 
805 static void
806 ipnet_unbindreq(queue_t *q, mblk_t *mp)
807 {
808 	ipnet_t	*ipnet = q->q_ptr;
809 
810 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
811 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
812 		return;
813 	}
814 
815 	if (ipnet->ipnet_dlstate != DL_IDLE) {
816 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
817 	} else {
818 		ipnet->ipnet_dlstate = DL_UNBOUND;
819 		ipnet->ipnet_family = AF_UNSPEC;
820 		dlokack(q, mp, DL_UNBIND_REQ);
821 	}
822 }
823 
824 static void
825 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
826 {
827 	ipnet_t		*ipnet = q->q_ptr;
828 	t_uscalar_t	level;
829 	int		err;
830 
831 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
832 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
833 		return;
834 	}
835 
836 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
837 		dlokack(q, mp, DL_PROMISCON_REQ);
838 		return;
839 	}
840 
841 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
842 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
843 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
844 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
845 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
846 			return;
847 		}
848 	}
849 
850 	switch (level) {
851 	case DL_PROMISC_PHYS:
852 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
853 		break;
854 	case DL_PROMISC_SAP:
855 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
856 		break;
857 	case DL_PROMISC_MULTI:
858 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
859 		break;
860 	default:
861 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
862 		return;
863 	}
864 
865 	dlokack(q, mp, DL_PROMISCON_REQ);
866 }
867 
868 static void
869 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
870 {
871 	ipnet_t		*ipnet = q->q_ptr;
872 	t_uscalar_t	level;
873 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
874 
875 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
876 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
877 		return;
878 	}
879 
880 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
881 		dlokack(q, mp, DL_PROMISCOFF_REQ);
882 		return;
883 	}
884 
885 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
886 	switch (level) {
887 	case DL_PROMISC_PHYS:
888 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
889 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
890 		break;
891 	case DL_PROMISC_SAP:
892 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
893 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
894 		break;
895 	case DL_PROMISC_MULTI:
896 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
897 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
898 		break;
899 	default:
900 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
901 		return;
902 	}
903 
904 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
905 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
906 		return;
907 	}
908 
909 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
910 		ipnet_leave_allmulti(ipnet->ipnet_if,
911 		    ipnet->ipnet_ns->netstack_ipnet);
912 	}
913 
914 	dlokack(q, mp, DL_PROMISCOFF_REQ);
915 }
916 
917 static int
918 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
919 {
920 	int		err = 0;
921 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
922 	uint64_t	index = ipnetif->if_index;
923 
924 	mutex_enter(&ips->ips_event_lock);
925 	if (ipnetif->if_multicnt == 0) {
926 		ASSERT((ipnetif->if_flags &
927 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
928 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
929 			err = ip_join_allmulti(index, B_FALSE, ipst);
930 			if (err != 0)
931 				goto done;
932 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
933 		}
934 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
935 			err = ip_join_allmulti(index, B_TRUE, ipst);
936 			if (err != 0 &&
937 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
938 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
939 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
940 				goto done;
941 			}
942 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
943 		}
944 	}
945 	ipnetif->if_multicnt++;
946 
947 done:
948 	mutex_exit(&ips->ips_event_lock);
949 	return (err);
950 }
951 
952 static void
953 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
954 {
955 	int		err;
956 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
957 	uint64_t	index = ipnetif->if_index;
958 
959 	mutex_enter(&ips->ips_event_lock);
960 	ASSERT(ipnetif->if_multicnt != 0);
961 	if (--ipnetif->if_multicnt == 0) {
962 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
963 			err = ip_leave_allmulti(index, B_FALSE, ipst);
964 			ASSERT(err == 0 || err == ENODEV);
965 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
966 		}
967 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
968 			err = ip_leave_allmulti(index, B_TRUE, ipst);
969 			ASSERT(err == 0 || err == ENODEV);
970 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
971 		}
972 	}
973 	mutex_exit(&ips->ips_event_lock);
974 }
975 
976 /*
977  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
978  * The structure it copies the header information from,
979  * hook_pkt_observe_t, is constructed using network byte
980  * order in ipobs_hook(), so there is no conversion here.
981  */
982 static mblk_t *
983 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
984 {
985 	mblk_t		*dlhdr;
986 	dl_ipnetinfo_t	*dl;
987 
988 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
989 		freemsg(mp);
990 		return (NULL);
991 	}
992 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
993 	dl->dli_version = DL_IPNETINFO_VERSION;
994 	dl->dli_family = hdr->hpo_family;
995 	dl->dli_htype = hdr->hpo_htype;
996 	dl->dli_pktlen = hdr->hpo_pktlen;
997 	dl->dli_ifindex = hdr->hpo_ifindex;
998 	dl->dli_grifindex = hdr->hpo_grifindex;
999 	dl->dli_zsrc = hdr->hpo_zsrc;
1000 	dl->dli_zdst = hdr->hpo_zdst;
1001 	dlhdr->b_wptr += sizeof (*dl);
1002 	dlhdr->b_cont = mp;
1003 
1004 	return (dlhdr);
1005 }
1006 
1007 static ipnet_addrtype_t
1008 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1009 {
1010 	list_t			*list;
1011 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1012 	ipnetif_addr_t		*ifaddr;
1013 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1014 
1015 	/* First check if the address is multicast or limited broadcast. */
1016 	switch (addr->iap_family) {
1017 	case AF_INET:
1018 		if (CLASSD(*(addr->iap_addr4)) ||
1019 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1020 			return (IPNETADDR_MBCAST);
1021 		break;
1022 	case AF_INET6:
1023 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1024 			return (IPNETADDR_MBCAST);
1025 		break;
1026 	}
1027 
1028 	/*
1029 	 * Walk the address list to see if the address belongs to our
1030 	 * interface or is one of our subnet broadcast addresses.
1031 	 */
1032 	mutex_enter(&ipnetif->if_addr_lock);
1033 	list = (addr->iap_family == AF_INET) ?
1034 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1035 	for (ifaddr = list_head(list);
1036 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1037 	    ifaddr = list_next(list, ifaddr)) {
1038 		/*
1039 		 * If we're not in the global zone, then only look at
1040 		 * addresses in our zone.
1041 		 */
1042 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1043 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1044 			continue;
1045 		switch (addr->iap_family) {
1046 		case AF_INET:
1047 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1048 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1049 				addrtype = IPNETADDR_MYADDR;
1050 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1051 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1052 				addrtype = IPNETADDR_MBCAST;
1053 			break;
1054 		case AF_INET6:
1055 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1056 			    &ifaddr->ifa_ip6addr))
1057 				addrtype = IPNETADDR_MYADDR;
1058 			break;
1059 		}
1060 	}
1061 	mutex_exit(&ipnetif->if_addr_lock);
1062 
1063 	return (addrtype);
1064 }
1065 
1066 /*
1067  * Verify if the packet contained in hdr should be passed up to the
1068  * ipnet client stream.
1069  */
1070 static boolean_t
1071 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1072     ipnet_addrp_t *dst)
1073 {
1074 	boolean_t		obsif;
1075 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1076 	ipnet_addrtype_t	srctype;
1077 	ipnet_addrtype_t	dsttype;
1078 
1079 	srctype = ipnet_get_addrtype(ipnet, src);
1080 	dsttype = ipnet_get_addrtype(ipnet, dst);
1081 
1082 	/*
1083 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1084 	 * matches ours, it's on the interface we're observing.  (Thus,
1085 	 * observing on the group ifindex matches all ifindexes in the group.)
1086 	 */
1087 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1088 	    ntohl(hdr->hpo_grifindex) == ifindex);
1089 
1090 	DTRACE_PROBE5(ipnet_accept__addr,
1091 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1092 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1093 	    boolean_t, obsif);
1094 
1095 	/*
1096 	 * Do not allow an ipnet stream to see packets that are not from or to
1097 	 * its zone.  The exception is when zones are using the shared stack
1098 	 * model.  In this case, streams in the global zone have visibility
1099 	 * into other shared-stack zones, and broadcast and multicast traffic
1100 	 * is visible by all zones in the stack.
1101 	 */
1102 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1103 	    dsttype != IPNETADDR_MBCAST) {
1104 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1105 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1106 			return (B_FALSE);
1107 	}
1108 
1109 	/*
1110 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1111 	 * packet's IP version.
1112 	 */
1113 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1114 	    ipnet->ipnet_family != hdr->hpo_family)
1115 		return (B_FALSE);
1116 
1117 	/* If the destination address is ours, then accept the packet. */
1118 	if (dsttype == IPNETADDR_MYADDR)
1119 		return (B_TRUE);
1120 
1121 	/*
1122 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1123 	 * sent or received on the interface we're observing, or packets that
1124 	 * have our source address (this allows us to see packets we send).
1125 	 */
1126 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1127 		if (srctype == IPNETADDR_MYADDR || obsif)
1128 			return (B_TRUE);
1129 	}
1130 
1131 	/*
1132 	 * We accept multicast and broadcast packets transmitted or received
1133 	 * on the interface we're observing.
1134 	 */
1135 	if (dsttype == IPNETADDR_MBCAST && obsif)
1136 		return (B_TRUE);
1137 
1138 	return (B_FALSE);
1139 }
1140 
1141 /*
1142  * Verify if the packet contained in hdr should be passed up to the ipnet
1143  * client stream that's in IPNET_LOMODE.
1144  */
1145 /* ARGSUSED */
1146 static boolean_t
1147 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1148     ipnet_addrp_t *dst)
1149 {
1150 	if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1151 		/*
1152 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1153 		 */
1154 		if (ipnet->ipnet_if == NULL)
1155 			return (B_FALSE);
1156 	}
1157 
1158 	/*
1159 	 * An ipnet stream must not see packets that are not from/to its zone.
1160 	 */
1161 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1162 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1163 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1164 			return (B_FALSE);
1165 	}
1166 
1167 	return (ipnet->ipnet_family == AF_UNSPEC ||
1168 	    ipnet->ipnet_family == hdr->hpo_family);
1169 }
1170 
1171 static void
1172 ipnet_dispatch(void *arg)
1173 {
1174 	mblk_t			*mp = arg;
1175 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1176 	ipnet_t			*ipnet;
1177 	mblk_t			*netmp;
1178 	list_t			*list;
1179 	ipnet_stack_t		*ips;
1180 	ipnet_addrp_t		src;
1181 	ipnet_addrp_t		dst;
1182 
1183 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1184 
1185 	netmp = hdr->hpo_pkt->b_cont;
1186 	src.iap_family = hdr->hpo_family;
1187 	dst.iap_family = hdr->hpo_family;
1188 
1189 	if (hdr->hpo_family == AF_INET) {
1190 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1191 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1192 	} else {
1193 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1194 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1195 	}
1196 
1197 	ipnet_walkers_inc(ips);
1198 
1199 	list = &ips->ips_str_list;
1200 	for (ipnet = list_head(list); ipnet != NULL;
1201 	    ipnet = list_next(list, ipnet)) {
1202 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1203 			IPSK_BUMP(ips, ik_acceptFail);
1204 			continue;
1205 		}
1206 		IPSK_BUMP(ips, ik_acceptOk);
1207 
1208 		if (list_next(list, ipnet) == NULL) {
1209 			netmp = hdr->hpo_pkt->b_cont;
1210 			hdr->hpo_pkt->b_cont = NULL;
1211 		} else {
1212 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1213 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1214 				IPSK_BUMP(ips, ik_duplicationFail);
1215 				continue;
1216 			}
1217 		}
1218 
1219 		if (ipnet->ipnet_flags & IPNET_INFO) {
1220 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1221 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1222 				continue;
1223 			}
1224 		}
1225 
1226 		if (ipnet->ipnet_rq->q_first == NULL &&
1227 		    canputnext(ipnet->ipnet_rq)) {
1228 			putnext(ipnet->ipnet_rq, netmp);
1229 			IPSK_BUMP(ips, ik_dispatchDeliver);
1230 		} else if (canput(ipnet->ipnet_rq)) {
1231 			(void) putq(ipnet->ipnet_rq, netmp);
1232 			IPSK_BUMP(ips, ik_dispatchDeliver);
1233 		} else {
1234 			freemsg(netmp);
1235 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1236 		}
1237 	}
1238 
1239 	ipnet_walkers_dec(ips);
1240 
1241 	freemsg(mp);
1242 }
1243 
1244 static void
1245 ipnet_input(mblk_t *mp)
1246 {
1247 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1248 	ipnet_stack_t		*ips;
1249 
1250 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1251 
1252 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1253 	    DDI_SUCCESS) {
1254 		IPSK_BUMP(ips, ik_dispatchFail);
1255 		freemsg(mp);
1256 	} else {
1257 		IPSK_BUMP(ips, ik_dispatchOk);
1258 	}
1259 }
1260 
1261 static ipnetif_t *
1262 ipnet_alloc_if(ipnet_stack_t *ips)
1263 {
1264 	ipnetif_t	*ipnetif;
1265 
1266 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1267 		return (NULL);
1268 
1269 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1270 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1271 	    offsetof(ipnetif_addr_t, ifa_link));
1272 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1273 	    offsetof(ipnetif_addr_t, ifa_link));
1274 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1275 
1276 	ipnetif->if_stackp = ips;
1277 
1278 	return (ipnetif);
1279 }
1280 
1281 /*
1282  * Create a new ipnetif_t and new minor node for it.  If creation is
1283  * successful the new ipnetif_t is inserted into an avl_tree
1284  * containing ipnetif's for this stack instance.
1285  */
1286 static ipnetif_t *
1287 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1288     uint64_t ifflags)
1289 {
1290 	ipnetif_t	*ipnetif;
1291 	avl_index_t	where = 0;
1292 	minor_t		ifminor;
1293 
1294 	/*
1295 	 * Because ipnetif_create() can be called from a NIC event
1296 	 * callback, it should not block.
1297 	 */
1298 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1299 	if (ifminor == (minor_t)-1)
1300 		return (NULL);
1301 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1302 		id_free(ipnet_minor_space, ifminor);
1303 		return (NULL);
1304 	}
1305 
1306 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1307 	ipnetif->if_index = (uint_t)index;
1308 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1309 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1310 
1311 	ipnetif->if_refcnt = 1;
1312 	if ((ifflags & IFF_LOOPBACK) != 0)
1313 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1314 
1315 	mutex_enter(&ips->ips_avl_lock);
1316 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1317 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1318 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1319 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1320 	mutex_exit(&ips->ips_avl_lock);
1321 
1322 	return (ipnetif);
1323 }
1324 
1325 static void
1326 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1327 {
1328 	ipnet_t	*ipnet;
1329 
1330 	ipnet_walkers_inc(ips);
1331 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1332 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1333 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1334 		if (ipnet->ipnet_if == ipnetif)
1335 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1336 	}
1337 	ipnet_walkers_dec(ips);
1338 	mutex_enter(&ips->ips_avl_lock);
1339 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1340 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1341 	mutex_exit(&ips->ips_avl_lock);
1342 	/*
1343 	 * Release the reference we implicitly held in ipnetif_create().
1344 	 */
1345 	ipnetif_refrele(ipnetif);
1346 }
1347 
1348 static void
1349 ipnet_purge_addrlist(list_t *addrlist)
1350 {
1351 	ipnetif_addr_t	*ifa;
1352 
1353 	while ((ifa = list_head(addrlist)) != NULL) {
1354 		list_remove(addrlist, ifa);
1355 		if (ifa->ifa_shared != NULL)
1356 			ipnetif_clone_release(ifa->ifa_shared);
1357 		kmem_free(ifa, sizeof (*ifa));
1358 	}
1359 }
1360 
1361 static void
1362 ipnetif_free(ipnetif_t *ipnetif)
1363 {
1364 	ASSERT(ipnetif->if_refcnt == 0);
1365 	ASSERT(ipnetif->if_sharecnt == 0);
1366 
1367 	/* Remove IPv4/v6 address lists from the ipnetif */
1368 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1369 	list_destroy(&ipnetif->if_ip4addr_list);
1370 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1371 	list_destroy(&ipnetif->if_ip6addr_list);
1372 	mutex_destroy(&ipnetif->if_addr_lock);
1373 	mutex_destroy(&ipnetif->if_reflock);
1374 	if (ipnetif->if_dev != 0)
1375 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1376 	kmem_free(ipnetif, sizeof (*ipnetif));
1377 }
1378 
1379 /*
1380  * Create an ipnetif_addr_t with the given logical interface id (lif)
1381  * and add it to the supplied ipnetif.  The lif is the netinfo
1382  * representation of logical interface id, and we use this id to match
1383  * incoming netinfo events against our lists of addresses.
1384  */
1385 static void
1386 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1387 {
1388 	ipnetif_addr_t		*ifaddr;
1389 	zoneid_t		zoneid;
1390 	struct sockaddr_in	bcast;
1391 	struct sockaddr_storage	addr;
1392 	net_ifaddr_t		type = NA_ADDRESS;
1393 	uint64_t		phyif = ipnetif->if_index;
1394 
1395 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1396 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1397 		return;
1398 
1399 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1400 		return;
1401 	ifaddr->ifa_zone = zoneid;
1402 	ifaddr->ifa_id = lif;
1403 	ifaddr->ifa_shared = NULL;
1404 
1405 	switch (addr.ss_family) {
1406 	case AF_INET:
1407 		ifaddr->ifa_ip4addr =
1408 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1409 		/*
1410 		 * Try and get the broadcast address.  Note that it's okay for
1411 		 * an interface to not have a broadcast address, so we don't
1412 		 * fail the entire operation if net_getlifaddr() fails here.
1413 		 */
1414 		type = NA_BROADCAST;
1415 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1416 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1417 		break;
1418 	case AF_INET6:
1419 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1420 		break;
1421 	}
1422 
1423 	/*
1424 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1425 	 * zone the address is being used in. This facilitates finding the
1426 	 * correct netstack_t pointer, amongst other things, later.
1427 	 */
1428 	if (zoneid == ALL_ZONES)
1429 		zoneid = GLOBAL_ZONEID;
1430 
1431 	mutex_enter(&ipnetif->if_addr_lock);
1432 	if (zoneid != ipnetif->if_zoneid) {
1433 		ipnetif_t *ifp2;
1434 
1435 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1436 		ifaddr->ifa_shared = ifp2;
1437 	}
1438 	list_insert_tail(addr.ss_family == AF_INET ?
1439 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1440 	mutex_exit(&ipnetif->if_addr_lock);
1441 }
1442 
1443 static void
1444 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1445 {
1446 	mutex_enter(&ipnetif->if_addr_lock);
1447 	if (ifaddr->ifa_shared != NULL)
1448 		ipnetif_clone_release(ifaddr->ifa_shared);
1449 
1450 	list_remove(isv6 ?
1451 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1452 	mutex_exit(&ipnetif->if_addr_lock);
1453 	kmem_free(ifaddr, sizeof (*ifaddr));
1454 }
1455 
1456 static void
1457 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1458 {
1459 	ipnetif_t	*ipnetif;
1460 	boolean_t	refrele_needed = B_TRUE;
1461 	uint64_t	ifflags;
1462 	uint64_t	ifindex;
1463 	char		*ifname;
1464 
1465 	ifflags = 0;
1466 	ifname = ipne->ipne_ifname;
1467 	ifindex = ipne->ipne_ifindex;
1468 
1469 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1470 
1471 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1472 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1473 		refrele_needed = B_FALSE;
1474 	}
1475 	if (ipnetif != NULL) {
1476 		ipnetif->if_flags |=
1477 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1478 	}
1479 
1480 	if (ipnetif->if_multicnt != 0) {
1481 		if (ip_join_allmulti(ifindex, isv6,
1482 		    ips->ips_netstack->netstack_ip) == 0) {
1483 			ipnetif->if_flags |=
1484 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1485 		}
1486 	}
1487 
1488 	if (refrele_needed)
1489 		ipnetif_refrele(ipnetif);
1490 }
1491 
1492 static void
1493 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1494 {
1495 	ipnetif_t	*ipnetif;
1496 
1497 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1498 		return;
1499 
1500 	mutex_enter(&ipnetif->if_addr_lock);
1501 	ipnet_purge_addrlist(isv6 ?
1502 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1503 	mutex_exit(&ipnetif->if_addr_lock);
1504 
1505 	/*
1506 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1507 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1508 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1509 	 */
1510 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1511 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1512 		ipnetif_remove(ipnetif, ips);
1513 	ipnetif_refrele(ipnetif);
1514 }
1515 
1516 static void
1517 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1518     ipnet_stack_t *ips, boolean_t isv6)
1519 {
1520 	ipnetif_t	*ipnetif;
1521 	ipnetif_addr_t	*ifaddr;
1522 
1523 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1524 		return;
1525 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1526 		/*
1527 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1528 		 * ifaddr and re-create it.
1529 		 */
1530 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1531 	}
1532 
1533 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1534 	ipnetif_refrele(ipnetif);
1535 }
1536 
1537 static void
1538 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1539     boolean_t isv6)
1540 {
1541 	ipnetif_t	*ipnetif;
1542 	ipnetif_addr_t	*ifaddr;
1543 
1544 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1545 		return;
1546 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1547 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1548 	ipnetif_refrele(ipnetif);
1549 	/*
1550 	 * Make sure that open streams on this ipnetif are still allowed to
1551 	 * have it open.
1552 	 */
1553 	ipnetif_zonecheck(ipnetif, ips);
1554 }
1555 
1556 /*
1557  * This callback from the NIC event framework dispatches a taskq as the event
1558  * handlers may block.
1559  */
1560 /* ARGSUSED */
1561 static int
1562 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1563 {
1564 	ipnet_stack_t		*ips = arg;
1565 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1566 	ipnet_nicevent_t	*ipne;
1567 
1568 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1569 		return (0);
1570 	ipne->ipne_event = hn->hne_event;
1571 	ipne->ipne_protocol = hn->hne_protocol;
1572 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1573 	ipne->ipne_ifindex = hn->hne_nic;
1574 	ipne->ipne_lifindex = hn->hne_lif;
1575 	if (hn->hne_datalen != 0) {
1576 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1577 		    sizeof (ipne->ipne_ifname));
1578 	}
1579 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1580 	    ipne, DDI_NOSLEEP);
1581 	return (0);
1582 }
1583 
1584 static void
1585 ipnet_nicevent_task(void *arg)
1586 {
1587 	ipnet_nicevent_t	*ipne = arg;
1588 	netstack_t		*ns;
1589 	ipnet_stack_t		*ips;
1590 	boolean_t		isv6;
1591 
1592 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1593 		goto done;
1594 	ips = ns->netstack_ipnet;
1595 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1596 
1597 	mutex_enter(&ips->ips_event_lock);
1598 	switch (ipne->ipne_event) {
1599 	case NE_PLUMB:
1600 		ipnet_plumb_ev(ipne, ips, isv6);
1601 		break;
1602 	case NE_UNPLUMB:
1603 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1604 		break;
1605 	case NE_LIF_UP:
1606 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1607 		    ipne->ipne_protocol, ips, isv6);
1608 		break;
1609 	case NE_LIF_DOWN:
1610 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1611 		    isv6);
1612 		break;
1613 	default:
1614 		break;
1615 	}
1616 	mutex_exit(&ips->ips_event_lock);
1617 done:
1618 	if (ns != NULL)
1619 		netstack_rele(ns);
1620 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1621 }
1622 
1623 dev_t
1624 ipnet_if_getdev(char *name, zoneid_t zoneid)
1625 {
1626 	netstack_t	*ns;
1627 	ipnet_stack_t	*ips;
1628 	ipnetif_t	*ipnetif;
1629 	dev_t		dev = (dev_t)-1;
1630 
1631 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1632 		return (dev);
1633 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1634 		return (dev);
1635 
1636 	ips = ns->netstack_ipnet;
1637 	mutex_enter(&ips->ips_avl_lock);
1638 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1639 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1640 			dev = ipnetif->if_dev;
1641 	}
1642 	mutex_exit(&ips->ips_avl_lock);
1643 	netstack_rele(ns);
1644 
1645 	return (dev);
1646 }
1647 
1648 static ipnetif_t *
1649 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1650 {
1651 	ipnetif_t	*ipnetif;
1652 
1653 	mutex_enter(&ips->ips_avl_lock);
1654 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1655 		ipnetif_refhold(ipnetif);
1656 	mutex_exit(&ips->ips_avl_lock);
1657 	return (ipnetif);
1658 }
1659 
1660 static ipnetif_t *
1661 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1662 {
1663 	ipnetif_t	*ipnetif;
1664 	avl_tree_t	*tree;
1665 
1666 	mutex_enter(&ips->ips_avl_lock);
1667 	tree = &ips->ips_avl_by_index;
1668 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1669 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1670 		if (ipnetif->if_dev == dev) {
1671 			ipnetif_refhold(ipnetif);
1672 			break;
1673 		}
1674 	}
1675 	mutex_exit(&ips->ips_avl_lock);
1676 	return (ipnetif);
1677 }
1678 
1679 static ipnetif_addr_t *
1680 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1681 {
1682 	ipnetif_addr_t	*ifaddr;
1683 	list_t	*list;
1684 
1685 	mutex_enter(&ipnetif->if_addr_lock);
1686 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1687 	for (ifaddr = list_head(list); ifaddr != NULL;
1688 	    ifaddr = list_next(list, ifaddr)) {
1689 		if (lid == ifaddr->ifa_id)
1690 			break;
1691 	}
1692 	mutex_exit(&ipnetif->if_addr_lock);
1693 	return (ifaddr);
1694 }
1695 
1696 /* ARGSUSED */
1697 static void *
1698 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1699 {
1700 	ipnet_stack_t	*ips;
1701 
1702 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1703 	ips->ips_netstack = ns;
1704 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1705 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1706 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1707 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1708 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1709 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1710 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1711 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1712 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1713 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1714 	    offsetof(ipnet_t, ipnet_next));
1715 	ipnet_register_netihook(ips);
1716 	return (ips);
1717 }
1718 
1719 /* ARGSUSED */
1720 static void
1721 ipnet_stack_fini(netstackid_t stackid, void *arg)
1722 {
1723 	ipnet_stack_t	*ips = arg;
1724 	ipnetif_t	*ipnetif, *nipnetif;
1725 
1726 	if (ips->ips_kstatp != NULL) {
1727 		zoneid_t zoneid;
1728 
1729 		zoneid = netstackid_to_zoneid(stackid);
1730 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1731 	}
1732 	if (ips->ips_ndv4 != NULL) {
1733 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1734 		    ips->ips_nicevents) == 0);
1735 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1736 	}
1737 	if (ips->ips_ndv6 != NULL) {
1738 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1739 		    ips->ips_nicevents) == 0);
1740 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1741 	}
1742 	hook_free(ips->ips_nicevents);
1743 
1744 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1745 	    ipnetif = nipnetif) {
1746 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1747 		ipnetif_remove(ipnetif, ips);
1748 	}
1749 	avl_destroy(&ips->ips_avl_by_shared);
1750 	avl_destroy(&ips->ips_avl_by_index);
1751 	avl_destroy(&ips->ips_avl_by_name);
1752 	mutex_destroy(&ips->ips_avl_lock);
1753 	mutex_destroy(&ips->ips_walkers_lock);
1754 	cv_destroy(&ips->ips_walkers_cv);
1755 	list_destroy(&ips->ips_str_list);
1756 	kmem_free(ips, sizeof (*ips));
1757 }
1758 
1759 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1760 static boolean_t
1761 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1762 {
1763 	ipnetif_addr_t	*ifa;
1764 
1765 	for (ifa = list_head(addrlist); ifa != NULL;
1766 	    ifa = list_next(addrlist, ifa)) {
1767 		if (ifa->ifa_zone == zoneid)
1768 			return (B_TRUE);
1769 	}
1770 	return (B_FALSE);
1771 }
1772 
1773 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1774 static boolean_t
1775 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1776 {
1777 	int	ret;
1778 
1779 	/*
1780 	 * The global zone has visibility into all interfaces in the global
1781 	 * stack, and exclusive stack zones have visibility into all
1782 	 * interfaces in their stack.
1783 	 */
1784 	if (zoneid == GLOBAL_ZONEID ||
1785 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1786 		return (B_TRUE);
1787 
1788 	/*
1789 	 * Shared-stack zones only have visibility for interfaces that have
1790 	 * addresses in their zone.
1791 	 */
1792 	mutex_enter(&ipnetif->if_addr_lock);
1793 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1794 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1795 	mutex_exit(&ipnetif->if_addr_lock);
1796 	return (ret);
1797 }
1798 
1799 /*
1800  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1801  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1802  * to have an ipnetif open if there are no longer any addresses that belong to
1803  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1804  * case, send the ipnet_t an M_HANGUP.
1805  */
1806 static void
1807 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1808 {
1809 	list_t	*strlist = &ips->ips_str_list;
1810 	ipnet_t	*ipnet;
1811 
1812 	ipnet_walkers_inc(ips);
1813 	for (ipnet = list_head(strlist); ipnet != NULL;
1814 	    ipnet = list_next(strlist, ipnet)) {
1815 		if (ipnet->ipnet_if != ipnetif)
1816 			continue;
1817 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1818 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1819 	}
1820 	ipnet_walkers_dec(ips);
1821 }
1822 
1823 void
1824 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1825 {
1826 	ipnetif_t		*ipnetif;
1827 	list_t			cbdata;
1828 	ipnetif_cbdata_t	*cbnode;
1829 	netstack_t		*ns;
1830 	ipnet_stack_t		*ips;
1831 
1832 	/*
1833 	 * On labeled systems, non-global zones shouldn't see anything
1834 	 * in /dev/ipnet.
1835 	 */
1836 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1837 		return;
1838 
1839 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1840 		return;
1841 
1842 	ips = ns->netstack_ipnet;
1843 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1844 	    offsetof(ipnetif_cbdata_t, ic_next));
1845 
1846 	mutex_enter(&ips->ips_avl_lock);
1847 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1848 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1849 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1850 			continue;
1851 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1852 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1853 		cbnode->ic_dev = ipnetif->if_dev;
1854 		list_insert_head(&cbdata, cbnode);
1855 	}
1856 	mutex_exit(&ips->ips_avl_lock);
1857 
1858 	while ((cbnode = list_head(&cbdata)) != NULL) {
1859 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1860 		list_remove(&cbdata, cbnode);
1861 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1862 	}
1863 	list_destroy(&cbdata);
1864 	netstack_rele(ns);
1865 }
1866 
1867 static int
1868 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1869 {
1870 	int64_t	index1 = *((int64_t *)index_ptr);
1871 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1872 
1873 	return (SIGNOF(index2 - index1));
1874 }
1875 
1876 static int
1877 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1878 {
1879 	int	res;
1880 
1881 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1882 	return (SIGNOF(res));
1883 }
1884 
1885 static int
1886 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1887 {
1888 	const uintptr_t	*ptr = key_ptr;
1889 	const ipnetif_t	*ifp;
1890 	int		res;
1891 
1892 	ifp = ipnetifp;
1893 	res = ifp->if_zoneid - ptr[0];
1894 	if (res != 0)
1895 		return (SIGNOF(res));
1896 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1897 	return (SIGNOF(res));
1898 }
1899 
1900 static void
1901 ipnetif_refhold(ipnetif_t *ipnetif)
1902 {
1903 	mutex_enter(&ipnetif->if_reflock);
1904 	ipnetif->if_refcnt++;
1905 	mutex_exit(&ipnetif->if_reflock);
1906 }
1907 
1908 static void
1909 ipnetif_refrele(ipnetif_t *ipnetif)
1910 {
1911 	mutex_enter(&ipnetif->if_reflock);
1912 	ASSERT(ipnetif->if_refcnt > 0);
1913 	if (--ipnetif->if_refcnt == 0)
1914 		ipnetif_free(ipnetif);
1915 	else
1916 		mutex_exit(&ipnetif->if_reflock);
1917 }
1918 
1919 static void
1920 ipnet_walkers_inc(ipnet_stack_t *ips)
1921 {
1922 	mutex_enter(&ips->ips_walkers_lock);
1923 	ips->ips_walkers_cnt++;
1924 	mutex_exit(&ips->ips_walkers_lock);
1925 }
1926 
1927 static void
1928 ipnet_walkers_dec(ipnet_stack_t *ips)
1929 {
1930 	mutex_enter(&ips->ips_walkers_lock);
1931 	ASSERT(ips->ips_walkers_cnt != 0);
1932 	if (--ips->ips_walkers_cnt == 0)
1933 		cv_broadcast(&ips->ips_walkers_cv);
1934 	mutex_exit(&ips->ips_walkers_lock);
1935 }
1936 
1937 /*ARGSUSED*/
1938 static int
1939 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1940 {
1941 	hook_pkt_observe_t	*hdr;
1942 	pfv_t			func = (pfv_t)arg;
1943 	mblk_t			*mp;
1944 
1945 	hdr = (hook_pkt_observe_t *)info;
1946 	/*
1947 	 * Code in ip_input() expects that it is the only one accessing the
1948 	 * packet.
1949 	 */
1950 	mp = copymsg(hdr->hpo_pkt);
1951 	if (mp == NULL)  {
1952 		netstack_t *ns = hdr->hpo_ctx;
1953 		ipnet_stack_t *ips = ns->netstack_ipnet;
1954 
1955 		IPSK_BUMP(ips, ik_dispatchDupDrop);
1956 		return (0);
1957 	}
1958 
1959 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1960 	hdr->hpo_pkt = mp;
1961 
1962 	func(mp);
1963 
1964 	return (0);
1965 }
1966 
1967 hook_t *
1968 ipobs_register_hook(netstack_t *ns, pfv_t func)
1969 {
1970 	ip_stack_t	*ipst = ns->netstack_ip;
1971 	char		name[32];
1972 	hook_t		*hook;
1973 
1974 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1975 	VERIFY(hook != NULL);
1976 
1977 	/*
1978 	 * To register multiple hooks with the same callback function,
1979 	 * a unique name is needed.
1980 	 */
1981 	(void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1982 	hook->h_name = strdup(name);
1983 
1984 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1985 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1986 
1987 	return (hook);
1988 }
1989 
1990 void
1991 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1992 {
1993 	ip_stack_t	*ipst = ns->netstack_ip;
1994 
1995 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1996 
1997 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1998 
1999 	strfree(hook->h_name);
2000 
2001 	hook_free(hook);
2002 }
2003 
2004 /* ******************************************************************** */
2005 /* BPF Functions below							*/
2006 /* ******************************************************************** */
2007 
2008 /*
2009  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2010  */
2011 ipnet_stack_t *
2012 ipnet_find_by_zoneid(zoneid_t zoneid)
2013 {
2014 	netstack_t	*ns;
2015 
2016 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2017 	return (ns->netstack_ipnet);
2018 }
2019 
2020 /*
2021  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2022  * pointer to ipnet_stack_t by calling a netstack lookup function.
2023  * The netstack_find_*() functions return a pointer after doing a "hold"
2024  * on the data structure and thereby require a "release" when the caller
2025  * is finished with it. We need to mirror that API here and thus a caller
2026  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2027  */
2028 void
2029 ipnet_rele(ipnet_stack_t *ips)
2030 {
2031 	netstack_rele(ips->ips_netstack);
2032 }
2033 
2034 /*
2035  */
2036 void
2037 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2038 {
2039 	ipnet_itap = tapfunc;
2040 }
2041 
2042 /*
2043  * The list of interfaces available via ipnet is private for each zone,
2044  * so the AVL tree of each zone must be searched for a given name, even
2045  * if all names are unique.
2046  */
2047 int
2048 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2049 {
2050 	ipnet_stack_t	*ips;
2051 	ipnetif_t	*ipnetif;
2052 
2053 	ASSERT(ptr != NULL);
2054 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2055 
2056 	mutex_enter(&ips->ips_avl_lock);
2057 
2058 	/*
2059 	 * Shared instance zone?
2060 	 */
2061 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2062 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2063 
2064 		ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2065 	} else {
2066 		ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2067 	}
2068 	if (ipnetif != NULL)
2069 		ipnetif_refhold(ipnetif);
2070 	mutex_exit(&ips->ips_avl_lock);
2071 
2072 	*ptr = ipnetif;
2073 	ipnet_rele(ips);
2074 
2075 	if (ipnetif == NULL)
2076 		return (ESRCH);
2077 	return (0);
2078 }
2079 
2080 void
2081 ipnet_close_byhandle(ipnetif_t *ifp)
2082 {
2083 	ASSERT(ifp != NULL);
2084 	ipnetif_refrele(ifp);
2085 }
2086 
2087 const char *
2088 ipnet_name(ipnetif_t *ifp)
2089 {
2090 	ASSERT(ifp != NULL);
2091 	return (ifp->if_name);
2092 }
2093 
2094 /*
2095  * To find the linkid for a given name, it is necessary to know which zone
2096  * the interface name belongs to and to search the avl tree for that zone
2097  * as there is no master list of all interfaces and which zone they belong
2098  * to. It is assumed that the caller of this function is somehow already
2099  * working with the ipnet interfaces and hence the ips_event_lock is held.
2100  * When BPF calls into this function, it is doing so because of an event
2101  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2102  * value returned has meaning without the need for grabbing a hold on the
2103  * owning structure.
2104  */
2105 int
2106 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2107 {
2108 	ipnet_stack_t	*ips;
2109 	ipnetif_t	*ifp;
2110 
2111 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2112 	ASSERT(mutex_owned(&ips->ips_event_lock));
2113 
2114 	mutex_enter(&ips->ips_avl_lock);
2115 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2116 	if (ifp != NULL)
2117 		*idp = (uint_t)ifp->if_index;
2118 
2119 	/*
2120 	 * Shared instance zone?
2121 	 */
2122 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2123 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2124 
2125 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2126 		if (ifp != NULL)
2127 			*idp = (uint_t)ifp->if_index;
2128 	}
2129 
2130 	mutex_exit(&ips->ips_avl_lock);
2131 	ipnet_rele(ips);
2132 
2133 	if (ifp == NULL)
2134 		return (ESRCH);
2135 	return (0);
2136 }
2137 
2138 /*
2139  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2140  * there is in mac. BPF only needs to have this because it is required as
2141  * part of interfacing correctly with mac. The reuse of the original
2142  * ipnetif_t as a client poses no danger, so long as it is done with its
2143  * own ref-count'd hold that is given up on close.
2144  */
2145 int
2146 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2147 {
2148 	ASSERT(ptr != NULL);
2149 	ASSERT(result != NULL);
2150 	ipnetif_refhold(ptr);
2151 	*result = ptr;
2152 
2153 	return (0);
2154 }
2155 
2156 void
2157 ipnet_client_close(ipnetif_t *ptr)
2158 {
2159 	ASSERT(ptr != NULL);
2160 	ipnetif_refrele(ptr);
2161 }
2162 
2163 /*
2164  * This is called from BPF when it needs to start receiving packets
2165  * from ipnet.
2166  *
2167  * The use of the ipnet_t structure here is somewhat lightweight when
2168  * compared to how it is used elsewhere but it already has all of the
2169  * right fields in it, so reuse here doesn't seem out of order. Its
2170  * primary purpose here is to provide the means to store pointers for
2171  * use when ipnet_promisc_remove() needs to be called.
2172  *
2173  * This should never be called for the IPNET_MINOR_LO device as it is
2174  * never created via ipnetif_create.
2175  */
2176 /*ARGSUSED*/
2177 int
2178 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2179     int flags)
2180 {
2181 	ip_stack_t	*ipst;
2182 	netstack_t	*ns;
2183 	ipnetif_t	*ifp;
2184 	ipnet_t		*ipnet;
2185 	char		name[32];
2186 	int		error;
2187 
2188 	ifp = (ipnetif_t *)handle;
2189 
2190 	if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2191 		return (EINVAL);
2192 
2193 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2194 
2195 	if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2196 		netstack_rele(ns);
2197 		return (error);
2198 	}
2199 
2200 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2201 	ipnet->ipnet_if = ifp;
2202 	ipnet->ipnet_ns = ns;
2203 	ipnet->ipnet_flags = flags;
2204 
2205 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2206 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2207 	} else {
2208 		ipnet->ipnet_acceptfn = ipnet_accept;
2209 	}
2210 
2211 	/*
2212 	 * To register multiple hooks with the same callback function,
2213 	 * a unique name is needed.
2214 	 */
2215 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2216 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2217 	    (void *)ipnet->ipnet_hook);
2218 	ipnet->ipnet_hook->h_name = strdup(name);
2219 	ipnet->ipnet_data = data;
2220 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2221 
2222 	ipst = ns->netstack_ip;
2223 
2224 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2225 	    ipnet->ipnet_hook);
2226 	if (error != 0)
2227 		goto regfail;
2228 
2229 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2230 	    ipnet->ipnet_hook);
2231 	if (error != 0) {
2232 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2233 		    NH_OBSERVE, ipnet->ipnet_hook);
2234 		goto regfail;
2235 	}
2236 
2237 	*mhandle = (uintptr_t)ipnet;
2238 	netstack_rele(ns);
2239 
2240 	return (0);
2241 
2242 regfail:
2243 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2244 	strfree(ipnet->ipnet_hook->h_name);
2245 	hook_free(ipnet->ipnet_hook);
2246 	netstack_rele(ns);
2247 	return (error);
2248 }
2249 
2250 void
2251 ipnet_promisc_remove(void *data)
2252 {
2253 	ip_stack_t	*ipst;
2254 	ipnet_t		*ipnet;
2255 	hook_t		*hook;
2256 
2257 	ipnet = data;
2258 	ipst = ipnet->ipnet_ns->netstack_ip;
2259 	hook = ipnet->ipnet_hook;
2260 
2261 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2262 	    hook) == 0);
2263 
2264 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2265 	    hook) == 0);
2266 
2267 	strfree(hook->h_name);
2268 
2269 	hook_free(hook);
2270 
2271 	kmem_free(ipnet, sizeof (*ipnet));
2272 }
2273 
2274 /*
2275  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2276  * An important field from that structure is "ipnet_data" that
2277  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2278  * to be passed back to bpf when we call into ipnet_itap.
2279  *
2280  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2281  * from BPF.
2282  */
2283 /*ARGSUSED*/
2284 static int
2285 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2286 {
2287 	hook_pkt_observe_t	*hdr;
2288 	ipnet_addrp_t		src;
2289 	ipnet_addrp_t		dst;
2290 	ipnet_stack_t		*ips;
2291 	ipnet_t			*ipnet;
2292 	mblk_t			*netmp;
2293 	mblk_t			*mp;
2294 
2295 	hdr = (hook_pkt_observe_t *)info;
2296 	mp = hdr->hpo_pkt;
2297 	ipnet = (ipnet_t *)arg;
2298 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2299 
2300 	netmp = hdr->hpo_pkt->b_cont;
2301 	src.iap_family = hdr->hpo_family;
2302 	dst.iap_family = hdr->hpo_family;
2303 
2304 	if (hdr->hpo_family == AF_INET) {
2305 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2306 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2307 	} else {
2308 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2309 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2310 	}
2311 
2312 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2313 		IPSK_BUMP(ips, ik_acceptFail);
2314 		return (0);
2315 	}
2316 	IPSK_BUMP(ips, ik_acceptOk);
2317 
2318 	ipnet_itap(ipnet->ipnet_data, mp,
2319 	    hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2320 	    ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2321 
2322 	return (0);
2323 }
2324 
2325 /*
2326  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2327  * to life and configures an IP address. The model that BPF uses is that
2328  * each interface must have a unique pointer and each interface must be
2329  * representative of what it can capture. They are limited to one DLT
2330  * per interface and one zone per interface. Thus every interface that
2331  * can be seen in a zone must be announced via an attach to bpf. For
2332  * shared instance zones, this means the ipnet driver needs to detect
2333  * when an address is added to an interface in a zone for the first
2334  * time (and also when the last address is removed.)
2335  */
2336 static ipnetif_t *
2337 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2338 {
2339 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2340 	ipnet_stack_t	*ips = ifp->if_stackp;
2341 	avl_index_t	where = 0;
2342 	ipnetif_t	*newif;
2343 
2344 	mutex_enter(&ips->ips_avl_lock);
2345 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2346 	if (newif != NULL) {
2347 		ipnetif_refhold(newif);
2348 		newif->if_sharecnt++;
2349 		mutex_exit(&ips->ips_avl_lock);
2350 		return (newif);
2351 	}
2352 
2353 	newif = ipnet_alloc_if(ips);
2354 	if (newif == NULL) {
2355 		mutex_exit(&ips->ips_avl_lock);
2356 		return (NULL);
2357 	}
2358 
2359 	newif->if_refcnt = 1;
2360 	newif->if_sharecnt = 1;
2361 	newif->if_zoneid = zoneid;
2362 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2363 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2364 	newif->if_index = ifp->if_index;
2365 
2366 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2367 	mutex_exit(&ips->ips_avl_lock);
2368 
2369 	return (newif);
2370 }
2371 
2372 static void
2373 ipnetif_clone_release(ipnetif_t *ipnetif)
2374 {
2375 	boolean_t	dofree = B_FALSE;
2376 	boolean_t	doremove = B_FALSE;
2377 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2378 
2379 	mutex_enter(&ipnetif->if_reflock);
2380 	ASSERT(ipnetif->if_refcnt > 0);
2381 	if (--ipnetif->if_refcnt == 0)
2382 		dofree = B_TRUE;
2383 	ASSERT(ipnetif->if_sharecnt > 0);
2384 	if (--ipnetif->if_sharecnt == 0)
2385 		doremove = B_TRUE;
2386 	mutex_exit(&ipnetif->if_reflock);
2387 	if (doremove) {
2388 		mutex_enter(&ips->ips_avl_lock);
2389 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2390 		mutex_exit(&ips->ips_avl_lock);
2391 	}
2392 	if (dofree) {
2393 		ASSERT(ipnetif->if_sharecnt == 0);
2394 		ipnetif_free(ipnetif);
2395 	}
2396 }
2397