xref: /illumos-gate/usr/src/uts/common/inet/ipnet/ipnet.c (revision 4c28a617e3922d92a58e813a5b955eb526b9c386)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
30  */
31 
32 /*
33  * The ipnet device defined here provides access to packets at the IP layer. To
34  * provide access to packets at this layer it registers a callback function in
35  * the ip module and when there are open instances of the device ip will pass
36  * packets into the device. Packets from ip are passed on the input, output and
37  * loopback paths. Internally the module returns to ip as soon as possible by
38  * deferring processing using a taskq.
39  *
40  * Management of the devices in /dev/ipnet/ is handled by the devname
41  * filesystem and use of the neti interfaces.  This module registers for NIC
42  * events using the neti framework so that when IP interfaces are bought up,
43  * taken down etc. the ipnet module is notified and its view of the interfaces
44  * configured on the system adjusted.  On attach, the module gets an initial
45  * view of the system again using the neti framework but as it has already
46  * registered for IP interface events, it is still up-to-date with any changes.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/conf.h>
51 #include <sys/cred.h>
52 #include <sys/stat.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/modctl.h>
56 #include <sys/dlpi.h>
57 #include <sys/strsun.h>
58 #include <sys/id_space.h>
59 #include <sys/kmem.h>
60 #include <sys/mkdev.h>
61 #include <sys/neti.h>
62 #include <net/if.h>
63 #include <sys/errno.h>
64 #include <sys/list.h>
65 #include <sys/ksynch.h>
66 #include <sys/hook_event.h>
67 #include <sys/sdt.h>
68 #include <sys/stropts.h>
69 #include <sys/sysmacros.h>
70 #include <inet/ip.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip6.h>
74 #include <inet/ipnet.h>
75 #include <net/bpf.h>
76 #include <net/bpfdesc.h>
77 #include <net/dlt.h>
78 
79 static struct module_info ipnet_minfo = {
80 	1,		/* mi_idnum */
81 	"ipnet",	/* mi_idname */
82 	0,		/* mi_minpsz */
83 	INFPSZ,		/* mi_maxpsz */
84 	2048,		/* mi_hiwat */
85 	0		/* mi_lowat */
86 };
87 
88 /*
89  * List to hold static view of ipnetif_t's on the system. This is needed to
90  * avoid holding the lock protecting the avl tree of ipnetif's over the
91  * callback into the dev filesystem.
92  */
93 typedef struct ipnetif_cbdata {
94 	char		ic_ifname[LIFNAMSIZ];
95 	dev_t		ic_dev;
96 	list_node_t	ic_next;
97 } ipnetif_cbdata_t;
98 
99 /*
100  * Convenience enumerated type for ipnet_accept().  It describes the
101  * properties of a given ipnet_addrp_t relative to a single ipnet_t
102  * client stream.  The values represent whether the address is ...
103  */
104 typedef enum {
105 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
106 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
107 	IPNETADDR_UNKNOWN	/* none of the above. */
108 } ipnet_addrtype_t;
109 
110 /* Argument used for the ipnet_nicevent_taskq callback. */
111 typedef struct ipnet_nicevent_s {
112 	nic_event_t		ipne_event;
113 	net_handle_t		ipne_protocol;
114 	netstackid_t		ipne_stackid;
115 	uint64_t		ipne_ifindex;
116 	uint64_t		ipne_lifindex;
117 	char			ipne_ifname[LIFNAMSIZ];
118 } ipnet_nicevent_t;
119 
120 static dev_info_t	*ipnet_dip;
121 static major_t		ipnet_major;
122 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
123 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
124 static id_space_t	*ipnet_minor_space;
125 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
126 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
127 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
128 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
129 static bpf_itap_fn_t	ipnet_itap;
130 
131 static void	ipnet_input(mblk_t *);
132 static int	ipnet_wput(queue_t *, mblk_t *);
133 static int	ipnet_rsrv(queue_t *);
134 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
135 static int	ipnet_close(queue_t *);
136 static void	ipnet_ioctl(queue_t *, mblk_t *);
137 static void	ipnet_iocdata(queue_t *, mblk_t *);
138 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
139 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
141 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
143 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
144 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
145 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
146 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
147 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
148 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
149 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
150 static void	ipnet_nicevent_task(void *);
151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
152     uint64_t);
153 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
158 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
159 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
160 static int 	ipnetif_compare_name(const void *, const void *);
161 static int 	ipnetif_compare_name_zone(const void *, const void *);
162 static int 	ipnetif_compare_index(const void *, const void *);
163 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
164 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
165 static void	ipnetif_refhold(ipnetif_t *);
166 static void	ipnetif_refrele(ipnetif_t *);
167 static void	ipnet_walkers_inc(ipnet_stack_t *);
168 static void	ipnet_walkers_dec(ipnet_stack_t *);
169 static void	ipnet_register_netihook(ipnet_stack_t *);
170 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
171 static void	ipnet_stack_fini(netstackid_t, void *);
172 static void	ipnet_dispatch(void *);
173 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
174 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
176 static void	ipnetif_clone_release(ipnetif_t *);
177 
178 static struct qinit ipnet_rinit = {
179 	NULL,		/* qi_putp */
180 	ipnet_rsrv,	/* qi_srvp */
181 	ipnet_open,	/* qi_qopen */
182 	ipnet_close,	/* qi_qclose */
183 	NULL,		/* qi_qadmin */
184 	&ipnet_minfo,	/* qi_minfo */
185 };
186 
187 static struct qinit ipnet_winit = {
188 	ipnet_wput,	/* qi_putp */
189 	NULL,		/* qi_srvp */
190 	NULL,		/* qi_qopen */
191 	NULL,		/* qi_qclose */
192 	NULL,		/* qi_qadmin */
193 	&ipnet_minfo,	/* qi_minfo */
194 };
195 
196 static struct streamtab ipnet_info = {
197 	&ipnet_rinit, &ipnet_winit
198 };
199 
200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
201     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
202     ddi_quiesce_not_supported);
203 
204 static struct modldrv modldrv = {
205 	&mod_driverops,
206 	"STREAMS ipnet driver",
207 	&ipnet_ops
208 };
209 
210 static struct modlinkage modlinkage = {
211 	MODREV_1, &modldrv, NULL
212 };
213 
214 /*
215  * This structure contains the template data (names and type) that is
216  * copied, in bulk, into the new kstats structure created by net_kstat_create.
217  * No actual statistical information is stored in this instance of the
218  * ipnet_kstats_t structure.
219  */
220 static ipnet_kstats_t stats_template = {
221 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
222 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
223 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
224 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
225 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
226 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
227 	{ "acceptOk",		KSTAT_DATA_UINT64 },
228 	{ "acceptFail",		KSTAT_DATA_UINT64 }
229 };
230 
231 /*
232  * Walk the list of physical interfaces on the machine, for each
233  * interface create a new ipnetif_t and add any addresses to it. We
234  * need to do the walk twice, once for IPv4 and once for IPv6.
235  *
236  * The interfaces are destroyed as part of ipnet_stack_fini() for each
237  * stack.  Note that we cannot do this initialization in
238  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
239  */
240 static int
241 ipnetif_init(void)
242 {
243 	netstack_handle_t	nh;
244 	netstack_t		*ns;
245 	ipnet_stack_t		*ips;
246 	int			ret = 0;
247 
248 	netstack_next_init(&nh);
249 	while ((ns = netstack_next(&nh)) != NULL) {
250 		ips = ns->netstack_ipnet;
251 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
252 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
253 		netstack_rele(ns);
254 		if (ret != 0)
255 			break;
256 	}
257 	netstack_next_fini(&nh);
258 	return (ret);
259 }
260 
261 /*
262  * Standard module entry points.
263  */
264 int
265 _init(void)
266 {
267 	int		ret;
268 	boolean_t	netstack_registered = B_FALSE;
269 
270 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
271 		return (ENODEV);
272 	ipnet_minor_space = id_space_create("ipnet_minor_space",
273 	    IPNET_MINOR_MIN, MAXMIN32);
274 
275 	/*
276 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
277 	 * delivery of packets to clients.  Note that we need to create the
278 	 * taskqs before calling netstack_register() since ipnet_stack_init()
279 	 * registers callbacks that use 'em.
280 	 */
281 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
282 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
283 	    1, TASKQ_DEFAULTPRI, 0);
284 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
285 		ret = ENOMEM;
286 		goto done;
287 	}
288 
289 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
290 	netstack_registered = B_TRUE;
291 
292 	if ((ret = ipnetif_init()) == 0)
293 		ret = mod_install(&modlinkage);
294 done:
295 	if (ret != 0) {
296 		if (ipnet_taskq != NULL)
297 			ddi_taskq_destroy(ipnet_taskq);
298 		if (ipnet_nicevent_taskq != NULL)
299 			ddi_taskq_destroy(ipnet_nicevent_taskq);
300 		if (netstack_registered)
301 			netstack_unregister(NS_IPNET);
302 		id_space_destroy(ipnet_minor_space);
303 	}
304 	return (ret);
305 }
306 
307 int
308 _fini(void)
309 {
310 	int	err;
311 
312 	if ((err = mod_remove(&modlinkage)) != 0)
313 		return (err);
314 
315 	netstack_unregister(NS_IPNET);
316 	ddi_taskq_destroy(ipnet_nicevent_taskq);
317 	ddi_taskq_destroy(ipnet_taskq);
318 	id_space_destroy(ipnet_minor_space);
319 	return (0);
320 }
321 
322 int
323 _info(struct modinfo *modinfop)
324 {
325 	return (mod_info(&modlinkage, modinfop));
326 }
327 
328 static void
329 ipnet_register_netihook(ipnet_stack_t *ips)
330 {
331 	int		ret;
332 	zoneid_t	zoneid;
333 	netid_t		netid;
334 
335 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
336 	    ips);
337 
338 	/*
339 	 * It is possible for an exclusive stack to be in the process of
340 	 * shutting down here, and the netid and protocol lookups could fail
341 	 * in that case.
342 	 */
343 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
344 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
345 		return;
346 
347 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
348 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
349 		    ips->ips_nicevents)) != 0) {
350 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
351 			ips->ips_ndv4 = NULL;
352 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
353 			    " in zone %d: %d", zoneid, ret);
354 		}
355 	}
356 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
357 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
358 		    ips->ips_nicevents)) != 0) {
359 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
360 			ips->ips_ndv6 = NULL;
361 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
362 			    " in zone %d: %d", zoneid, ret);
363 		}
364 	}
365 
366 	/*
367 	 * Create a local set of kstats for each zone.
368 	 */
369 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
370 	    "misc", KSTAT_TYPE_NAMED,
371 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
372 	if (ips->ips_kstatp != NULL) {
373 		bcopy(&stats_template, &ips->ips_stats,
374 		    sizeof (ips->ips_stats));
375 		ips->ips_kstatp->ks_data = &ips->ips_stats;
376 		ips->ips_kstatp->ks_private =
377 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
378 		kstat_install(ips->ips_kstatp);
379 	} else {
380 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
381 		    "ipnet", "ipnet_stats", "misc");
382 	}
383 }
384 
385 /*
386  * This function is called on attach to build an initial view of the
387  * interfaces on the system. It will be called once for IPv4 and once
388  * for IPv6, although there is only one ipnet interface for both IPv4
389  * and IPv6 there are separate address lists.
390  */
391 static int
392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
393 {
394 	phy_if_t	phyif;
395 	lif_if_t	lif;
396 	ipnetif_t	*ipnetif;
397 	char		name[LIFNAMSIZ];
398 	boolean_t	new_if = B_FALSE;
399 	uint64_t	ifflags;
400 	int		ret = 0;
401 
402 	/*
403 	 * If ipnet_register_netihook() was unable to initialize this
404 	 * stack's net_handle_t, then we cannot populate any interface
405 	 * information.  This usually happens when we attempted to
406 	 * grab a net_handle_t as a stack was shutting down.  We don't
407 	 * want to fail the entire _init() operation because of a
408 	 * stack shutdown (other stacks will continue to work just
409 	 * fine), so we silently return success here.
410 	 */
411 	if (nd == NULL)
412 		return (0);
413 
414 	/*
415 	 * Make sure we're not processing NIC events during the
416 	 * population of our interfaces and address lists.
417 	 */
418 	mutex_enter(&ips->ips_event_lock);
419 
420 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
421 	    phyif = net_phygetnext(nd, phyif)) {
422 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
423 			continue;
424 		ifflags =  0;
425 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
426 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
427 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
428 			if (ipnetif == NULL) {
429 				ret = ENOMEM;
430 				goto done;
431 			}
432 			new_if = B_TRUE;
433 		}
434 		ipnetif->if_flags |=
435 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
436 
437 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
438 		    lif = net_lifgetnext(nd, phyif, lif)) {
439 			/*
440 			 * Skip addresses that aren't up.  We'll add
441 			 * them when we receive an NE_LIF_UP event.
442 			 */
443 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
444 			    !(ifflags & IFF_UP))
445 				continue;
446 			/* Don't add it if we already have it. */
447 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
448 				continue;
449 			ipnet_add_ifaddr(lif, ipnetif, nd);
450 		}
451 		if (!new_if)
452 			ipnetif_refrele(ipnetif);
453 	}
454 
455 done:
456 	mutex_exit(&ips->ips_event_lock);
457 	return (ret);
458 }
459 
460 static int
461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
462 {
463 	if (cmd != DDI_ATTACH)
464 		return (DDI_FAILURE);
465 
466 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
467 	    DDI_PSEUDO, 0) == DDI_FAILURE)
468 		return (DDI_FAILURE);
469 
470 	ipnet_dip = dip;
471 	return (DDI_SUCCESS);
472 }
473 
474 static int
475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
476 {
477 	if (cmd != DDI_DETACH)
478 		return (DDI_FAILURE);
479 
480 	ASSERT(dip == ipnet_dip);
481 	ddi_remove_minor_node(ipnet_dip, NULL);
482 	ipnet_dip = NULL;
483 	return (DDI_SUCCESS);
484 }
485 
486 /* ARGSUSED */
487 static int
488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
489 {
490 	int	error = DDI_FAILURE;
491 
492 	switch (infocmd) {
493 	case DDI_INFO_DEVT2INSTANCE:
494 		*result = (void *)0;
495 		error = DDI_SUCCESS;
496 		break;
497 	case DDI_INFO_DEVT2DEVINFO:
498 		if (ipnet_dip != NULL) {
499 			*result = ipnet_dip;
500 			error = DDI_SUCCESS;
501 		}
502 		break;
503 	}
504 	return (error);
505 }
506 
507 /* ARGSUSED */
508 static int
509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
510 {
511 	ipnet_t		*ipnet;
512 	netstack_t	*ns = NULL;
513 	ipnet_stack_t	*ips;
514 	int		err = 0;
515 	zoneid_t	zoneid = crgetzoneid(crp);
516 
517 	/*
518 	 * If the system is labeled, only the global zone is allowed to open
519 	 * IP observability nodes.
520 	 */
521 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
522 		return (EACCES);
523 
524 	/* We don't support open as a module */
525 	if (sflag & MODOPEN)
526 		return (ENOTSUP);
527 
528 	/* This driver is self-cloning, we don't support re-open. */
529 	if (rq->q_ptr != NULL)
530 		return (EBUSY);
531 
532 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
533 		return (ENOMEM);
534 
535 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
536 	ips = ns->netstack_ipnet;
537 
538 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
539 	ipnet->ipnet_rq = rq;
540 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
541 	ipnet->ipnet_zoneid = zoneid;
542 	ipnet->ipnet_dlstate = DL_UNBOUND;
543 	ipnet->ipnet_ns = ns;
544 
545 	/*
546 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
547 	 * to be processed after ipnet_if is set and the ipnet_t has been
548 	 * inserted in the ips_str_list.
549 	 */
550 	mutex_enter(&ips->ips_event_lock);
551 	if (getminor(*dev) == IPNET_MINOR_LO) {
552 		ipnet->ipnet_flags |= IPNET_LOMODE;
553 		ipnet->ipnet_acceptfn = ipnet_loaccept;
554 	} else {
555 		ipnet->ipnet_acceptfn = ipnet_accept;
556 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
557 		if (ipnet->ipnet_if == NULL ||
558 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
559 			err = ENODEV;
560 			goto done;
561 		}
562 	}
563 
564 	mutex_enter(&ips->ips_walkers_lock);
565 	while (ips->ips_walkers_cnt != 0)
566 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
567 	list_insert_head(&ips->ips_str_list, ipnet);
568 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
569 	qprocson(rq);
570 
571 	/*
572 	 * Only register our callback if we're the first open client; we call
573 	 * unregister in close() for the last open client.
574 	 */
575 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
576 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
577 	mutex_exit(&ips->ips_walkers_lock);
578 
579 done:
580 	mutex_exit(&ips->ips_event_lock);
581 	if (err != 0) {
582 		netstack_rele(ns);
583 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
584 		if (ipnet->ipnet_if != NULL)
585 			ipnetif_refrele(ipnet->ipnet_if);
586 		kmem_free(ipnet, sizeof (*ipnet));
587 	}
588 	return (err);
589 }
590 
591 static int
592 ipnet_close(queue_t *rq)
593 {
594 	ipnet_t		*ipnet = rq->q_ptr;
595 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
596 
597 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
598 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
599 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
600 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
601 
602 	mutex_enter(&ips->ips_walkers_lock);
603 	while (ips->ips_walkers_cnt != 0)
604 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
605 
606 	qprocsoff(rq);
607 
608 	list_remove(&ips->ips_str_list, ipnet);
609 	if (ipnet->ipnet_if != NULL)
610 		ipnetif_refrele(ipnet->ipnet_if);
611 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
612 
613 	if (list_is_empty(&ips->ips_str_list)) {
614 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
615 		ips->ips_hook = NULL;
616 	}
617 
618 	kmem_free(ipnet, sizeof (*ipnet));
619 
620 	mutex_exit(&ips->ips_walkers_lock);
621 	netstack_rele(ips->ips_netstack);
622 	return (0);
623 }
624 
625 static int
626 ipnet_wput(queue_t *q, mblk_t *mp)
627 {
628 	switch (mp->b_datap->db_type) {
629 	case M_FLUSH:
630 		if (*mp->b_rptr & FLUSHW) {
631 			flushq(q, FLUSHDATA);
632 			*mp->b_rptr &= ~FLUSHW;
633 		}
634 		if (*mp->b_rptr & FLUSHR)
635 			qreply(q, mp);
636 		else
637 			freemsg(mp);
638 		break;
639 	case M_PROTO:
640 	case M_PCPROTO:
641 		ipnet_wputnondata(q, mp);
642 		break;
643 	case M_IOCTL:
644 		ipnet_ioctl(q, mp);
645 		break;
646 	case M_IOCDATA:
647 		ipnet_iocdata(q, mp);
648 		break;
649 	default:
650 		freemsg(mp);
651 		break;
652 	}
653 	return (0);
654 }
655 
656 static int
657 ipnet_rsrv(queue_t *q)
658 {
659 	mblk_t	*mp;
660 
661 	while ((mp = getq(q)) != NULL) {
662 		ASSERT(DB_TYPE(mp) == M_DATA);
663 		if (canputnext(q)) {
664 			putnext(q, mp);
665 		} else {
666 			(void) putbq(q, mp);
667 			break;
668 		}
669 	}
670 	return (0);
671 }
672 
673 static void
674 ipnet_ioctl(queue_t *q, mblk_t *mp)
675 {
676 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
677 
678 	switch (iocp->ioc_cmd) {
679 	case DLIOCRAW:
680 		miocack(q, mp, 0, 0);
681 		break;
682 	case DLIOCIPNETINFO:
683 		if (iocp->ioc_count == TRANSPARENT) {
684 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
685 			qreply(q, mp);
686 			break;
687 		}
688 		/* We don't support I_STR with DLIOCIPNETINFO. */
689 		/* FALLTHROUGH */
690 	default:
691 		miocnak(q, mp, 0, EINVAL);
692 		break;
693 	}
694 }
695 
696 static void
697 ipnet_iocdata(queue_t *q, mblk_t *mp)
698 {
699 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
700 	ipnet_t	*ipnet = q->q_ptr;
701 
702 	switch (iocp->ioc_cmd) {
703 	case DLIOCIPNETINFO:
704 		if (*(int *)mp->b_cont->b_rptr == 1)
705 			ipnet->ipnet_flags |= IPNET_INFO;
706 		else if (*(int *)mp->b_cont->b_rptr == 0)
707 			ipnet->ipnet_flags &= ~IPNET_INFO;
708 		else
709 			goto iocnak;
710 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
711 		break;
712 	default:
713 iocnak:
714 		miocnak(q, mp, 0, EINVAL);
715 		break;
716 	}
717 }
718 
719 static void
720 ipnet_wputnondata(queue_t *q, mblk_t *mp)
721 {
722 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
723 	t_uscalar_t		prim = dlp->dl_primitive;
724 
725 	switch (prim) {
726 	case DL_INFO_REQ:
727 		ipnet_inforeq(q, mp);
728 		break;
729 	case DL_UNBIND_REQ:
730 		ipnet_unbindreq(q, mp);
731 		break;
732 	case DL_BIND_REQ:
733 		ipnet_bindreq(q, mp);
734 		break;
735 	case DL_PROMISCON_REQ:
736 		ipnet_dlpromisconreq(q, mp);
737 		break;
738 	case DL_PROMISCOFF_REQ:
739 		ipnet_dlpromiscoffreq(q, mp);
740 		break;
741 	case DL_UNITDATA_REQ:
742 	case DL_DETACH_REQ:
743 	case DL_PHYS_ADDR_REQ:
744 	case DL_SET_PHYS_ADDR_REQ:
745 	case DL_ENABMULTI_REQ:
746 	case DL_DISABMULTI_REQ:
747 	case DL_ATTACH_REQ:
748 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
749 		break;
750 	default:
751 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
752 		break;
753 	}
754 }
755 
756 static void
757 ipnet_inforeq(queue_t *q, mblk_t *mp)
758 {
759 	dl_info_ack_t	*dlip;
760 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
761 
762 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
763 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
764 		return;
765 	}
766 
767 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
768 		return;
769 
770 	dlip = (dl_info_ack_t *)mp->b_rptr;
771 	*dlip = ipnet_infoack;
772 	qreply(q, mp);
773 }
774 
775 static void
776 ipnet_bindreq(queue_t *q, mblk_t *mp)
777 {
778 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
779 	ipnet_t			*ipnet = q->q_ptr;
780 
781 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
782 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
783 		return;
784 	}
785 
786 	switch (dlp->bind_req.dl_sap) {
787 	case 0 :
788 		ipnet->ipnet_family = AF_UNSPEC;
789 		break;
790 	case IPV4_VERSION :
791 		ipnet->ipnet_family = AF_INET;
792 		break;
793 	case IPV6_VERSION :
794 		ipnet->ipnet_family = AF_INET6;
795 		break;
796 	default :
797 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
798 		return;
799 		/*NOTREACHED*/
800 	}
801 
802 	ipnet->ipnet_dlstate = DL_IDLE;
803 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
804 }
805 
806 static void
807 ipnet_unbindreq(queue_t *q, mblk_t *mp)
808 {
809 	ipnet_t	*ipnet = q->q_ptr;
810 
811 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
812 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
813 		return;
814 	}
815 
816 	if (ipnet->ipnet_dlstate != DL_IDLE) {
817 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
818 	} else {
819 		ipnet->ipnet_dlstate = DL_UNBOUND;
820 		ipnet->ipnet_family = AF_UNSPEC;
821 		dlokack(q, mp, DL_UNBIND_REQ);
822 	}
823 }
824 
825 static void
826 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
827 {
828 	ipnet_t		*ipnet = q->q_ptr;
829 	t_uscalar_t	level;
830 	int		err;
831 
832 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
833 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
834 		return;
835 	}
836 
837 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
838 		dlokack(q, mp, DL_PROMISCON_REQ);
839 		return;
840 	}
841 
842 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
843 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
844 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
845 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
846 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
847 			return;
848 		}
849 	}
850 
851 	switch (level) {
852 	case DL_PROMISC_PHYS:
853 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
854 		break;
855 	case DL_PROMISC_SAP:
856 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
857 		break;
858 	case DL_PROMISC_MULTI:
859 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
860 		break;
861 	default:
862 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
863 		return;
864 	}
865 
866 	dlokack(q, mp, DL_PROMISCON_REQ);
867 }
868 
869 static void
870 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
871 {
872 	ipnet_t		*ipnet = q->q_ptr;
873 	t_uscalar_t	level;
874 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
875 
876 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
877 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
878 		return;
879 	}
880 
881 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
882 		dlokack(q, mp, DL_PROMISCOFF_REQ);
883 		return;
884 	}
885 
886 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
887 	switch (level) {
888 	case DL_PROMISC_PHYS:
889 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
890 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
891 		break;
892 	case DL_PROMISC_SAP:
893 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
894 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
895 		break;
896 	case DL_PROMISC_MULTI:
897 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
898 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
899 		break;
900 	default:
901 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
902 		return;
903 	}
904 
905 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
906 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
907 		return;
908 	}
909 
910 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
911 		ipnet_leave_allmulti(ipnet->ipnet_if,
912 		    ipnet->ipnet_ns->netstack_ipnet);
913 	}
914 
915 	dlokack(q, mp, DL_PROMISCOFF_REQ);
916 }
917 
918 static int
919 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
920 {
921 	int		err = 0;
922 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
923 	uint64_t	index = ipnetif->if_index;
924 
925 	mutex_enter(&ips->ips_event_lock);
926 	if (ipnetif->if_multicnt == 0) {
927 		ASSERT((ipnetif->if_flags &
928 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
929 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
930 			err = ip_join_allmulti(index, B_FALSE, ipst);
931 			if (err != 0)
932 				goto done;
933 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
934 		}
935 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
936 			err = ip_join_allmulti(index, B_TRUE, ipst);
937 			if (err != 0 &&
938 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
939 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
940 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
941 				goto done;
942 			}
943 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
944 		}
945 	}
946 	ipnetif->if_multicnt++;
947 
948 done:
949 	mutex_exit(&ips->ips_event_lock);
950 	return (err);
951 }
952 
953 static void
954 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
955 {
956 	int		err;
957 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
958 	uint64_t	index = ipnetif->if_index;
959 
960 	mutex_enter(&ips->ips_event_lock);
961 	ASSERT(ipnetif->if_multicnt != 0);
962 	if (--ipnetif->if_multicnt == 0) {
963 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
964 			err = ip_leave_allmulti(index, B_FALSE, ipst);
965 			ASSERT(err == 0 || err == ENODEV);
966 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
967 		}
968 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
969 			err = ip_leave_allmulti(index, B_TRUE, ipst);
970 			ASSERT(err == 0 || err == ENODEV);
971 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
972 		}
973 	}
974 	mutex_exit(&ips->ips_event_lock);
975 }
976 
977 /*
978  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
979  * The structure it copies the header information from,
980  * hook_pkt_observe_t, is constructed using network byte
981  * order in ipobs_hook(), so there is no conversion here.
982  */
983 static mblk_t *
984 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
985 {
986 	mblk_t		*dlhdr;
987 	dl_ipnetinfo_t	*dl;
988 
989 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
990 		freemsg(mp);
991 		return (NULL);
992 	}
993 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
994 	dl->dli_version = DL_IPNETINFO_VERSION;
995 	dl->dli_family = hdr->hpo_family;
996 	dl->dli_htype = hdr->hpo_htype;
997 	dl->dli_pktlen = hdr->hpo_pktlen;
998 	dl->dli_ifindex = hdr->hpo_ifindex;
999 	dl->dli_grifindex = hdr->hpo_grifindex;
1000 	dl->dli_zsrc = hdr->hpo_zsrc;
1001 	dl->dli_zdst = hdr->hpo_zdst;
1002 	dlhdr->b_wptr += sizeof (*dl);
1003 	dlhdr->b_cont = mp;
1004 
1005 	return (dlhdr);
1006 }
1007 
1008 static ipnet_addrtype_t
1009 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1010 {
1011 	list_t			*list;
1012 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1013 	ipnetif_addr_t		*ifaddr;
1014 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1015 
1016 	/* First check if the address is multicast or limited broadcast. */
1017 	switch (addr->iap_family) {
1018 	case AF_INET:
1019 		if (CLASSD(*(addr->iap_addr4)) ||
1020 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1021 			return (IPNETADDR_MBCAST);
1022 		break;
1023 	case AF_INET6:
1024 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1025 			return (IPNETADDR_MBCAST);
1026 		break;
1027 	}
1028 
1029 	/*
1030 	 * Walk the address list to see if the address belongs to our
1031 	 * interface or is one of our subnet broadcast addresses.
1032 	 */
1033 	mutex_enter(&ipnetif->if_addr_lock);
1034 	list = (addr->iap_family == AF_INET) ?
1035 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1036 	for (ifaddr = list_head(list);
1037 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1038 	    ifaddr = list_next(list, ifaddr)) {
1039 		/*
1040 		 * If we're not in the global zone, then only look at
1041 		 * addresses in our zone.
1042 		 */
1043 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1044 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1045 			continue;
1046 		switch (addr->iap_family) {
1047 		case AF_INET:
1048 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1049 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1050 				addrtype = IPNETADDR_MYADDR;
1051 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1052 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1053 				addrtype = IPNETADDR_MBCAST;
1054 			break;
1055 		case AF_INET6:
1056 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1057 			    &ifaddr->ifa_ip6addr))
1058 				addrtype = IPNETADDR_MYADDR;
1059 			break;
1060 		}
1061 	}
1062 	mutex_exit(&ipnetif->if_addr_lock);
1063 
1064 	return (addrtype);
1065 }
1066 
1067 /*
1068  * Verify if the packet contained in hdr should be passed up to the
1069  * ipnet client stream.
1070  */
1071 static boolean_t
1072 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1073     ipnet_addrp_t *dst)
1074 {
1075 	boolean_t		obsif;
1076 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1077 	ipnet_addrtype_t	srctype;
1078 	ipnet_addrtype_t	dsttype;
1079 
1080 	srctype = ipnet_get_addrtype(ipnet, src);
1081 	dsttype = ipnet_get_addrtype(ipnet, dst);
1082 
1083 	/*
1084 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1085 	 * matches ours, it's on the interface we're observing.  (Thus,
1086 	 * observing on the group ifindex matches all ifindexes in the group.)
1087 	 */
1088 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1089 	    ntohl(hdr->hpo_grifindex) == ifindex);
1090 
1091 	DTRACE_PROBE5(ipnet_accept__addr,
1092 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1093 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1094 	    boolean_t, obsif);
1095 
1096 	/*
1097 	 * Do not allow an ipnet stream to see packets that are not from or to
1098 	 * its zone.  The exception is when zones are using the shared stack
1099 	 * model.  In this case, streams in the global zone have visibility
1100 	 * into other shared-stack zones, and broadcast and multicast traffic
1101 	 * is visible by all zones in the stack.
1102 	 */
1103 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1104 	    dsttype != IPNETADDR_MBCAST) {
1105 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1106 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1107 			return (B_FALSE);
1108 	}
1109 
1110 	/*
1111 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1112 	 * packet's IP version.
1113 	 */
1114 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1115 	    ipnet->ipnet_family != hdr->hpo_family)
1116 		return (B_FALSE);
1117 
1118 	/* If the destination address is ours, then accept the packet. */
1119 	if (dsttype == IPNETADDR_MYADDR)
1120 		return (B_TRUE);
1121 
1122 	/*
1123 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1124 	 * sent or received on the interface we're observing, or packets that
1125 	 * have our source address (this allows us to see packets we send).
1126 	 */
1127 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1128 		if (srctype == IPNETADDR_MYADDR || obsif)
1129 			return (B_TRUE);
1130 	}
1131 
1132 	/*
1133 	 * We accept multicast and broadcast packets transmitted or received
1134 	 * on the interface we're observing.
1135 	 */
1136 	if (dsttype == IPNETADDR_MBCAST && obsif)
1137 		return (B_TRUE);
1138 
1139 	return (B_FALSE);
1140 }
1141 
1142 /*
1143  * Verify if the packet contained in hdr should be passed up to the ipnet
1144  * client stream that's in IPNET_LOMODE.
1145  */
1146 /* ARGSUSED */
1147 static boolean_t
1148 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1149     ipnet_addrp_t *dst)
1150 {
1151 	if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1152 		/*
1153 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1154 		 */
1155 		if (ipnet->ipnet_if == NULL)
1156 			return (B_FALSE);
1157 	}
1158 
1159 	/*
1160 	 * An ipnet stream must not see packets that are not from/to its zone.
1161 	 */
1162 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1163 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1164 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1165 			return (B_FALSE);
1166 	}
1167 
1168 	return (ipnet->ipnet_family == AF_UNSPEC ||
1169 	    ipnet->ipnet_family == hdr->hpo_family);
1170 }
1171 
1172 static void
1173 ipnet_dispatch(void *arg)
1174 {
1175 	mblk_t			*mp = arg;
1176 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1177 	ipnet_t			*ipnet;
1178 	mblk_t			*netmp;
1179 	list_t			*list;
1180 	ipnet_stack_t		*ips;
1181 	ipnet_addrp_t		src;
1182 	ipnet_addrp_t		dst;
1183 
1184 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1185 
1186 	netmp = hdr->hpo_pkt->b_cont;
1187 	src.iap_family = hdr->hpo_family;
1188 	dst.iap_family = hdr->hpo_family;
1189 
1190 	if (hdr->hpo_family == AF_INET) {
1191 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1192 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1193 	} else {
1194 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1195 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1196 	}
1197 
1198 	ipnet_walkers_inc(ips);
1199 
1200 	list = &ips->ips_str_list;
1201 	for (ipnet = list_head(list); ipnet != NULL;
1202 	    ipnet = list_next(list, ipnet)) {
1203 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1204 			IPSK_BUMP(ips, ik_acceptFail);
1205 			continue;
1206 		}
1207 		IPSK_BUMP(ips, ik_acceptOk);
1208 
1209 		if (list_next(list, ipnet) == NULL) {
1210 			netmp = hdr->hpo_pkt->b_cont;
1211 			hdr->hpo_pkt->b_cont = NULL;
1212 		} else {
1213 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1214 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1215 				IPSK_BUMP(ips, ik_duplicationFail);
1216 				continue;
1217 			}
1218 		}
1219 
1220 		if (ipnet->ipnet_flags & IPNET_INFO) {
1221 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1222 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1223 				continue;
1224 			}
1225 		}
1226 
1227 		if (ipnet->ipnet_rq->q_first == NULL &&
1228 		    canputnext(ipnet->ipnet_rq)) {
1229 			putnext(ipnet->ipnet_rq, netmp);
1230 			IPSK_BUMP(ips, ik_dispatchDeliver);
1231 		} else if (canput(ipnet->ipnet_rq)) {
1232 			(void) putq(ipnet->ipnet_rq, netmp);
1233 			IPSK_BUMP(ips, ik_dispatchDeliver);
1234 		} else {
1235 			freemsg(netmp);
1236 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1237 		}
1238 	}
1239 
1240 	ipnet_walkers_dec(ips);
1241 
1242 	freemsg(mp);
1243 }
1244 
1245 static void
1246 ipnet_input(mblk_t *mp)
1247 {
1248 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1249 	ipnet_stack_t		*ips;
1250 
1251 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1252 
1253 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1254 	    DDI_SUCCESS) {
1255 		IPSK_BUMP(ips, ik_dispatchFail);
1256 		freemsg(mp);
1257 	} else {
1258 		IPSK_BUMP(ips, ik_dispatchOk);
1259 	}
1260 }
1261 
1262 static ipnetif_t *
1263 ipnet_alloc_if(ipnet_stack_t *ips)
1264 {
1265 	ipnetif_t	*ipnetif;
1266 
1267 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1268 		return (NULL);
1269 
1270 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1271 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1272 	    offsetof(ipnetif_addr_t, ifa_link));
1273 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1274 	    offsetof(ipnetif_addr_t, ifa_link));
1275 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1276 
1277 	ipnetif->if_stackp = ips;
1278 
1279 	return (ipnetif);
1280 }
1281 
1282 /*
1283  * Create a new ipnetif_t and new minor node for it.  If creation is
1284  * successful the new ipnetif_t is inserted into an avl_tree
1285  * containing ipnetif's for this stack instance.
1286  */
1287 static ipnetif_t *
1288 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1289     uint64_t ifflags)
1290 {
1291 	ipnetif_t	*ipnetif;
1292 	avl_index_t	where = 0;
1293 	minor_t		ifminor;
1294 
1295 	/*
1296 	 * Because ipnetif_create() can be called from a NIC event
1297 	 * callback, it should not block.
1298 	 */
1299 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1300 	if (ifminor == (minor_t)-1)
1301 		return (NULL);
1302 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1303 		id_free(ipnet_minor_space, ifminor);
1304 		return (NULL);
1305 	}
1306 
1307 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1308 	ipnetif->if_index = (uint_t)index;
1309 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1310 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1311 
1312 	ipnetif->if_refcnt = 1;
1313 	if ((ifflags & IFF_LOOPBACK) != 0)
1314 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1315 
1316 	mutex_enter(&ips->ips_avl_lock);
1317 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1318 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1319 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1320 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1321 	mutex_exit(&ips->ips_avl_lock);
1322 
1323 	return (ipnetif);
1324 }
1325 
1326 static void
1327 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1328 {
1329 	ipnet_t	*ipnet;
1330 
1331 	ipnet_walkers_inc(ips);
1332 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1333 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1334 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1335 		if (ipnet->ipnet_if == ipnetif)
1336 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1337 	}
1338 	ipnet_walkers_dec(ips);
1339 	mutex_enter(&ips->ips_avl_lock);
1340 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1341 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1342 	mutex_exit(&ips->ips_avl_lock);
1343 	/*
1344 	 * Release the reference we implicitly held in ipnetif_create().
1345 	 */
1346 	ipnetif_refrele(ipnetif);
1347 }
1348 
1349 static void
1350 ipnet_purge_addrlist(list_t *addrlist)
1351 {
1352 	ipnetif_addr_t	*ifa;
1353 
1354 	while ((ifa = list_head(addrlist)) != NULL) {
1355 		list_remove(addrlist, ifa);
1356 		if (ifa->ifa_shared != NULL)
1357 			ipnetif_clone_release(ifa->ifa_shared);
1358 		kmem_free(ifa, sizeof (*ifa));
1359 	}
1360 }
1361 
1362 static void
1363 ipnetif_free(ipnetif_t *ipnetif)
1364 {
1365 	ASSERT(ipnetif->if_refcnt == 0);
1366 	ASSERT(ipnetif->if_sharecnt == 0);
1367 
1368 	/* Remove IPv4/v6 address lists from the ipnetif */
1369 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1370 	list_destroy(&ipnetif->if_ip4addr_list);
1371 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1372 	list_destroy(&ipnetif->if_ip6addr_list);
1373 	mutex_destroy(&ipnetif->if_addr_lock);
1374 	mutex_destroy(&ipnetif->if_reflock);
1375 	if (ipnetif->if_dev != 0)
1376 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1377 	kmem_free(ipnetif, sizeof (*ipnetif));
1378 }
1379 
1380 /*
1381  * Create an ipnetif_addr_t with the given logical interface id (lif)
1382  * and add it to the supplied ipnetif.  The lif is the netinfo
1383  * representation of logical interface id, and we use this id to match
1384  * incoming netinfo events against our lists of addresses.
1385  */
1386 static void
1387 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1388 {
1389 	ipnetif_addr_t		*ifaddr;
1390 	zoneid_t		zoneid;
1391 	struct sockaddr_in	bcast;
1392 	struct sockaddr_storage	addr;
1393 	net_ifaddr_t		type = NA_ADDRESS;
1394 	uint64_t		phyif = ipnetif->if_index;
1395 
1396 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1397 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1398 		return;
1399 
1400 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1401 		return;
1402 	ifaddr->ifa_zone = zoneid;
1403 	ifaddr->ifa_id = lif;
1404 	ifaddr->ifa_shared = NULL;
1405 
1406 	switch (addr.ss_family) {
1407 	case AF_INET:
1408 		ifaddr->ifa_ip4addr =
1409 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1410 		/*
1411 		 * Try and get the broadcast address.  Note that it's okay for
1412 		 * an interface to not have a broadcast address, so we don't
1413 		 * fail the entire operation if net_getlifaddr() fails here.
1414 		 */
1415 		type = NA_BROADCAST;
1416 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1417 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1418 		break;
1419 	case AF_INET6:
1420 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1421 		break;
1422 	}
1423 
1424 	/*
1425 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1426 	 * zone the address is being used in. This facilitates finding the
1427 	 * correct netstack_t pointer, amongst other things, later.
1428 	 */
1429 	if (zoneid == ALL_ZONES)
1430 		zoneid = GLOBAL_ZONEID;
1431 
1432 	mutex_enter(&ipnetif->if_addr_lock);
1433 	if (zoneid != ipnetif->if_zoneid) {
1434 		ipnetif_t *ifp2;
1435 
1436 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1437 		ifaddr->ifa_shared = ifp2;
1438 	}
1439 	list_insert_tail(addr.ss_family == AF_INET ?
1440 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1441 	mutex_exit(&ipnetif->if_addr_lock);
1442 }
1443 
1444 static void
1445 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1446 {
1447 	mutex_enter(&ipnetif->if_addr_lock);
1448 	if (ifaddr->ifa_shared != NULL)
1449 		ipnetif_clone_release(ifaddr->ifa_shared);
1450 
1451 	list_remove(isv6 ?
1452 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1453 	mutex_exit(&ipnetif->if_addr_lock);
1454 	kmem_free(ifaddr, sizeof (*ifaddr));
1455 }
1456 
1457 static void
1458 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1459 {
1460 	ipnetif_t	*ipnetif;
1461 	boolean_t	refrele_needed = B_TRUE;
1462 	uint64_t	ifflags;
1463 	uint64_t	ifindex;
1464 	char		*ifname;
1465 
1466 	ifflags = 0;
1467 	ifname = ipne->ipne_ifname;
1468 	ifindex = ipne->ipne_ifindex;
1469 
1470 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1471 
1472 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1473 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1474 		refrele_needed = B_FALSE;
1475 	}
1476 	if (ipnetif != NULL) {
1477 		ipnetif->if_flags |=
1478 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1479 	}
1480 
1481 	if (ipnetif->if_multicnt != 0) {
1482 		if (ip_join_allmulti(ifindex, isv6,
1483 		    ips->ips_netstack->netstack_ip) == 0) {
1484 			ipnetif->if_flags |=
1485 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1486 		}
1487 	}
1488 
1489 	if (refrele_needed)
1490 		ipnetif_refrele(ipnetif);
1491 }
1492 
1493 static void
1494 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1495 {
1496 	ipnetif_t	*ipnetif;
1497 
1498 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1499 		return;
1500 
1501 	mutex_enter(&ipnetif->if_addr_lock);
1502 	ipnet_purge_addrlist(isv6 ?
1503 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1504 	mutex_exit(&ipnetif->if_addr_lock);
1505 
1506 	/*
1507 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1508 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1509 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1510 	 */
1511 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1512 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1513 		ipnetif_remove(ipnetif, ips);
1514 	ipnetif_refrele(ipnetif);
1515 }
1516 
1517 static void
1518 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1519     ipnet_stack_t *ips, boolean_t isv6)
1520 {
1521 	ipnetif_t	*ipnetif;
1522 	ipnetif_addr_t	*ifaddr;
1523 
1524 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1525 		return;
1526 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1527 		/*
1528 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1529 		 * ifaddr and re-create it.
1530 		 */
1531 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1532 	}
1533 
1534 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1535 	ipnetif_refrele(ipnetif);
1536 }
1537 
1538 static void
1539 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1540     boolean_t isv6)
1541 {
1542 	ipnetif_t	*ipnetif;
1543 	ipnetif_addr_t	*ifaddr;
1544 
1545 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1546 		return;
1547 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1548 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1549 	ipnetif_refrele(ipnetif);
1550 	/*
1551 	 * Make sure that open streams on this ipnetif are still allowed to
1552 	 * have it open.
1553 	 */
1554 	ipnetif_zonecheck(ipnetif, ips);
1555 }
1556 
1557 /*
1558  * This callback from the NIC event framework dispatches a taskq as the event
1559  * handlers may block.
1560  */
1561 /* ARGSUSED */
1562 static int
1563 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1564 {
1565 	ipnet_stack_t		*ips = arg;
1566 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1567 	ipnet_nicevent_t	*ipne;
1568 
1569 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1570 		return (0);
1571 	ipne->ipne_event = hn->hne_event;
1572 	ipne->ipne_protocol = hn->hne_protocol;
1573 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1574 	ipne->ipne_ifindex = hn->hne_nic;
1575 	ipne->ipne_lifindex = hn->hne_lif;
1576 	if (hn->hne_datalen != 0) {
1577 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1578 		    sizeof (ipne->ipne_ifname));
1579 	}
1580 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1581 	    ipne, DDI_NOSLEEP);
1582 	return (0);
1583 }
1584 
1585 static void
1586 ipnet_nicevent_task(void *arg)
1587 {
1588 	ipnet_nicevent_t	*ipne = arg;
1589 	netstack_t		*ns;
1590 	ipnet_stack_t		*ips;
1591 	boolean_t		isv6;
1592 
1593 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1594 		goto done;
1595 	ips = ns->netstack_ipnet;
1596 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1597 
1598 	mutex_enter(&ips->ips_event_lock);
1599 	switch (ipne->ipne_event) {
1600 	case NE_PLUMB:
1601 		ipnet_plumb_ev(ipne, ips, isv6);
1602 		break;
1603 	case NE_UNPLUMB:
1604 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1605 		break;
1606 	case NE_LIF_UP:
1607 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1608 		    ipne->ipne_protocol, ips, isv6);
1609 		break;
1610 	case NE_LIF_DOWN:
1611 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1612 		    isv6);
1613 		break;
1614 	default:
1615 		break;
1616 	}
1617 	mutex_exit(&ips->ips_event_lock);
1618 done:
1619 	if (ns != NULL)
1620 		netstack_rele(ns);
1621 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1622 }
1623 
1624 dev_t
1625 ipnet_if_getdev(char *name, zoneid_t zoneid)
1626 {
1627 	netstack_t	*ns;
1628 	ipnet_stack_t	*ips;
1629 	ipnetif_t	*ipnetif;
1630 	dev_t		dev = (dev_t)-1;
1631 
1632 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1633 		return (dev);
1634 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1635 		return (dev);
1636 
1637 	ips = ns->netstack_ipnet;
1638 	mutex_enter(&ips->ips_avl_lock);
1639 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1640 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1641 			dev = ipnetif->if_dev;
1642 	}
1643 	mutex_exit(&ips->ips_avl_lock);
1644 	netstack_rele(ns);
1645 
1646 	return (dev);
1647 }
1648 
1649 static ipnetif_t *
1650 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1651 {
1652 	ipnetif_t	*ipnetif;
1653 
1654 	mutex_enter(&ips->ips_avl_lock);
1655 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1656 		ipnetif_refhold(ipnetif);
1657 	mutex_exit(&ips->ips_avl_lock);
1658 	return (ipnetif);
1659 }
1660 
1661 static ipnetif_t *
1662 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1663 {
1664 	ipnetif_t	*ipnetif;
1665 	avl_tree_t	*tree;
1666 
1667 	mutex_enter(&ips->ips_avl_lock);
1668 	tree = &ips->ips_avl_by_index;
1669 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1670 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1671 		if (ipnetif->if_dev == dev) {
1672 			ipnetif_refhold(ipnetif);
1673 			break;
1674 		}
1675 	}
1676 	mutex_exit(&ips->ips_avl_lock);
1677 	return (ipnetif);
1678 }
1679 
1680 static ipnetif_addr_t *
1681 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1682 {
1683 	ipnetif_addr_t	*ifaddr;
1684 	list_t	*list;
1685 
1686 	mutex_enter(&ipnetif->if_addr_lock);
1687 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1688 	for (ifaddr = list_head(list); ifaddr != NULL;
1689 	    ifaddr = list_next(list, ifaddr)) {
1690 		if (lid == ifaddr->ifa_id)
1691 			break;
1692 	}
1693 	mutex_exit(&ipnetif->if_addr_lock);
1694 	return (ifaddr);
1695 }
1696 
1697 /* ARGSUSED */
1698 static void *
1699 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1700 {
1701 	ipnet_stack_t	*ips;
1702 
1703 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1704 	ips->ips_netstack = ns;
1705 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1706 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1707 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1708 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1709 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1710 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1711 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1712 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1713 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1714 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1715 	    offsetof(ipnet_t, ipnet_next));
1716 	ipnet_register_netihook(ips);
1717 	return (ips);
1718 }
1719 
1720 /* ARGSUSED */
1721 static void
1722 ipnet_stack_fini(netstackid_t stackid, void *arg)
1723 {
1724 	ipnet_stack_t	*ips = arg;
1725 	ipnetif_t	*ipnetif, *nipnetif;
1726 
1727 	if (ips->ips_kstatp != NULL) {
1728 		zoneid_t zoneid;
1729 
1730 		zoneid = netstackid_to_zoneid(stackid);
1731 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1732 	}
1733 	if (ips->ips_ndv4 != NULL) {
1734 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1735 		    ips->ips_nicevents) == 0);
1736 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1737 	}
1738 	if (ips->ips_ndv6 != NULL) {
1739 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1740 		    ips->ips_nicevents) == 0);
1741 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1742 	}
1743 	hook_free(ips->ips_nicevents);
1744 
1745 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1746 	    ipnetif = nipnetif) {
1747 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1748 		ipnetif_remove(ipnetif, ips);
1749 	}
1750 	avl_destroy(&ips->ips_avl_by_shared);
1751 	avl_destroy(&ips->ips_avl_by_index);
1752 	avl_destroy(&ips->ips_avl_by_name);
1753 	mutex_destroy(&ips->ips_avl_lock);
1754 	mutex_destroy(&ips->ips_walkers_lock);
1755 	cv_destroy(&ips->ips_walkers_cv);
1756 	list_destroy(&ips->ips_str_list);
1757 	kmem_free(ips, sizeof (*ips));
1758 }
1759 
1760 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1761 static boolean_t
1762 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1763 {
1764 	ipnetif_addr_t	*ifa;
1765 
1766 	for (ifa = list_head(addrlist); ifa != NULL;
1767 	    ifa = list_next(addrlist, ifa)) {
1768 		if (ifa->ifa_zone == zoneid)
1769 			return (B_TRUE);
1770 	}
1771 	return (B_FALSE);
1772 }
1773 
1774 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1775 static boolean_t
1776 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1777 {
1778 	int	ret;
1779 
1780 	/*
1781 	 * The global zone has visibility into all interfaces in the global
1782 	 * stack, and exclusive stack zones have visibility into all
1783 	 * interfaces in their stack.
1784 	 */
1785 	if (zoneid == GLOBAL_ZONEID ||
1786 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1787 		return (B_TRUE);
1788 
1789 	/*
1790 	 * Shared-stack zones only have visibility for interfaces that have
1791 	 * addresses in their zone.
1792 	 */
1793 	mutex_enter(&ipnetif->if_addr_lock);
1794 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1795 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1796 	mutex_exit(&ipnetif->if_addr_lock);
1797 	return (ret);
1798 }
1799 
1800 /*
1801  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1802  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1803  * to have an ipnetif open if there are no longer any addresses that belong to
1804  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1805  * case, send the ipnet_t an M_HANGUP.
1806  */
1807 static void
1808 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1809 {
1810 	list_t	*strlist = &ips->ips_str_list;
1811 	ipnet_t	*ipnet;
1812 
1813 	ipnet_walkers_inc(ips);
1814 	for (ipnet = list_head(strlist); ipnet != NULL;
1815 	    ipnet = list_next(strlist, ipnet)) {
1816 		if (ipnet->ipnet_if != ipnetif)
1817 			continue;
1818 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1819 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1820 	}
1821 	ipnet_walkers_dec(ips);
1822 }
1823 
1824 void
1825 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1826 {
1827 	ipnetif_t		*ipnetif;
1828 	list_t			cbdata;
1829 	ipnetif_cbdata_t	*cbnode;
1830 	netstack_t		*ns;
1831 	ipnet_stack_t		*ips;
1832 
1833 	/*
1834 	 * On labeled systems, non-global zones shouldn't see anything
1835 	 * in /dev/ipnet.
1836 	 */
1837 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1838 		return;
1839 
1840 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1841 		return;
1842 
1843 	ips = ns->netstack_ipnet;
1844 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1845 	    offsetof(ipnetif_cbdata_t, ic_next));
1846 
1847 	mutex_enter(&ips->ips_avl_lock);
1848 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1849 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1850 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1851 			continue;
1852 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1853 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1854 		cbnode->ic_dev = ipnetif->if_dev;
1855 		list_insert_head(&cbdata, cbnode);
1856 	}
1857 	mutex_exit(&ips->ips_avl_lock);
1858 
1859 	while ((cbnode = list_head(&cbdata)) != NULL) {
1860 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1861 		list_remove(&cbdata, cbnode);
1862 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1863 	}
1864 	list_destroy(&cbdata);
1865 	netstack_rele(ns);
1866 }
1867 
1868 static int
1869 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1870 {
1871 	int64_t	index1 = *((int64_t *)index_ptr);
1872 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1873 
1874 	return (SIGNOF(index2 - index1));
1875 }
1876 
1877 static int
1878 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1879 {
1880 	int	res;
1881 
1882 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1883 	return (SIGNOF(res));
1884 }
1885 
1886 static int
1887 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1888 {
1889 	const uintptr_t	*ptr = key_ptr;
1890 	const ipnetif_t	*ifp;
1891 	int		res;
1892 
1893 	ifp = ipnetifp;
1894 	res = ifp->if_zoneid - ptr[0];
1895 	if (res != 0)
1896 		return (SIGNOF(res));
1897 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1898 	return (SIGNOF(res));
1899 }
1900 
1901 static void
1902 ipnetif_refhold(ipnetif_t *ipnetif)
1903 {
1904 	mutex_enter(&ipnetif->if_reflock);
1905 	ipnetif->if_refcnt++;
1906 	mutex_exit(&ipnetif->if_reflock);
1907 }
1908 
1909 static void
1910 ipnetif_refrele(ipnetif_t *ipnetif)
1911 {
1912 	mutex_enter(&ipnetif->if_reflock);
1913 	ASSERT(ipnetif->if_refcnt > 0);
1914 	if (--ipnetif->if_refcnt == 0)
1915 		ipnetif_free(ipnetif);
1916 	else
1917 		mutex_exit(&ipnetif->if_reflock);
1918 }
1919 
1920 static void
1921 ipnet_walkers_inc(ipnet_stack_t *ips)
1922 {
1923 	mutex_enter(&ips->ips_walkers_lock);
1924 	ips->ips_walkers_cnt++;
1925 	mutex_exit(&ips->ips_walkers_lock);
1926 }
1927 
1928 static void
1929 ipnet_walkers_dec(ipnet_stack_t *ips)
1930 {
1931 	mutex_enter(&ips->ips_walkers_lock);
1932 	ASSERT(ips->ips_walkers_cnt != 0);
1933 	if (--ips->ips_walkers_cnt == 0)
1934 		cv_broadcast(&ips->ips_walkers_cv);
1935 	mutex_exit(&ips->ips_walkers_lock);
1936 }
1937 
1938 /*ARGSUSED*/
1939 static int
1940 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1941 {
1942 	hook_pkt_observe_t	*hdr;
1943 	pfv_t			func = (pfv_t)arg;
1944 	mblk_t			*mp;
1945 
1946 	hdr = (hook_pkt_observe_t *)info;
1947 	/*
1948 	 * Code in ip_input() expects that it is the only one accessing the
1949 	 * packet.
1950 	 */
1951 	mp = copymsg(hdr->hpo_pkt);
1952 	if (mp == NULL)  {
1953 		netstack_t *ns = hdr->hpo_ctx;
1954 		ipnet_stack_t *ips = ns->netstack_ipnet;
1955 
1956 		IPSK_BUMP(ips, ik_dispatchDupDrop);
1957 		return (0);
1958 	}
1959 
1960 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1961 	hdr->hpo_pkt = mp;
1962 
1963 	func(mp);
1964 
1965 	return (0);
1966 }
1967 
1968 hook_t *
1969 ipobs_register_hook(netstack_t *ns, pfv_t func)
1970 {
1971 	ip_stack_t	*ipst = ns->netstack_ip;
1972 	char		name[32];
1973 	hook_t		*hook;
1974 
1975 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1976 	VERIFY(hook != NULL);
1977 
1978 	/*
1979 	 * To register multiple hooks with the same callback function,
1980 	 * a unique name is needed.
1981 	 */
1982 	(void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1983 	hook->h_name = strdup(name);
1984 
1985 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1986 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1987 
1988 	return (hook);
1989 }
1990 
1991 void
1992 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1993 {
1994 	ip_stack_t	*ipst = ns->netstack_ip;
1995 
1996 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1997 
1998 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1999 
2000 	strfree(hook->h_name);
2001 
2002 	hook_free(hook);
2003 }
2004 
2005 /* ******************************************************************** */
2006 /* BPF Functions below							*/
2007 /* ******************************************************************** */
2008 
2009 /*
2010  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2011  */
2012 ipnet_stack_t *
2013 ipnet_find_by_zoneid(zoneid_t zoneid)
2014 {
2015 	netstack_t	*ns;
2016 
2017 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2018 	return (ns->netstack_ipnet);
2019 }
2020 
2021 /*
2022  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2023  * pointer to ipnet_stack_t by calling a netstack lookup function.
2024  * The netstack_find_*() functions return a pointer after doing a "hold"
2025  * on the data structure and thereby require a "release" when the caller
2026  * is finished with it. We need to mirror that API here and thus a caller
2027  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2028  */
2029 void
2030 ipnet_rele(ipnet_stack_t *ips)
2031 {
2032 	netstack_rele(ips->ips_netstack);
2033 }
2034 
2035 /*
2036  */
2037 void
2038 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2039 {
2040 	ipnet_itap = tapfunc;
2041 }
2042 
2043 /*
2044  * The list of interfaces available via ipnet is private for each zone,
2045  * so the AVL tree of each zone must be searched for a given name, even
2046  * if all names are unique.
2047  */
2048 int
2049 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2050 {
2051 	ipnet_stack_t	*ips;
2052 	ipnetif_t	*ipnetif;
2053 
2054 	ASSERT(ptr != NULL);
2055 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2056 
2057 	mutex_enter(&ips->ips_avl_lock);
2058 
2059 	/*
2060 	 * Shared instance zone?
2061 	 */
2062 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2063 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2064 
2065 		ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2066 	} else {
2067 		ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2068 	}
2069 	if (ipnetif != NULL)
2070 		ipnetif_refhold(ipnetif);
2071 	mutex_exit(&ips->ips_avl_lock);
2072 
2073 	*ptr = ipnetif;
2074 	ipnet_rele(ips);
2075 
2076 	if (ipnetif == NULL)
2077 		return (ESRCH);
2078 	return (0);
2079 }
2080 
2081 void
2082 ipnet_close_byhandle(ipnetif_t *ifp)
2083 {
2084 	ASSERT(ifp != NULL);
2085 	ipnetif_refrele(ifp);
2086 }
2087 
2088 const char *
2089 ipnet_name(ipnetif_t *ifp)
2090 {
2091 	ASSERT(ifp != NULL);
2092 	return (ifp->if_name);
2093 }
2094 
2095 /*
2096  * To find the linkid for a given name, it is necessary to know which zone
2097  * the interface name belongs to and to search the avl tree for that zone
2098  * as there is no master list of all interfaces and which zone they belong
2099  * to. It is assumed that the caller of this function is somehow already
2100  * working with the ipnet interfaces and hence the ips_event_lock is held.
2101  * When BPF calls into this function, it is doing so because of an event
2102  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2103  * value returned has meaning without the need for grabbing a hold on the
2104  * owning structure.
2105  */
2106 int
2107 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2108 {
2109 	ipnet_stack_t	*ips;
2110 	ipnetif_t	*ifp;
2111 
2112 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2113 	ASSERT(mutex_owned(&ips->ips_event_lock));
2114 
2115 	mutex_enter(&ips->ips_avl_lock);
2116 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2117 	if (ifp != NULL)
2118 		*idp = (uint_t)ifp->if_index;
2119 
2120 	/*
2121 	 * Shared instance zone?
2122 	 */
2123 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2124 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2125 
2126 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2127 		if (ifp != NULL)
2128 			*idp = (uint_t)ifp->if_index;
2129 	}
2130 
2131 	mutex_exit(&ips->ips_avl_lock);
2132 	ipnet_rele(ips);
2133 
2134 	if (ifp == NULL)
2135 		return (ESRCH);
2136 	return (0);
2137 }
2138 
2139 /*
2140  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2141  * there is in mac. BPF only needs to have this because it is required as
2142  * part of interfacing correctly with mac. The reuse of the original
2143  * ipnetif_t as a client poses no danger, so long as it is done with its
2144  * own ref-count'd hold that is given up on close.
2145  */
2146 int
2147 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2148 {
2149 	ASSERT(ptr != NULL);
2150 	ASSERT(result != NULL);
2151 	ipnetif_refhold(ptr);
2152 	*result = ptr;
2153 
2154 	return (0);
2155 }
2156 
2157 void
2158 ipnet_client_close(ipnetif_t *ptr)
2159 {
2160 	ASSERT(ptr != NULL);
2161 	ipnetif_refrele(ptr);
2162 }
2163 
2164 /*
2165  * This is called from BPF when it needs to start receiving packets
2166  * from ipnet.
2167  *
2168  * The use of the ipnet_t structure here is somewhat lightweight when
2169  * compared to how it is used elsewhere but it already has all of the
2170  * right fields in it, so reuse here doesn't seem out of order. Its
2171  * primary purpose here is to provide the means to store pointers for
2172  * use when ipnet_promisc_remove() needs to be called.
2173  *
2174  * This should never be called for the IPNET_MINOR_LO device as it is
2175  * never created via ipnetif_create.
2176  */
2177 /*ARGSUSED*/
2178 int
2179 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2180     int flags)
2181 {
2182 	ip_stack_t	*ipst;
2183 	netstack_t	*ns;
2184 	ipnetif_t	*ifp;
2185 	ipnet_t		*ipnet;
2186 	char		name[32];
2187 	int		error;
2188 
2189 	ifp = (ipnetif_t *)handle;
2190 
2191 	if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2192 		return (EINVAL);
2193 
2194 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2195 
2196 	if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2197 		netstack_rele(ns);
2198 		return (error);
2199 	}
2200 
2201 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2202 	ipnet->ipnet_if = ifp;
2203 	ipnet->ipnet_ns = ns;
2204 	ipnet->ipnet_flags = flags;
2205 
2206 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2207 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2208 	} else {
2209 		ipnet->ipnet_acceptfn = ipnet_accept;
2210 	}
2211 
2212 	/*
2213 	 * To register multiple hooks with the same callback function,
2214 	 * a unique name is needed.
2215 	 */
2216 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2217 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2218 	    (void *)ipnet->ipnet_hook);
2219 	ipnet->ipnet_hook->h_name = strdup(name);
2220 	ipnet->ipnet_data = data;
2221 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2222 
2223 	ipst = ns->netstack_ip;
2224 
2225 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2226 	    ipnet->ipnet_hook);
2227 	if (error != 0)
2228 		goto regfail;
2229 
2230 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2231 	    ipnet->ipnet_hook);
2232 	if (error != 0) {
2233 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2234 		    NH_OBSERVE, ipnet->ipnet_hook);
2235 		goto regfail;
2236 	}
2237 
2238 	*mhandle = (uintptr_t)ipnet;
2239 	netstack_rele(ns);
2240 
2241 	return (0);
2242 
2243 regfail:
2244 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2245 	strfree(ipnet->ipnet_hook->h_name);
2246 	hook_free(ipnet->ipnet_hook);
2247 	netstack_rele(ns);
2248 	return (error);
2249 }
2250 
2251 void
2252 ipnet_promisc_remove(void *data)
2253 {
2254 	ip_stack_t	*ipst;
2255 	ipnet_t		*ipnet;
2256 	hook_t		*hook;
2257 
2258 	ipnet = data;
2259 	ipst = ipnet->ipnet_ns->netstack_ip;
2260 	hook = ipnet->ipnet_hook;
2261 
2262 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2263 	    hook) == 0);
2264 
2265 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2266 	    hook) == 0);
2267 
2268 	strfree(hook->h_name);
2269 
2270 	hook_free(hook);
2271 
2272 	kmem_free(ipnet, sizeof (*ipnet));
2273 }
2274 
2275 /*
2276  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2277  * An important field from that structure is "ipnet_data" that
2278  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2279  * to be passed back to bpf when we call into ipnet_itap.
2280  *
2281  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2282  * from BPF.
2283  */
2284 /*ARGSUSED*/
2285 static int
2286 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2287 {
2288 	hook_pkt_observe_t	*hdr;
2289 	ipnet_addrp_t		src;
2290 	ipnet_addrp_t		dst;
2291 	ipnet_stack_t		*ips;
2292 	ipnet_t			*ipnet;
2293 	mblk_t			*netmp;
2294 	mblk_t			*mp;
2295 
2296 	hdr = (hook_pkt_observe_t *)info;
2297 	mp = hdr->hpo_pkt;
2298 	ipnet = (ipnet_t *)arg;
2299 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2300 
2301 	netmp = hdr->hpo_pkt->b_cont;
2302 	src.iap_family = hdr->hpo_family;
2303 	dst.iap_family = hdr->hpo_family;
2304 
2305 	if (hdr->hpo_family == AF_INET) {
2306 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2307 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2308 	} else {
2309 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2310 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2311 	}
2312 
2313 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2314 		IPSK_BUMP(ips, ik_acceptFail);
2315 		return (0);
2316 	}
2317 	IPSK_BUMP(ips, ik_acceptOk);
2318 
2319 	ipnet_itap(ipnet->ipnet_data, mp,
2320 	    hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2321 	    ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2322 
2323 	return (0);
2324 }
2325 
2326 /*
2327  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2328  * to life and configures an IP address. The model that BPF uses is that
2329  * each interface must have a unique pointer and each interface must be
2330  * representative of what it can capture. They are limited to one DLT
2331  * per interface and one zone per interface. Thus every interface that
2332  * can be seen in a zone must be announced via an attach to bpf. For
2333  * shared instance zones, this means the ipnet driver needs to detect
2334  * when an address is added to an interface in a zone for the first
2335  * time (and also when the last address is removed.)
2336  */
2337 static ipnetif_t *
2338 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2339 {
2340 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2341 	ipnet_stack_t	*ips = ifp->if_stackp;
2342 	avl_index_t	where = 0;
2343 	ipnetif_t	*newif;
2344 
2345 	mutex_enter(&ips->ips_avl_lock);
2346 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2347 	if (newif != NULL) {
2348 		ipnetif_refhold(newif);
2349 		newif->if_sharecnt++;
2350 		mutex_exit(&ips->ips_avl_lock);
2351 		return (newif);
2352 	}
2353 
2354 	newif = ipnet_alloc_if(ips);
2355 	if (newif == NULL) {
2356 		mutex_exit(&ips->ips_avl_lock);
2357 		return (NULL);
2358 	}
2359 
2360 	newif->if_refcnt = 1;
2361 	newif->if_sharecnt = 1;
2362 	newif->if_zoneid = zoneid;
2363 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2364 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2365 	newif->if_index = ifp->if_index;
2366 
2367 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2368 	mutex_exit(&ips->ips_avl_lock);
2369 
2370 	return (newif);
2371 }
2372 
2373 static void
2374 ipnetif_clone_release(ipnetif_t *ipnetif)
2375 {
2376 	boolean_t	dofree = B_FALSE;
2377 	boolean_t	doremove = B_FALSE;
2378 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2379 
2380 	mutex_enter(&ipnetif->if_reflock);
2381 	ASSERT(ipnetif->if_refcnt > 0);
2382 	if (--ipnetif->if_refcnt == 0)
2383 		dofree = B_TRUE;
2384 	ASSERT(ipnetif->if_sharecnt > 0);
2385 	if (--ipnetif->if_sharecnt == 0)
2386 		doremove = B_TRUE;
2387 	mutex_exit(&ipnetif->if_reflock);
2388 	if (doremove) {
2389 		mutex_enter(&ips->ips_avl_lock);
2390 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2391 		mutex_exit(&ips->ips_avl_lock);
2392 	}
2393 	if (dofree) {
2394 		ASSERT(ipnetif->if_sharecnt == 0);
2395 		ipnetif_free(ipnetif);
2396 	}
2397 }
2398