xref: /titanic_52/usr/src/uts/common/inet/ipnet/ipnet.c (revision 906afcb89d0412cc073b95c2d701a804a8cdb62c)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * The ipnet device defined here provides access to packets at the IP layer. To
29  * provide access to packets at this layer it registers a callback function in
30  * the ip module and when there are open instances of the device ip will pass
31  * packets into the device. Packets from ip are passed on the input, output and
32  * loopback paths. Internally the module returns to ip as soon as possible by
33  * deferring processing using a taskq.
34  *
35  * Management of the devices in /dev/ipnet/ is handled by the devname
36  * filesystem and use of the neti interfaces.  This module registers for NIC
37  * events using the neti framework so that when IP interfaces are bought up,
38  * taken down etc. the ipnet module is notified and its view of the interfaces
39  * configured on the system adjusted.  On attach, the module gets an initial
40  * view of the system again using the neti framework but as it has already
41  * registered for IP interface events, it is still up-to-date with any changes.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/conf.h>
46 #include <sys/cred.h>
47 #include <sys/stat.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/modctl.h>
51 #include <sys/dlpi.h>
52 #include <sys/strsun.h>
53 #include <sys/id_space.h>
54 #include <sys/kmem.h>
55 #include <sys/mkdev.h>
56 #include <sys/neti.h>
57 #include <net/if.h>
58 #include <sys/errno.h>
59 #include <sys/list.h>
60 #include <sys/ksynch.h>
61 #include <sys/hook_event.h>
62 #include <sys/sdt.h>
63 #include <sys/stropts.h>
64 #include <sys/sysmacros.h>
65 #include <inet/ip.h>
66 #include <inet/ip_if.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip6.h>
69 #include <inet/ipnet.h>
70 #include <net/bpf.h>
71 #include <net/bpfdesc.h>
72 #include <net/dlt.h>
73 
74 static struct module_info ipnet_minfo = {
75 	1,		/* mi_idnum */
76 	"ipnet",	/* mi_idname */
77 	0,		/* mi_minpsz */
78 	INFPSZ,		/* mi_maxpsz */
79 	2048,		/* mi_hiwat */
80 	0		/* mi_lowat */
81 };
82 
83 /*
84  * List to hold static view of ipnetif_t's on the system. This is needed to
85  * avoid holding the lock protecting the avl tree of ipnetif's over the
86  * callback into the dev filesystem.
87  */
88 typedef struct ipnetif_cbdata {
89 	char		ic_ifname[LIFNAMSIZ];
90 	dev_t		ic_dev;
91 	list_node_t	ic_next;
92 } ipnetif_cbdata_t;
93 
94 /*
95  * Convenience enumerated type for ipnet_accept().  It describes the
96  * properties of a given ipnet_addrp_t relative to a single ipnet_t
97  * client stream.  The values represent whether the address is ...
98  */
99 typedef enum {
100 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
101 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
102 	IPNETADDR_UNKNOWN	/* none of the above. */
103 } ipnet_addrtype_t;
104 
105 /* Argument used for the ipnet_nicevent_taskq callback. */
106 typedef struct ipnet_nicevent_s {
107 	nic_event_t		ipne_event;
108 	net_handle_t		ipne_protocol;
109 	netstackid_t		ipne_stackid;
110 	uint64_t		ipne_ifindex;
111 	uint64_t		ipne_lifindex;
112 	char			ipne_ifname[LIFNAMSIZ];
113 } ipnet_nicevent_t;
114 
115 static dev_info_t	*ipnet_dip;
116 static major_t		ipnet_major;
117 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
118 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
119 static id_space_t	*ipnet_minor_space;
120 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
121 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
122 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
123 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
124 static bpf_itap_fn_t	ipnet_itap;
125 
126 static void	ipnet_input(mblk_t *);
127 static int	ipnet_wput(queue_t *, mblk_t *);
128 static int	ipnet_rsrv(queue_t *);
129 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
130 static int	ipnet_close(queue_t *);
131 static void	ipnet_ioctl(queue_t *, mblk_t *);
132 static void	ipnet_iocdata(queue_t *, mblk_t *);
133 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
134 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
135 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
136 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
137 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
138 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
139 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
140 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
141 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
142 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
143 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
144 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
145 static void	ipnet_nicevent_task(void *);
146 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
147     uint64_t);
148 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
149 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
150 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
151 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
152 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
153 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
154 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
155 static int 	ipnetif_compare_name(const void *, const void *);
156 static int 	ipnetif_compare_name_zone(const void *, const void *);
157 static int 	ipnetif_compare_index(const void *, const void *);
158 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
159 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
160 static void	ipnetif_refhold(ipnetif_t *);
161 static void	ipnetif_refrele(ipnetif_t *);
162 static void	ipnet_walkers_inc(ipnet_stack_t *);
163 static void	ipnet_walkers_dec(ipnet_stack_t *);
164 static void	ipnet_register_netihook(ipnet_stack_t *);
165 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
166 static void	ipnet_stack_fini(netstackid_t, void *);
167 static void	ipnet_dispatch(void *);
168 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
169 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
170 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
171 static void	ipnetif_clone_release(ipnetif_t *);
172 
173 static struct qinit ipnet_rinit = {
174 	NULL,		/* qi_putp */
175 	ipnet_rsrv,	/* qi_srvp */
176 	ipnet_open,	/* qi_qopen */
177 	ipnet_close,	/* qi_qclose */
178 	NULL,		/* qi_qadmin */
179 	&ipnet_minfo,	/* qi_minfo */
180 };
181 
182 static struct qinit ipnet_winit = {
183 	ipnet_wput,	/* qi_putp */
184 	NULL,		/* qi_srvp */
185 	NULL,		/* qi_qopen */
186 	NULL,		/* qi_qclose */
187 	NULL,		/* qi_qadmin */
188 	&ipnet_minfo,	/* qi_minfo */
189 };
190 
191 static struct streamtab ipnet_info = {
192 	&ipnet_rinit, &ipnet_winit
193 };
194 
195 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
196     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
197     ddi_quiesce_not_supported);
198 
199 static struct modldrv modldrv = {
200 	&mod_driverops,
201 	"STREAMS ipnet driver",
202 	&ipnet_ops
203 };
204 
205 static struct modlinkage modlinkage = {
206 	MODREV_1, &modldrv, NULL
207 };
208 
209 /*
210  * This structure contains the template data (names and type) that is
211  * copied, in bulk, into the new kstats structure created by net_kstat_create.
212  * No actual statistical information is stored in this instance of the
213  * ipnet_kstats_t structure.
214  */
215 static ipnet_kstats_t stats_template = {
216 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
217 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
218 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
219 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
220 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
221 	{ "dispatchPutDrop",	KSTAT_DATA_UINT64 },
222 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
223 	{ "acceptOk",		KSTAT_DATA_UINT64 },
224 	{ "acceptFail",		KSTAT_DATA_UINT64 }
225 };
226 
227 /*
228  * Walk the list of physical interfaces on the machine, for each
229  * interface create a new ipnetif_t and add any addresses to it. We
230  * need to do the walk twice, once for IPv4 and once for IPv6.
231  *
232  * The interfaces are destroyed as part of ipnet_stack_fini() for each
233  * stack.  Note that we cannot do this initialization in
234  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
235  */
236 static int
237 ipnetif_init(void)
238 {
239 	netstack_handle_t	nh;
240 	netstack_t		*ns;
241 	ipnet_stack_t		*ips;
242 	int			ret = 0;
243 
244 	netstack_next_init(&nh);
245 	while ((ns = netstack_next(&nh)) != NULL) {
246 		ips = ns->netstack_ipnet;
247 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
248 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
249 		netstack_rele(ns);
250 		if (ret != 0)
251 			break;
252 	}
253 	netstack_next_fini(&nh);
254 	return (ret);
255 }
256 
257 /*
258  * Standard module entry points.
259  */
260 int
261 _init(void)
262 {
263 	int		ret;
264 	boolean_t	netstack_registered = B_FALSE;
265 
266 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
267 		return (ENODEV);
268 	ipnet_minor_space = id_space_create("ipnet_minor_space",
269 	    IPNET_MINOR_MIN, MAXMIN32);
270 
271 	/*
272 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
273 	 * delivery of packets to clients.  Note that we need to create the
274 	 * taskqs before calling netstack_register() since ipnet_stack_init()
275 	 * registers callbacks that use 'em.
276 	 */
277 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
278 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
279 	    1, TASKQ_DEFAULTPRI, 0);
280 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
281 		ret = ENOMEM;
282 		goto done;
283 	}
284 
285 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
286 	netstack_registered = B_TRUE;
287 
288 	if ((ret = ipnetif_init()) == 0)
289 		ret = mod_install(&modlinkage);
290 done:
291 	if (ret != 0) {
292 		if (ipnet_taskq != NULL)
293 			ddi_taskq_destroy(ipnet_taskq);
294 		if (ipnet_nicevent_taskq != NULL)
295 			ddi_taskq_destroy(ipnet_nicevent_taskq);
296 		if (netstack_registered)
297 			netstack_unregister(NS_IPNET);
298 		id_space_destroy(ipnet_minor_space);
299 	}
300 	return (ret);
301 }
302 
303 int
304 _fini(void)
305 {
306 	int	err;
307 
308 	if ((err = mod_remove(&modlinkage)) != 0)
309 		return (err);
310 
311 	netstack_unregister(NS_IPNET);
312 	ddi_taskq_destroy(ipnet_nicevent_taskq);
313 	ddi_taskq_destroy(ipnet_taskq);
314 	id_space_destroy(ipnet_minor_space);
315 	return (0);
316 }
317 
318 int
319 _info(struct modinfo *modinfop)
320 {
321 	return (mod_info(&modlinkage, modinfop));
322 }
323 
324 static void
325 ipnet_register_netihook(ipnet_stack_t *ips)
326 {
327 	int		ret;
328 	zoneid_t	zoneid;
329 	netid_t		netid;
330 
331 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
332 	    ips);
333 
334 	/*
335 	 * It is possible for an exclusive stack to be in the process of
336 	 * shutting down here, and the netid and protocol lookups could fail
337 	 * in that case.
338 	 */
339 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
340 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
341 		return;
342 
343 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
344 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
345 		    ips->ips_nicevents)) != 0) {
346 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
347 			ips->ips_ndv4 = NULL;
348 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
349 			    " in zone %d: %d", zoneid, ret);
350 		}
351 	}
352 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
353 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
354 		    ips->ips_nicevents)) != 0) {
355 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
356 			ips->ips_ndv6 = NULL;
357 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
358 			    " in zone %d: %d", zoneid, ret);
359 		}
360 	}
361 
362 	/*
363 	 * Create a local set of kstats for each zone.
364 	 */
365 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
366 	    "misc", KSTAT_TYPE_NAMED,
367 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
368 	if (ips->ips_kstatp != NULL) {
369 		bcopy(&stats_template, &ips->ips_stats,
370 		    sizeof (ips->ips_stats));
371 		ips->ips_kstatp->ks_data = &ips->ips_stats;
372 		ips->ips_kstatp->ks_private =
373 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
374 		kstat_install(ips->ips_kstatp);
375 	} else {
376 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
377 		    "ipnet", "ipnet_stats", "misc");
378 	}
379 }
380 
381 /*
382  * This function is called on attach to build an initial view of the
383  * interfaces on the system. It will be called once for IPv4 and once
384  * for IPv6, although there is only one ipnet interface for both IPv4
385  * and IPv6 there are separate address lists.
386  */
387 static int
388 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
389 {
390 	phy_if_t	phyif;
391 	lif_if_t	lif;
392 	ipnetif_t	*ipnetif;
393 	char		name[LIFNAMSIZ];
394 	boolean_t	new_if = B_FALSE;
395 	uint64_t	ifflags;
396 	int		ret = 0;
397 
398 	/*
399 	 * If ipnet_register_netihook() was unable to initialize this
400 	 * stack's net_handle_t, then we cannot populate any interface
401 	 * information.  This usually happens when we attempted to
402 	 * grab a net_handle_t as a stack was shutting down.  We don't
403 	 * want to fail the entire _init() operation because of a
404 	 * stack shutdown (other stacks will continue to work just
405 	 * fine), so we silently return success here.
406 	 */
407 	if (nd == NULL)
408 		return (0);
409 
410 	/*
411 	 * Make sure we're not processing NIC events during the
412 	 * population of our interfaces and address lists.
413 	 */
414 	mutex_enter(&ips->ips_event_lock);
415 
416 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
417 	    phyif = net_phygetnext(nd, phyif)) {
418 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
419 			continue;
420 		ifflags =  0;
421 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
422 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
423 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
424 			if (ipnetif == NULL) {
425 				ret = ENOMEM;
426 				goto done;
427 			}
428 			new_if = B_TRUE;
429 		}
430 		ipnetif->if_flags |=
431 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
432 
433 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
434 		    lif = net_lifgetnext(nd, phyif, lif)) {
435 			/*
436 			 * Skip addresses that aren't up.  We'll add
437 			 * them when we receive an NE_LIF_UP event.
438 			 */
439 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
440 			    !(ifflags & IFF_UP))
441 				continue;
442 			/* Don't add it if we already have it. */
443 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
444 				continue;
445 			ipnet_add_ifaddr(lif, ipnetif, nd);
446 		}
447 		if (!new_if)
448 			ipnetif_refrele(ipnetif);
449 	}
450 
451 done:
452 	mutex_exit(&ips->ips_event_lock);
453 	return (ret);
454 }
455 
456 static int
457 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
458 {
459 	if (cmd != DDI_ATTACH)
460 		return (DDI_FAILURE);
461 
462 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
463 	    DDI_PSEUDO, 0) == DDI_FAILURE)
464 		return (DDI_FAILURE);
465 
466 	ipnet_dip = dip;
467 	return (DDI_SUCCESS);
468 }
469 
470 static int
471 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
472 {
473 	if (cmd != DDI_DETACH)
474 		return (DDI_FAILURE);
475 
476 	ASSERT(dip == ipnet_dip);
477 	ddi_remove_minor_node(ipnet_dip, NULL);
478 	ipnet_dip = NULL;
479 	return (DDI_SUCCESS);
480 }
481 
482 /* ARGSUSED */
483 static int
484 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
485 {
486 	int	error = DDI_FAILURE;
487 
488 	switch (infocmd) {
489 	case DDI_INFO_DEVT2INSTANCE:
490 		*result = (void *)0;
491 		error = DDI_SUCCESS;
492 		break;
493 	case DDI_INFO_DEVT2DEVINFO:
494 		if (ipnet_dip != NULL) {
495 			*result = ipnet_dip;
496 			error = DDI_SUCCESS;
497 		}
498 		break;
499 	}
500 	return (error);
501 }
502 
503 /* ARGSUSED */
504 static int
505 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
506 {
507 	ipnet_t		*ipnet;
508 	netstack_t	*ns = NULL;
509 	ipnet_stack_t	*ips;
510 	int		err = 0;
511 	zoneid_t	zoneid = crgetzoneid(crp);
512 
513 	/*
514 	 * If the system is labeled, only the global zone is allowed to open
515 	 * IP observability nodes.
516 	 */
517 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
518 		return (EACCES);
519 
520 	/* We don't support open as a module */
521 	if (sflag & MODOPEN)
522 		return (ENOTSUP);
523 
524 	/* This driver is self-cloning, we don't support re-open. */
525 	if (rq->q_ptr != NULL)
526 		return (EBUSY);
527 
528 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
529 		return (ENOMEM);
530 
531 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
532 	ips = ns->netstack_ipnet;
533 
534 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
535 	ipnet->ipnet_rq = rq;
536 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
537 	ipnet->ipnet_zoneid = zoneid;
538 	ipnet->ipnet_dlstate = DL_UNBOUND;
539 	ipnet->ipnet_ns = ns;
540 
541 	/*
542 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
543 	 * to be processed after ipnet_if is set and the ipnet_t has been
544 	 * inserted in the ips_str_list.
545 	 */
546 	mutex_enter(&ips->ips_event_lock);
547 	if (getminor(*dev) == IPNET_MINOR_LO) {
548 		ipnet->ipnet_flags |= IPNET_LOMODE;
549 		ipnet->ipnet_acceptfn = ipnet_loaccept;
550 	} else {
551 		ipnet->ipnet_acceptfn = ipnet_accept;
552 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
553 		if (ipnet->ipnet_if == NULL ||
554 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
555 			err = ENODEV;
556 			goto done;
557 		}
558 	}
559 
560 	mutex_enter(&ips->ips_walkers_lock);
561 	while (ips->ips_walkers_cnt != 0)
562 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
563 	list_insert_head(&ips->ips_str_list, ipnet);
564 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
565 	qprocson(rq);
566 
567 	/*
568 	 * Only register our callback if we're the first open client; we call
569 	 * unregister in close() for the last open client.
570 	 */
571 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
572 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
573 	mutex_exit(&ips->ips_walkers_lock);
574 
575 done:
576 	mutex_exit(&ips->ips_event_lock);
577 	if (err != 0) {
578 		netstack_rele(ns);
579 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
580 		if (ipnet->ipnet_if != NULL)
581 			ipnetif_refrele(ipnet->ipnet_if);
582 		kmem_free(ipnet, sizeof (*ipnet));
583 	}
584 	return (err);
585 }
586 
587 static int
588 ipnet_close(queue_t *rq)
589 {
590 	ipnet_t		*ipnet = rq->q_ptr;
591 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
592 
593 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
594 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
595 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
596 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
597 
598 	mutex_enter(&ips->ips_walkers_lock);
599 	while (ips->ips_walkers_cnt != 0)
600 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
601 
602 	qprocsoff(rq);
603 
604 	list_remove(&ips->ips_str_list, ipnet);
605 	if (ipnet->ipnet_if != NULL)
606 		ipnetif_refrele(ipnet->ipnet_if);
607 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
608 
609 	if (list_is_empty(&ips->ips_str_list)) {
610 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
611 		ips->ips_hook = NULL;
612 	}
613 
614 	kmem_free(ipnet, sizeof (*ipnet));
615 
616 	mutex_exit(&ips->ips_walkers_lock);
617 	netstack_rele(ips->ips_netstack);
618 	return (0);
619 }
620 
621 static int
622 ipnet_wput(queue_t *q, mblk_t *mp)
623 {
624 	switch (mp->b_datap->db_type) {
625 	case M_FLUSH:
626 		if (*mp->b_rptr & FLUSHW) {
627 			flushq(q, FLUSHDATA);
628 			*mp->b_rptr &= ~FLUSHW;
629 		}
630 		if (*mp->b_rptr & FLUSHR)
631 			qreply(q, mp);
632 		else
633 			freemsg(mp);
634 		break;
635 	case M_PROTO:
636 	case M_PCPROTO:
637 		ipnet_wputnondata(q, mp);
638 		break;
639 	case M_IOCTL:
640 		ipnet_ioctl(q, mp);
641 		break;
642 	case M_IOCDATA:
643 		ipnet_iocdata(q, mp);
644 		break;
645 	default:
646 		freemsg(mp);
647 		break;
648 	}
649 	return (0);
650 }
651 
652 static int
653 ipnet_rsrv(queue_t *q)
654 {
655 	mblk_t	*mp;
656 
657 	while ((mp = getq(q)) != NULL) {
658 		ASSERT(DB_TYPE(mp) == M_DATA);
659 		if (canputnext(q)) {
660 			putnext(q, mp);
661 		} else {
662 			(void) putbq(q, mp);
663 			break;
664 		}
665 	}
666 	return (0);
667 }
668 
669 static void
670 ipnet_ioctl(queue_t *q, mblk_t *mp)
671 {
672 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
673 
674 	switch (iocp->ioc_cmd) {
675 	case DLIOCRAW:
676 		miocack(q, mp, 0, 0);
677 		break;
678 	case DLIOCIPNETINFO:
679 		if (iocp->ioc_count == TRANSPARENT) {
680 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
681 			qreply(q, mp);
682 			break;
683 		}
684 		/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
685 	default:
686 		miocnak(q, mp, 0, EINVAL);
687 		break;
688 	}
689 }
690 
691 static void
692 ipnet_iocdata(queue_t *q, mblk_t *mp)
693 {
694 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
695 	ipnet_t	*ipnet = q->q_ptr;
696 
697 	switch (iocp->ioc_cmd) {
698 	case DLIOCIPNETINFO:
699 		if (*(int *)mp->b_cont->b_rptr == 1)
700 			ipnet->ipnet_flags |= IPNET_INFO;
701 		else if (*(int *)mp->b_cont->b_rptr == 0)
702 			ipnet->ipnet_flags &= ~IPNET_INFO;
703 		else
704 			goto iocnak;
705 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
706 		break;
707 	default:
708 iocnak:
709 		miocnak(q, mp, 0, EINVAL);
710 		break;
711 	}
712 }
713 
714 static void
715 ipnet_wputnondata(queue_t *q, mblk_t *mp)
716 {
717 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
718 	t_uscalar_t		prim = dlp->dl_primitive;
719 
720 	switch (prim) {
721 	case DL_INFO_REQ:
722 		ipnet_inforeq(q, mp);
723 		break;
724 	case DL_UNBIND_REQ:
725 		ipnet_unbindreq(q, mp);
726 		break;
727 	case DL_BIND_REQ:
728 		ipnet_bindreq(q, mp);
729 		break;
730 	case DL_PROMISCON_REQ:
731 		ipnet_dlpromisconreq(q, mp);
732 		break;
733 	case DL_PROMISCOFF_REQ:
734 		ipnet_dlpromiscoffreq(q, mp);
735 		break;
736 	case DL_UNITDATA_REQ:
737 	case DL_DETACH_REQ:
738 	case DL_PHYS_ADDR_REQ:
739 	case DL_SET_PHYS_ADDR_REQ:
740 	case DL_ENABMULTI_REQ:
741 	case DL_DISABMULTI_REQ:
742 	case DL_ATTACH_REQ:
743 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
744 		break;
745 	default:
746 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
747 		break;
748 	}
749 }
750 
751 static void
752 ipnet_inforeq(queue_t *q, mblk_t *mp)
753 {
754 	dl_info_ack_t	*dlip;
755 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
756 
757 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
758 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
759 		return;
760 	}
761 
762 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
763 		return;
764 
765 	dlip = (dl_info_ack_t *)mp->b_rptr;
766 	*dlip = ipnet_infoack;
767 	qreply(q, mp);
768 }
769 
770 static void
771 ipnet_bindreq(queue_t *q, mblk_t *mp)
772 {
773 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
774 	ipnet_t			*ipnet = q->q_ptr;
775 
776 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
777 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
778 		return;
779 	}
780 
781 	switch (dlp->bind_req.dl_sap) {
782 	case 0 :
783 		ipnet->ipnet_family = AF_UNSPEC;
784 		break;
785 	case IPV4_VERSION :
786 		ipnet->ipnet_family = AF_INET;
787 		break;
788 	case IPV6_VERSION :
789 		ipnet->ipnet_family = AF_INET6;
790 		break;
791 	default :
792 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
793 		return;
794 		/*NOTREACHED*/
795 	}
796 
797 	ipnet->ipnet_dlstate = DL_IDLE;
798 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
799 }
800 
801 static void
802 ipnet_unbindreq(queue_t *q, mblk_t *mp)
803 {
804 	ipnet_t	*ipnet = q->q_ptr;
805 
806 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
807 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
808 		return;
809 	}
810 
811 	if (ipnet->ipnet_dlstate != DL_IDLE) {
812 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
813 	} else {
814 		ipnet->ipnet_dlstate = DL_UNBOUND;
815 		ipnet->ipnet_family = AF_UNSPEC;
816 		dlokack(q, mp, DL_UNBIND_REQ);
817 	}
818 }
819 
820 static void
821 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
822 {
823 	ipnet_t		*ipnet = q->q_ptr;
824 	t_uscalar_t	level;
825 	int		err;
826 
827 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
828 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
829 		return;
830 	}
831 
832 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
833 		dlokack(q, mp, DL_PROMISCON_REQ);
834 		return;
835 	}
836 
837 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
838 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
839 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
840 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
841 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
842 			return;
843 		}
844 	}
845 
846 	switch (level) {
847 	case DL_PROMISC_PHYS:
848 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
849 		break;
850 	case DL_PROMISC_SAP:
851 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
852 		break;
853 	case DL_PROMISC_MULTI:
854 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
855 		break;
856 	default:
857 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
858 		return;
859 	}
860 
861 	dlokack(q, mp, DL_PROMISCON_REQ);
862 }
863 
864 static void
865 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
866 {
867 	ipnet_t		*ipnet = q->q_ptr;
868 	t_uscalar_t	level;
869 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
870 
871 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
872 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
873 		return;
874 	}
875 
876 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
877 		dlokack(q, mp, DL_PROMISCOFF_REQ);
878 		return;
879 	}
880 
881 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
882 	switch (level) {
883 	case DL_PROMISC_PHYS:
884 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
885 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
886 		break;
887 	case DL_PROMISC_SAP:
888 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
889 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
890 		break;
891 	case DL_PROMISC_MULTI:
892 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
893 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
894 		break;
895 	default:
896 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
897 		return;
898 	}
899 
900 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
901 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
902 		return;
903 	}
904 
905 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
906 		ipnet_leave_allmulti(ipnet->ipnet_if,
907 		    ipnet->ipnet_ns->netstack_ipnet);
908 	}
909 
910 	dlokack(q, mp, DL_PROMISCOFF_REQ);
911 }
912 
913 static int
914 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
915 {
916 	int		err = 0;
917 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
918 	uint64_t	index = ipnetif->if_index;
919 
920 	mutex_enter(&ips->ips_event_lock);
921 	if (ipnetif->if_multicnt == 0) {
922 		ASSERT((ipnetif->if_flags &
923 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
924 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
925 			err = ip_join_allmulti(index, B_FALSE, ipst);
926 			if (err != 0)
927 				goto done;
928 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
929 		}
930 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
931 			err = ip_join_allmulti(index, B_TRUE, ipst);
932 			if (err != 0 &&
933 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
934 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
935 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
936 				goto done;
937 			}
938 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
939 		}
940 	}
941 	ipnetif->if_multicnt++;
942 
943 done:
944 	mutex_exit(&ips->ips_event_lock);
945 	return (err);
946 }
947 
948 static void
949 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
950 {
951 	int		err;
952 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
953 	uint64_t	index = ipnetif->if_index;
954 
955 	mutex_enter(&ips->ips_event_lock);
956 	ASSERT(ipnetif->if_multicnt != 0);
957 	if (--ipnetif->if_multicnt == 0) {
958 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
959 			err = ip_leave_allmulti(index, B_FALSE, ipst);
960 			ASSERT(err == 0 || err == ENODEV);
961 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
962 		}
963 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
964 			err = ip_leave_allmulti(index, B_TRUE, ipst);
965 			ASSERT(err == 0 || err == ENODEV);
966 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
967 		}
968 	}
969 	mutex_exit(&ips->ips_event_lock);
970 }
971 
972 /*
973  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
974  * The structure it copies the header information from,
975  * hook_pkt_observe_t, is constructed using network byte
976  * order in ipobs_hook(), so there is no conversion here.
977  */
978 static mblk_t *
979 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
980 {
981 	mblk_t		*dlhdr;
982 	dl_ipnetinfo_t	*dl;
983 
984 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
985 		freemsg(mp);
986 		return (NULL);
987 	}
988 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
989 	dl->dli_version = DL_IPNETINFO_VERSION;
990 	dl->dli_family = hdr->hpo_family;
991 	dl->dli_htype = hdr->hpo_htype;
992 	dl->dli_pktlen = hdr->hpo_pktlen;
993 	dl->dli_ifindex = hdr->hpo_ifindex;
994 	dl->dli_grifindex = hdr->hpo_grifindex;
995 	dl->dli_zsrc = hdr->hpo_zsrc;
996 	dl->dli_zdst = hdr->hpo_zdst;
997 	dlhdr->b_wptr += sizeof (*dl);
998 	dlhdr->b_cont = mp;
999 
1000 	return (dlhdr);
1001 }
1002 
1003 static ipnet_addrtype_t
1004 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1005 {
1006 	list_t			*list;
1007 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1008 	ipnetif_addr_t		*ifaddr;
1009 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1010 
1011 	/* First check if the address is multicast or limited broadcast. */
1012 	switch (addr->iap_family) {
1013 	case AF_INET:
1014 		if (CLASSD(*(addr->iap_addr4)) ||
1015 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1016 			return (IPNETADDR_MBCAST);
1017 		break;
1018 	case AF_INET6:
1019 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1020 			return (IPNETADDR_MBCAST);
1021 		break;
1022 	}
1023 
1024 	/*
1025 	 * Walk the address list to see if the address belongs to our
1026 	 * interface or is one of our subnet broadcast addresses.
1027 	 */
1028 	mutex_enter(&ipnetif->if_addr_lock);
1029 	list = (addr->iap_family == AF_INET) ?
1030 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1031 	for (ifaddr = list_head(list);
1032 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1033 	    ifaddr = list_next(list, ifaddr)) {
1034 		/*
1035 		 * If we're not in the global zone, then only look at
1036 		 * addresses in our zone.
1037 		 */
1038 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1039 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1040 			continue;
1041 		switch (addr->iap_family) {
1042 		case AF_INET:
1043 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1044 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1045 				addrtype = IPNETADDR_MYADDR;
1046 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1047 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1048 				addrtype = IPNETADDR_MBCAST;
1049 			break;
1050 		case AF_INET6:
1051 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1052 			    &ifaddr->ifa_ip6addr))
1053 				addrtype = IPNETADDR_MYADDR;
1054 			break;
1055 		}
1056 	}
1057 	mutex_exit(&ipnetif->if_addr_lock);
1058 
1059 	return (addrtype);
1060 }
1061 
1062 /*
1063  * Verify if the packet contained in hdr should be passed up to the
1064  * ipnet client stream.
1065  */
1066 static boolean_t
1067 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1068     ipnet_addrp_t *dst)
1069 {
1070 	boolean_t		obsif;
1071 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1072 	ipnet_addrtype_t	srctype;
1073 	ipnet_addrtype_t	dsttype;
1074 
1075 	srctype = ipnet_get_addrtype(ipnet, src);
1076 	dsttype = ipnet_get_addrtype(ipnet, dst);
1077 
1078 	/*
1079 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1080 	 * matches ours, it's on the interface we're observing.  (Thus,
1081 	 * observing on the group ifindex matches all ifindexes in the group.)
1082 	 */
1083 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1084 	    ntohl(hdr->hpo_grifindex) == ifindex);
1085 
1086 	DTRACE_PROBE5(ipnet_accept__addr,
1087 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1088 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1089 	    boolean_t, obsif);
1090 
1091 	/*
1092 	 * Do not allow an ipnet stream to see packets that are not from or to
1093 	 * its zone.  The exception is when zones are using the shared stack
1094 	 * model.  In this case, streams in the global zone have visibility
1095 	 * into other shared-stack zones, and broadcast and multicast traffic
1096 	 * is visible by all zones in the stack.
1097 	 */
1098 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1099 	    dsttype != IPNETADDR_MBCAST) {
1100 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1101 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1102 			return (B_FALSE);
1103 	}
1104 
1105 	/*
1106 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1107 	 * packet's IP version.
1108 	 */
1109 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1110 	    ipnet->ipnet_family != hdr->hpo_family)
1111 		return (B_FALSE);
1112 
1113 	/* If the destination address is ours, then accept the packet. */
1114 	if (dsttype == IPNETADDR_MYADDR)
1115 		return (B_TRUE);
1116 
1117 	/*
1118 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1119 	 * sent or received on the interface we're observing, or packets that
1120 	 * have our source address (this allows us to see packets we send).
1121 	 */
1122 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1123 		if (srctype == IPNETADDR_MYADDR || obsif)
1124 			return (B_TRUE);
1125 	}
1126 
1127 	/*
1128 	 * We accept multicast and broadcast packets transmitted or received
1129 	 * on the interface we're observing.
1130 	 */
1131 	if (dsttype == IPNETADDR_MBCAST && obsif)
1132 		return (B_TRUE);
1133 
1134 	return (B_FALSE);
1135 }
1136 
1137 /*
1138  * Verify if the packet contained in hdr should be passed up to the ipnet
1139  * client stream that's in IPNET_LOMODE.
1140  */
1141 /* ARGSUSED */
1142 static boolean_t
1143 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1144     ipnet_addrp_t *dst)
1145 {
1146 	if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1147 		/*
1148 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1149 		 */
1150 		if (ipnet->ipnet_if == NULL)
1151 			return (B_FALSE);
1152 	}
1153 
1154 	/*
1155 	 * An ipnet stream must not see packets that are not from/to its zone.
1156 	 */
1157 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1158 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1159 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1160 			return (B_FALSE);
1161 	}
1162 
1163 	return (ipnet->ipnet_family == AF_UNSPEC ||
1164 	    ipnet->ipnet_family == hdr->hpo_family);
1165 }
1166 
1167 static void
1168 ipnet_dispatch(void *arg)
1169 {
1170 	mblk_t			*mp = arg;
1171 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1172 	ipnet_t			*ipnet;
1173 	mblk_t			*netmp;
1174 	list_t			*list;
1175 	ipnet_stack_t		*ips;
1176 	ipnet_addrp_t		src;
1177 	ipnet_addrp_t		dst;
1178 
1179 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1180 
1181 	netmp = hdr->hpo_pkt->b_cont;
1182 	src.iap_family = hdr->hpo_family;
1183 	dst.iap_family = hdr->hpo_family;
1184 
1185 	if (hdr->hpo_family == AF_INET) {
1186 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1187 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1188 	} else {
1189 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1190 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1191 	}
1192 
1193 	ipnet_walkers_inc(ips);
1194 
1195 	list = &ips->ips_str_list;
1196 	for (ipnet = list_head(list); ipnet != NULL;
1197 	    ipnet = list_next(list, ipnet)) {
1198 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1199 			IPSK_BUMP(ips, ik_acceptFail);
1200 			continue;
1201 		}
1202 		IPSK_BUMP(ips, ik_acceptOk);
1203 
1204 		if (list_next(list, ipnet) == NULL) {
1205 			netmp = hdr->hpo_pkt->b_cont;
1206 			hdr->hpo_pkt->b_cont = NULL;
1207 		} else {
1208 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1209 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1210 				IPSK_BUMP(ips, ik_duplicationFail);
1211 				continue;
1212 			}
1213 		}
1214 
1215 		if (ipnet->ipnet_flags & IPNET_INFO) {
1216 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1217 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1218 				continue;
1219 			}
1220 		}
1221 
1222 		if (ipnet->ipnet_rq->q_first == NULL &&
1223 		    canputnext(ipnet->ipnet_rq)) {
1224 			putnext(ipnet->ipnet_rq, netmp);
1225 			IPSK_BUMP(ips, ik_dispatchDeliver);
1226 		} else if (canput(ipnet->ipnet_rq)) {
1227 			(void) putq(ipnet->ipnet_rq, netmp);
1228 			IPSK_BUMP(ips, ik_dispatchDeliver);
1229 		} else {
1230 			freemsg(netmp);
1231 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1232 		}
1233 	}
1234 
1235 	ipnet_walkers_dec(ips);
1236 
1237 	freemsg(mp);
1238 }
1239 
1240 static void
1241 ipnet_input(mblk_t *mp)
1242 {
1243 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1244 	ipnet_stack_t		*ips;
1245 
1246 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1247 
1248 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1249 	    DDI_SUCCESS) {
1250 		IPSK_BUMP(ips, ik_dispatchFail);
1251 		freemsg(mp);
1252 	} else {
1253 		IPSK_BUMP(ips, ik_dispatchOk);
1254 	}
1255 }
1256 
1257 static ipnetif_t *
1258 ipnet_alloc_if(ipnet_stack_t *ips)
1259 {
1260 	ipnetif_t	*ipnetif;
1261 
1262 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1263 		return (NULL);
1264 
1265 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1266 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1267 	    offsetof(ipnetif_addr_t, ifa_link));
1268 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1269 	    offsetof(ipnetif_addr_t, ifa_link));
1270 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1271 
1272 	ipnetif->if_stackp = ips;
1273 
1274 	return (ipnetif);
1275 }
1276 
1277 /*
1278  * Create a new ipnetif_t and new minor node for it.  If creation is
1279  * successful the new ipnetif_t is inserted into an avl_tree
1280  * containing ipnetif's for this stack instance.
1281  */
1282 static ipnetif_t *
1283 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1284     uint64_t ifflags)
1285 {
1286 	ipnetif_t	*ipnetif;
1287 	avl_index_t	where = 0;
1288 	minor_t		ifminor;
1289 
1290 	/*
1291 	 * Because ipnetif_create() can be called from a NIC event
1292 	 * callback, it should not block.
1293 	 */
1294 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1295 	if (ifminor == (minor_t)-1)
1296 		return (NULL);
1297 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1298 		id_free(ipnet_minor_space, ifminor);
1299 		return (NULL);
1300 	}
1301 
1302 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1303 	ipnetif->if_index = (uint_t)index;
1304 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1305 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1306 
1307 	ipnetif->if_refcnt = 1;
1308 	if ((ifflags & IFF_LOOPBACK) != 0)
1309 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1310 
1311 	mutex_enter(&ips->ips_avl_lock);
1312 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1313 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1314 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1315 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1316 	mutex_exit(&ips->ips_avl_lock);
1317 
1318 	return (ipnetif);
1319 }
1320 
1321 static void
1322 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1323 {
1324 	ipnet_t	*ipnet;
1325 
1326 	ipnet_walkers_inc(ips);
1327 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1328 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1329 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1330 		if (ipnet->ipnet_if == ipnetif)
1331 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1332 	}
1333 	ipnet_walkers_dec(ips);
1334 	mutex_enter(&ips->ips_avl_lock);
1335 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1336 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1337 	mutex_exit(&ips->ips_avl_lock);
1338 	/*
1339 	 * Release the reference we implicitly held in ipnetif_create().
1340 	 */
1341 	ipnetif_refrele(ipnetif);
1342 }
1343 
1344 static void
1345 ipnet_purge_addrlist(list_t *addrlist)
1346 {
1347 	ipnetif_addr_t	*ifa;
1348 
1349 	while ((ifa = list_head(addrlist)) != NULL) {
1350 		list_remove(addrlist, ifa);
1351 		if (ifa->ifa_shared != NULL)
1352 			ipnetif_clone_release(ifa->ifa_shared);
1353 		kmem_free(ifa, sizeof (*ifa));
1354 	}
1355 }
1356 
1357 static void
1358 ipnetif_free(ipnetif_t *ipnetif)
1359 {
1360 	ASSERT(ipnetif->if_refcnt == 0);
1361 	ASSERT(ipnetif->if_sharecnt == 0);
1362 
1363 	/* Remove IPv4/v6 address lists from the ipnetif */
1364 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1365 	list_destroy(&ipnetif->if_ip4addr_list);
1366 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1367 	list_destroy(&ipnetif->if_ip6addr_list);
1368 	mutex_destroy(&ipnetif->if_addr_lock);
1369 	mutex_destroy(&ipnetif->if_reflock);
1370 	if (ipnetif->if_dev != 0)
1371 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1372 	kmem_free(ipnetif, sizeof (*ipnetif));
1373 }
1374 
1375 /*
1376  * Create an ipnetif_addr_t with the given logical interface id (lif)
1377  * and add it to the supplied ipnetif.  The lif is the netinfo
1378  * representation of logical interface id, and we use this id to match
1379  * incoming netinfo events against our lists of addresses.
1380  */
1381 static void
1382 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1383 {
1384 	ipnetif_addr_t		*ifaddr;
1385 	zoneid_t		zoneid;
1386 	struct sockaddr_in	bcast;
1387 	struct sockaddr_storage	addr;
1388 	net_ifaddr_t		type = NA_ADDRESS;
1389 	uint64_t		phyif = ipnetif->if_index;
1390 
1391 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1392 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1393 		return;
1394 
1395 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1396 		return;
1397 	ifaddr->ifa_zone = zoneid;
1398 	ifaddr->ifa_id = lif;
1399 	ifaddr->ifa_shared = NULL;
1400 
1401 	switch (addr.ss_family) {
1402 	case AF_INET:
1403 		ifaddr->ifa_ip4addr =
1404 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1405 		/*
1406 		 * Try and get the broadcast address.  Note that it's okay for
1407 		 * an interface to not have a broadcast address, so we don't
1408 		 * fail the entire operation if net_getlifaddr() fails here.
1409 		 */
1410 		type = NA_BROADCAST;
1411 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1412 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1413 		break;
1414 	case AF_INET6:
1415 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1416 		break;
1417 	}
1418 
1419 	/*
1420 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1421 	 * zone the address is being used in. This facilitates finding the
1422 	 * correct netstack_t pointer, amongst other things, later.
1423 	 */
1424 	if (zoneid == ALL_ZONES)
1425 		zoneid = GLOBAL_ZONEID;
1426 
1427 	mutex_enter(&ipnetif->if_addr_lock);
1428 	if (zoneid != ipnetif->if_zoneid) {
1429 		ipnetif_t *ifp2;
1430 
1431 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1432 		ifaddr->ifa_shared = ifp2;
1433 	}
1434 	list_insert_tail(addr.ss_family == AF_INET ?
1435 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1436 	mutex_exit(&ipnetif->if_addr_lock);
1437 }
1438 
1439 static void
1440 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1441 {
1442 	mutex_enter(&ipnetif->if_addr_lock);
1443 	if (ifaddr->ifa_shared != NULL)
1444 		ipnetif_clone_release(ifaddr->ifa_shared);
1445 
1446 	list_remove(isv6 ?
1447 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1448 	mutex_exit(&ipnetif->if_addr_lock);
1449 	kmem_free(ifaddr, sizeof (*ifaddr));
1450 }
1451 
1452 static void
1453 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1454 {
1455 	ipnetif_t	*ipnetif;
1456 	boolean_t	refrele_needed = B_TRUE;
1457 	uint64_t	ifflags;
1458 	uint64_t	ifindex;
1459 	char		*ifname;
1460 
1461 	ifflags = 0;
1462 	ifname = ipne->ipne_ifname;
1463 	ifindex = ipne->ipne_ifindex;
1464 
1465 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1466 
1467 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1468 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1469 		refrele_needed = B_FALSE;
1470 	}
1471 	if (ipnetif != NULL) {
1472 		ipnetif->if_flags |=
1473 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1474 	}
1475 
1476 	if (ipnetif->if_multicnt != 0) {
1477 		if (ip_join_allmulti(ifindex, isv6,
1478 		    ips->ips_netstack->netstack_ip) == 0) {
1479 			ipnetif->if_flags |=
1480 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1481 		}
1482 	}
1483 
1484 	if (refrele_needed)
1485 		ipnetif_refrele(ipnetif);
1486 }
1487 
1488 static void
1489 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1490 {
1491 	ipnetif_t	*ipnetif;
1492 
1493 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1494 		return;
1495 
1496 	mutex_enter(&ipnetif->if_addr_lock);
1497 	ipnet_purge_addrlist(isv6 ?
1498 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1499 	mutex_exit(&ipnetif->if_addr_lock);
1500 
1501 	/*
1502 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1503 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1504 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1505 	 */
1506 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1507 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1508 		ipnetif_remove(ipnetif, ips);
1509 	ipnetif_refrele(ipnetif);
1510 }
1511 
1512 static void
1513 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1514     ipnet_stack_t *ips, boolean_t isv6)
1515 {
1516 	ipnetif_t	*ipnetif;
1517 	ipnetif_addr_t	*ifaddr;
1518 
1519 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1520 		return;
1521 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1522 		/*
1523 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1524 		 * ifaddr and re-create it.
1525 		 */
1526 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1527 	}
1528 
1529 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1530 	ipnetif_refrele(ipnetif);
1531 }
1532 
1533 static void
1534 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1535     boolean_t isv6)
1536 {
1537 	ipnetif_t	*ipnetif;
1538 	ipnetif_addr_t	*ifaddr;
1539 
1540 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1541 		return;
1542 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1543 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1544 	ipnetif_refrele(ipnetif);
1545 	/*
1546 	 * Make sure that open streams on this ipnetif are still allowed to
1547 	 * have it open.
1548 	 */
1549 	ipnetif_zonecheck(ipnetif, ips);
1550 }
1551 
1552 /*
1553  * This callback from the NIC event framework dispatches a taskq as the event
1554  * handlers may block.
1555  */
1556 /* ARGSUSED */
1557 static int
1558 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1559 {
1560 	ipnet_stack_t		*ips = arg;
1561 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1562 	ipnet_nicevent_t	*ipne;
1563 
1564 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1565 		return (0);
1566 	ipne->ipne_event = hn->hne_event;
1567 	ipne->ipne_protocol = hn->hne_protocol;
1568 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1569 	ipne->ipne_ifindex = hn->hne_nic;
1570 	ipne->ipne_lifindex = hn->hne_lif;
1571 	if (hn->hne_datalen != 0) {
1572 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1573 		    sizeof (ipne->ipne_ifname));
1574 	}
1575 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1576 	    ipne, DDI_NOSLEEP);
1577 	return (0);
1578 }
1579 
1580 static void
1581 ipnet_nicevent_task(void *arg)
1582 {
1583 	ipnet_nicevent_t	*ipne = arg;
1584 	netstack_t		*ns;
1585 	ipnet_stack_t		*ips;
1586 	boolean_t		isv6;
1587 
1588 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1589 		goto done;
1590 	ips = ns->netstack_ipnet;
1591 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1592 
1593 	mutex_enter(&ips->ips_event_lock);
1594 	switch (ipne->ipne_event) {
1595 	case NE_PLUMB:
1596 		ipnet_plumb_ev(ipne, ips, isv6);
1597 		break;
1598 	case NE_UNPLUMB:
1599 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1600 		break;
1601 	case NE_LIF_UP:
1602 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1603 		    ipne->ipne_protocol, ips, isv6);
1604 		break;
1605 	case NE_LIF_DOWN:
1606 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1607 		    isv6);
1608 		break;
1609 	default:
1610 		break;
1611 	}
1612 	mutex_exit(&ips->ips_event_lock);
1613 done:
1614 	if (ns != NULL)
1615 		netstack_rele(ns);
1616 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1617 }
1618 
1619 dev_t
1620 ipnet_if_getdev(char *name, zoneid_t zoneid)
1621 {
1622 	netstack_t	*ns;
1623 	ipnet_stack_t	*ips;
1624 	ipnetif_t	*ipnetif;
1625 	dev_t		dev = (dev_t)-1;
1626 
1627 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1628 		return (dev);
1629 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1630 		return (dev);
1631 
1632 	ips = ns->netstack_ipnet;
1633 	mutex_enter(&ips->ips_avl_lock);
1634 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1635 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1636 			dev = ipnetif->if_dev;
1637 	}
1638 	mutex_exit(&ips->ips_avl_lock);
1639 	netstack_rele(ns);
1640 
1641 	return (dev);
1642 }
1643 
1644 static ipnetif_t *
1645 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1646 {
1647 	ipnetif_t	*ipnetif;
1648 
1649 	mutex_enter(&ips->ips_avl_lock);
1650 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1651 		ipnetif_refhold(ipnetif);
1652 	mutex_exit(&ips->ips_avl_lock);
1653 	return (ipnetif);
1654 }
1655 
1656 static ipnetif_t *
1657 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1658 {
1659 	ipnetif_t	*ipnetif;
1660 	avl_tree_t	*tree;
1661 
1662 	mutex_enter(&ips->ips_avl_lock);
1663 	tree = &ips->ips_avl_by_index;
1664 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1665 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1666 		if (ipnetif->if_dev == dev) {
1667 			ipnetif_refhold(ipnetif);
1668 			break;
1669 		}
1670 	}
1671 	mutex_exit(&ips->ips_avl_lock);
1672 	return (ipnetif);
1673 }
1674 
1675 static ipnetif_addr_t *
1676 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1677 {
1678 	ipnetif_addr_t	*ifaddr;
1679 	list_t	*list;
1680 
1681 	mutex_enter(&ipnetif->if_addr_lock);
1682 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1683 	for (ifaddr = list_head(list); ifaddr != NULL;
1684 	    ifaddr = list_next(list, ifaddr)) {
1685 		if (lid == ifaddr->ifa_id)
1686 			break;
1687 	}
1688 	mutex_exit(&ipnetif->if_addr_lock);
1689 	return (ifaddr);
1690 }
1691 
1692 /* ARGSUSED */
1693 static void *
1694 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1695 {
1696 	ipnet_stack_t	*ips;
1697 
1698 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1699 	ips->ips_netstack = ns;
1700 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1701 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1702 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1703 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1704 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1705 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1706 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1707 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1708 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1709 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1710 	    offsetof(ipnet_t, ipnet_next));
1711 	ipnet_register_netihook(ips);
1712 	return (ips);
1713 }
1714 
1715 /* ARGSUSED */
1716 static void
1717 ipnet_stack_fini(netstackid_t stackid, void *arg)
1718 {
1719 	ipnet_stack_t	*ips = arg;
1720 	ipnetif_t	*ipnetif, *nipnetif;
1721 
1722 	if (ips->ips_kstatp != NULL) {
1723 		zoneid_t zoneid;
1724 
1725 		zoneid = netstackid_to_zoneid(stackid);
1726 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1727 	}
1728 	if (ips->ips_ndv4 != NULL) {
1729 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1730 		    ips->ips_nicevents) == 0);
1731 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1732 	}
1733 	if (ips->ips_ndv6 != NULL) {
1734 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1735 		    ips->ips_nicevents) == 0);
1736 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1737 	}
1738 	hook_free(ips->ips_nicevents);
1739 
1740 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1741 	    ipnetif = nipnetif) {
1742 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1743 		ipnetif_remove(ipnetif, ips);
1744 	}
1745 	avl_destroy(&ips->ips_avl_by_shared);
1746 	avl_destroy(&ips->ips_avl_by_index);
1747 	avl_destroy(&ips->ips_avl_by_name);
1748 	mutex_destroy(&ips->ips_avl_lock);
1749 	mutex_destroy(&ips->ips_walkers_lock);
1750 	cv_destroy(&ips->ips_walkers_cv);
1751 	list_destroy(&ips->ips_str_list);
1752 	kmem_free(ips, sizeof (*ips));
1753 }
1754 
1755 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1756 static boolean_t
1757 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1758 {
1759 	ipnetif_addr_t	*ifa;
1760 
1761 	for (ifa = list_head(addrlist); ifa != NULL;
1762 	    ifa = list_next(addrlist, ifa)) {
1763 		if (ifa->ifa_zone == zoneid)
1764 			return (B_TRUE);
1765 	}
1766 	return (B_FALSE);
1767 }
1768 
1769 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1770 static boolean_t
1771 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1772 {
1773 	int	ret;
1774 
1775 	/*
1776 	 * The global zone has visibility into all interfaces in the global
1777 	 * stack, and exclusive stack zones have visibility into all
1778 	 * interfaces in their stack.
1779 	 */
1780 	if (zoneid == GLOBAL_ZONEID ||
1781 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1782 		return (B_TRUE);
1783 
1784 	/*
1785 	 * Shared-stack zones only have visibility for interfaces that have
1786 	 * addresses in their zone.
1787 	 */
1788 	mutex_enter(&ipnetif->if_addr_lock);
1789 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1790 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1791 	mutex_exit(&ipnetif->if_addr_lock);
1792 	return (ret);
1793 }
1794 
1795 /*
1796  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1797  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1798  * to have an ipnetif open if there are no longer any addresses that belong to
1799  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1800  * case, send the ipnet_t an M_HANGUP.
1801  */
1802 static void
1803 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1804 {
1805 	list_t	*strlist = &ips->ips_str_list;
1806 	ipnet_t	*ipnet;
1807 
1808 	ipnet_walkers_inc(ips);
1809 	for (ipnet = list_head(strlist); ipnet != NULL;
1810 	    ipnet = list_next(strlist, ipnet)) {
1811 		if (ipnet->ipnet_if != ipnetif)
1812 			continue;
1813 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1814 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1815 	}
1816 	ipnet_walkers_dec(ips);
1817 }
1818 
1819 void
1820 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1821 {
1822 	ipnetif_t		*ipnetif;
1823 	list_t			cbdata;
1824 	ipnetif_cbdata_t	*cbnode;
1825 	netstack_t		*ns;
1826 	ipnet_stack_t		*ips;
1827 
1828 	/*
1829 	 * On labeled systems, non-global zones shouldn't see anything
1830 	 * in /dev/ipnet.
1831 	 */
1832 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1833 		return;
1834 
1835 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1836 		return;
1837 
1838 	ips = ns->netstack_ipnet;
1839 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1840 	    offsetof(ipnetif_cbdata_t, ic_next));
1841 
1842 	mutex_enter(&ips->ips_avl_lock);
1843 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1844 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1845 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1846 			continue;
1847 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1848 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1849 		cbnode->ic_dev = ipnetif->if_dev;
1850 		list_insert_head(&cbdata, cbnode);
1851 	}
1852 	mutex_exit(&ips->ips_avl_lock);
1853 
1854 	while ((cbnode = list_head(&cbdata)) != NULL) {
1855 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1856 		list_remove(&cbdata, cbnode);
1857 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1858 	}
1859 	list_destroy(&cbdata);
1860 	netstack_rele(ns);
1861 }
1862 
1863 static int
1864 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1865 {
1866 	int64_t	index1 = *((int64_t *)index_ptr);
1867 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1868 
1869 	return (SIGNOF(index2 - index1));
1870 }
1871 
1872 static int
1873 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1874 {
1875 	int	res;
1876 
1877 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1878 	return (SIGNOF(res));
1879 }
1880 
1881 static int
1882 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1883 {
1884 	const uintptr_t	*ptr = key_ptr;
1885 	const ipnetif_t	*ifp;
1886 	int		res;
1887 
1888 	ifp = ipnetifp;
1889 	res = ifp->if_zoneid - ptr[0];
1890 	if (res != 0)
1891 		return (SIGNOF(res));
1892 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1893 	return (SIGNOF(res));
1894 }
1895 
1896 static void
1897 ipnetif_refhold(ipnetif_t *ipnetif)
1898 {
1899 	mutex_enter(&ipnetif->if_reflock);
1900 	ipnetif->if_refcnt++;
1901 	mutex_exit(&ipnetif->if_reflock);
1902 }
1903 
1904 static void
1905 ipnetif_refrele(ipnetif_t *ipnetif)
1906 {
1907 	mutex_enter(&ipnetif->if_reflock);
1908 	ASSERT(ipnetif->if_refcnt > 0);
1909 	if (--ipnetif->if_refcnt == 0)
1910 		ipnetif_free(ipnetif);
1911 	else
1912 		mutex_exit(&ipnetif->if_reflock);
1913 }
1914 
1915 static void
1916 ipnet_walkers_inc(ipnet_stack_t *ips)
1917 {
1918 	mutex_enter(&ips->ips_walkers_lock);
1919 	ips->ips_walkers_cnt++;
1920 	mutex_exit(&ips->ips_walkers_lock);
1921 }
1922 
1923 static void
1924 ipnet_walkers_dec(ipnet_stack_t *ips)
1925 {
1926 	mutex_enter(&ips->ips_walkers_lock);
1927 	ASSERT(ips->ips_walkers_cnt != 0);
1928 	if (--ips->ips_walkers_cnt == 0)
1929 		cv_broadcast(&ips->ips_walkers_cv);
1930 	mutex_exit(&ips->ips_walkers_lock);
1931 }
1932 
1933 /*ARGSUSED*/
1934 static int
1935 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1936 {
1937 	hook_pkt_observe_t	*hdr;
1938 	pfv_t			func = (pfv_t)arg;
1939 	mblk_t			*mp;
1940 
1941 	hdr = (hook_pkt_observe_t *)info;
1942 	/*
1943 	 * Code in ip_input() expects that it is the only one accessing the
1944 	 * packet.
1945 	 */
1946 	mp = copymsg(hdr->hpo_pkt);
1947 	if (mp == NULL)  {
1948 		netstack_t *ns = hdr->hpo_ctx;
1949 		ipnet_stack_t *ips = ns->netstack_ipnet;
1950 
1951 		IPSK_BUMP(ips, ik_dispatchDupDrop);
1952 		return (0);
1953 	}
1954 
1955 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1956 	hdr->hpo_pkt = mp;
1957 
1958 	func(mp);
1959 
1960 	return (0);
1961 }
1962 
1963 hook_t *
1964 ipobs_register_hook(netstack_t *ns, pfv_t func)
1965 {
1966 	ip_stack_t	*ipst = ns->netstack_ip;
1967 	char		name[32];
1968 	hook_t		*hook;
1969 
1970 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1971 	VERIFY(hook != NULL);
1972 
1973 	/*
1974 	 * To register multiple hooks with he same callback function,
1975 	 * a unique name is needed.
1976 	 */
1977 	(void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1978 	hook->h_name = strdup(name);
1979 
1980 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1981 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1982 
1983 	return (hook);
1984 }
1985 
1986 void
1987 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1988 {
1989 	ip_stack_t	*ipst = ns->netstack_ip;
1990 
1991 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1992 
1993 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1994 
1995 	strfree(hook->h_name);
1996 
1997 	hook_free(hook);
1998 }
1999 
2000 /* ******************************************************************** */
2001 /* BPF Functions below							*/
2002 /* ******************************************************************** */
2003 
2004 /*
2005  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2006  */
2007 ipnet_stack_t *
2008 ipnet_find_by_zoneid(zoneid_t zoneid)
2009 {
2010 	netstack_t	*ns;
2011 
2012 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2013 	return (ns->netstack_ipnet);
2014 }
2015 
2016 /*
2017  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2018  * pointer to ipnet_stack_t by calling a netstack lookup function.
2019  * The netstack_find_*() functions return a pointer after doing a "hold"
2020  * on the data structure and thereby require a "release" when the caller
2021  * is finished with it. We need to mirror that API here and thus a caller
2022  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2023  */
2024 void
2025 ipnet_rele(ipnet_stack_t *ips)
2026 {
2027 	netstack_rele(ips->ips_netstack);
2028 }
2029 
2030 /*
2031  */
2032 void
2033 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2034 {
2035 	ipnet_itap = tapfunc;
2036 }
2037 
2038 /*
2039  * The list of interfaces available via ipnet is private for each zone,
2040  * so the AVL tree of each zone must be searched for a given name, even
2041  * if all names are unique.
2042  */
2043 int
2044 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2045 {
2046 	ipnet_stack_t	*ips;
2047 	ipnetif_t	*ipnetif;
2048 
2049 	ASSERT(ptr != NULL);
2050 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2051 
2052 	mutex_enter(&ips->ips_avl_lock);
2053 
2054 	/*
2055 	 * Shared instance zone?
2056 	 */
2057 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2058 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2059 
2060 		ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2061 	} else {
2062 		ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2063 	}
2064 	if (ipnetif != NULL)
2065 		ipnetif_refhold(ipnetif);
2066 	mutex_exit(&ips->ips_avl_lock);
2067 
2068 	*ptr = ipnetif;
2069 	ipnet_rele(ips);
2070 
2071 	if (ipnetif == NULL)
2072 		return (ESRCH);
2073 	return (0);
2074 }
2075 
2076 void
2077 ipnet_close_byhandle(ipnetif_t *ifp)
2078 {
2079 	ASSERT(ifp != NULL);
2080 	ipnetif_refrele(ifp);
2081 }
2082 
2083 const char *
2084 ipnet_name(ipnetif_t *ifp)
2085 {
2086 	ASSERT(ifp != NULL);
2087 	return (ifp->if_name);
2088 }
2089 
2090 /*
2091  * To find the linkid for a given name, it is necessary to know which zone
2092  * the interface name belongs to and to search the avl tree for that zone
2093  * as there is no master list of all interfaces and which zone they belong
2094  * to. It is assumed that the caller of this function is somehow already
2095  * working with the ipnet interfaces and hence the ips_event_lock is held.
2096  * When BPF calls into this function, it is doing so because of an event
2097  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2098  * value returned has meaning without the need for grabbing a hold on the
2099  * owning structure.
2100  */
2101 int
2102 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2103 {
2104 	ipnet_stack_t	*ips;
2105 	ipnetif_t	*ifp;
2106 
2107 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2108 	ASSERT(mutex_owned(&ips->ips_event_lock));
2109 
2110 	mutex_enter(&ips->ips_avl_lock);
2111 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2112 	if (ifp != NULL)
2113 		*idp = (uint_t)ifp->if_index;
2114 
2115 	/*
2116 	 * Shared instance zone?
2117 	 */
2118 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2119 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2120 
2121 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2122 		if (ifp != NULL)
2123 			*idp = (uint_t)ifp->if_index;
2124 	}
2125 
2126 	mutex_exit(&ips->ips_avl_lock);
2127 	ipnet_rele(ips);
2128 
2129 	if (ifp == NULL)
2130 		return (ESRCH);
2131 	return (0);
2132 }
2133 
2134 /*
2135  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2136  * there is in mac. BPF only needs to have this because it is required as
2137  * part of interfacing correctly with mac. The reuse of the original
2138  * ipnetif_t as a client poses no danger, so long as it is done with its
2139  * own ref-count'd hold that is given up on close.
2140  */
2141 int
2142 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2143 {
2144 	ASSERT(ptr != NULL);
2145 	ASSERT(result != NULL);
2146 	ipnetif_refhold(ptr);
2147 	*result = ptr;
2148 
2149 	return (0);
2150 }
2151 
2152 void
2153 ipnet_client_close(ipnetif_t *ptr)
2154 {
2155 	ASSERT(ptr != NULL);
2156 	ipnetif_refrele(ptr);
2157 }
2158 
2159 /*
2160  * This is called from BPF when it needs to start receiving packets
2161  * from ipnet.
2162  *
2163  * The use of the ipnet_t structure here is somewhat lightweight when
2164  * compared to how it is used elsewhere but it already has all of the
2165  * right fields in it, so reuse here doesn't seem out of order. Its
2166  * primary purpose here is to provide the means to store pointers for
2167  * use when ipnet_promisc_remove() needs to be called.
2168  *
2169  * This should never be called for the IPNET_MINOR_LO device as it is
2170  * never created via ipnetif_create.
2171  */
2172 /*ARGSUSED*/
2173 int
2174 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2175     int flags)
2176 {
2177 	ip_stack_t	*ipst;
2178 	netstack_t	*ns;
2179 	ipnetif_t	*ifp;
2180 	ipnet_t		*ipnet;
2181 	char		name[32];
2182 	int		error;
2183 
2184 	ifp = (ipnetif_t *)handle;
2185 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2186 
2187 	if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
2188 		error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
2189 		if (error != 0)
2190 			return (error);
2191 	} else {
2192 		return (EINVAL);
2193 	}
2194 
2195 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2196 	ipnet->ipnet_if = ifp;
2197 	ipnet->ipnet_ns = ns;
2198 	ipnet->ipnet_flags = flags;
2199 
2200 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2201 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2202 	} else {
2203 		ipnet->ipnet_acceptfn = ipnet_accept;
2204 	}
2205 
2206 	/*
2207 	 * To register multiple hooks with the same callback function,
2208 	 * a unique name is needed.
2209 	 */
2210 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2211 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2212 	    (void *)ipnet->ipnet_hook);
2213 	ipnet->ipnet_hook->h_name = strdup(name);
2214 	ipnet->ipnet_data = data;
2215 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2216 
2217 	ipst = ns->netstack_ip;
2218 
2219 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2220 	    ipnet->ipnet_hook);
2221 	if (error != 0)
2222 		goto regfail;
2223 
2224 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2225 	    ipnet->ipnet_hook);
2226 	if (error != 0) {
2227 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2228 		    NH_OBSERVE, ipnet->ipnet_hook);
2229 		goto regfail;
2230 	}
2231 
2232 	*mhandle = (uintptr_t)ipnet;
2233 
2234 	return (0);
2235 
2236 regfail:
2237 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2238 	strfree(ipnet->ipnet_hook->h_name);
2239 	hook_free(ipnet->ipnet_hook);
2240 	ipnet_leave_allmulti(ifp, ns->netstack_ipnet);
2241 	netstack_rele(ns);
2242 	return (error);
2243 }
2244 
2245 void
2246 ipnet_promisc_remove(void *data)
2247 {
2248 	ip_stack_t	*ipst;
2249 	ipnet_t		*ipnet;
2250 	hook_t		*hook;
2251 
2252 	ipnet = data;
2253 	ipst = ipnet->ipnet_ns->netstack_ip;
2254 	hook = ipnet->ipnet_hook;
2255 
2256 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2257 	    hook) == 0);
2258 
2259 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2260 	    hook) == 0);
2261 
2262 	strfree(hook->h_name);
2263 
2264 	hook_free(hook);
2265 
2266 	ipnet_leave_allmulti(ipnet->ipnet_if, ipnet->ipnet_ns->netstack_ipnet);
2267 
2268 	netstack_rele(ipnet->ipnet_ns);
2269 
2270 	kmem_free(ipnet, sizeof (*ipnet));
2271 }
2272 
2273 /*
2274  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2275  * An important field from that structure is "ipnet_data" that
2276  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2277  * to be passed back to bpf when we call into ipnet_itap.
2278  *
2279  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2280  * from BPF.
2281  */
2282 /*ARGSUSED*/
2283 static int
2284 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2285 {
2286 	hook_pkt_observe_t	*hdr;
2287 	ipnet_addrp_t		src;
2288 	ipnet_addrp_t		dst;
2289 	ipnet_stack_t		*ips;
2290 	ipnet_t			*ipnet;
2291 	mblk_t			*netmp;
2292 	mblk_t			*mp;
2293 
2294 	hdr = (hook_pkt_observe_t *)info;
2295 	mp = hdr->hpo_pkt;
2296 	ipnet = (ipnet_t *)arg;
2297 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2298 
2299 	netmp = hdr->hpo_pkt->b_cont;
2300 	src.iap_family = hdr->hpo_family;
2301 	dst.iap_family = hdr->hpo_family;
2302 
2303 	if (hdr->hpo_family == AF_INET) {
2304 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2305 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2306 	} else {
2307 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2308 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2309 	}
2310 
2311 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2312 		IPSK_BUMP(ips, ik_acceptFail);
2313 		return (0);
2314 	}
2315 	IPSK_BUMP(ips, ik_acceptOk);
2316 
2317 	ipnet_itap(ipnet->ipnet_data, mp,
2318 	    hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2319 	    ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2320 
2321 	return (0);
2322 }
2323 
2324 /*
2325  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2326  * to life and configures an IP address. The model that BPF uses is that
2327  * each interface must have a unique pointer and each interface must be
2328  * representative of what it can capture. They are limited to one DLT
2329  * per interface and one zone per interface. Thus every interface that
2330  * can be seen in a zone must be announced via an attach to bpf. For
2331  * shared instance zones, this means the ipnet driver needs to detect
2332  * when an address is added to an interface in a zone for the first
2333  * time (and also when the last address is removed.)
2334  */
2335 static ipnetif_t *
2336 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2337 {
2338 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2339 	ipnet_stack_t	*ips = ifp->if_stackp;
2340 	avl_index_t	where = 0;
2341 	ipnetif_t	*newif;
2342 
2343 	mutex_enter(&ips->ips_avl_lock);
2344 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2345 	if (newif != NULL) {
2346 		ipnetif_refhold(newif);
2347 		newif->if_sharecnt++;
2348 		mutex_exit(&ips->ips_avl_lock);
2349 		return (newif);
2350 	}
2351 
2352 	newif = ipnet_alloc_if(ips);
2353 	if (newif == NULL) {
2354 		mutex_exit(&ips->ips_avl_lock);
2355 		return (NULL);
2356 	}
2357 
2358 	newif->if_refcnt = 1;
2359 	newif->if_sharecnt = 1;
2360 	newif->if_zoneid = zoneid;
2361 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2362 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2363 	newif->if_index = ifp->if_index;
2364 
2365 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2366 	mutex_exit(&ips->ips_avl_lock);
2367 
2368 	return (newif);
2369 }
2370 
2371 static void
2372 ipnetif_clone_release(ipnetif_t *ipnetif)
2373 {
2374 	boolean_t	dofree = B_FALSE;
2375 	boolean_t	doremove = B_FALSE;
2376 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2377 
2378 	mutex_enter(&ipnetif->if_reflock);
2379 	ASSERT(ipnetif->if_refcnt > 0);
2380 	if (--ipnetif->if_refcnt == 0)
2381 		dofree = B_TRUE;
2382 	ASSERT(ipnetif->if_sharecnt > 0);
2383 	if (--ipnetif->if_sharecnt == 0)
2384 		doremove = B_TRUE;
2385 	mutex_exit(&ipnetif->if_reflock);
2386 	if (doremove) {
2387 		mutex_enter(&ips->ips_avl_lock);
2388 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2389 		mutex_exit(&ips->ips_avl_lock);
2390 	}
2391 	if (dofree) {
2392 		ASSERT(ipnetif->if_sharecnt == 0);
2393 		ipnetif_free(ipnetif);
2394 	}
2395 }
2396