xref: /titanic_50/usr/src/uts/common/inet/ipnet/ipnet.c (revision d5d7cf4e084ada61ab475b433429da88487a6725)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * The ipnet device defined here provides access to packets at the IP layer. To
29  * provide access to packets at this layer it registers a callback function in
30  * the ip module and when there are open instances of the device ip will pass
31  * packets into the device. Packets from ip are passed on the input, output and
32  * loopback paths. Internally the module returns to ip as soon as possible by
33  * deferring processing using a taskq.
34  *
35  * Management of the devices in /dev/ipnet/ is handled by the devname
36  * filesystem and use of the neti interfaces.  This module registers for NIC
37  * events using the neti framework so that when IP interfaces are bought up,
38  * taken down etc. the ipnet module is notified and its view of the interfaces
39  * configured on the system adjusted.  On attach, the module gets an initial
40  * view of the system again using the neti framework but as it has already
41  * registered for IP interface events, it is still up-to-date with any changes.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/conf.h>
46 #include <sys/cred.h>
47 #include <sys/stat.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/modctl.h>
51 #include <sys/dlpi.h>
52 #include <sys/strsun.h>
53 #include <sys/id_space.h>
54 #include <sys/kmem.h>
55 #include <sys/mkdev.h>
56 #include <sys/neti.h>
57 #include <net/if.h>
58 #include <sys/errno.h>
59 #include <sys/list.h>
60 #include <sys/ksynch.h>
61 #include <sys/hook_event.h>
62 #include <sys/sdt.h>
63 #include <sys/stropts.h>
64 #include <sys/sysmacros.h>
65 #include <inet/ip.h>
66 #include <inet/ip_if.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip6.h>
69 #include <inet/ipnet.h>
70 #include <net/bpf.h>
71 #include <net/bpfdesc.h>
72 #include <net/dlt.h>
73 
74 static struct module_info ipnet_minfo = {
75 	1,		/* mi_idnum */
76 	"ipnet",	/* mi_idname */
77 	0,		/* mi_minpsz */
78 	INFPSZ,		/* mi_maxpsz */
79 	2048,		/* mi_hiwat */
80 	0		/* mi_lowat */
81 };
82 
83 /*
84  * List to hold static view of ipnetif_t's on the system. This is needed to
85  * avoid holding the lock protecting the avl tree of ipnetif's over the
86  * callback into the dev filesystem.
87  */
88 typedef struct ipnetif_cbdata {
89 	char		ic_ifname[LIFNAMSIZ];
90 	dev_t		ic_dev;
91 	list_node_t	ic_next;
92 } ipnetif_cbdata_t;
93 
94 /*
95  * Convenience enumerated type for ipnet_accept().  It describes the
96  * properties of a given ipnet_addrp_t relative to a single ipnet_t
97  * client stream.  The values represent whether the address is ...
98  */
99 typedef enum {
100 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
101 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
102 	IPNETADDR_UNKNOWN	/* none of the above. */
103 } ipnet_addrtype_t;
104 
105 /* Argument used for the ipnet_nicevent_taskq callback. */
106 typedef struct ipnet_nicevent_s {
107 	nic_event_t		ipne_event;
108 	net_handle_t		ipne_protocol;
109 	netstackid_t		ipne_stackid;
110 	uint64_t		ipne_ifindex;
111 	uint64_t		ipne_lifindex;
112 	char			ipne_ifname[LIFNAMSIZ];
113 } ipnet_nicevent_t;
114 
115 static dev_info_t	*ipnet_dip;
116 static major_t		ipnet_major;
117 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
118 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
119 static id_space_t	*ipnet_minor_space;
120 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
121 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
122 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
123 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
124 static bpf_itap_fn_t	ipnet_itap;
125 
126 static void	ipnet_input(mblk_t *);
127 static int	ipnet_wput(queue_t *, mblk_t *);
128 static int	ipnet_rsrv(queue_t *);
129 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
130 static int	ipnet_close(queue_t *);
131 static void	ipnet_ioctl(queue_t *, mblk_t *);
132 static void	ipnet_iocdata(queue_t *, mblk_t *);
133 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
134 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
135 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
136 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
137 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
138 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
139 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
140 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
141 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
142 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
143 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
144 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
145 static void	ipnet_nicevent_task(void *);
146 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
147     uint64_t);
148 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
149 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
150 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
151 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
152 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
153 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
154 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
155 static int 	ipnetif_compare_name(const void *, const void *);
156 static int 	ipnetif_compare_name_zone(const void *, const void *);
157 static int 	ipnetif_compare_index(const void *, const void *);
158 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
159 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
160 static void	ipnetif_refhold(ipnetif_t *);
161 static void	ipnetif_refrele(ipnetif_t *);
162 static void	ipnet_walkers_inc(ipnet_stack_t *);
163 static void	ipnet_walkers_dec(ipnet_stack_t *);
164 static void	ipnet_register_netihook(ipnet_stack_t *);
165 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
166 static void	ipnet_stack_fini(netstackid_t, void *);
167 static void	ipnet_dispatch(void *);
168 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
169 static void	ipnet_bpfattach(ipnetif_t *);
170 static void	ipnet_bpfdetach(ipnetif_t *);
171 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
172 static void	ipnet_bpf_probe_shared(ipnet_stack_t *);
173 static void	ipnet_bpf_release_shared(ipnet_stack_t *);
174 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
175 static void	ipnetif_clone_release(ipnetif_t *);
176 
177 static struct qinit ipnet_rinit = {
178 	NULL,		/* qi_putp */
179 	ipnet_rsrv,	/* qi_srvp */
180 	ipnet_open,	/* qi_qopen */
181 	ipnet_close,	/* qi_qclose */
182 	NULL,		/* qi_qadmin */
183 	&ipnet_minfo,	/* qi_minfo */
184 };
185 
186 static struct qinit ipnet_winit = {
187 	ipnet_wput,	/* qi_putp */
188 	NULL,		/* qi_srvp */
189 	NULL,		/* qi_qopen */
190 	NULL,		/* qi_qclose */
191 	NULL,		/* qi_qadmin */
192 	&ipnet_minfo,	/* qi_minfo */
193 };
194 
195 static struct streamtab ipnet_info = {
196 	&ipnet_rinit, &ipnet_winit
197 };
198 
199 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
200     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
201     ddi_quiesce_not_supported);
202 
203 static struct modldrv modldrv = {
204 	&mod_driverops,
205 	"STREAMS ipnet driver",
206 	&ipnet_ops
207 };
208 
209 static struct modlinkage modlinkage = {
210 	MODREV_1, &modldrv, NULL
211 };
212 
213 /*
214  * This structure contains the template data (names and type) that is
215  * copied, in bulk, into the new kstats structure created by net_kstat_create.
216  * No actual statistical information is stored in this instance of the
217  * ipnet_kstats_t structure.
218  */
219 static ipnet_kstats_t stats_template = {
220 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
221 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
222 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
223 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
224 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
225 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
226 	{ "acceptOk",		KSTAT_DATA_UINT64 },
227 	{ "acceptFail",		KSTAT_DATA_UINT64 }
228 };
229 
230 /*
231  * Walk the list of physical interfaces on the machine, for each
232  * interface create a new ipnetif_t and add any addresses to it. We
233  * need to do the walk twice, once for IPv4 and once for IPv6.
234  *
235  * The interfaces are destroyed as part of ipnet_stack_fini() for each
236  * stack.  Note that we cannot do this initialization in
237  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
238  */
239 static int
240 ipnetif_init(void)
241 {
242 	netstack_handle_t	nh;
243 	netstack_t		*ns;
244 	ipnet_stack_t		*ips;
245 	int			ret = 0;
246 
247 	netstack_next_init(&nh);
248 	while ((ns = netstack_next(&nh)) != NULL) {
249 		ips = ns->netstack_ipnet;
250 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
251 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
252 		netstack_rele(ns);
253 		if (ret != 0)
254 			break;
255 	}
256 	netstack_next_fini(&nh);
257 	return (ret);
258 }
259 
260 /*
261  * Standard module entry points.
262  */
263 int
264 _init(void)
265 {
266 	int		ret;
267 	boolean_t	netstack_registered = B_FALSE;
268 
269 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
270 		return (ENODEV);
271 	ipnet_minor_space = id_space_create("ipnet_minor_space",
272 	    IPNET_MINOR_MIN, MAXMIN32);
273 
274 	/*
275 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
276 	 * delivery of packets to clients.  Note that we need to create the
277 	 * taskqs before calling netstack_register() since ipnet_stack_init()
278 	 * registers callbacks that use 'em.
279 	 */
280 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
281 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
282 	    1, TASKQ_DEFAULTPRI, 0);
283 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
284 		ret = ENOMEM;
285 		goto done;
286 	}
287 
288 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
289 	netstack_registered = B_TRUE;
290 
291 	if ((ret = ipnetif_init()) == 0)
292 		ret = mod_install(&modlinkage);
293 done:
294 	if (ret != 0) {
295 		if (ipnet_taskq != NULL)
296 			ddi_taskq_destroy(ipnet_taskq);
297 		if (ipnet_nicevent_taskq != NULL)
298 			ddi_taskq_destroy(ipnet_nicevent_taskq);
299 		if (netstack_registered)
300 			netstack_unregister(NS_IPNET);
301 		id_space_destroy(ipnet_minor_space);
302 	}
303 	return (ret);
304 }
305 
306 int
307 _fini(void)
308 {
309 	int	err;
310 
311 	if ((err = mod_remove(&modlinkage)) != 0)
312 		return (err);
313 
314 	netstack_unregister(NS_IPNET);
315 	ddi_taskq_destroy(ipnet_nicevent_taskq);
316 	ddi_taskq_destroy(ipnet_taskq);
317 	id_space_destroy(ipnet_minor_space);
318 	return (0);
319 }
320 
321 int
322 _info(struct modinfo *modinfop)
323 {
324 	return (mod_info(&modlinkage, modinfop));
325 }
326 
327 static void
328 ipnet_register_netihook(ipnet_stack_t *ips)
329 {
330 	int		ret;
331 	zoneid_t	zoneid;
332 	netid_t		netid;
333 
334 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
335 	    ips);
336 
337 	/*
338 	 * It is possible for an exclusive stack to be in the process of
339 	 * shutting down here, and the netid and protocol lookups could fail
340 	 * in that case.
341 	 */
342 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
343 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
344 		return;
345 
346 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
347 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
348 		    ips->ips_nicevents)) != 0) {
349 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
350 			ips->ips_ndv4 = NULL;
351 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
352 			    " in zone %d: %d", zoneid, ret);
353 		}
354 	}
355 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
356 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
357 		    ips->ips_nicevents)) != 0) {
358 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
359 			ips->ips_ndv6 = NULL;
360 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
361 			    " in zone %d: %d", zoneid, ret);
362 		}
363 	}
364 
365 	/*
366 	 * Create a local set of kstats for each zone.
367 	 */
368 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
369 	    "misc", KSTAT_TYPE_NAMED,
370 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
371 	if (ips->ips_kstatp != NULL) {
372 		bcopy(&stats_template, &ips->ips_stats,
373 		    sizeof (ips->ips_stats));
374 		ips->ips_kstatp->ks_data = &ips->ips_stats;
375 		ips->ips_kstatp->ks_private =
376 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
377 		kstat_install(ips->ips_kstatp);
378 	} else {
379 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
380 		    "ipnet", "ipnet_stats", "misc");
381 	}
382 }
383 
384 /*
385  * This function is called on attach to build an initial view of the
386  * interfaces on the system. It will be called once for IPv4 and once
387  * for IPv6, although there is only one ipnet interface for both IPv4
388  * and IPv6 there are separate address lists.
389  */
390 static int
391 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
392 {
393 	phy_if_t	phyif;
394 	lif_if_t	lif;
395 	ipnetif_t	*ipnetif;
396 	char		name[LIFNAMSIZ];
397 	boolean_t	new_if = B_FALSE;
398 	uint64_t	ifflags;
399 	int		ret = 0;
400 
401 	/*
402 	 * If ipnet_register_netihook() was unable to initialize this
403 	 * stack's net_handle_t, then we cannot populate any interface
404 	 * information.  This usually happens when we attempted to
405 	 * grab a net_handle_t as a stack was shutting down.  We don't
406 	 * want to fail the entire _init() operation because of a
407 	 * stack shutdown (other stacks will continue to work just
408 	 * fine), so we silently return success here.
409 	 */
410 	if (nd == NULL)
411 		return (0);
412 
413 	/*
414 	 * Make sure we're not processing NIC events during the
415 	 * population of our interfaces and address lists.
416 	 */
417 	mutex_enter(&ips->ips_event_lock);
418 
419 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
420 	    phyif = net_phygetnext(nd, phyif)) {
421 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
422 			continue;
423 		ifflags =  0;
424 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
425 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
426 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
427 			if (ipnetif == NULL) {
428 				ret = ENOMEM;
429 				goto done;
430 			}
431 			new_if = B_TRUE;
432 		}
433 		ipnetif->if_flags |=
434 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
435 
436 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
437 		    lif = net_lifgetnext(nd, phyif, lif)) {
438 			/*
439 			 * Skip addresses that aren't up.  We'll add
440 			 * them when we receive an NE_LIF_UP event.
441 			 */
442 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
443 			    !(ifflags & IFF_UP))
444 				continue;
445 			/* Don't add it if we already have it. */
446 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
447 				continue;
448 			ipnet_add_ifaddr(lif, ipnetif, nd);
449 		}
450 		if (!new_if)
451 			ipnetif_refrele(ipnetif);
452 	}
453 
454 done:
455 	mutex_exit(&ips->ips_event_lock);
456 	return (ret);
457 }
458 
459 static int
460 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
461 {
462 	if (cmd != DDI_ATTACH)
463 		return (DDI_FAILURE);
464 
465 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
466 	    DDI_PSEUDO, 0) == DDI_FAILURE)
467 		return (DDI_FAILURE);
468 
469 	ipnet_dip = dip;
470 	return (DDI_SUCCESS);
471 }
472 
473 static int
474 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
475 {
476 	if (cmd != DDI_DETACH)
477 		return (DDI_FAILURE);
478 
479 	ASSERT(dip == ipnet_dip);
480 	ddi_remove_minor_node(ipnet_dip, NULL);
481 	ipnet_dip = NULL;
482 	return (DDI_SUCCESS);
483 }
484 
485 /* ARGSUSED */
486 static int
487 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
488 {
489 	int	error = DDI_FAILURE;
490 
491 	switch (infocmd) {
492 	case DDI_INFO_DEVT2INSTANCE:
493 		*result = (void *)0;
494 		error = DDI_SUCCESS;
495 		break;
496 	case DDI_INFO_DEVT2DEVINFO:
497 		if (ipnet_dip != NULL) {
498 			*result = ipnet_dip;
499 			error = DDI_SUCCESS;
500 		}
501 		break;
502 	}
503 	return (error);
504 }
505 
506 /* ARGSUSED */
507 static int
508 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
509 {
510 	ipnet_t		*ipnet;
511 	netstack_t	*ns = NULL;
512 	ipnet_stack_t	*ips;
513 	int		err = 0;
514 	zoneid_t	zoneid = crgetzoneid(crp);
515 
516 	/*
517 	 * If the system is labeled, only the global zone is allowed to open
518 	 * IP observability nodes.
519 	 */
520 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
521 		return (EACCES);
522 
523 	/* We don't support open as a module */
524 	if (sflag & MODOPEN)
525 		return (ENOTSUP);
526 
527 	/* This driver is self-cloning, we don't support re-open. */
528 	if (rq->q_ptr != NULL)
529 		return (EBUSY);
530 
531 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
532 		return (ENOMEM);
533 
534 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
535 	ips = ns->netstack_ipnet;
536 
537 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
538 	ipnet->ipnet_rq = rq;
539 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
540 	ipnet->ipnet_zoneid = zoneid;
541 	ipnet->ipnet_dlstate = DL_UNBOUND;
542 	ipnet->ipnet_ns = ns;
543 
544 	/*
545 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
546 	 * to be processed after ipnet_if is set and the ipnet_t has been
547 	 * inserted in the ips_str_list.
548 	 */
549 	mutex_enter(&ips->ips_event_lock);
550 	if (getminor(*dev) == IPNET_MINOR_LO) {
551 		ipnet->ipnet_flags |= IPNET_LOMODE;
552 		ipnet->ipnet_acceptfn = ipnet_loaccept;
553 	} else {
554 		ipnet->ipnet_acceptfn = ipnet_accept;
555 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
556 		if (ipnet->ipnet_if == NULL ||
557 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
558 			err = ENODEV;
559 			goto done;
560 		}
561 	}
562 
563 	mutex_enter(&ips->ips_walkers_lock);
564 	while (ips->ips_walkers_cnt != 0)
565 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
566 	list_insert_head(&ips->ips_str_list, ipnet);
567 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
568 	qprocson(rq);
569 
570 	/*
571 	 * Only register our callback if we're the first open client; we call
572 	 * unregister in close() for the last open client.
573 	 */
574 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
575 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
576 	mutex_exit(&ips->ips_walkers_lock);
577 
578 done:
579 	mutex_exit(&ips->ips_event_lock);
580 	if (err != 0) {
581 		netstack_rele(ns);
582 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
583 		if (ipnet->ipnet_if != NULL)
584 			ipnetif_refrele(ipnet->ipnet_if);
585 		kmem_free(ipnet, sizeof (*ipnet));
586 	}
587 	return (err);
588 }
589 
590 static int
591 ipnet_close(queue_t *rq)
592 {
593 	ipnet_t		*ipnet = rq->q_ptr;
594 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
595 
596 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
597 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
598 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
599 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
600 
601 	mutex_enter(&ips->ips_walkers_lock);
602 	while (ips->ips_walkers_cnt != 0)
603 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
604 
605 	qprocsoff(rq);
606 
607 	list_remove(&ips->ips_str_list, ipnet);
608 	if (ipnet->ipnet_if != NULL)
609 		ipnetif_refrele(ipnet->ipnet_if);
610 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
611 
612 	if (list_is_empty(&ips->ips_str_list)) {
613 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
614 		ips->ips_hook = NULL;
615 	}
616 
617 	kmem_free(ipnet, sizeof (*ipnet));
618 
619 	mutex_exit(&ips->ips_walkers_lock);
620 	netstack_rele(ips->ips_netstack);
621 	return (0);
622 }
623 
624 static int
625 ipnet_wput(queue_t *q, mblk_t *mp)
626 {
627 	switch (mp->b_datap->db_type) {
628 	case M_FLUSH:
629 		if (*mp->b_rptr & FLUSHW) {
630 			flushq(q, FLUSHDATA);
631 			*mp->b_rptr &= ~FLUSHW;
632 		}
633 		if (*mp->b_rptr & FLUSHR)
634 			qreply(q, mp);
635 		else
636 			freemsg(mp);
637 		break;
638 	case M_PROTO:
639 	case M_PCPROTO:
640 		ipnet_wputnondata(q, mp);
641 		break;
642 	case M_IOCTL:
643 		ipnet_ioctl(q, mp);
644 		break;
645 	case M_IOCDATA:
646 		ipnet_iocdata(q, mp);
647 		break;
648 	default:
649 		freemsg(mp);
650 		break;
651 	}
652 	return (0);
653 }
654 
655 static int
656 ipnet_rsrv(queue_t *q)
657 {
658 	mblk_t	*mp;
659 
660 	while ((mp = getq(q)) != NULL) {
661 		ASSERT(DB_TYPE(mp) == M_DATA);
662 		if (canputnext(q)) {
663 			putnext(q, mp);
664 		} else {
665 			(void) putbq(q, mp);
666 			break;
667 		}
668 	}
669 	return (0);
670 }
671 
672 static void
673 ipnet_ioctl(queue_t *q, mblk_t *mp)
674 {
675 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
676 
677 	switch (iocp->ioc_cmd) {
678 	case DLIOCRAW:
679 		miocack(q, mp, 0, 0);
680 		break;
681 	case DLIOCIPNETINFO:
682 		if (iocp->ioc_count == TRANSPARENT) {
683 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
684 			qreply(q, mp);
685 			break;
686 		}
687 		/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
688 	default:
689 		miocnak(q, mp, 0, EINVAL);
690 		break;
691 	}
692 }
693 
694 static void
695 ipnet_iocdata(queue_t *q, mblk_t *mp)
696 {
697 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
698 	ipnet_t	*ipnet = q->q_ptr;
699 
700 	switch (iocp->ioc_cmd) {
701 	case DLIOCIPNETINFO:
702 		if (*(int *)mp->b_cont->b_rptr == 1)
703 			ipnet->ipnet_flags |= IPNET_INFO;
704 		else if (*(int *)mp->b_cont->b_rptr == 0)
705 			ipnet->ipnet_flags &= ~IPNET_INFO;
706 		else
707 			goto iocnak;
708 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
709 		break;
710 	default:
711 iocnak:
712 		miocnak(q, mp, 0, EINVAL);
713 		break;
714 	}
715 }
716 
717 static void
718 ipnet_wputnondata(queue_t *q, mblk_t *mp)
719 {
720 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
721 	t_uscalar_t		prim = dlp->dl_primitive;
722 
723 	switch (prim) {
724 	case DL_INFO_REQ:
725 		ipnet_inforeq(q, mp);
726 		break;
727 	case DL_UNBIND_REQ:
728 		ipnet_unbindreq(q, mp);
729 		break;
730 	case DL_BIND_REQ:
731 		ipnet_bindreq(q, mp);
732 		break;
733 	case DL_PROMISCON_REQ:
734 		ipnet_dlpromisconreq(q, mp);
735 		break;
736 	case DL_PROMISCOFF_REQ:
737 		ipnet_dlpromiscoffreq(q, mp);
738 		break;
739 	case DL_UNITDATA_REQ:
740 	case DL_DETACH_REQ:
741 	case DL_PHYS_ADDR_REQ:
742 	case DL_SET_PHYS_ADDR_REQ:
743 	case DL_ENABMULTI_REQ:
744 	case DL_DISABMULTI_REQ:
745 	case DL_ATTACH_REQ:
746 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
747 		break;
748 	default:
749 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
750 		break;
751 	}
752 }
753 
754 static void
755 ipnet_inforeq(queue_t *q, mblk_t *mp)
756 {
757 	dl_info_ack_t	*dlip;
758 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
759 
760 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
761 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
762 		return;
763 	}
764 
765 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
766 		return;
767 
768 	dlip = (dl_info_ack_t *)mp->b_rptr;
769 	*dlip = ipnet_infoack;
770 	qreply(q, mp);
771 }
772 
773 static void
774 ipnet_bindreq(queue_t *q, mblk_t *mp)
775 {
776 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
777 	ipnet_t			*ipnet = q->q_ptr;
778 
779 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
780 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
781 		return;
782 	}
783 
784 	switch (dlp->bind_req.dl_sap) {
785 	case 0 :
786 		ipnet->ipnet_family = AF_UNSPEC;
787 		break;
788 	case IPV4_VERSION :
789 		ipnet->ipnet_family = AF_INET;
790 		break;
791 	case IPV6_VERSION :
792 		ipnet->ipnet_family = AF_INET6;
793 		break;
794 	default :
795 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
796 		return;
797 		/*NOTREACHED*/
798 	}
799 
800 	ipnet->ipnet_dlstate = DL_IDLE;
801 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
802 }
803 
804 static void
805 ipnet_unbindreq(queue_t *q, mblk_t *mp)
806 {
807 	ipnet_t	*ipnet = q->q_ptr;
808 
809 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
810 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
811 		return;
812 	}
813 
814 	if (ipnet->ipnet_dlstate != DL_IDLE) {
815 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
816 	} else {
817 		ipnet->ipnet_dlstate = DL_UNBOUND;
818 		ipnet->ipnet_family = AF_UNSPEC;
819 		dlokack(q, mp, DL_UNBIND_REQ);
820 	}
821 }
822 
823 static void
824 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
825 {
826 	ipnet_t		*ipnet = q->q_ptr;
827 	t_uscalar_t	level;
828 	int		err;
829 
830 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
831 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
832 		return;
833 	}
834 
835 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
836 		dlokack(q, mp, DL_PROMISCON_REQ);
837 		return;
838 	}
839 
840 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
841 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
842 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
843 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
844 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
845 			return;
846 		}
847 	}
848 
849 	switch (level) {
850 	case DL_PROMISC_PHYS:
851 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
852 		break;
853 	case DL_PROMISC_SAP:
854 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
855 		break;
856 	case DL_PROMISC_MULTI:
857 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
858 		break;
859 	default:
860 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
861 		return;
862 	}
863 
864 	dlokack(q, mp, DL_PROMISCON_REQ);
865 }
866 
867 static void
868 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
869 {
870 	ipnet_t		*ipnet = q->q_ptr;
871 	t_uscalar_t	level;
872 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
873 
874 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
875 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
876 		return;
877 	}
878 
879 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
880 		dlokack(q, mp, DL_PROMISCOFF_REQ);
881 		return;
882 	}
883 
884 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
885 	switch (level) {
886 	case DL_PROMISC_PHYS:
887 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
888 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
889 		break;
890 	case DL_PROMISC_SAP:
891 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
892 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
893 		break;
894 	case DL_PROMISC_MULTI:
895 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
896 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
897 		break;
898 	default:
899 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
900 		return;
901 	}
902 
903 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
904 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
905 		return;
906 	}
907 
908 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
909 		ipnet_leave_allmulti(ipnet->ipnet_if,
910 		    ipnet->ipnet_ns->netstack_ipnet);
911 	}
912 
913 	dlokack(q, mp, DL_PROMISCOFF_REQ);
914 }
915 
916 static int
917 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
918 {
919 	int		err = 0;
920 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
921 	uint64_t	index = ipnetif->if_index;
922 
923 	mutex_enter(&ips->ips_event_lock);
924 	if (ipnetif->if_multicnt == 0) {
925 		ASSERT((ipnetif->if_flags &
926 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
927 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
928 			err = ip_join_allmulti(index, B_FALSE, ipst);
929 			if (err != 0)
930 				goto done;
931 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
932 		}
933 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
934 			err = ip_join_allmulti(index, B_TRUE, ipst);
935 			if (err != 0 &&
936 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
937 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
938 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
939 				goto done;
940 			}
941 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
942 		}
943 	}
944 	ipnetif->if_multicnt++;
945 
946 done:
947 	mutex_exit(&ips->ips_event_lock);
948 	return (err);
949 }
950 
951 static void
952 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
953 {
954 	int		err;
955 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
956 	uint64_t	index = ipnetif->if_index;
957 
958 	mutex_enter(&ips->ips_event_lock);
959 	ASSERT(ipnetif->if_multicnt != 0);
960 	if (--ipnetif->if_multicnt == 0) {
961 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
962 			err = ip_leave_allmulti(index, B_FALSE, ipst);
963 			ASSERT(err == 0 || err == ENODEV);
964 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
965 		}
966 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
967 			err = ip_leave_allmulti(index, B_TRUE, ipst);
968 			ASSERT(err == 0 || err == ENODEV);
969 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
970 		}
971 	}
972 	mutex_exit(&ips->ips_event_lock);
973 }
974 
975 /*
976  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
977  * The structure it copies the header information from,
978  * hook_pkt_observe_t, is constructed using network byte
979  * order in ipobs_hook(), so there is no conversion here.
980  */
981 static mblk_t *
982 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
983 {
984 	mblk_t		*dlhdr;
985 	dl_ipnetinfo_t	*dl;
986 
987 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
988 		freemsg(mp);
989 		return (NULL);
990 	}
991 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
992 	dl->dli_version = DL_IPNETINFO_VERSION;
993 	dl->dli_family = hdr->hpo_family;
994 	dl->dli_htype = hdr->hpo_htype;
995 	dl->dli_pktlen = hdr->hpo_pktlen;
996 	dl->dli_ifindex = hdr->hpo_ifindex;
997 	dl->dli_grifindex = hdr->hpo_grifindex;
998 	dl->dli_zsrc = hdr->hpo_zsrc;
999 	dl->dli_zdst = hdr->hpo_zdst;
1000 	dlhdr->b_wptr += sizeof (*dl);
1001 	dlhdr->b_cont = mp;
1002 
1003 	return (dlhdr);
1004 }
1005 
1006 static ipnet_addrtype_t
1007 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1008 {
1009 	list_t			*list;
1010 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1011 	ipnetif_addr_t		*ifaddr;
1012 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1013 
1014 	/* First check if the address is multicast or limited broadcast. */
1015 	switch (addr->iap_family) {
1016 	case AF_INET:
1017 		if (CLASSD(*(addr->iap_addr4)) ||
1018 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1019 			return (IPNETADDR_MBCAST);
1020 		break;
1021 	case AF_INET6:
1022 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1023 			return (IPNETADDR_MBCAST);
1024 		break;
1025 	}
1026 
1027 	/*
1028 	 * Walk the address list to see if the address belongs to our
1029 	 * interface or is one of our subnet broadcast addresses.
1030 	 */
1031 	mutex_enter(&ipnetif->if_addr_lock);
1032 	list = (addr->iap_family == AF_INET) ?
1033 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1034 	for (ifaddr = list_head(list);
1035 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1036 	    ifaddr = list_next(list, ifaddr)) {
1037 		/*
1038 		 * If we're not in the global zone, then only look at
1039 		 * addresses in our zone.
1040 		 */
1041 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1042 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1043 			continue;
1044 		switch (addr->iap_family) {
1045 		case AF_INET:
1046 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1047 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1048 				addrtype = IPNETADDR_MYADDR;
1049 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1050 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1051 				addrtype = IPNETADDR_MBCAST;
1052 			break;
1053 		case AF_INET6:
1054 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1055 			    &ifaddr->ifa_ip6addr))
1056 				addrtype = IPNETADDR_MYADDR;
1057 			break;
1058 		}
1059 	}
1060 	mutex_exit(&ipnetif->if_addr_lock);
1061 
1062 	return (addrtype);
1063 }
1064 
1065 /*
1066  * Verify if the packet contained in hdr should be passed up to the
1067  * ipnet client stream.
1068  */
1069 static boolean_t
1070 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1071     ipnet_addrp_t *dst)
1072 {
1073 	boolean_t		obsif;
1074 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1075 	ipnet_addrtype_t	srctype;
1076 	ipnet_addrtype_t	dsttype;
1077 
1078 	srctype = ipnet_get_addrtype(ipnet, src);
1079 	dsttype = ipnet_get_addrtype(ipnet, dst);
1080 
1081 	/*
1082 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1083 	 * matches ours, it's on the interface we're observing.  (Thus,
1084 	 * observing on the group ifindex matches all ifindexes in the group.)
1085 	 */
1086 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1087 	    ntohl(hdr->hpo_grifindex) == ifindex);
1088 
1089 	DTRACE_PROBE5(ipnet_accept__addr,
1090 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1091 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1092 	    boolean_t, obsif);
1093 
1094 	/*
1095 	 * Do not allow an ipnet stream to see packets that are not from or to
1096 	 * its zone.  The exception is when zones are using the shared stack
1097 	 * model.  In this case, streams in the global zone have visibility
1098 	 * into other shared-stack zones, and broadcast and multicast traffic
1099 	 * is visible by all zones in the stack.
1100 	 */
1101 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1102 	    dsttype != IPNETADDR_MBCAST) {
1103 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1104 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1105 			return (B_FALSE);
1106 	}
1107 
1108 	/*
1109 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1110 	 * packet's IP version.
1111 	 */
1112 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1113 	    ipnet->ipnet_family != hdr->hpo_family)
1114 		return (B_FALSE);
1115 
1116 	/* If the destination address is ours, then accept the packet. */
1117 	if (dsttype == IPNETADDR_MYADDR)
1118 		return (B_TRUE);
1119 
1120 	/*
1121 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1122 	 * sent or received on the interface we're observing, or packets that
1123 	 * have our source address (this allows us to see packets we send).
1124 	 */
1125 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1126 		if (srctype == IPNETADDR_MYADDR || obsif)
1127 			return (B_TRUE);
1128 	}
1129 
1130 	/*
1131 	 * We accept multicast and broadcast packets transmitted or received
1132 	 * on the interface we're observing.
1133 	 */
1134 	if (dsttype == IPNETADDR_MBCAST && obsif)
1135 		return (B_TRUE);
1136 
1137 	return (B_FALSE);
1138 }
1139 
1140 /*
1141  * Verify if the packet contained in hdr should be passed up to the ipnet
1142  * client stream that's in IPNET_LOMODE.
1143  */
1144 /* ARGSUSED */
1145 static boolean_t
1146 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1147     ipnet_addrp_t *dst)
1148 {
1149 	if (hdr->hpo_htype != IPOBS_HOOK_LOCAL) {
1150 		/*
1151 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1152 		 */
1153 		if (ipnet->ipnet_if == NULL)
1154 			return (B_FALSE);
1155 	}
1156 
1157 	/*
1158 	 * An ipnet stream must not see packets that are not from/to its zone.
1159 	 */
1160 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1161 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1162 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1163 			return (B_FALSE);
1164 	}
1165 
1166 	return (ipnet->ipnet_family == AF_UNSPEC ||
1167 	    ipnet->ipnet_family == hdr->hpo_family);
1168 }
1169 
1170 static void
1171 ipnet_dispatch(void *arg)
1172 {
1173 	mblk_t			*mp = arg;
1174 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1175 	ipnet_t			*ipnet;
1176 	mblk_t			*netmp;
1177 	list_t			*list;
1178 	ipnet_stack_t		*ips;
1179 	ipnet_addrp_t		src;
1180 	ipnet_addrp_t		dst;
1181 
1182 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1183 
1184 	netmp = hdr->hpo_pkt->b_cont;
1185 	src.iap_family = hdr->hpo_family;
1186 	dst.iap_family = hdr->hpo_family;
1187 
1188 	if (hdr->hpo_family == AF_INET) {
1189 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1190 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1191 	} else {
1192 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1193 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1194 	}
1195 
1196 	ipnet_walkers_inc(ips);
1197 
1198 	list = &ips->ips_str_list;
1199 	for (ipnet = list_head(list); ipnet != NULL;
1200 	    ipnet = list_next(list, ipnet)) {
1201 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1202 			IPSK_BUMP(ips, ik_acceptFail);
1203 			continue;
1204 		}
1205 		IPSK_BUMP(ips, ik_acceptOk);
1206 
1207 		if (list_next(list, ipnet) == NULL) {
1208 			netmp = hdr->hpo_pkt->b_cont;
1209 			hdr->hpo_pkt->b_cont = NULL;
1210 		} else {
1211 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1212 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1213 				IPSK_BUMP(ips, ik_duplicationFail);
1214 				continue;
1215 			}
1216 		}
1217 
1218 		if (ipnet->ipnet_flags & IPNET_INFO) {
1219 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1220 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1221 				continue;
1222 			}
1223 		}
1224 
1225 		if (ipnet->ipnet_rq->q_first == NULL &&
1226 		    canputnext(ipnet->ipnet_rq)) {
1227 			putnext(ipnet->ipnet_rq, netmp);
1228 			IPSK_BUMP(ips, ik_dispatchDeliver);
1229 		} else if (canput(ipnet->ipnet_rq)) {
1230 			(void) putq(ipnet->ipnet_rq, netmp);
1231 			IPSK_BUMP(ips, ik_dispatchDeliver);
1232 		} else {
1233 			freemsg(netmp);
1234 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1235 		}
1236 	}
1237 
1238 	ipnet_walkers_dec(ips);
1239 
1240 	freemsg(mp);
1241 }
1242 
1243 static void
1244 ipnet_input(mblk_t *mp)
1245 {
1246 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1247 	ipnet_stack_t		*ips;
1248 
1249 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1250 
1251 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1252 	    DDI_SUCCESS) {
1253 		IPSK_BUMP(ips, ik_dispatchFail);
1254 		freemsg(mp);
1255 	} else {
1256 		IPSK_BUMP(ips, ik_dispatchOk);
1257 	}
1258 }
1259 
1260 static ipnetif_t *
1261 ipnet_alloc_if(ipnet_stack_t *ips)
1262 {
1263 	ipnetif_t	*ipnetif;
1264 
1265 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1266 		return (NULL);
1267 
1268 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1269 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1270 	    offsetof(ipnetif_addr_t, ifa_link));
1271 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1272 	    offsetof(ipnetif_addr_t, ifa_link));
1273 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1274 
1275 	ipnetif->if_stackp = ips;
1276 
1277 	return (ipnetif);
1278 }
1279 
1280 /*
1281  * Create a new ipnetif_t and new minor node for it.  If creation is
1282  * successful the new ipnetif_t is inserted into an avl_tree
1283  * containing ipnetif's for this stack instance.
1284  */
1285 static ipnetif_t *
1286 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1287     uint64_t ifflags)
1288 {
1289 	ipnetif_t	*ipnetif;
1290 	avl_index_t	where = 0;
1291 	minor_t		ifminor;
1292 
1293 	/*
1294 	 * Because ipnetif_create() can be called from a NIC event
1295 	 * callback, it should not block.
1296 	 */
1297 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1298 	if (ifminor == (minor_t)-1)
1299 		return (NULL);
1300 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1301 		id_free(ipnet_minor_space, ifminor);
1302 		return (NULL);
1303 	}
1304 
1305 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1306 	ipnetif->if_index = (uint_t)index;
1307 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1308 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1309 
1310 	ipnetif->if_refcnt = 1;
1311 	if ((ifflags & IFF_LOOPBACK) != 0)
1312 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1313 
1314 	mutex_enter(&ips->ips_avl_lock);
1315 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1316 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1317 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1318 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1319 	mutex_exit(&ips->ips_avl_lock);
1320 	/*
1321 	 * Now that the interface can be found by lookups back into ipnet,
1322 	 * allowing for sanity checking, call the BPF attach.
1323 	 */
1324 	ipnet_bpfattach(ipnetif);
1325 
1326 	return (ipnetif);
1327 }
1328 
1329 static void
1330 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1331 {
1332 	ipnet_t	*ipnet;
1333 
1334 	ipnet_walkers_inc(ips);
1335 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1336 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1337 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1338 		if (ipnet->ipnet_if == ipnetif)
1339 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1340 	}
1341 	ipnet_walkers_dec(ips);
1342 	mutex_enter(&ips->ips_avl_lock);
1343 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1344 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1345 	mutex_exit(&ips->ips_avl_lock);
1346 	/*
1347 	 * Now that the interface can't be found, do a BPF detach
1348 	 */
1349 	ipnet_bpfdetach(ipnetif);
1350 	/*
1351 	 * Release the reference we implicitly held in ipnetif_create().
1352 	 */
1353 	ipnetif_refrele(ipnetif);
1354 }
1355 
1356 static void
1357 ipnet_purge_addrlist(list_t *addrlist)
1358 {
1359 	ipnetif_addr_t	*ifa;
1360 
1361 	while ((ifa = list_head(addrlist)) != NULL) {
1362 		list_remove(addrlist, ifa);
1363 		if (ifa->ifa_shared != NULL)
1364 			ipnetif_clone_release(ifa->ifa_shared);
1365 		kmem_free(ifa, sizeof (*ifa));
1366 	}
1367 }
1368 
1369 static void
1370 ipnetif_free(ipnetif_t *ipnetif)
1371 {
1372 	ASSERT(ipnetif->if_refcnt == 0);
1373 	ASSERT(ipnetif->if_sharecnt == 0);
1374 
1375 	/* Remove IPv4/v6 address lists from the ipnetif */
1376 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1377 	list_destroy(&ipnetif->if_ip4addr_list);
1378 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1379 	list_destroy(&ipnetif->if_ip6addr_list);
1380 	mutex_destroy(&ipnetif->if_addr_lock);
1381 	mutex_destroy(&ipnetif->if_reflock);
1382 	if (ipnetif->if_dev != 0)
1383 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1384 	kmem_free(ipnetif, sizeof (*ipnetif));
1385 }
1386 
1387 /*
1388  * Create an ipnetif_addr_t with the given logical interface id (lif)
1389  * and add it to the supplied ipnetif.  The lif is the netinfo
1390  * representation of logical interface id, and we use this id to match
1391  * incoming netinfo events against our lists of addresses.
1392  */
1393 static void
1394 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1395 {
1396 	ipnetif_addr_t		*ifaddr;
1397 	zoneid_t		zoneid;
1398 	struct sockaddr_in	bcast;
1399 	struct sockaddr_storage	addr;
1400 	net_ifaddr_t		type = NA_ADDRESS;
1401 	uint64_t		phyif = ipnetif->if_index;
1402 
1403 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1404 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1405 		return;
1406 
1407 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1408 		return;
1409 	ifaddr->ifa_zone = zoneid;
1410 	ifaddr->ifa_id = lif;
1411 	ifaddr->ifa_shared = NULL;
1412 
1413 	switch (addr.ss_family) {
1414 	case AF_INET:
1415 		ifaddr->ifa_ip4addr =
1416 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1417 		/*
1418 		 * Try and get the broadcast address.  Note that it's okay for
1419 		 * an interface to not have a broadcast address, so we don't
1420 		 * fail the entire operation if net_getlifaddr() fails here.
1421 		 */
1422 		type = NA_BROADCAST;
1423 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1424 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1425 		break;
1426 	case AF_INET6:
1427 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1428 		break;
1429 	}
1430 
1431 	/*
1432 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1433 	 * zone the address is being used in. This facilitates finding the
1434 	 * correct netstack_t pointer, amongst other things, later.
1435 	 */
1436 	if (zoneid == ALL_ZONES)
1437 		zoneid = GLOBAL_ZONEID;
1438 
1439 	mutex_enter(&ipnetif->if_addr_lock);
1440 	if (zoneid != ipnetif->if_zoneid) {
1441 		ipnetif_t *ifp2;
1442 
1443 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1444 		ifaddr->ifa_shared = ifp2;
1445 	}
1446 	list_insert_tail(addr.ss_family == AF_INET ?
1447 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1448 	mutex_exit(&ipnetif->if_addr_lock);
1449 }
1450 
1451 static void
1452 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1453 {
1454 	mutex_enter(&ipnetif->if_addr_lock);
1455 	if (ifaddr->ifa_shared != NULL)
1456 		ipnetif_clone_release(ifaddr->ifa_shared);
1457 
1458 	list_remove(isv6 ?
1459 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1460 	mutex_exit(&ipnetif->if_addr_lock);
1461 	kmem_free(ifaddr, sizeof (*ifaddr));
1462 }
1463 
1464 static void
1465 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1466 {
1467 	ipnetif_t	*ipnetif;
1468 	boolean_t	refrele_needed = B_TRUE;
1469 	uint64_t	ifflags;
1470 	uint64_t	ifindex;
1471 	char		*ifname;
1472 
1473 	ifflags = 0;
1474 	ifname = ipne->ipne_ifname;
1475 	ifindex = ipne->ipne_ifindex;
1476 
1477 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1478 
1479 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1480 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1481 		refrele_needed = B_FALSE;
1482 	}
1483 	if (ipnetif != NULL) {
1484 		ipnetif->if_flags |=
1485 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1486 	}
1487 
1488 	if (ipnetif->if_multicnt != 0) {
1489 		if (ip_join_allmulti(ifindex, isv6,
1490 		    ips->ips_netstack->netstack_ip) == 0) {
1491 			ipnetif->if_flags |=
1492 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1493 		}
1494 	}
1495 
1496 	if (refrele_needed)
1497 		ipnetif_refrele(ipnetif);
1498 }
1499 
1500 static void
1501 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1502 {
1503 	ipnetif_t	*ipnetif;
1504 
1505 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1506 		return;
1507 
1508 	mutex_enter(&ipnetif->if_addr_lock);
1509 	ipnet_purge_addrlist(isv6 ?
1510 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1511 	mutex_exit(&ipnetif->if_addr_lock);
1512 
1513 	/*
1514 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1515 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1516 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1517 	 */
1518 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1519 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1520 		ipnetif_remove(ipnetif, ips);
1521 	ipnetif_refrele(ipnetif);
1522 }
1523 
1524 static void
1525 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1526     ipnet_stack_t *ips, boolean_t isv6)
1527 {
1528 	ipnetif_t	*ipnetif;
1529 	ipnetif_addr_t	*ifaddr;
1530 
1531 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1532 		return;
1533 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1534 		/*
1535 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1536 		 * ifaddr and re-create it.
1537 		 */
1538 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1539 	}
1540 
1541 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1542 	ipnetif_refrele(ipnetif);
1543 }
1544 
1545 static void
1546 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1547     boolean_t isv6)
1548 {
1549 	ipnetif_t	*ipnetif;
1550 	ipnetif_addr_t	*ifaddr;
1551 
1552 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1553 		return;
1554 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1555 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1556 	ipnetif_refrele(ipnetif);
1557 	/*
1558 	 * Make sure that open streams on this ipnetif are still allowed to
1559 	 * have it open.
1560 	 */
1561 	ipnetif_zonecheck(ipnetif, ips);
1562 }
1563 
1564 /*
1565  * This callback from the NIC event framework dispatches a taskq as the event
1566  * handlers may block.
1567  */
1568 /* ARGSUSED */
1569 static int
1570 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1571 {
1572 	ipnet_stack_t		*ips = arg;
1573 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1574 	ipnet_nicevent_t	*ipne;
1575 
1576 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1577 		return (0);
1578 	ipne->ipne_event = hn->hne_event;
1579 	ipne->ipne_protocol = hn->hne_protocol;
1580 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1581 	ipne->ipne_ifindex = hn->hne_nic;
1582 	ipne->ipne_lifindex = hn->hne_lif;
1583 	if (hn->hne_datalen != 0) {
1584 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1585 		    sizeof (ipne->ipne_ifname));
1586 	}
1587 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1588 	    ipne, DDI_NOSLEEP);
1589 	return (0);
1590 }
1591 
1592 static void
1593 ipnet_nicevent_task(void *arg)
1594 {
1595 	ipnet_nicevent_t	*ipne = arg;
1596 	netstack_t		*ns;
1597 	ipnet_stack_t		*ips;
1598 	boolean_t		isv6;
1599 
1600 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1601 		goto done;
1602 	ips = ns->netstack_ipnet;
1603 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1604 
1605 	mutex_enter(&ips->ips_event_lock);
1606 	switch (ipne->ipne_event) {
1607 	case NE_PLUMB:
1608 		ipnet_plumb_ev(ipne, ips, isv6);
1609 		break;
1610 	case NE_UNPLUMB:
1611 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1612 		break;
1613 	case NE_LIF_UP:
1614 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1615 		    ipne->ipne_protocol, ips, isv6);
1616 		break;
1617 	case NE_LIF_DOWN:
1618 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1619 		    isv6);
1620 		break;
1621 	default:
1622 		break;
1623 	}
1624 	mutex_exit(&ips->ips_event_lock);
1625 done:
1626 	if (ns != NULL)
1627 		netstack_rele(ns);
1628 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1629 }
1630 
1631 dev_t
1632 ipnet_if_getdev(char *name, zoneid_t zoneid)
1633 {
1634 	netstack_t	*ns;
1635 	ipnet_stack_t	*ips;
1636 	ipnetif_t	*ipnetif;
1637 	dev_t		dev = (dev_t)-1;
1638 
1639 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1640 		return (dev);
1641 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1642 		return (dev);
1643 
1644 	ips = ns->netstack_ipnet;
1645 	mutex_enter(&ips->ips_avl_lock);
1646 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1647 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1648 			dev = ipnetif->if_dev;
1649 	}
1650 	mutex_exit(&ips->ips_avl_lock);
1651 	netstack_rele(ns);
1652 
1653 	return (dev);
1654 }
1655 
1656 static ipnetif_t *
1657 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1658 {
1659 	ipnetif_t	*ipnetif;
1660 
1661 	mutex_enter(&ips->ips_avl_lock);
1662 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1663 		ipnetif_refhold(ipnetif);
1664 	mutex_exit(&ips->ips_avl_lock);
1665 	return (ipnetif);
1666 }
1667 
1668 static ipnetif_t *
1669 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1670 {
1671 	ipnetif_t	*ipnetif;
1672 	avl_tree_t	*tree;
1673 
1674 	mutex_enter(&ips->ips_avl_lock);
1675 	tree = &ips->ips_avl_by_index;
1676 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1677 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1678 		if (ipnetif->if_dev == dev) {
1679 			ipnetif_refhold(ipnetif);
1680 			break;
1681 		}
1682 	}
1683 	mutex_exit(&ips->ips_avl_lock);
1684 	return (ipnetif);
1685 }
1686 
1687 static ipnetif_addr_t *
1688 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1689 {
1690 	ipnetif_addr_t	*ifaddr;
1691 	list_t	*list;
1692 
1693 	mutex_enter(&ipnetif->if_addr_lock);
1694 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1695 	for (ifaddr = list_head(list); ifaddr != NULL;
1696 	    ifaddr = list_next(list, ifaddr)) {
1697 		if (lid == ifaddr->ifa_id)
1698 			break;
1699 	}
1700 	mutex_exit(&ipnetif->if_addr_lock);
1701 	return (ifaddr);
1702 }
1703 
1704 /* ARGSUSED */
1705 static void *
1706 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1707 {
1708 	ipnet_stack_t	*ips;
1709 
1710 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1711 	ips->ips_netstack = ns;
1712 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1713 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1714 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1715 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1716 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1717 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1718 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1719 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1720 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1721 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1722 	    offsetof(ipnet_t, ipnet_next));
1723 	ipnet_register_netihook(ips);
1724 	return (ips);
1725 }
1726 
1727 /* ARGSUSED */
1728 static void
1729 ipnet_stack_fini(netstackid_t stackid, void *arg)
1730 {
1731 	ipnet_stack_t	*ips = arg;
1732 	ipnetif_t	*ipnetif, *nipnetif;
1733 
1734 	if (ips->ips_kstatp != NULL) {
1735 		zoneid_t zoneid;
1736 
1737 		zoneid = netstackid_to_zoneid(stackid);
1738 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1739 	}
1740 	if (ips->ips_ndv4 != NULL) {
1741 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1742 		    ips->ips_nicevents) == 0);
1743 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1744 	}
1745 	if (ips->ips_ndv6 != NULL) {
1746 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1747 		    ips->ips_nicevents) == 0);
1748 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1749 	}
1750 	hook_free(ips->ips_nicevents);
1751 
1752 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1753 	    ipnetif = nipnetif) {
1754 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1755 		ipnetif_remove(ipnetif, ips);
1756 	}
1757 	avl_destroy(&ips->ips_avl_by_shared);
1758 	avl_destroy(&ips->ips_avl_by_index);
1759 	avl_destroy(&ips->ips_avl_by_name);
1760 	mutex_destroy(&ips->ips_avl_lock);
1761 	mutex_destroy(&ips->ips_walkers_lock);
1762 	cv_destroy(&ips->ips_walkers_cv);
1763 	list_destroy(&ips->ips_str_list);
1764 	kmem_free(ips, sizeof (*ips));
1765 }
1766 
1767 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1768 static boolean_t
1769 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1770 {
1771 	ipnetif_addr_t	*ifa;
1772 
1773 	for (ifa = list_head(addrlist); ifa != NULL;
1774 	    ifa = list_next(addrlist, ifa)) {
1775 		if (ifa->ifa_zone == zoneid)
1776 			return (B_TRUE);
1777 	}
1778 	return (B_FALSE);
1779 }
1780 
1781 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1782 static boolean_t
1783 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1784 {
1785 	int	ret;
1786 
1787 	/*
1788 	 * The global zone has visibility into all interfaces in the global
1789 	 * stack, and exclusive stack zones have visibility into all
1790 	 * interfaces in their stack.
1791 	 */
1792 	if (zoneid == GLOBAL_ZONEID ||
1793 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1794 		return (B_TRUE);
1795 
1796 	/*
1797 	 * Shared-stack zones only have visibility for interfaces that have
1798 	 * addresses in their zone.
1799 	 */
1800 	mutex_enter(&ipnetif->if_addr_lock);
1801 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1802 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1803 	mutex_exit(&ipnetif->if_addr_lock);
1804 	return (ret);
1805 }
1806 
1807 /*
1808  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1809  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1810  * to have an ipnetif open if there are no longer any addresses that belong to
1811  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1812  * case, send the ipnet_t an M_HANGUP.
1813  */
1814 static void
1815 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1816 {
1817 	list_t	*strlist = &ips->ips_str_list;
1818 	ipnet_t	*ipnet;
1819 
1820 	ipnet_walkers_inc(ips);
1821 	for (ipnet = list_head(strlist); ipnet != NULL;
1822 	    ipnet = list_next(strlist, ipnet)) {
1823 		if (ipnet->ipnet_if != ipnetif)
1824 			continue;
1825 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1826 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1827 	}
1828 	ipnet_walkers_dec(ips);
1829 }
1830 
1831 void
1832 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1833 {
1834 	ipnetif_t		*ipnetif;
1835 	list_t			cbdata;
1836 	ipnetif_cbdata_t	*cbnode;
1837 	netstack_t		*ns;
1838 	ipnet_stack_t		*ips;
1839 
1840 	/*
1841 	 * On labeled systems, non-global zones shouldn't see anything
1842 	 * in /dev/ipnet.
1843 	 */
1844 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1845 		return;
1846 
1847 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1848 		return;
1849 
1850 	ips = ns->netstack_ipnet;
1851 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1852 	    offsetof(ipnetif_cbdata_t, ic_next));
1853 
1854 	mutex_enter(&ips->ips_avl_lock);
1855 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1856 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1857 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1858 			continue;
1859 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1860 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1861 		cbnode->ic_dev = ipnetif->if_dev;
1862 		list_insert_head(&cbdata, cbnode);
1863 	}
1864 	mutex_exit(&ips->ips_avl_lock);
1865 
1866 	while ((cbnode = list_head(&cbdata)) != NULL) {
1867 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1868 		list_remove(&cbdata, cbnode);
1869 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1870 	}
1871 	list_destroy(&cbdata);
1872 	netstack_rele(ns);
1873 }
1874 
1875 static int
1876 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1877 {
1878 	int64_t	index1 = *((int64_t *)index_ptr);
1879 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1880 
1881 	return (SIGNOF(index2 - index1));
1882 }
1883 
1884 static int
1885 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1886 {
1887 	int	res;
1888 
1889 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1890 	return (SIGNOF(res));
1891 }
1892 
1893 static int
1894 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1895 {
1896 	const uintptr_t	*ptr = key_ptr;
1897 	const ipnetif_t	*ifp;
1898 	int		res;
1899 
1900 	ifp = ipnetifp;
1901 	res = ifp->if_zoneid - ptr[0];
1902 	if (res != 0)
1903 		return (SIGNOF(res));
1904 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1905 	return (SIGNOF(res));
1906 }
1907 
1908 static void
1909 ipnetif_refhold(ipnetif_t *ipnetif)
1910 {
1911 	mutex_enter(&ipnetif->if_reflock);
1912 	ipnetif->if_refcnt++;
1913 	mutex_exit(&ipnetif->if_reflock);
1914 }
1915 
1916 static void
1917 ipnetif_refrele(ipnetif_t *ipnetif)
1918 {
1919 	mutex_enter(&ipnetif->if_reflock);
1920 	ASSERT(ipnetif->if_refcnt > 0);
1921 	if (--ipnetif->if_refcnt == 0)
1922 		ipnetif_free(ipnetif);
1923 	else
1924 		mutex_exit(&ipnetif->if_reflock);
1925 }
1926 
1927 static void
1928 ipnet_walkers_inc(ipnet_stack_t *ips)
1929 {
1930 	mutex_enter(&ips->ips_walkers_lock);
1931 	ips->ips_walkers_cnt++;
1932 	mutex_exit(&ips->ips_walkers_lock);
1933 }
1934 
1935 static void
1936 ipnet_walkers_dec(ipnet_stack_t *ips)
1937 {
1938 	mutex_enter(&ips->ips_walkers_lock);
1939 	ASSERT(ips->ips_walkers_cnt != 0);
1940 	if (--ips->ips_walkers_cnt == 0)
1941 		cv_broadcast(&ips->ips_walkers_cv);
1942 	mutex_exit(&ips->ips_walkers_lock);
1943 }
1944 
1945 /*ARGSUSED*/
1946 static int
1947 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1948 {
1949 	hook_pkt_observe_t	*hdr;
1950 	pfv_t			func = (pfv_t)arg;
1951 	mblk_t			*mp;
1952 
1953 	hdr = (hook_pkt_observe_t *)info;
1954 	mp = dupmsg(hdr->hpo_pkt);
1955 	if (mp == NULL) {
1956 		mp = copymsg(hdr->hpo_pkt);
1957 		if (mp == NULL)  {
1958 			netstack_t *ns = hdr->hpo_ctx;
1959 			ipnet_stack_t *ips = ns->netstack_ipnet;
1960 
1961 			IPSK_BUMP(ips, ik_dispatchDupDrop);
1962 			return (0);
1963 		}
1964 	}
1965 
1966 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1967 	hdr->hpo_pkt = mp;
1968 
1969 	func(mp);
1970 
1971 	return (0);
1972 }
1973 
1974 hook_t *
1975 ipobs_register_hook(netstack_t *ns, pfv_t func)
1976 {
1977 	ip_stack_t	*ipst = ns->netstack_ip;
1978 	char		name[32];
1979 	hook_t		*hook;
1980 
1981 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1982 	VERIFY(hook != NULL);
1983 
1984 	/*
1985 	 * To register multiple hooks with he same callback function,
1986 	 * a unique name is needed.
1987 	 */
1988 	(void) snprintf(name, sizeof (name), "ipobserve_%p", hook);
1989 	hook->h_name = strdup(name);
1990 
1991 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1992 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1993 
1994 	return (hook);
1995 }
1996 
1997 void
1998 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1999 {
2000 	ip_stack_t	*ipst = ns->netstack_ip;
2001 
2002 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
2003 
2004 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
2005 
2006 	strfree(hook->h_name);
2007 
2008 	hook_free(hook);
2009 }
2010 
2011 /* ******************************************************************** */
2012 /* BPF Functions below							*/
2013 /* ******************************************************************** */
2014 
2015 /*
2016  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2017  */
2018 static ipnet_stack_t *
2019 ipnet_find_by_zoneid(zoneid_t zoneid)
2020 {
2021 	netstack_t	*ns;
2022 
2023 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2024 	return (ns->netstack_ipnet);
2025 }
2026 
2027 /*
2028  * Rather than weave the complexity of what needs to be done for a BPF
2029  * device attach or detach into the code paths of where they're used,
2030  * it is presented here in a couple of simple functions, along with
2031  * other similar code.
2032  *
2033  * The refrele/refhold here provide the means by which it is known
2034  * when the clone structures can be free'd.
2035  */
2036 static void
2037 ipnet_bpfdetach(ipnetif_t *ifp)
2038 {
2039 	if (ifp->if_stackp->ips_bpfdetach_fn != NULL) {
2040 		ifp->if_stackp->ips_bpfdetach_fn((uintptr_t)ifp);
2041 		ipnetif_refrele(ifp);
2042 	}
2043 }
2044 
2045 static void
2046 ipnet_bpfattach(ipnetif_t *ifp)
2047 {
2048 	if (ifp->if_stackp->ips_bpfattach_fn != NULL) {
2049 		ipnetif_refhold(ifp);
2050 		ifp->if_stackp->ips_bpfattach_fn((uintptr_t)ifp, DL_IPNET,
2051 		    ifp->if_zoneid, BPR_IPNET);
2052 	}
2053 }
2054 
2055 /*
2056  * Set the functions to call back to when adding or removing an interface so
2057  * that BPF can keep its internal list of these up to date.
2058  */
2059 void
2060 ipnet_set_bpfattach(bpf_attach_fn_t attach, bpf_detach_fn_t detach,
2061     zoneid_t zoneid, bpf_itap_fn_t tapfunc, bpf_provider_reg_fn_t provider)
2062 {
2063 	ipnet_stack_t	*ips;
2064 	ipnetif_t	*ipnetif;
2065 	avl_tree_t	*tree;
2066 	ipnetif_t	*next;
2067 
2068 	if (zoneid == GLOBAL_ZONEID) {
2069 		ipnet_itap = tapfunc;
2070 	}
2071 
2072 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2073 
2074 	/*
2075 	 * If we're setting a new attach function, call it for every
2076 	 * mac that has already been attached.
2077 	 */
2078 	if (attach != NULL && ips->ips_bpfattach_fn == NULL) {
2079 		ASSERT(detach != NULL);
2080 		if (provider != NULL) {
2081 			(void) provider(&bpf_ipnet);
2082 		}
2083 		/*
2084 		 * The call to ipnet_bpfattach() calls into bpf`bpfattach
2085 		 * which then wants to resolve the link name into a link id.
2086 		 * For ipnet, this results in a call back to
2087 		 * ipnet_get_linkid_byname which also needs to lock and walk
2088 		 * the AVL tree. Thus the call to ipnet_bpfattach needs to
2089 		 * be made without the avl_lock held.
2090 		 */
2091 		mutex_enter(&ips->ips_event_lock);
2092 		ips->ips_bpfattach_fn = attach;
2093 		ips->ips_bpfdetach_fn = detach;
2094 		mutex_enter(&ips->ips_avl_lock);
2095 		tree = &ips->ips_avl_by_index;
2096 		for (ipnetif = avl_first(tree); ipnetif != NULL;
2097 		    ipnetif = next) {
2098 			ipnetif_refhold(ipnetif);
2099 			mutex_exit(&ips->ips_avl_lock);
2100 			ipnet_bpfattach(ipnetif);
2101 			mutex_enter(&ips->ips_avl_lock);
2102 			next = avl_walk(tree, ipnetif, AVL_AFTER);
2103 			ipnetif_refrele(ipnetif);
2104 		}
2105 		mutex_exit(&ips->ips_avl_lock);
2106 		ipnet_bpf_probe_shared(ips);
2107 		mutex_exit(&ips->ips_event_lock);
2108 
2109 	} else if (attach == NULL && ips->ips_bpfattach_fn != NULL) {
2110 		ASSERT(ips->ips_bpfdetach_fn != NULL);
2111 		mutex_enter(&ips->ips_event_lock);
2112 		ips->ips_bpfattach_fn = NULL;
2113 		mutex_enter(&ips->ips_avl_lock);
2114 		tree = &ips->ips_avl_by_index;
2115 		for (ipnetif = avl_first(tree); ipnetif != NULL;
2116 		    ipnetif = next) {
2117 			ipnetif_refhold(ipnetif);
2118 			mutex_exit(&ips->ips_avl_lock);
2119 			ipnet_bpfdetach((ipnetif_t *)ipnetif);
2120 			mutex_enter(&ips->ips_avl_lock);
2121 			next = avl_walk(tree, ipnetif, AVL_AFTER);
2122 			ipnetif_refrele(ipnetif);
2123 		}
2124 		mutex_exit(&ips->ips_avl_lock);
2125 		ipnet_bpf_release_shared(ips);
2126 		ips->ips_bpfdetach_fn = NULL;
2127 		mutex_exit(&ips->ips_event_lock);
2128 
2129 		if (provider != NULL) {
2130 			(void) provider(&bpf_ipnet);
2131 		}
2132 	}
2133 }
2134 
2135 /*
2136  * The list of interfaces available via ipnet is private for each zone,
2137  * so the AVL tree of each zone must be searched for a given name, even
2138  * if all names are unique.
2139  */
2140 int
2141 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2142 {
2143 	ipnet_stack_t	*ips;
2144 	ipnetif_t	*ipnetif;
2145 
2146 	ASSERT(ptr != NULL);
2147 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2148 
2149 	mutex_enter(&ips->ips_avl_lock);
2150 	ipnetif = avl_find(&ips->ips_avl_by_name, (char *)name, NULL);
2151 	if (ipnetif != NULL) {
2152 		ipnetif_refhold(ipnetif);
2153 	}
2154 	mutex_exit(&ips->ips_avl_lock);
2155 
2156 	*ptr = ipnetif;
2157 
2158 	if (ipnetif == NULL)
2159 		return (ESRCH);
2160 	return (0);
2161 }
2162 
2163 void
2164 ipnet_close_byhandle(ipnetif_t *ifp)
2165 {
2166 	ASSERT(ifp != NULL);
2167 	ipnetif_refrele(ifp);
2168 }
2169 
2170 const char *
2171 ipnet_name(ipnetif_t *ifp)
2172 {
2173 	ASSERT(ifp != NULL);
2174 	return (ifp->if_name);
2175 }
2176 
2177 /*
2178  * To find the linkid for a given name, it is necessary to know which zone
2179  * the interface name belongs to and to search the avl tree for that zone
2180  * as there is no master list of all interfaces and which zone they belong
2181  * to. It is assumed that the caller of this function is somehow already
2182  * working with the ipnet interfaces and hence the ips_event_lock is held.
2183  * When BPF calls into this function, it is doing so because of an event
2184  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2185  * value returned has meaning without the need for grabbing a hold on the
2186  * owning structure.
2187  */
2188 int
2189 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2190 {
2191 	ipnet_stack_t	*ips;
2192 	ipnetif_t	*ifp;
2193 
2194 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2195 	ASSERT(mutex_owned(&ips->ips_event_lock));
2196 
2197 	mutex_enter(&ips->ips_avl_lock);
2198 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2199 	if (ifp != NULL)
2200 		*idp = (uint_t)ifp->if_index;
2201 
2202 	/*
2203 	 * Shared instance zone?
2204 	 */
2205 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2206 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2207 
2208 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2209 		if (ifp != NULL)
2210 			*idp = (uint_t)ifp->if_index;
2211 	}
2212 
2213 	mutex_exit(&ips->ips_avl_lock);
2214 
2215 	if (ifp == NULL)
2216 		return (ESRCH);
2217 	return (0);
2218 }
2219 
2220 /*
2221  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2222  * there is in mac. BPF only needs to have this because it is required as
2223  * part of interfacing correctly with mac. The reuse of the original
2224  * ipnetif_t as a client poses no danger, so long as it is done with its
2225  * own ref-count'd hold that is given up on close.
2226  */
2227 int
2228 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2229 {
2230 	ASSERT(ptr != NULL);
2231 	ASSERT(result != NULL);
2232 	ipnetif_refhold(ptr);
2233 	*result = ptr;
2234 
2235 	return (0);
2236 }
2237 
2238 void
2239 ipnet_client_close(ipnetif_t *ptr)
2240 {
2241 	ASSERT(ptr != NULL);
2242 	ipnetif_refrele(ptr);
2243 }
2244 
2245 /*
2246  * This is called from BPF when it needs to start receiving packets
2247  * from ipnet.
2248  *
2249  * The use of the ipnet_t structure here is somewhat lightweight when
2250  * compared to how it is used elsewhere but it already has all of the
2251  * right fields in it, so reuse here doesn't seem out of order. Its
2252  * primary purpose here is to provide the means to store pointers for
2253  * use when ipnet_promisc_remove() needs to be called.
2254  *
2255  * This should never be called for the IPNET_MINOR_LO device as it is
2256  * never created via ipnetif_create.
2257  */
2258 /*ARGSUSED*/
2259 int
2260 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2261     int flags)
2262 {
2263 	ip_stack_t	*ipst;
2264 	netstack_t	*ns;
2265 	ipnetif_t	*ifp;
2266 	ipnet_t		*ipnet;
2267 	char		name[32];
2268 	int		error;
2269 
2270 	ifp = (ipnetif_t *)handle;
2271 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2272 
2273 	if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
2274 		error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
2275 		if (error != 0)
2276 			return (error);
2277 	} else {
2278 		return (EINVAL);
2279 	}
2280 
2281 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2282 	ipnet->ipnet_if = ifp;
2283 	ipnet->ipnet_ns = ns;
2284 	ipnet->ipnet_flags = flags;
2285 
2286 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2287 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2288 	} else {
2289 		ipnet->ipnet_acceptfn = ipnet_accept;
2290 	}
2291 
2292 	/*
2293 	 * To register multiple hooks with the same callback function,
2294 	 * a unique name is needed.
2295 	 */
2296 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2297 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2298 	    ipnet->ipnet_hook);
2299 	ipnet->ipnet_hook->h_name = strdup(name);
2300 	ipnet->ipnet_data = data;
2301 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2302 
2303 	ipst = ns->netstack_ip;
2304 
2305 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2306 	    ipnet->ipnet_hook);
2307 	if (error != 0)
2308 		goto regfail;
2309 
2310 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2311 	    ipnet->ipnet_hook);
2312 	if (error != 0) {
2313 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2314 		    NH_OBSERVE, ipnet->ipnet_hook);
2315 		goto regfail;
2316 	}
2317 
2318 	*mhandle = (uintptr_t)ipnet;
2319 
2320 	return (0);
2321 
2322 regfail:
2323 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2324 	strfree(ipnet->ipnet_hook->h_name);
2325 	hook_free(ipnet->ipnet_hook);
2326 	return (error);
2327 }
2328 
2329 void
2330 ipnet_promisc_remove(void *data)
2331 {
2332 	ip_stack_t	*ipst;
2333 	ipnet_t		*ipnet;
2334 	hook_t		*hook;
2335 
2336 	ipnet = data;
2337 	ipst = ipnet->ipnet_ns->netstack_ip;
2338 	hook = ipnet->ipnet_hook;
2339 
2340 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2341 	    hook) == 0);
2342 
2343 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2344 	    hook) == 0);
2345 
2346 	strfree(hook->h_name);
2347 
2348 	hook_free(hook);
2349 
2350 	kmem_free(ipnet, sizeof (*ipnet));
2351 }
2352 
2353 /*
2354  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2355  * An important field from that structure is "ipnet_data" that
2356  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2357  * to be passed back to bpf when we call into ipnet_itap.
2358  *
2359  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2360  * from BPF.
2361  */
2362 /*ARGSUSED*/
2363 static int
2364 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2365 {
2366 	hook_pkt_observe_t	*hdr;
2367 	ipnet_addrp_t		src;
2368 	ipnet_addrp_t		dst;
2369 	ipnet_stack_t		*ips;
2370 	ipnet_t			*ipnet;
2371 	mblk_t			*netmp;
2372 	mblk_t			*mp;
2373 
2374 	hdr = (hook_pkt_observe_t *)info;
2375 	mp = hdr->hpo_pkt;
2376 	ipnet = (ipnet_t *)arg;
2377 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2378 
2379 	netmp = hdr->hpo_pkt->b_cont;
2380 	src.iap_family = hdr->hpo_family;
2381 	dst.iap_family = hdr->hpo_family;
2382 
2383 	if (hdr->hpo_family == AF_INET) {
2384 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2385 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2386 	} else {
2387 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2388 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2389 	}
2390 
2391 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2392 		IPSK_BUMP(ips, ik_acceptFail);
2393 		return (0);
2394 	}
2395 	IPSK_BUMP(ips, ik_acceptOk);
2396 
2397 	ipnet_itap(ipnet->ipnet_data, mp,
2398 	    hdr->hpo_htype == IPOBS_HOOK_OUTBOUND,
2399 	    ntohs(hdr->hpo_pktlen) + (mp->b_wptr - mp->b_rptr));
2400 
2401 	return (0);
2402 }
2403 
2404 /*
2405  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2406  * to life and configures an IP address. The model that BPF uses is that
2407  * each interface must have a unique pointer and each interface must be
2408  * representative of what it can capture. They are limited to one DLT
2409  * per interface and one zone per interface. Thus every interface that
2410  * can be seen in a zone must be announced via an attach to bpf. For
2411  * shared instance zones, this means the ipnet driver needs to detect
2412  * when an address is added to an interface in a zone for the first
2413  * time (and also when the last address is removed.)
2414  */
2415 static ipnetif_t *
2416 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2417 {
2418 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2419 	ipnet_stack_t	*ips = ifp->if_stackp;
2420 	avl_index_t	where = 0;
2421 	ipnetif_t	*newif;
2422 
2423 	mutex_enter(&ips->ips_avl_lock);
2424 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2425 	if (newif != NULL) {
2426 		ipnetif_refhold(newif);
2427 		newif->if_sharecnt++;
2428 		mutex_exit(&ips->ips_avl_lock);
2429 		return (newif);
2430 	}
2431 
2432 	newif = ipnet_alloc_if(ips);
2433 	if (newif == NULL) {
2434 		mutex_exit(&ips->ips_avl_lock);
2435 		return (NULL);
2436 	}
2437 
2438 	newif->if_refcnt = 1;
2439 	newif->if_sharecnt = 1;
2440 	newif->if_zoneid = zoneid;
2441 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2442 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2443 	newif->if_index = ifp->if_index;
2444 
2445 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2446 	mutex_exit(&ips->ips_avl_lock);
2447 
2448 	ipnet_bpfattach(newif);
2449 
2450 	return (newif);
2451 }
2452 
2453 static void
2454 ipnetif_clone_release(ipnetif_t *ipnetif)
2455 {
2456 	boolean_t	dofree = B_FALSE;
2457 	boolean_t	doremove = B_FALSE;
2458 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2459 
2460 	mutex_enter(&ipnetif->if_reflock);
2461 	ASSERT(ipnetif->if_refcnt > 0);
2462 	if (--ipnetif->if_refcnt == 0)
2463 		dofree = B_TRUE;
2464 	ASSERT(ipnetif->if_sharecnt > 0);
2465 	if (--ipnetif->if_sharecnt == 0)
2466 		doremove = B_TRUE;
2467 	mutex_exit(&ipnetif->if_reflock);
2468 	if (doremove) {
2469 		mutex_enter(&ips->ips_avl_lock);
2470 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2471 		mutex_exit(&ips->ips_avl_lock);
2472 		ipnet_bpfdetach(ipnetif);
2473 	}
2474 	if (dofree) {
2475 		ASSERT(ipnetif->if_sharecnt == 0);
2476 		ipnetif_free(ipnetif);
2477 	}
2478 }
2479 
2480 /*
2481  * Called when BPF loads, the goal is to tell BPF about all of the interfaces
2482  * in use by zones that have a shared IP stack. These interfaces are stored
2483  * in the ips_avl_by_shared tree. Note that if there are 1000 bge0's in use
2484  * as bge0:1 through to bge0:1000, then this would be represented by a single
2485  * bge0 on that AVL tree.
2486  */
2487 static void
2488 ipnet_bpf_probe_shared(ipnet_stack_t *ips)
2489 {
2490 	ipnetif_t	*next;
2491 	ipnetif_t	*ifp;
2492 
2493 	mutex_enter(&ips->ips_avl_lock);
2494 
2495 	for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL;
2496 	    ifp = next) {
2497 		ipnetif_refhold(ifp);
2498 		mutex_exit(&ips->ips_avl_lock);
2499 		ipnet_bpfattach(ifp);
2500 		mutex_enter(&ips->ips_avl_lock);
2501 		next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER);
2502 		ipnetif_refrele(ifp);
2503 	}
2504 	mutex_exit(&ips->ips_avl_lock);
2505 }
2506 
2507 static void
2508 ipnet_bpf_release_shared(ipnet_stack_t *ips)
2509 {
2510 	ipnetif_t	*next;
2511 	ipnetif_t	*ifp;
2512 
2513 	mutex_enter(&ips->ips_avl_lock);
2514 
2515 	for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL;
2516 	    ifp = next) {
2517 		ipnetif_refhold(ifp);
2518 		mutex_exit(&ips->ips_avl_lock);
2519 		ipnet_bpfdetach(ifp);
2520 		mutex_enter(&ips->ips_avl_lock);
2521 		next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER);
2522 		ipnetif_refrele(ifp);
2523 	}
2524 	mutex_exit(&ips->ips_avl_lock);
2525 }
2526