xref: /illumos-gate/usr/src/uts/common/inet/ipnet/ipnet.c (revision a61ed2ce7a86a4d6428f2a83eb4739fae945447e)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  * Copyright (c) 2016 by Delphix. All rights reserved.
26  */
27 
28 /*
29  * Copyright (c) 2016, Joyent, Inc. All rights reserved.
30  */
31 
32 /*
33  * The ipnet device defined here provides access to packets at the IP layer. To
34  * provide access to packets at this layer it registers a callback function in
35  * the ip module and when there are open instances of the device ip will pass
36  * packets into the device. Packets from ip are passed on the input, output and
37  * loopback paths. Internally the module returns to ip as soon as possible by
38  * deferring processing using a taskq.
39  *
40  * Management of the devices in /dev/ipnet/ is handled by the devname
41  * filesystem and use of the neti interfaces.  This module registers for NIC
42  * events using the neti framework so that when IP interfaces are bought up,
43  * taken down etc. the ipnet module is notified and its view of the interfaces
44  * configured on the system adjusted.  On attach, the module gets an initial
45  * view of the system again using the neti framework but as it has already
46  * registered for IP interface events, it is still up-to-date with any changes.
47  */
48 
49 #include <sys/types.h>
50 #include <sys/conf.h>
51 #include <sys/cred.h>
52 #include <sys/stat.h>
53 #include <sys/ddi.h>
54 #include <sys/sunddi.h>
55 #include <sys/modctl.h>
56 #include <sys/dlpi.h>
57 #include <sys/strsun.h>
58 #include <sys/id_space.h>
59 #include <sys/kmem.h>
60 #include <sys/mkdev.h>
61 #include <sys/neti.h>
62 #include <net/if.h>
63 #include <sys/errno.h>
64 #include <sys/list.h>
65 #include <sys/ksynch.h>
66 #include <sys/hook_event.h>
67 #include <sys/sdt.h>
68 #include <sys/stropts.h>
69 #include <sys/sysmacros.h>
70 #include <inet/ip.h>
71 #include <inet/ip_if.h>
72 #include <inet/ip_multi.h>
73 #include <inet/ip6.h>
74 #include <inet/ipnet.h>
75 #include <net/bpf.h>
76 #include <net/bpfdesc.h>
77 #include <net/dlt.h>
78 
79 static struct module_info ipnet_minfo = {
80 	1,		/* mi_idnum */
81 	"ipnet",	/* mi_idname */
82 	0,		/* mi_minpsz */
83 	INFPSZ,		/* mi_maxpsz */
84 	2048,		/* mi_hiwat */
85 	0		/* mi_lowat */
86 };
87 
88 /*
89  * List to hold static view of ipnetif_t's on the system. This is needed to
90  * avoid holding the lock protecting the avl tree of ipnetif's over the
91  * callback into the dev filesystem.
92  */
93 typedef struct ipnetif_cbdata {
94 	char		ic_ifname[LIFNAMSIZ];
95 	dev_t		ic_dev;
96 	list_node_t	ic_next;
97 } ipnetif_cbdata_t;
98 
99 /*
100  * Convenience enumerated type for ipnet_accept().  It describes the
101  * properties of a given ipnet_addrp_t relative to a single ipnet_t
102  * client stream.  The values represent whether the address is ...
103  */
104 typedef enum {
105 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
106 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
107 	IPNETADDR_UNKNOWN	/* none of the above. */
108 } ipnet_addrtype_t;
109 
110 /* Argument used for the ipnet_nicevent_taskq callback. */
111 typedef struct ipnet_nicevent_s {
112 	nic_event_t		ipne_event;
113 	net_handle_t		ipne_protocol;
114 	netstackid_t		ipne_stackid;
115 	uint64_t		ipne_ifindex;
116 	uint64_t		ipne_lifindex;
117 	char			ipne_ifname[LIFNAMSIZ];
118 } ipnet_nicevent_t;
119 
120 static dev_info_t	*ipnet_dip;
121 static major_t		ipnet_major;
122 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
123 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
124 static id_space_t	*ipnet_minor_space;
125 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
126 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
127 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
128 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
129 static bpf_itap_fn_t	ipnet_itap;
130 
131 static void	ipnet_input(mblk_t *);
132 static int	ipnet_wput(queue_t *, mblk_t *);
133 static int	ipnet_rsrv(queue_t *);
134 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
135 static int	ipnet_close(queue_t *, int, cred_t *);
136 static void	ipnet_ioctl(queue_t *, mblk_t *);
137 static void	ipnet_iocdata(queue_t *, mblk_t *);
138 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
139 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
140 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
141 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
142 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
143 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
144 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
145 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
146 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
147 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
148 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
149 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
150 static void	ipnet_nicevent_task(void *);
151 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
152     uint64_t);
153 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
154 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
155 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
156 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
157 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
158 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
159 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
160 static int 	ipnetif_compare_name(const void *, const void *);
161 static int 	ipnetif_compare_name_zone(const void *, const void *);
162 static int 	ipnetif_compare_index(const void *, const void *);
163 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
164 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
165 static void	ipnetif_refhold(ipnetif_t *);
166 static void	ipnetif_refrele(ipnetif_t *);
167 static void	ipnet_walkers_inc(ipnet_stack_t *);
168 static void	ipnet_walkers_dec(ipnet_stack_t *);
169 static void	ipnet_register_netihook(ipnet_stack_t *);
170 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
171 static void	ipnet_stack_fini(netstackid_t, void *);
172 static void	ipnet_dispatch(void *);
173 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
174 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
175 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
176 static void	ipnetif_clone_release(ipnetif_t *);
177 
178 static struct qinit ipnet_rinit = {
179 	NULL,		/* qi_putp */
180 	ipnet_rsrv,	/* qi_srvp */
181 	ipnet_open,	/* qi_qopen */
182 	ipnet_close,	/* qi_qclose */
183 	NULL,		/* qi_qadmin */
184 	&ipnet_minfo,	/* qi_minfo */
185 };
186 
187 static struct qinit ipnet_winit = {
188 	ipnet_wput,	/* qi_putp */
189 	NULL,		/* qi_srvp */
190 	NULL,		/* qi_qopen */
191 	NULL,		/* qi_qclose */
192 	NULL,		/* qi_qadmin */
193 	&ipnet_minfo,	/* qi_minfo */
194 };
195 
196 static struct streamtab ipnet_info = {
197 	&ipnet_rinit, &ipnet_winit
198 };
199 
200 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
201     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
202     ddi_quiesce_not_supported);
203 
204 static struct modldrv modldrv = {
205 	&mod_driverops,
206 	"STREAMS ipnet driver",
207 	&ipnet_ops
208 };
209 
210 static struct modlinkage modlinkage = {
211 	MODREV_1, &modldrv, NULL
212 };
213 
214 /*
215  * This structure contains the template data (names and type) that is
216  * copied, in bulk, into the new kstats structure created by net_kstat_create.
217  * No actual statistical information is stored in this instance of the
218  * ipnet_kstats_t structure.
219  */
220 static ipnet_kstats_t stats_template = {
221 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
222 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
223 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
224 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
225 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
226 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
227 	{ "acceptOk",		KSTAT_DATA_UINT64 },
228 	{ "acceptFail",		KSTAT_DATA_UINT64 }
229 };
230 
231 /*
232  * Walk the list of physical interfaces on the machine, for each
233  * interface create a new ipnetif_t and add any addresses to it. We
234  * need to do the walk twice, once for IPv4 and once for IPv6.
235  *
236  * The interfaces are destroyed as part of ipnet_stack_fini() for each
237  * stack.  Note that we cannot do this initialization in
238  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
239  */
240 static int
241 ipnetif_init(void)
242 {
243 	netstack_handle_t	nh;
244 	netstack_t		*ns;
245 	ipnet_stack_t		*ips;
246 	int			ret = 0;
247 
248 	netstack_next_init(&nh);
249 	while ((ns = netstack_next(&nh)) != NULL) {
250 		ips = ns->netstack_ipnet;
251 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
252 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
253 		netstack_rele(ns);
254 		if (ret != 0)
255 			break;
256 	}
257 	netstack_next_fini(&nh);
258 	return (ret);
259 }
260 
261 /*
262  * Standard module entry points.
263  */
264 int
265 _init(void)
266 {
267 	int		ret;
268 	boolean_t	netstack_registered = B_FALSE;
269 
270 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
271 		return (ENODEV);
272 	ipnet_minor_space = id_space_create("ipnet_minor_space",
273 	    IPNET_MINOR_MIN, MAXMIN32);
274 
275 	/*
276 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
277 	 * delivery of packets to clients.  Note that we need to create the
278 	 * taskqs before calling netstack_register() since ipnet_stack_init()
279 	 * registers callbacks that use 'em.
280 	 */
281 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
282 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
283 	    1, TASKQ_DEFAULTPRI, 0);
284 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
285 		ret = ENOMEM;
286 		goto done;
287 	}
288 
289 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
290 	netstack_registered = B_TRUE;
291 
292 	if ((ret = ipnetif_init()) == 0)
293 		ret = mod_install(&modlinkage);
294 done:
295 	if (ret != 0) {
296 		if (ipnet_taskq != NULL)
297 			ddi_taskq_destroy(ipnet_taskq);
298 		if (ipnet_nicevent_taskq != NULL)
299 			ddi_taskq_destroy(ipnet_nicevent_taskq);
300 		if (netstack_registered)
301 			netstack_unregister(NS_IPNET);
302 		id_space_destroy(ipnet_minor_space);
303 	}
304 	return (ret);
305 }
306 
307 int
308 _fini(void)
309 {
310 	int	err;
311 
312 	if ((err = mod_remove(&modlinkage)) != 0)
313 		return (err);
314 
315 	netstack_unregister(NS_IPNET);
316 	ddi_taskq_destroy(ipnet_nicevent_taskq);
317 	ddi_taskq_destroy(ipnet_taskq);
318 	id_space_destroy(ipnet_minor_space);
319 	return (0);
320 }
321 
322 int
323 _info(struct modinfo *modinfop)
324 {
325 	return (mod_info(&modlinkage, modinfop));
326 }
327 
328 static void
329 ipnet_register_netihook(ipnet_stack_t *ips)
330 {
331 	int		ret;
332 	zoneid_t	zoneid;
333 	netid_t		netid;
334 
335 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
336 	    ips);
337 
338 	/*
339 	 * It is possible for an exclusive stack to be in the process of
340 	 * shutting down here, and the netid and protocol lookups could fail
341 	 * in that case.
342 	 */
343 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
344 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
345 		return;
346 
347 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
348 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
349 		    ips->ips_nicevents)) != 0) {
350 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
351 			ips->ips_ndv4 = NULL;
352 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
353 			    " in zone %d: %d", zoneid, ret);
354 		}
355 	}
356 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
357 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
358 		    ips->ips_nicevents)) != 0) {
359 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
360 			ips->ips_ndv6 = NULL;
361 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
362 			    " in zone %d: %d", zoneid, ret);
363 		}
364 	}
365 
366 	/*
367 	 * Create a local set of kstats for each zone.
368 	 */
369 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
370 	    "misc", KSTAT_TYPE_NAMED,
371 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
372 	if (ips->ips_kstatp != NULL) {
373 		bcopy(&stats_template, &ips->ips_stats,
374 		    sizeof (ips->ips_stats));
375 		ips->ips_kstatp->ks_data = &ips->ips_stats;
376 		ips->ips_kstatp->ks_private =
377 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
378 		kstat_install(ips->ips_kstatp);
379 	} else {
380 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
381 		    "ipnet", "ipnet_stats", "misc");
382 	}
383 }
384 
385 /*
386  * This function is called on attach to build an initial view of the
387  * interfaces on the system. It will be called once for IPv4 and once
388  * for IPv6, although there is only one ipnet interface for both IPv4
389  * and IPv6 there are separate address lists.
390  */
391 static int
392 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
393 {
394 	phy_if_t	phyif;
395 	lif_if_t	lif;
396 	ipnetif_t	*ipnetif;
397 	char		name[LIFNAMSIZ];
398 	boolean_t	new_if = B_FALSE;
399 	uint64_t	ifflags;
400 	int		ret = 0;
401 
402 	/*
403 	 * If ipnet_register_netihook() was unable to initialize this
404 	 * stack's net_handle_t, then we cannot populate any interface
405 	 * information.  This usually happens when we attempted to
406 	 * grab a net_handle_t as a stack was shutting down.  We don't
407 	 * want to fail the entire _init() operation because of a
408 	 * stack shutdown (other stacks will continue to work just
409 	 * fine), so we silently return success here.
410 	 */
411 	if (nd == NULL)
412 		return (0);
413 
414 	/*
415 	 * Make sure we're not processing NIC events during the
416 	 * population of our interfaces and address lists.
417 	 */
418 	mutex_enter(&ips->ips_event_lock);
419 
420 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
421 	    phyif = net_phygetnext(nd, phyif)) {
422 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
423 			continue;
424 		ifflags =  0;
425 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
426 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
427 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
428 			if (ipnetif == NULL) {
429 				ret = ENOMEM;
430 				goto done;
431 			}
432 			new_if = B_TRUE;
433 		}
434 		ipnetif->if_flags |=
435 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
436 
437 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
438 		    lif = net_lifgetnext(nd, phyif, lif)) {
439 			/*
440 			 * Skip addresses that aren't up.  We'll add
441 			 * them when we receive an NE_LIF_UP event.
442 			 */
443 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
444 			    !(ifflags & IFF_UP))
445 				continue;
446 			/* Don't add it if we already have it. */
447 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
448 				continue;
449 			ipnet_add_ifaddr(lif, ipnetif, nd);
450 		}
451 		if (!new_if)
452 			ipnetif_refrele(ipnetif);
453 	}
454 
455 done:
456 	mutex_exit(&ips->ips_event_lock);
457 	return (ret);
458 }
459 
460 static int
461 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
462 {
463 	if (cmd != DDI_ATTACH)
464 		return (DDI_FAILURE);
465 
466 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
467 	    DDI_PSEUDO, 0) == DDI_FAILURE)
468 		return (DDI_FAILURE);
469 
470 	ipnet_dip = dip;
471 	return (DDI_SUCCESS);
472 }
473 
474 static int
475 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
476 {
477 	if (cmd != DDI_DETACH)
478 		return (DDI_FAILURE);
479 
480 	ASSERT(dip == ipnet_dip);
481 	ddi_remove_minor_node(ipnet_dip, NULL);
482 	ipnet_dip = NULL;
483 	return (DDI_SUCCESS);
484 }
485 
486 /* ARGSUSED */
487 static int
488 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
489 {
490 	int	error = DDI_FAILURE;
491 
492 	switch (infocmd) {
493 	case DDI_INFO_DEVT2INSTANCE:
494 		*result = (void *)0;
495 		error = DDI_SUCCESS;
496 		break;
497 	case DDI_INFO_DEVT2DEVINFO:
498 		if (ipnet_dip != NULL) {
499 			*result = ipnet_dip;
500 			error = DDI_SUCCESS;
501 		}
502 		break;
503 	}
504 	return (error);
505 }
506 
507 /* ARGSUSED */
508 static int
509 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
510 {
511 	ipnet_t		*ipnet;
512 	netstack_t	*ns = NULL;
513 	ipnet_stack_t	*ips;
514 	int		err = 0;
515 	zoneid_t	zoneid = crgetzoneid(crp);
516 
517 	/*
518 	 * If the system is labeled, only the global zone is allowed to open
519 	 * IP observability nodes.
520 	 */
521 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
522 		return (EACCES);
523 
524 	/* We don't support open as a module */
525 	if (sflag & MODOPEN)
526 		return (ENOTSUP);
527 
528 	/* This driver is self-cloning, we don't support re-open. */
529 	if (rq->q_ptr != NULL)
530 		return (EBUSY);
531 
532 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
533 		return (ENOMEM);
534 
535 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
536 	ips = ns->netstack_ipnet;
537 
538 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
539 	ipnet->ipnet_rq = rq;
540 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
541 	ipnet->ipnet_zoneid = zoneid;
542 	ipnet->ipnet_dlstate = DL_UNBOUND;
543 	ipnet->ipnet_ns = ns;
544 
545 	/*
546 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
547 	 * to be processed after ipnet_if is set and the ipnet_t has been
548 	 * inserted in the ips_str_list.
549 	 */
550 	mutex_enter(&ips->ips_event_lock);
551 	if (getminor(*dev) == IPNET_MINOR_LO) {
552 		ipnet->ipnet_flags |= IPNET_LOMODE;
553 		ipnet->ipnet_acceptfn = ipnet_loaccept;
554 	} else {
555 		ipnet->ipnet_acceptfn = ipnet_accept;
556 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
557 		if (ipnet->ipnet_if == NULL ||
558 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
559 			err = ENODEV;
560 			goto done;
561 		}
562 	}
563 
564 	mutex_enter(&ips->ips_walkers_lock);
565 	while (ips->ips_walkers_cnt != 0)
566 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
567 	list_insert_head(&ips->ips_str_list, ipnet);
568 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
569 	qprocson(rq);
570 
571 	/*
572 	 * Only register our callback if we're the first open client; we call
573 	 * unregister in close() for the last open client.
574 	 */
575 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
576 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
577 	mutex_exit(&ips->ips_walkers_lock);
578 
579 done:
580 	mutex_exit(&ips->ips_event_lock);
581 	if (err != 0) {
582 		netstack_rele(ns);
583 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
584 		if (ipnet->ipnet_if != NULL)
585 			ipnetif_refrele(ipnet->ipnet_if);
586 		kmem_free(ipnet, sizeof (*ipnet));
587 	}
588 	return (err);
589 }
590 
591 /* ARGSUSED */
592 static int
593 ipnet_close(queue_t *rq, int flags __unused, cred_t *credp __unused)
594 {
595 	ipnet_t		*ipnet = rq->q_ptr;
596 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
597 
598 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
599 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
600 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
601 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
602 
603 	mutex_enter(&ips->ips_walkers_lock);
604 	while (ips->ips_walkers_cnt != 0)
605 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
606 
607 	qprocsoff(rq);
608 
609 	list_remove(&ips->ips_str_list, ipnet);
610 	if (ipnet->ipnet_if != NULL)
611 		ipnetif_refrele(ipnet->ipnet_if);
612 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
613 
614 	if (list_is_empty(&ips->ips_str_list)) {
615 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
616 		ips->ips_hook = NULL;
617 	}
618 
619 	kmem_free(ipnet, sizeof (*ipnet));
620 
621 	mutex_exit(&ips->ips_walkers_lock);
622 	netstack_rele(ips->ips_netstack);
623 	return (0);
624 }
625 
626 static int
627 ipnet_wput(queue_t *q, mblk_t *mp)
628 {
629 	switch (mp->b_datap->db_type) {
630 	case M_FLUSH:
631 		if (*mp->b_rptr & FLUSHW) {
632 			flushq(q, FLUSHDATA);
633 			*mp->b_rptr &= ~FLUSHW;
634 		}
635 		if (*mp->b_rptr & FLUSHR)
636 			qreply(q, mp);
637 		else
638 			freemsg(mp);
639 		break;
640 	case M_PROTO:
641 	case M_PCPROTO:
642 		ipnet_wputnondata(q, mp);
643 		break;
644 	case M_IOCTL:
645 		ipnet_ioctl(q, mp);
646 		break;
647 	case M_IOCDATA:
648 		ipnet_iocdata(q, mp);
649 		break;
650 	default:
651 		freemsg(mp);
652 		break;
653 	}
654 	return (0);
655 }
656 
657 static int
658 ipnet_rsrv(queue_t *q)
659 {
660 	mblk_t	*mp;
661 
662 	while ((mp = getq(q)) != NULL) {
663 		ASSERT(DB_TYPE(mp) == M_DATA);
664 		if (canputnext(q)) {
665 			putnext(q, mp);
666 		} else {
667 			(void) putbq(q, mp);
668 			break;
669 		}
670 	}
671 	return (0);
672 }
673 
674 static void
675 ipnet_ioctl(queue_t *q, mblk_t *mp)
676 {
677 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
678 
679 	switch (iocp->ioc_cmd) {
680 	case DLIOCRAW:
681 		miocack(q, mp, 0, 0);
682 		break;
683 	case DLIOCIPNETINFO:
684 		if (iocp->ioc_count == TRANSPARENT) {
685 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
686 			qreply(q, mp);
687 			break;
688 		}
689 		/* We don't support I_STR with DLIOCIPNETINFO. */
690 		/* FALLTHROUGH */
691 	default:
692 		miocnak(q, mp, 0, EINVAL);
693 		break;
694 	}
695 }
696 
697 static void
698 ipnet_iocdata(queue_t *q, mblk_t *mp)
699 {
700 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
701 	ipnet_t	*ipnet = q->q_ptr;
702 
703 	switch (iocp->ioc_cmd) {
704 	case DLIOCIPNETINFO:
705 		if (*(int *)mp->b_cont->b_rptr == 1)
706 			ipnet->ipnet_flags |= IPNET_INFO;
707 		else if (*(int *)mp->b_cont->b_rptr == 0)
708 			ipnet->ipnet_flags &= ~IPNET_INFO;
709 		else
710 			goto iocnak;
711 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
712 		break;
713 	default:
714 iocnak:
715 		miocnak(q, mp, 0, EINVAL);
716 		break;
717 	}
718 }
719 
720 static void
721 ipnet_wputnondata(queue_t *q, mblk_t *mp)
722 {
723 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
724 	t_uscalar_t		prim = dlp->dl_primitive;
725 
726 	switch (prim) {
727 	case DL_INFO_REQ:
728 		ipnet_inforeq(q, mp);
729 		break;
730 	case DL_UNBIND_REQ:
731 		ipnet_unbindreq(q, mp);
732 		break;
733 	case DL_BIND_REQ:
734 		ipnet_bindreq(q, mp);
735 		break;
736 	case DL_PROMISCON_REQ:
737 		ipnet_dlpromisconreq(q, mp);
738 		break;
739 	case DL_PROMISCOFF_REQ:
740 		ipnet_dlpromiscoffreq(q, mp);
741 		break;
742 	case DL_UNITDATA_REQ:
743 	case DL_DETACH_REQ:
744 	case DL_PHYS_ADDR_REQ:
745 	case DL_SET_PHYS_ADDR_REQ:
746 	case DL_ENABMULTI_REQ:
747 	case DL_DISABMULTI_REQ:
748 	case DL_ATTACH_REQ:
749 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
750 		break;
751 	default:
752 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
753 		break;
754 	}
755 }
756 
757 static void
758 ipnet_inforeq(queue_t *q, mblk_t *mp)
759 {
760 	dl_info_ack_t	*dlip;
761 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
762 
763 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
764 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
765 		return;
766 	}
767 
768 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
769 		return;
770 
771 	dlip = (dl_info_ack_t *)mp->b_rptr;
772 	*dlip = ipnet_infoack;
773 	qreply(q, mp);
774 }
775 
776 static void
777 ipnet_bindreq(queue_t *q, mblk_t *mp)
778 {
779 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
780 	ipnet_t			*ipnet = q->q_ptr;
781 
782 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
783 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
784 		return;
785 	}
786 
787 	switch (dlp->bind_req.dl_sap) {
788 	case 0 :
789 		ipnet->ipnet_family = AF_UNSPEC;
790 		break;
791 	case IPV4_VERSION :
792 		ipnet->ipnet_family = AF_INET;
793 		break;
794 	case IPV6_VERSION :
795 		ipnet->ipnet_family = AF_INET6;
796 		break;
797 	default :
798 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
799 		return;
800 		/*NOTREACHED*/
801 	}
802 
803 	ipnet->ipnet_dlstate = DL_IDLE;
804 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
805 }
806 
807 static void
808 ipnet_unbindreq(queue_t *q, mblk_t *mp)
809 {
810 	ipnet_t	*ipnet = q->q_ptr;
811 
812 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
813 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
814 		return;
815 	}
816 
817 	if (ipnet->ipnet_dlstate != DL_IDLE) {
818 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
819 	} else {
820 		ipnet->ipnet_dlstate = DL_UNBOUND;
821 		ipnet->ipnet_family = AF_UNSPEC;
822 		dlokack(q, mp, DL_UNBIND_REQ);
823 	}
824 }
825 
826 static void
827 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
828 {
829 	ipnet_t		*ipnet = q->q_ptr;
830 	t_uscalar_t	level;
831 	int		err;
832 
833 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
834 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
835 		return;
836 	}
837 
838 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
839 		dlokack(q, mp, DL_PROMISCON_REQ);
840 		return;
841 	}
842 
843 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
844 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
845 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
846 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
847 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
848 			return;
849 		}
850 	}
851 
852 	switch (level) {
853 	case DL_PROMISC_PHYS:
854 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
855 		break;
856 	case DL_PROMISC_SAP:
857 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
858 		break;
859 	case DL_PROMISC_MULTI:
860 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
861 		break;
862 	default:
863 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
864 		return;
865 	}
866 
867 	dlokack(q, mp, DL_PROMISCON_REQ);
868 }
869 
870 static void
871 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
872 {
873 	ipnet_t		*ipnet = q->q_ptr;
874 	t_uscalar_t	level;
875 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
876 
877 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
878 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
879 		return;
880 	}
881 
882 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
883 		dlokack(q, mp, DL_PROMISCOFF_REQ);
884 		return;
885 	}
886 
887 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
888 	switch (level) {
889 	case DL_PROMISC_PHYS:
890 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
891 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
892 		break;
893 	case DL_PROMISC_SAP:
894 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
895 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
896 		break;
897 	case DL_PROMISC_MULTI:
898 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
899 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
900 		break;
901 	default:
902 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
903 		return;
904 	}
905 
906 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
907 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
908 		return;
909 	}
910 
911 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
912 		ipnet_leave_allmulti(ipnet->ipnet_if,
913 		    ipnet->ipnet_ns->netstack_ipnet);
914 	}
915 
916 	dlokack(q, mp, DL_PROMISCOFF_REQ);
917 }
918 
919 static int
920 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
921 {
922 	int		err = 0;
923 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
924 	uint64_t	index = ipnetif->if_index;
925 
926 	mutex_enter(&ips->ips_event_lock);
927 	if (ipnetif->if_multicnt == 0) {
928 		ASSERT((ipnetif->if_flags &
929 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
930 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
931 			err = ip_join_allmulti(index, B_FALSE, ipst);
932 			if (err != 0)
933 				goto done;
934 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
935 		}
936 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
937 			err = ip_join_allmulti(index, B_TRUE, ipst);
938 			if (err != 0 &&
939 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
940 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
941 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
942 				goto done;
943 			}
944 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
945 		}
946 	}
947 	ipnetif->if_multicnt++;
948 
949 done:
950 	mutex_exit(&ips->ips_event_lock);
951 	return (err);
952 }
953 
954 static void
955 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
956 {
957 	int		err;
958 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
959 	uint64_t	index = ipnetif->if_index;
960 
961 	mutex_enter(&ips->ips_event_lock);
962 	ASSERT(ipnetif->if_multicnt != 0);
963 	if (--ipnetif->if_multicnt == 0) {
964 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
965 			err = ip_leave_allmulti(index, B_FALSE, ipst);
966 			ASSERT(err == 0 || err == ENODEV);
967 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
968 		}
969 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
970 			err = ip_leave_allmulti(index, B_TRUE, ipst);
971 			ASSERT(err == 0 || err == ENODEV);
972 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
973 		}
974 	}
975 	mutex_exit(&ips->ips_event_lock);
976 }
977 
978 /*
979  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
980  * The structure it copies the header information from,
981  * hook_pkt_observe_t, is constructed using network byte
982  * order in ipobs_hook(), so there is no conversion here.
983  */
984 static mblk_t *
985 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
986 {
987 	mblk_t		*dlhdr;
988 	dl_ipnetinfo_t	*dl;
989 
990 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
991 		freemsg(mp);
992 		return (NULL);
993 	}
994 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
995 	dl->dli_version = DL_IPNETINFO_VERSION;
996 	dl->dli_family = hdr->hpo_family;
997 	dl->dli_htype = hdr->hpo_htype;
998 	dl->dli_pktlen = hdr->hpo_pktlen;
999 	dl->dli_ifindex = hdr->hpo_ifindex;
1000 	dl->dli_grifindex = hdr->hpo_grifindex;
1001 	dl->dli_zsrc = hdr->hpo_zsrc;
1002 	dl->dli_zdst = hdr->hpo_zdst;
1003 	dlhdr->b_wptr += sizeof (*dl);
1004 	dlhdr->b_cont = mp;
1005 
1006 	return (dlhdr);
1007 }
1008 
1009 static ipnet_addrtype_t
1010 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1011 {
1012 	list_t			*list;
1013 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1014 	ipnetif_addr_t		*ifaddr;
1015 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1016 
1017 	/* First check if the address is multicast or limited broadcast. */
1018 	switch (addr->iap_family) {
1019 	case AF_INET:
1020 		if (CLASSD(*(addr->iap_addr4)) ||
1021 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1022 			return (IPNETADDR_MBCAST);
1023 		break;
1024 	case AF_INET6:
1025 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1026 			return (IPNETADDR_MBCAST);
1027 		break;
1028 	}
1029 
1030 	/*
1031 	 * Walk the address list to see if the address belongs to our
1032 	 * interface or is one of our subnet broadcast addresses.
1033 	 */
1034 	mutex_enter(&ipnetif->if_addr_lock);
1035 	list = (addr->iap_family == AF_INET) ?
1036 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1037 	for (ifaddr = list_head(list);
1038 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1039 	    ifaddr = list_next(list, ifaddr)) {
1040 		/*
1041 		 * If we're not in the global zone, then only look at
1042 		 * addresses in our zone.
1043 		 */
1044 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1045 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1046 			continue;
1047 		switch (addr->iap_family) {
1048 		case AF_INET:
1049 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1050 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1051 				addrtype = IPNETADDR_MYADDR;
1052 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1053 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1054 				addrtype = IPNETADDR_MBCAST;
1055 			break;
1056 		case AF_INET6:
1057 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1058 			    &ifaddr->ifa_ip6addr))
1059 				addrtype = IPNETADDR_MYADDR;
1060 			break;
1061 		}
1062 	}
1063 	mutex_exit(&ipnetif->if_addr_lock);
1064 
1065 	return (addrtype);
1066 }
1067 
1068 /*
1069  * Verify if the packet contained in hdr should be passed up to the
1070  * ipnet client stream.
1071  */
1072 static boolean_t
1073 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1074     ipnet_addrp_t *dst)
1075 {
1076 	boolean_t		obsif;
1077 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1078 	ipnet_addrtype_t	srctype;
1079 	ipnet_addrtype_t	dsttype;
1080 
1081 	srctype = ipnet_get_addrtype(ipnet, src);
1082 	dsttype = ipnet_get_addrtype(ipnet, dst);
1083 
1084 	/*
1085 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1086 	 * matches ours, it's on the interface we're observing.  (Thus,
1087 	 * observing on the group ifindex matches all ifindexes in the group.)
1088 	 */
1089 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1090 	    ntohl(hdr->hpo_grifindex) == ifindex);
1091 
1092 	DTRACE_PROBE5(ipnet_accept__addr,
1093 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1094 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1095 	    boolean_t, obsif);
1096 
1097 	/*
1098 	 * Do not allow an ipnet stream to see packets that are not from or to
1099 	 * its zone.  The exception is when zones are using the shared stack
1100 	 * model.  In this case, streams in the global zone have visibility
1101 	 * into other shared-stack zones, and broadcast and multicast traffic
1102 	 * is visible by all zones in the stack.
1103 	 */
1104 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1105 	    dsttype != IPNETADDR_MBCAST) {
1106 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1107 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1108 			return (B_FALSE);
1109 	}
1110 
1111 	/*
1112 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1113 	 * packet's IP version.
1114 	 */
1115 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1116 	    ipnet->ipnet_family != hdr->hpo_family)
1117 		return (B_FALSE);
1118 
1119 	/* If the destination address is ours, then accept the packet. */
1120 	if (dsttype == IPNETADDR_MYADDR)
1121 		return (B_TRUE);
1122 
1123 	/*
1124 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1125 	 * sent or received on the interface we're observing, or packets that
1126 	 * have our source address (this allows us to see packets we send).
1127 	 */
1128 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1129 		if (srctype == IPNETADDR_MYADDR || obsif)
1130 			return (B_TRUE);
1131 	}
1132 
1133 	/*
1134 	 * We accept multicast and broadcast packets transmitted or received
1135 	 * on the interface we're observing.
1136 	 */
1137 	if (dsttype == IPNETADDR_MBCAST && obsif)
1138 		return (B_TRUE);
1139 
1140 	return (B_FALSE);
1141 }
1142 
1143 /*
1144  * Verify if the packet contained in hdr should be passed up to the ipnet
1145  * client stream that's in IPNET_LOMODE.
1146  */
1147 /* ARGSUSED */
1148 static boolean_t
1149 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1150     ipnet_addrp_t *dst)
1151 {
1152 	if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1153 		/*
1154 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1155 		 */
1156 		if (ipnet->ipnet_if == NULL)
1157 			return (B_FALSE);
1158 	}
1159 
1160 	/*
1161 	 * An ipnet stream must not see packets that are not from/to its zone.
1162 	 */
1163 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1164 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1165 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1166 			return (B_FALSE);
1167 	}
1168 
1169 	return (ipnet->ipnet_family == AF_UNSPEC ||
1170 	    ipnet->ipnet_family == hdr->hpo_family);
1171 }
1172 
1173 static void
1174 ipnet_dispatch(void *arg)
1175 {
1176 	mblk_t			*mp = arg;
1177 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1178 	ipnet_t			*ipnet;
1179 	mblk_t			*netmp;
1180 	list_t			*list;
1181 	ipnet_stack_t		*ips;
1182 	ipnet_addrp_t		src;
1183 	ipnet_addrp_t		dst;
1184 
1185 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1186 
1187 	netmp = hdr->hpo_pkt->b_cont;
1188 	src.iap_family = hdr->hpo_family;
1189 	dst.iap_family = hdr->hpo_family;
1190 
1191 	if (hdr->hpo_family == AF_INET) {
1192 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1193 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1194 	} else {
1195 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1196 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1197 	}
1198 
1199 	ipnet_walkers_inc(ips);
1200 
1201 	list = &ips->ips_str_list;
1202 	for (ipnet = list_head(list); ipnet != NULL;
1203 	    ipnet = list_next(list, ipnet)) {
1204 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1205 			IPSK_BUMP(ips, ik_acceptFail);
1206 			continue;
1207 		}
1208 		IPSK_BUMP(ips, ik_acceptOk);
1209 
1210 		if (list_next(list, ipnet) == NULL) {
1211 			netmp = hdr->hpo_pkt->b_cont;
1212 			hdr->hpo_pkt->b_cont = NULL;
1213 		} else {
1214 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1215 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1216 				IPSK_BUMP(ips, ik_duplicationFail);
1217 				continue;
1218 			}
1219 		}
1220 
1221 		if (ipnet->ipnet_flags & IPNET_INFO) {
1222 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1223 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1224 				continue;
1225 			}
1226 		}
1227 
1228 		if (ipnet->ipnet_rq->q_first == NULL &&
1229 		    canputnext(ipnet->ipnet_rq)) {
1230 			putnext(ipnet->ipnet_rq, netmp);
1231 			IPSK_BUMP(ips, ik_dispatchDeliver);
1232 		} else if (canput(ipnet->ipnet_rq)) {
1233 			(void) putq(ipnet->ipnet_rq, netmp);
1234 			IPSK_BUMP(ips, ik_dispatchDeliver);
1235 		} else {
1236 			freemsg(netmp);
1237 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1238 		}
1239 	}
1240 
1241 	ipnet_walkers_dec(ips);
1242 
1243 	freemsg(mp);
1244 }
1245 
1246 static void
1247 ipnet_input(mblk_t *mp)
1248 {
1249 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1250 	ipnet_stack_t		*ips;
1251 
1252 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1253 
1254 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1255 	    DDI_SUCCESS) {
1256 		IPSK_BUMP(ips, ik_dispatchFail);
1257 		freemsg(mp);
1258 	} else {
1259 		IPSK_BUMP(ips, ik_dispatchOk);
1260 	}
1261 }
1262 
1263 static ipnetif_t *
1264 ipnet_alloc_if(ipnet_stack_t *ips)
1265 {
1266 	ipnetif_t	*ipnetif;
1267 
1268 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1269 		return (NULL);
1270 
1271 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1272 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1273 	    offsetof(ipnetif_addr_t, ifa_link));
1274 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1275 	    offsetof(ipnetif_addr_t, ifa_link));
1276 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1277 
1278 	ipnetif->if_stackp = ips;
1279 
1280 	return (ipnetif);
1281 }
1282 
1283 /*
1284  * Create a new ipnetif_t and new minor node for it.  If creation is
1285  * successful the new ipnetif_t is inserted into an avl_tree
1286  * containing ipnetif's for this stack instance.
1287  */
1288 static ipnetif_t *
1289 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1290     uint64_t ifflags)
1291 {
1292 	ipnetif_t	*ipnetif;
1293 	avl_index_t	where = 0;
1294 	minor_t		ifminor;
1295 
1296 	/*
1297 	 * Because ipnetif_create() can be called from a NIC event
1298 	 * callback, it should not block.
1299 	 */
1300 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1301 	if (ifminor == (minor_t)-1)
1302 		return (NULL);
1303 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1304 		id_free(ipnet_minor_space, ifminor);
1305 		return (NULL);
1306 	}
1307 
1308 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1309 	ipnetif->if_index = (uint_t)index;
1310 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1311 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1312 
1313 	ipnetif->if_refcnt = 1;
1314 	if ((ifflags & IFF_LOOPBACK) != 0)
1315 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1316 
1317 	mutex_enter(&ips->ips_avl_lock);
1318 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1319 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1320 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1321 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1322 	mutex_exit(&ips->ips_avl_lock);
1323 
1324 	return (ipnetif);
1325 }
1326 
1327 static void
1328 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1329 {
1330 	ipnet_t	*ipnet;
1331 
1332 	ipnet_walkers_inc(ips);
1333 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1334 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1335 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1336 		if (ipnet->ipnet_if == ipnetif)
1337 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1338 	}
1339 	ipnet_walkers_dec(ips);
1340 	mutex_enter(&ips->ips_avl_lock);
1341 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1342 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1343 	mutex_exit(&ips->ips_avl_lock);
1344 	/*
1345 	 * Release the reference we implicitly held in ipnetif_create().
1346 	 */
1347 	ipnetif_refrele(ipnetif);
1348 }
1349 
1350 static void
1351 ipnet_purge_addrlist(list_t *addrlist)
1352 {
1353 	ipnetif_addr_t	*ifa;
1354 
1355 	while ((ifa = list_head(addrlist)) != NULL) {
1356 		list_remove(addrlist, ifa);
1357 		if (ifa->ifa_shared != NULL)
1358 			ipnetif_clone_release(ifa->ifa_shared);
1359 		kmem_free(ifa, sizeof (*ifa));
1360 	}
1361 }
1362 
1363 static void
1364 ipnetif_free(ipnetif_t *ipnetif)
1365 {
1366 	ASSERT(ipnetif->if_refcnt == 0);
1367 	ASSERT(ipnetif->if_sharecnt == 0);
1368 
1369 	/* Remove IPv4/v6 address lists from the ipnetif */
1370 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1371 	list_destroy(&ipnetif->if_ip4addr_list);
1372 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1373 	list_destroy(&ipnetif->if_ip6addr_list);
1374 	mutex_destroy(&ipnetif->if_addr_lock);
1375 	mutex_destroy(&ipnetif->if_reflock);
1376 	if (ipnetif->if_dev != 0)
1377 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1378 	kmem_free(ipnetif, sizeof (*ipnetif));
1379 }
1380 
1381 /*
1382  * Create an ipnetif_addr_t with the given logical interface id (lif)
1383  * and add it to the supplied ipnetif.  The lif is the netinfo
1384  * representation of logical interface id, and we use this id to match
1385  * incoming netinfo events against our lists of addresses.
1386  */
1387 static void
1388 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1389 {
1390 	ipnetif_addr_t		*ifaddr;
1391 	zoneid_t		zoneid;
1392 	struct sockaddr_in	bcast;
1393 	struct sockaddr_storage	addr;
1394 	net_ifaddr_t		type = NA_ADDRESS;
1395 	uint64_t		phyif = ipnetif->if_index;
1396 
1397 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1398 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1399 		return;
1400 
1401 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1402 		return;
1403 	ifaddr->ifa_zone = zoneid;
1404 	ifaddr->ifa_id = lif;
1405 	ifaddr->ifa_shared = NULL;
1406 
1407 	switch (addr.ss_family) {
1408 	case AF_INET:
1409 		ifaddr->ifa_ip4addr =
1410 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1411 		/*
1412 		 * Try and get the broadcast address.  Note that it's okay for
1413 		 * an interface to not have a broadcast address, so we don't
1414 		 * fail the entire operation if net_getlifaddr() fails here.
1415 		 */
1416 		type = NA_BROADCAST;
1417 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1418 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1419 		break;
1420 	case AF_INET6:
1421 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1422 		break;
1423 	}
1424 
1425 	/*
1426 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1427 	 * zone the address is being used in. This facilitates finding the
1428 	 * correct netstack_t pointer, amongst other things, later.
1429 	 */
1430 	if (zoneid == ALL_ZONES)
1431 		zoneid = GLOBAL_ZONEID;
1432 
1433 	mutex_enter(&ipnetif->if_addr_lock);
1434 	if (zoneid != ipnetif->if_zoneid) {
1435 		ipnetif_t *ifp2;
1436 
1437 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1438 		ifaddr->ifa_shared = ifp2;
1439 	}
1440 	list_insert_tail(addr.ss_family == AF_INET ?
1441 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1442 	mutex_exit(&ipnetif->if_addr_lock);
1443 }
1444 
1445 static void
1446 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1447 {
1448 	mutex_enter(&ipnetif->if_addr_lock);
1449 	if (ifaddr->ifa_shared != NULL)
1450 		ipnetif_clone_release(ifaddr->ifa_shared);
1451 
1452 	list_remove(isv6 ?
1453 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1454 	mutex_exit(&ipnetif->if_addr_lock);
1455 	kmem_free(ifaddr, sizeof (*ifaddr));
1456 }
1457 
1458 static void
1459 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1460 {
1461 	ipnetif_t	*ipnetif;
1462 	boolean_t	refrele_needed = B_TRUE;
1463 	uint64_t	ifflags;
1464 	uint64_t	ifindex;
1465 	char		*ifname;
1466 
1467 	ifflags = 0;
1468 	ifname = ipne->ipne_ifname;
1469 	ifindex = ipne->ipne_ifindex;
1470 
1471 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1472 
1473 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1474 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1475 		refrele_needed = B_FALSE;
1476 	}
1477 	if (ipnetif != NULL) {
1478 		ipnetif->if_flags |=
1479 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1480 	}
1481 
1482 	if (ipnetif->if_multicnt != 0) {
1483 		if (ip_join_allmulti(ifindex, isv6,
1484 		    ips->ips_netstack->netstack_ip) == 0) {
1485 			ipnetif->if_flags |=
1486 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1487 		}
1488 	}
1489 
1490 	if (refrele_needed)
1491 		ipnetif_refrele(ipnetif);
1492 }
1493 
1494 static void
1495 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1496 {
1497 	ipnetif_t	*ipnetif;
1498 
1499 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1500 		return;
1501 
1502 	mutex_enter(&ipnetif->if_addr_lock);
1503 	ipnet_purge_addrlist(isv6 ?
1504 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1505 	mutex_exit(&ipnetif->if_addr_lock);
1506 
1507 	/*
1508 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1509 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1510 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1511 	 */
1512 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1513 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1514 		ipnetif_remove(ipnetif, ips);
1515 	ipnetif_refrele(ipnetif);
1516 }
1517 
1518 static void
1519 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1520     ipnet_stack_t *ips, boolean_t isv6)
1521 {
1522 	ipnetif_t	*ipnetif;
1523 	ipnetif_addr_t	*ifaddr;
1524 
1525 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1526 		return;
1527 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1528 		/*
1529 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1530 		 * ifaddr and re-create it.
1531 		 */
1532 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1533 	}
1534 
1535 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1536 	ipnetif_refrele(ipnetif);
1537 }
1538 
1539 static void
1540 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1541     boolean_t isv6)
1542 {
1543 	ipnetif_t	*ipnetif;
1544 	ipnetif_addr_t	*ifaddr;
1545 
1546 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1547 		return;
1548 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1549 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1550 	ipnetif_refrele(ipnetif);
1551 	/*
1552 	 * Make sure that open streams on this ipnetif are still allowed to
1553 	 * have it open.
1554 	 */
1555 	ipnetif_zonecheck(ipnetif, ips);
1556 }
1557 
1558 /*
1559  * This callback from the NIC event framework dispatches a taskq as the event
1560  * handlers may block.
1561  */
1562 /* ARGSUSED */
1563 static int
1564 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1565 {
1566 	ipnet_stack_t		*ips = arg;
1567 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1568 	ipnet_nicevent_t	*ipne;
1569 
1570 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1571 		return (0);
1572 	ipne->ipne_event = hn->hne_event;
1573 	ipne->ipne_protocol = hn->hne_protocol;
1574 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1575 	ipne->ipne_ifindex = hn->hne_nic;
1576 	ipne->ipne_lifindex = hn->hne_lif;
1577 	if (hn->hne_datalen != 0) {
1578 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1579 		    sizeof (ipne->ipne_ifname));
1580 	}
1581 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1582 	    ipne, DDI_NOSLEEP);
1583 	return (0);
1584 }
1585 
1586 static void
1587 ipnet_nicevent_task(void *arg)
1588 {
1589 	ipnet_nicevent_t	*ipne = arg;
1590 	netstack_t		*ns;
1591 	ipnet_stack_t		*ips;
1592 	boolean_t		isv6;
1593 
1594 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1595 		goto done;
1596 	ips = ns->netstack_ipnet;
1597 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1598 
1599 	mutex_enter(&ips->ips_event_lock);
1600 	switch (ipne->ipne_event) {
1601 	case NE_PLUMB:
1602 		ipnet_plumb_ev(ipne, ips, isv6);
1603 		break;
1604 	case NE_UNPLUMB:
1605 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1606 		break;
1607 	case NE_LIF_UP:
1608 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1609 		    ipne->ipne_protocol, ips, isv6);
1610 		break;
1611 	case NE_LIF_DOWN:
1612 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1613 		    isv6);
1614 		break;
1615 	default:
1616 		break;
1617 	}
1618 	mutex_exit(&ips->ips_event_lock);
1619 done:
1620 	if (ns != NULL)
1621 		netstack_rele(ns);
1622 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1623 }
1624 
1625 dev_t
1626 ipnet_if_getdev(char *name, zoneid_t zoneid)
1627 {
1628 	netstack_t	*ns;
1629 	ipnet_stack_t	*ips;
1630 	ipnetif_t	*ipnetif;
1631 	dev_t		dev = (dev_t)-1;
1632 
1633 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1634 		return (dev);
1635 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1636 		return (dev);
1637 
1638 	ips = ns->netstack_ipnet;
1639 	mutex_enter(&ips->ips_avl_lock);
1640 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1641 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1642 			dev = ipnetif->if_dev;
1643 	}
1644 	mutex_exit(&ips->ips_avl_lock);
1645 	netstack_rele(ns);
1646 
1647 	return (dev);
1648 }
1649 
1650 static ipnetif_t *
1651 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1652 {
1653 	ipnetif_t	*ipnetif;
1654 
1655 	mutex_enter(&ips->ips_avl_lock);
1656 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1657 		ipnetif_refhold(ipnetif);
1658 	mutex_exit(&ips->ips_avl_lock);
1659 	return (ipnetif);
1660 }
1661 
1662 static ipnetif_t *
1663 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1664 {
1665 	ipnetif_t	*ipnetif;
1666 	avl_tree_t	*tree;
1667 
1668 	mutex_enter(&ips->ips_avl_lock);
1669 	tree = &ips->ips_avl_by_index;
1670 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1671 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1672 		if (ipnetif->if_dev == dev) {
1673 			ipnetif_refhold(ipnetif);
1674 			break;
1675 		}
1676 	}
1677 	mutex_exit(&ips->ips_avl_lock);
1678 	return (ipnetif);
1679 }
1680 
1681 static ipnetif_addr_t *
1682 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1683 {
1684 	ipnetif_addr_t	*ifaddr;
1685 	list_t	*list;
1686 
1687 	mutex_enter(&ipnetif->if_addr_lock);
1688 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1689 	for (ifaddr = list_head(list); ifaddr != NULL;
1690 	    ifaddr = list_next(list, ifaddr)) {
1691 		if (lid == ifaddr->ifa_id)
1692 			break;
1693 	}
1694 	mutex_exit(&ipnetif->if_addr_lock);
1695 	return (ifaddr);
1696 }
1697 
1698 /* ARGSUSED */
1699 static void *
1700 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1701 {
1702 	ipnet_stack_t	*ips;
1703 
1704 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1705 	ips->ips_netstack = ns;
1706 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1707 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1708 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1709 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1710 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1711 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1712 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1713 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1714 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1715 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1716 	    offsetof(ipnet_t, ipnet_next));
1717 	ipnet_register_netihook(ips);
1718 	return (ips);
1719 }
1720 
1721 /* ARGSUSED */
1722 static void
1723 ipnet_stack_fini(netstackid_t stackid, void *arg)
1724 {
1725 	ipnet_stack_t	*ips = arg;
1726 	ipnetif_t	*ipnetif, *nipnetif;
1727 
1728 	if (ips->ips_kstatp != NULL) {
1729 		zoneid_t zoneid;
1730 
1731 		zoneid = netstackid_to_zoneid(stackid);
1732 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1733 	}
1734 	if (ips->ips_ndv4 != NULL) {
1735 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1736 		    ips->ips_nicevents) == 0);
1737 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1738 	}
1739 	if (ips->ips_ndv6 != NULL) {
1740 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1741 		    ips->ips_nicevents) == 0);
1742 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1743 	}
1744 	hook_free(ips->ips_nicevents);
1745 
1746 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1747 	    ipnetif = nipnetif) {
1748 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1749 		ipnetif_remove(ipnetif, ips);
1750 	}
1751 	avl_destroy(&ips->ips_avl_by_shared);
1752 	avl_destroy(&ips->ips_avl_by_index);
1753 	avl_destroy(&ips->ips_avl_by_name);
1754 	mutex_destroy(&ips->ips_avl_lock);
1755 	mutex_destroy(&ips->ips_walkers_lock);
1756 	cv_destroy(&ips->ips_walkers_cv);
1757 	list_destroy(&ips->ips_str_list);
1758 	kmem_free(ips, sizeof (*ips));
1759 }
1760 
1761 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1762 static boolean_t
1763 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1764 {
1765 	ipnetif_addr_t	*ifa;
1766 
1767 	for (ifa = list_head(addrlist); ifa != NULL;
1768 	    ifa = list_next(addrlist, ifa)) {
1769 		if (ifa->ifa_zone == zoneid)
1770 			return (B_TRUE);
1771 	}
1772 	return (B_FALSE);
1773 }
1774 
1775 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1776 static boolean_t
1777 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1778 {
1779 	int	ret;
1780 
1781 	/*
1782 	 * The global zone has visibility into all interfaces in the global
1783 	 * stack, and exclusive stack zones have visibility into all
1784 	 * interfaces in their stack.
1785 	 */
1786 	if (zoneid == GLOBAL_ZONEID ||
1787 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1788 		return (B_TRUE);
1789 
1790 	/*
1791 	 * Shared-stack zones only have visibility for interfaces that have
1792 	 * addresses in their zone.
1793 	 */
1794 	mutex_enter(&ipnetif->if_addr_lock);
1795 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1796 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1797 	mutex_exit(&ipnetif->if_addr_lock);
1798 	return (ret);
1799 }
1800 
1801 /*
1802  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1803  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1804  * to have an ipnetif open if there are no longer any addresses that belong to
1805  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1806  * case, send the ipnet_t an M_HANGUP.
1807  */
1808 static void
1809 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1810 {
1811 	list_t	*strlist = &ips->ips_str_list;
1812 	ipnet_t	*ipnet;
1813 
1814 	ipnet_walkers_inc(ips);
1815 	for (ipnet = list_head(strlist); ipnet != NULL;
1816 	    ipnet = list_next(strlist, ipnet)) {
1817 		if (ipnet->ipnet_if != ipnetif)
1818 			continue;
1819 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1820 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1821 	}
1822 	ipnet_walkers_dec(ips);
1823 }
1824 
1825 void
1826 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1827 {
1828 	ipnetif_t		*ipnetif;
1829 	list_t			cbdata;
1830 	ipnetif_cbdata_t	*cbnode;
1831 	netstack_t		*ns;
1832 	ipnet_stack_t		*ips;
1833 
1834 	/*
1835 	 * On labeled systems, non-global zones shouldn't see anything
1836 	 * in /dev/ipnet.
1837 	 */
1838 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1839 		return;
1840 
1841 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1842 		return;
1843 
1844 	ips = ns->netstack_ipnet;
1845 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1846 	    offsetof(ipnetif_cbdata_t, ic_next));
1847 
1848 	mutex_enter(&ips->ips_avl_lock);
1849 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1850 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1851 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1852 			continue;
1853 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1854 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1855 		cbnode->ic_dev = ipnetif->if_dev;
1856 		list_insert_head(&cbdata, cbnode);
1857 	}
1858 	mutex_exit(&ips->ips_avl_lock);
1859 
1860 	while ((cbnode = list_head(&cbdata)) != NULL) {
1861 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1862 		list_remove(&cbdata, cbnode);
1863 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1864 	}
1865 	list_destroy(&cbdata);
1866 	netstack_rele(ns);
1867 }
1868 
1869 static int
1870 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1871 {
1872 	int64_t	index1 = *((int64_t *)index_ptr);
1873 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1874 
1875 	return (SIGNOF(index2 - index1));
1876 }
1877 
1878 static int
1879 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1880 {
1881 	int	res;
1882 
1883 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1884 	return (SIGNOF(res));
1885 }
1886 
1887 static int
1888 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1889 {
1890 	const uintptr_t	*ptr = key_ptr;
1891 	const ipnetif_t	*ifp;
1892 	int		res;
1893 
1894 	ifp = ipnetifp;
1895 	res = ifp->if_zoneid - ptr[0];
1896 	if (res != 0)
1897 		return (SIGNOF(res));
1898 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1899 	return (SIGNOF(res));
1900 }
1901 
1902 static void
1903 ipnetif_refhold(ipnetif_t *ipnetif)
1904 {
1905 	mutex_enter(&ipnetif->if_reflock);
1906 	ipnetif->if_refcnt++;
1907 	mutex_exit(&ipnetif->if_reflock);
1908 }
1909 
1910 static void
1911 ipnetif_refrele(ipnetif_t *ipnetif)
1912 {
1913 	mutex_enter(&ipnetif->if_reflock);
1914 	ASSERT(ipnetif->if_refcnt > 0);
1915 	if (--ipnetif->if_refcnt == 0)
1916 		ipnetif_free(ipnetif);
1917 	else
1918 		mutex_exit(&ipnetif->if_reflock);
1919 }
1920 
1921 static void
1922 ipnet_walkers_inc(ipnet_stack_t *ips)
1923 {
1924 	mutex_enter(&ips->ips_walkers_lock);
1925 	ips->ips_walkers_cnt++;
1926 	mutex_exit(&ips->ips_walkers_lock);
1927 }
1928 
1929 static void
1930 ipnet_walkers_dec(ipnet_stack_t *ips)
1931 {
1932 	mutex_enter(&ips->ips_walkers_lock);
1933 	ASSERT(ips->ips_walkers_cnt != 0);
1934 	if (--ips->ips_walkers_cnt == 0)
1935 		cv_broadcast(&ips->ips_walkers_cv);
1936 	mutex_exit(&ips->ips_walkers_lock);
1937 }
1938 
1939 /*ARGSUSED*/
1940 static int
1941 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1942 {
1943 	hook_pkt_observe_t	*hdr;
1944 	pfv_t			func = (pfv_t)arg;
1945 	mblk_t			*mp;
1946 
1947 	hdr = (hook_pkt_observe_t *)info;
1948 	/*
1949 	 * Code in ip_input() expects that it is the only one accessing the
1950 	 * packet.
1951 	 */
1952 	mp = copymsg(hdr->hpo_pkt);
1953 	if (mp == NULL)  {
1954 		netstack_t *ns = hdr->hpo_ctx;
1955 		ipnet_stack_t *ips = ns->netstack_ipnet;
1956 
1957 		IPSK_BUMP(ips, ik_dispatchDupDrop);
1958 		return (0);
1959 	}
1960 
1961 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1962 	hdr->hpo_pkt = mp;
1963 
1964 	func(mp);
1965 
1966 	return (0);
1967 }
1968 
1969 hook_t *
1970 ipobs_register_hook(netstack_t *ns, pfv_t func)
1971 {
1972 	ip_stack_t	*ipst = ns->netstack_ip;
1973 	char		name[32];
1974 	hook_t		*hook;
1975 
1976 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1977 	VERIFY(hook != NULL);
1978 
1979 	/*
1980 	 * To register multiple hooks with the same callback function,
1981 	 * a unique name is needed.
1982 	 */
1983 	(void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1984 	hook->h_name = strdup(name);
1985 
1986 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1987 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1988 
1989 	return (hook);
1990 }
1991 
1992 void
1993 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1994 {
1995 	ip_stack_t	*ipst = ns->netstack_ip;
1996 
1997 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1998 
1999 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
2000 
2001 	strfree(hook->h_name);
2002 
2003 	hook_free(hook);
2004 }
2005 
2006 /* ******************************************************************** */
2007 /* BPF Functions below							*/
2008 /* ******************************************************************** */
2009 
2010 /*
2011  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2012  */
2013 ipnet_stack_t *
2014 ipnet_find_by_zoneid(zoneid_t zoneid)
2015 {
2016 	netstack_t	*ns;
2017 
2018 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2019 	return (ns->netstack_ipnet);
2020 }
2021 
2022 /*
2023  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2024  * pointer to ipnet_stack_t by calling a netstack lookup function.
2025  * The netstack_find_*() functions return a pointer after doing a "hold"
2026  * on the data structure and thereby require a "release" when the caller
2027  * is finished with it. We need to mirror that API here and thus a caller
2028  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2029  */
2030 void
2031 ipnet_rele(ipnet_stack_t *ips)
2032 {
2033 	netstack_rele(ips->ips_netstack);
2034 }
2035 
2036 /*
2037  */
2038 void
2039 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2040 {
2041 	ipnet_itap = tapfunc;
2042 }
2043 
2044 /*
2045  * The list of interfaces available via ipnet is private for each zone,
2046  * so the AVL tree of each zone must be searched for a given name, even
2047  * if all names are unique.
2048  */
2049 int
2050 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2051 {
2052 	ipnet_stack_t	*ips;
2053 	ipnetif_t	*ipnetif;
2054 
2055 	ASSERT(ptr != NULL);
2056 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2057 
2058 	mutex_enter(&ips->ips_avl_lock);
2059 
2060 	/*
2061 	 * Shared instance zone?
2062 	 */
2063 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2064 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2065 
2066 		ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2067 	} else {
2068 		ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2069 	}
2070 	if (ipnetif != NULL)
2071 		ipnetif_refhold(ipnetif);
2072 	mutex_exit(&ips->ips_avl_lock);
2073 
2074 	*ptr = ipnetif;
2075 	ipnet_rele(ips);
2076 
2077 	if (ipnetif == NULL)
2078 		return (ESRCH);
2079 	return (0);
2080 }
2081 
2082 void
2083 ipnet_close_byhandle(ipnetif_t *ifp)
2084 {
2085 	ASSERT(ifp != NULL);
2086 	ipnetif_refrele(ifp);
2087 }
2088 
2089 const char *
2090 ipnet_name(ipnetif_t *ifp)
2091 {
2092 	ASSERT(ifp != NULL);
2093 	return (ifp->if_name);
2094 }
2095 
2096 /*
2097  * To find the linkid for a given name, it is necessary to know which zone
2098  * the interface name belongs to and to search the avl tree for that zone
2099  * as there is no master list of all interfaces and which zone they belong
2100  * to. It is assumed that the caller of this function is somehow already
2101  * working with the ipnet interfaces and hence the ips_event_lock is held.
2102  * When BPF calls into this function, it is doing so because of an event
2103  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2104  * value returned has meaning without the need for grabbing a hold on the
2105  * owning structure.
2106  */
2107 int
2108 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2109 {
2110 	ipnet_stack_t	*ips;
2111 	ipnetif_t	*ifp;
2112 
2113 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2114 	ASSERT(mutex_owned(&ips->ips_event_lock));
2115 
2116 	mutex_enter(&ips->ips_avl_lock);
2117 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2118 	if (ifp != NULL)
2119 		*idp = (uint_t)ifp->if_index;
2120 
2121 	/*
2122 	 * Shared instance zone?
2123 	 */
2124 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2125 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2126 
2127 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2128 		if (ifp != NULL)
2129 			*idp = (uint_t)ifp->if_index;
2130 	}
2131 
2132 	mutex_exit(&ips->ips_avl_lock);
2133 	ipnet_rele(ips);
2134 
2135 	if (ifp == NULL)
2136 		return (ESRCH);
2137 	return (0);
2138 }
2139 
2140 /*
2141  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2142  * there is in mac. BPF only needs to have this because it is required as
2143  * part of interfacing correctly with mac. The reuse of the original
2144  * ipnetif_t as a client poses no danger, so long as it is done with its
2145  * own ref-count'd hold that is given up on close.
2146  */
2147 int
2148 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2149 {
2150 	ASSERT(ptr != NULL);
2151 	ASSERT(result != NULL);
2152 	ipnetif_refhold(ptr);
2153 	*result = ptr;
2154 
2155 	return (0);
2156 }
2157 
2158 void
2159 ipnet_client_close(ipnetif_t *ptr)
2160 {
2161 	ASSERT(ptr != NULL);
2162 	ipnetif_refrele(ptr);
2163 }
2164 
2165 /*
2166  * This is called from BPF when it needs to start receiving packets
2167  * from ipnet.
2168  *
2169  * The use of the ipnet_t structure here is somewhat lightweight when
2170  * compared to how it is used elsewhere but it already has all of the
2171  * right fields in it, so reuse here doesn't seem out of order. Its
2172  * primary purpose here is to provide the means to store pointers for
2173  * use when ipnet_promisc_remove() needs to be called.
2174  *
2175  * This should never be called for the IPNET_MINOR_LO device as it is
2176  * never created via ipnetif_create.
2177  */
2178 /*ARGSUSED*/
2179 int
2180 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2181     int flags)
2182 {
2183 	ip_stack_t	*ipst;
2184 	netstack_t	*ns;
2185 	ipnetif_t	*ifp;
2186 	ipnet_t		*ipnet;
2187 	char		name[32];
2188 	int		error;
2189 
2190 	ifp = (ipnetif_t *)handle;
2191 
2192 	if (how != DL_PROMISC_PHYS && how != DL_PROMISC_MULTI)
2193 		return (EINVAL);
2194 
2195 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2196 
2197 	if ((error = ipnet_join_allmulti(ifp, ns->netstack_ipnet)) != 0) {
2198 		netstack_rele(ns);
2199 		return (error);
2200 	}
2201 
2202 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2203 	ipnet->ipnet_if = ifp;
2204 	ipnet->ipnet_ns = ns;
2205 	ipnet->ipnet_flags = flags;
2206 
2207 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2208 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2209 	} else {
2210 		ipnet->ipnet_acceptfn = ipnet_accept;
2211 	}
2212 
2213 	/*
2214 	 * To register multiple hooks with the same callback function,
2215 	 * a unique name is needed.
2216 	 */
2217 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2218 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2219 	    (void *)ipnet->ipnet_hook);
2220 	ipnet->ipnet_hook->h_name = strdup(name);
2221 	ipnet->ipnet_data = data;
2222 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2223 
2224 	ipst = ns->netstack_ip;
2225 
2226 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2227 	    ipnet->ipnet_hook);
2228 	if (error != 0)
2229 		goto regfail;
2230 
2231 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2232 	    ipnet->ipnet_hook);
2233 	if (error != 0) {
2234 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2235 		    NH_OBSERVE, ipnet->ipnet_hook);
2236 		goto regfail;
2237 	}
2238 
2239 	*mhandle = (uintptr_t)ipnet;
2240 	netstack_rele(ns);
2241 
2242 	return (0);
2243 
2244 regfail:
2245 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2246 	strfree(ipnet->ipnet_hook->h_name);
2247 	hook_free(ipnet->ipnet_hook);
2248 	netstack_rele(ns);
2249 	return (error);
2250 }
2251 
2252 void
2253 ipnet_promisc_remove(void *data)
2254 {
2255 	ip_stack_t	*ipst;
2256 	ipnet_t		*ipnet;
2257 	hook_t		*hook;
2258 
2259 	ipnet = data;
2260 	ipst = ipnet->ipnet_ns->netstack_ip;
2261 	hook = ipnet->ipnet_hook;
2262 
2263 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2264 	    hook) == 0);
2265 
2266 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2267 	    hook) == 0);
2268 
2269 	strfree(hook->h_name);
2270 
2271 	hook_free(hook);
2272 
2273 	kmem_free(ipnet, sizeof (*ipnet));
2274 }
2275 
2276 /*
2277  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2278  * An important field from that structure is "ipnet_data" that
2279  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2280  * to be passed back to bpf when we call into ipnet_itap.
2281  *
2282  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2283  * from BPF.
2284  */
2285 /*ARGSUSED*/
2286 static int
2287 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2288 {
2289 	hook_pkt_observe_t	*hdr;
2290 	ipnet_addrp_t		src;
2291 	ipnet_addrp_t		dst;
2292 	ipnet_stack_t		*ips;
2293 	ipnet_t			*ipnet;
2294 	mblk_t			*netmp;
2295 	mblk_t			*mp;
2296 
2297 	hdr = (hook_pkt_observe_t *)info;
2298 	mp = hdr->hpo_pkt;
2299 	ipnet = (ipnet_t *)arg;
2300 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2301 
2302 	netmp = hdr->hpo_pkt->b_cont;
2303 	src.iap_family = hdr->hpo_family;
2304 	dst.iap_family = hdr->hpo_family;
2305 
2306 	if (hdr->hpo_family == AF_INET) {
2307 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2308 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2309 	} else {
2310 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2311 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2312 	}
2313 
2314 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2315 		IPSK_BUMP(ips, ik_acceptFail);
2316 		return (0);
2317 	}
2318 	IPSK_BUMP(ips, ik_acceptOk);
2319 
2320 	ipnet_itap(ipnet->ipnet_data, mp,
2321 	    hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2322 	    ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2323 
2324 	return (0);
2325 }
2326 
2327 /*
2328  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2329  * to life and configures an IP address. The model that BPF uses is that
2330  * each interface must have a unique pointer and each interface must be
2331  * representative of what it can capture. They are limited to one DLT
2332  * per interface and one zone per interface. Thus every interface that
2333  * can be seen in a zone must be announced via an attach to bpf. For
2334  * shared instance zones, this means the ipnet driver needs to detect
2335  * when an address is added to an interface in a zone for the first
2336  * time (and also when the last address is removed.)
2337  */
2338 static ipnetif_t *
2339 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2340 {
2341 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2342 	ipnet_stack_t	*ips = ifp->if_stackp;
2343 	avl_index_t	where = 0;
2344 	ipnetif_t	*newif;
2345 
2346 	mutex_enter(&ips->ips_avl_lock);
2347 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2348 	if (newif != NULL) {
2349 		ipnetif_refhold(newif);
2350 		newif->if_sharecnt++;
2351 		mutex_exit(&ips->ips_avl_lock);
2352 		return (newif);
2353 	}
2354 
2355 	newif = ipnet_alloc_if(ips);
2356 	if (newif == NULL) {
2357 		mutex_exit(&ips->ips_avl_lock);
2358 		return (NULL);
2359 	}
2360 
2361 	newif->if_refcnt = 1;
2362 	newif->if_sharecnt = 1;
2363 	newif->if_zoneid = zoneid;
2364 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2365 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2366 	newif->if_index = ifp->if_index;
2367 
2368 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2369 	mutex_exit(&ips->ips_avl_lock);
2370 
2371 	return (newif);
2372 }
2373 
2374 static void
2375 ipnetif_clone_release(ipnetif_t *ipnetif)
2376 {
2377 	boolean_t	dofree = B_FALSE;
2378 	boolean_t	doremove = B_FALSE;
2379 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2380 
2381 	mutex_enter(&ipnetif->if_reflock);
2382 	ASSERT(ipnetif->if_refcnt > 0);
2383 	if (--ipnetif->if_refcnt == 0)
2384 		dofree = B_TRUE;
2385 	ASSERT(ipnetif->if_sharecnt > 0);
2386 	if (--ipnetif->if_sharecnt == 0)
2387 		doremove = B_TRUE;
2388 	mutex_exit(&ipnetif->if_reflock);
2389 	if (doremove) {
2390 		mutex_enter(&ips->ips_avl_lock);
2391 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2392 		mutex_exit(&ips->ips_avl_lock);
2393 	}
2394 	if (dofree) {
2395 		ASSERT(ipnetif->if_sharecnt == 0);
2396 		ipnetif_free(ipnetif);
2397 	}
2398 }
2399