xref: /titanic_50/usr/src/uts/common/inet/ipnet/ipnet.c (revision fc5884fc5cbeced353b19c8153bd02be0d801d97)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 
22 /*
23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 /*
28  * The ipnet device defined here provides access to packets at the IP layer. To
29  * provide access to packets at this layer it registers a callback function in
30  * the ip module and when there are open instances of the device ip will pass
31  * packets into the device. Packets from ip are passed on the input, output and
32  * loopback paths. Internally the module returns to ip as soon as possible by
33  * deferring processing using a taskq.
34  *
35  * Management of the devices in /dev/ipnet/ is handled by the devname
36  * filesystem and use of the neti interfaces.  This module registers for NIC
37  * events using the neti framework so that when IP interfaces are bought up,
38  * taken down etc. the ipnet module is notified and its view of the interfaces
39  * configured on the system adjusted.  On attach, the module gets an initial
40  * view of the system again using the neti framework but as it has already
41  * registered for IP interface events, it is still up-to-date with any changes.
42  */
43 
44 #include <sys/types.h>
45 #include <sys/conf.h>
46 #include <sys/cred.h>
47 #include <sys/stat.h>
48 #include <sys/ddi.h>
49 #include <sys/sunddi.h>
50 #include <sys/modctl.h>
51 #include <sys/dlpi.h>
52 #include <sys/strsun.h>
53 #include <sys/id_space.h>
54 #include <sys/kmem.h>
55 #include <sys/mkdev.h>
56 #include <sys/neti.h>
57 #include <net/if.h>
58 #include <sys/errno.h>
59 #include <sys/list.h>
60 #include <sys/ksynch.h>
61 #include <sys/hook_event.h>
62 #include <sys/sdt.h>
63 #include <sys/stropts.h>
64 #include <sys/sysmacros.h>
65 #include <inet/ip.h>
66 #include <inet/ip_if.h>
67 #include <inet/ip_multi.h>
68 #include <inet/ip6.h>
69 #include <inet/ipnet.h>
70 #include <net/bpf.h>
71 #include <net/bpfdesc.h>
72 #include <net/dlt.h>
73 
74 static struct module_info ipnet_minfo = {
75 	1,		/* mi_idnum */
76 	"ipnet",	/* mi_idname */
77 	0,		/* mi_minpsz */
78 	INFPSZ,		/* mi_maxpsz */
79 	2048,		/* mi_hiwat */
80 	0		/* mi_lowat */
81 };
82 
83 /*
84  * List to hold static view of ipnetif_t's on the system. This is needed to
85  * avoid holding the lock protecting the avl tree of ipnetif's over the
86  * callback into the dev filesystem.
87  */
88 typedef struct ipnetif_cbdata {
89 	char		ic_ifname[LIFNAMSIZ];
90 	dev_t		ic_dev;
91 	list_node_t	ic_next;
92 } ipnetif_cbdata_t;
93 
94 /*
95  * Convenience enumerated type for ipnet_accept().  It describes the
96  * properties of a given ipnet_addrp_t relative to a single ipnet_t
97  * client stream.  The values represent whether the address is ...
98  */
99 typedef enum {
100 	IPNETADDR_MYADDR,	/* an address on my ipnetif_t. */
101 	IPNETADDR_MBCAST,	/* a multicast or broadcast address. */
102 	IPNETADDR_UNKNOWN	/* none of the above. */
103 } ipnet_addrtype_t;
104 
105 /* Argument used for the ipnet_nicevent_taskq callback. */
106 typedef struct ipnet_nicevent_s {
107 	nic_event_t		ipne_event;
108 	net_handle_t		ipne_protocol;
109 	netstackid_t		ipne_stackid;
110 	uint64_t		ipne_ifindex;
111 	uint64_t		ipne_lifindex;
112 	char			ipne_ifname[LIFNAMSIZ];
113 } ipnet_nicevent_t;
114 
115 static dev_info_t	*ipnet_dip;
116 static major_t		ipnet_major;
117 static ddi_taskq_t	*ipnet_taskq;		/* taskq for packets */
118 static ddi_taskq_t	*ipnet_nicevent_taskq;	/* taskq for NIC events */
119 static id_space_t	*ipnet_minor_space;
120 static const int	IPNET_MINOR_LO = 1; 	/* minor number for /dev/lo0 */
121 static const int 	IPNET_MINOR_MIN = 2; 	/* start of dynamic minors */
122 static dl_info_ack_t	ipnet_infoack = IPNET_INFO_ACK_INIT;
123 static ipnet_acceptfn_t	ipnet_accept, ipnet_loaccept;
124 static bpf_itap_fn_t	ipnet_itap;
125 
126 static void	ipnet_input(mblk_t *);
127 static int	ipnet_wput(queue_t *, mblk_t *);
128 static int	ipnet_rsrv(queue_t *);
129 static int	ipnet_open(queue_t *, dev_t *, int, int, cred_t *);
130 static int	ipnet_close(queue_t *);
131 static void	ipnet_ioctl(queue_t *, mblk_t *);
132 static void	ipnet_iocdata(queue_t *, mblk_t *);
133 static void 	ipnet_wputnondata(queue_t *, mblk_t *);
134 static int	ipnet_attach(dev_info_t *, ddi_attach_cmd_t);
135 static int	ipnet_detach(dev_info_t *, ddi_detach_cmd_t);
136 static int	ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **);
137 static void	ipnet_inforeq(queue_t *q, mblk_t *mp);
138 static void	ipnet_bindreq(queue_t *q, mblk_t *mp);
139 static void	ipnet_unbindreq(queue_t *q, mblk_t *mp);
140 static void	ipnet_dlpromisconreq(queue_t *q, mblk_t *mp);
141 static void	ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp);
142 static int	ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *);
143 static void	ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *);
144 static int	ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *);
145 static void	ipnet_nicevent_task(void *);
146 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *,
147     uint64_t);
148 static void	ipnetif_remove(ipnetif_t *, ipnet_stack_t *);
149 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t);
150 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *);
151 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *);
152 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *);
153 static void	ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *);
154 static int	ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t);
155 static int 	ipnetif_compare_name(const void *, const void *);
156 static int 	ipnetif_compare_name_zone(const void *, const void *);
157 static int 	ipnetif_compare_index(const void *, const void *);
158 static void	ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t);
159 static void	ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t);
160 static void	ipnetif_refhold(ipnetif_t *);
161 static void	ipnetif_refrele(ipnetif_t *);
162 static void	ipnet_walkers_inc(ipnet_stack_t *);
163 static void	ipnet_walkers_dec(ipnet_stack_t *);
164 static void	ipnet_register_netihook(ipnet_stack_t *);
165 static void	*ipnet_stack_init(netstackid_t, netstack_t *);
166 static void	ipnet_stack_fini(netstackid_t, void *);
167 static void	ipnet_dispatch(void *);
168 static int	ipobs_bounce_func(hook_event_token_t, hook_data_t, void *);
169 static int	ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *);
170 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t);
171 static void	ipnetif_clone_release(ipnetif_t *);
172 
173 static struct qinit ipnet_rinit = {
174 	NULL,		/* qi_putp */
175 	ipnet_rsrv,	/* qi_srvp */
176 	ipnet_open,	/* qi_qopen */
177 	ipnet_close,	/* qi_qclose */
178 	NULL,		/* qi_qadmin */
179 	&ipnet_minfo,	/* qi_minfo */
180 };
181 
182 static struct qinit ipnet_winit = {
183 	ipnet_wput,	/* qi_putp */
184 	NULL,		/* qi_srvp */
185 	NULL,		/* qi_qopen */
186 	NULL,		/* qi_qclose */
187 	NULL,		/* qi_qadmin */
188 	&ipnet_minfo,	/* qi_minfo */
189 };
190 
191 static struct streamtab ipnet_info = {
192 	&ipnet_rinit, &ipnet_winit
193 };
194 
195 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach,
196     ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info,
197     ddi_quiesce_not_supported);
198 
199 static struct modldrv modldrv = {
200 	&mod_driverops,
201 	"STREAMS ipnet driver",
202 	&ipnet_ops
203 };
204 
205 static struct modlinkage modlinkage = {
206 	MODREV_1, &modldrv, NULL
207 };
208 
209 /*
210  * This structure contains the template data (names and type) that is
211  * copied, in bulk, into the new kstats structure created by net_kstat_create.
212  * No actual statistical information is stored in this instance of the
213  * ipnet_kstats_t structure.
214  */
215 static ipnet_kstats_t stats_template = {
216 	{ "duplicationFail",	KSTAT_DATA_UINT64 },
217 	{ "dispatchOk",		KSTAT_DATA_UINT64 },
218 	{ "dispatchFail",	KSTAT_DATA_UINT64 },
219 	{ "dispatchHeaderDrop",	KSTAT_DATA_UINT64 },
220 	{ "dispatchDupDrop",	KSTAT_DATA_UINT64 },
221 	{ "dispatchDeliver",	KSTAT_DATA_UINT64 },
222 	{ "acceptOk",		KSTAT_DATA_UINT64 },
223 	{ "acceptFail",		KSTAT_DATA_UINT64 }
224 };
225 
226 /*
227  * Walk the list of physical interfaces on the machine, for each
228  * interface create a new ipnetif_t and add any addresses to it. We
229  * need to do the walk twice, once for IPv4 and once for IPv6.
230  *
231  * The interfaces are destroyed as part of ipnet_stack_fini() for each
232  * stack.  Note that we cannot do this initialization in
233  * ipnet_stack_init(), since ipnet_stack_init() cannot fail.
234  */
235 static int
ipnetif_init(void)236 ipnetif_init(void)
237 {
238 	netstack_handle_t	nh;
239 	netstack_t		*ns;
240 	ipnet_stack_t		*ips;
241 	int			ret = 0;
242 
243 	netstack_next_init(&nh);
244 	while ((ns = netstack_next(&nh)) != NULL) {
245 		ips = ns->netstack_ipnet;
246 		if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0)
247 			ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE);
248 		netstack_rele(ns);
249 		if (ret != 0)
250 			break;
251 	}
252 	netstack_next_fini(&nh);
253 	return (ret);
254 }
255 
256 /*
257  * Standard module entry points.
258  */
259 int
_init(void)260 _init(void)
261 {
262 	int		ret;
263 	boolean_t	netstack_registered = B_FALSE;
264 
265 	if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1)
266 		return (ENODEV);
267 	ipnet_minor_space = id_space_create("ipnet_minor_space",
268 	    IPNET_MINOR_MIN, MAXMIN32);
269 
270 	/*
271 	 * We call ddi_taskq_create() with nthread == 1 to ensure in-order
272 	 * delivery of packets to clients.  Note that we need to create the
273 	 * taskqs before calling netstack_register() since ipnet_stack_init()
274 	 * registers callbacks that use 'em.
275 	 */
276 	ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0);
277 	ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue",
278 	    1, TASKQ_DEFAULTPRI, 0);
279 	if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) {
280 		ret = ENOMEM;
281 		goto done;
282 	}
283 
284 	netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini);
285 	netstack_registered = B_TRUE;
286 
287 	if ((ret = ipnetif_init()) == 0)
288 		ret = mod_install(&modlinkage);
289 done:
290 	if (ret != 0) {
291 		if (ipnet_taskq != NULL)
292 			ddi_taskq_destroy(ipnet_taskq);
293 		if (ipnet_nicevent_taskq != NULL)
294 			ddi_taskq_destroy(ipnet_nicevent_taskq);
295 		if (netstack_registered)
296 			netstack_unregister(NS_IPNET);
297 		id_space_destroy(ipnet_minor_space);
298 	}
299 	return (ret);
300 }
301 
302 int
_fini(void)303 _fini(void)
304 {
305 	int	err;
306 
307 	if ((err = mod_remove(&modlinkage)) != 0)
308 		return (err);
309 
310 	netstack_unregister(NS_IPNET);
311 	ddi_taskq_destroy(ipnet_nicevent_taskq);
312 	ddi_taskq_destroy(ipnet_taskq);
313 	id_space_destroy(ipnet_minor_space);
314 	return (0);
315 }
316 
317 int
_info(struct modinfo * modinfop)318 _info(struct modinfo *modinfop)
319 {
320 	return (mod_info(&modlinkage, modinfop));
321 }
322 
323 static void
ipnet_register_netihook(ipnet_stack_t * ips)324 ipnet_register_netihook(ipnet_stack_t *ips)
325 {
326 	int		ret;
327 	zoneid_t	zoneid;
328 	netid_t		netid;
329 
330 	HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents",
331 	    ips);
332 
333 	/*
334 	 * It is possible for an exclusive stack to be in the process of
335 	 * shutting down here, and the netid and protocol lookups could fail
336 	 * in that case.
337 	 */
338 	zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid);
339 	if ((netid = net_zoneidtonetid(zoneid)) == -1)
340 		return;
341 
342 	if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) {
343 		if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS,
344 		    ips->ips_nicevents)) != 0) {
345 			VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
346 			ips->ips_ndv4 = NULL;
347 			cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks"
348 			    " in zone %d: %d", zoneid, ret);
349 		}
350 	}
351 	if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) {
352 		if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS,
353 		    ips->ips_nicevents)) != 0) {
354 			VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
355 			ips->ips_ndv6 = NULL;
356 			cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks"
357 			    " in zone %d: %d", zoneid, ret);
358 		}
359 	}
360 
361 	/*
362 	 * Create a local set of kstats for each zone.
363 	 */
364 	ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats",
365 	    "misc", KSTAT_TYPE_NAMED,
366 	    sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0);
367 	if (ips->ips_kstatp != NULL) {
368 		bcopy(&stats_template, &ips->ips_stats,
369 		    sizeof (ips->ips_stats));
370 		ips->ips_kstatp->ks_data = &ips->ips_stats;
371 		ips->ips_kstatp->ks_private =
372 		    (void *)(uintptr_t)ips->ips_netstack->netstack_stackid;
373 		kstat_install(ips->ips_kstatp);
374 	} else {
375 		cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed",
376 		    "ipnet", "ipnet_stats", "misc");
377 	}
378 }
379 
380 /*
381  * This function is called on attach to build an initial view of the
382  * interfaces on the system. It will be called once for IPv4 and once
383  * for IPv6, although there is only one ipnet interface for both IPv4
384  * and IPv6 there are separate address lists.
385  */
386 static int
ipnet_populate_if(net_handle_t nd,ipnet_stack_t * ips,boolean_t isv6)387 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6)
388 {
389 	phy_if_t	phyif;
390 	lif_if_t	lif;
391 	ipnetif_t	*ipnetif;
392 	char		name[LIFNAMSIZ];
393 	boolean_t	new_if = B_FALSE;
394 	uint64_t	ifflags;
395 	int		ret = 0;
396 
397 	/*
398 	 * If ipnet_register_netihook() was unable to initialize this
399 	 * stack's net_handle_t, then we cannot populate any interface
400 	 * information.  This usually happens when we attempted to
401 	 * grab a net_handle_t as a stack was shutting down.  We don't
402 	 * want to fail the entire _init() operation because of a
403 	 * stack shutdown (other stacks will continue to work just
404 	 * fine), so we silently return success here.
405 	 */
406 	if (nd == NULL)
407 		return (0);
408 
409 	/*
410 	 * Make sure we're not processing NIC events during the
411 	 * population of our interfaces and address lists.
412 	 */
413 	mutex_enter(&ips->ips_event_lock);
414 
415 	for (phyif = net_phygetnext(nd, 0); phyif != 0;
416 	    phyif = net_phygetnext(nd, phyif)) {
417 		if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0)
418 			continue;
419 		ifflags =  0;
420 		(void) net_getlifflags(nd, phyif, 0, &ifflags);
421 		if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) {
422 			ipnetif = ipnetif_create(name, phyif, ips, ifflags);
423 			if (ipnetif == NULL) {
424 				ret = ENOMEM;
425 				goto done;
426 			}
427 			new_if = B_TRUE;
428 		}
429 		ipnetif->if_flags |=
430 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
431 
432 		for (lif = net_lifgetnext(nd, phyif, 0); lif != 0;
433 		    lif = net_lifgetnext(nd, phyif, lif)) {
434 			/*
435 			 * Skip addresses that aren't up.  We'll add
436 			 * them when we receive an NE_LIF_UP event.
437 			 */
438 			if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 ||
439 			    !(ifflags & IFF_UP))
440 				continue;
441 			/* Don't add it if we already have it. */
442 			if (ipnet_match_lif(ipnetif, lif, isv6) != NULL)
443 				continue;
444 			ipnet_add_ifaddr(lif, ipnetif, nd);
445 		}
446 		if (!new_if)
447 			ipnetif_refrele(ipnetif);
448 	}
449 
450 done:
451 	mutex_exit(&ips->ips_event_lock);
452 	return (ret);
453 }
454 
455 static int
ipnet_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)456 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
457 {
458 	if (cmd != DDI_ATTACH)
459 		return (DDI_FAILURE);
460 
461 	if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO,
462 	    DDI_PSEUDO, 0) == DDI_FAILURE)
463 		return (DDI_FAILURE);
464 
465 	ipnet_dip = dip;
466 	return (DDI_SUCCESS);
467 }
468 
469 static int
ipnet_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)470 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
471 {
472 	if (cmd != DDI_DETACH)
473 		return (DDI_FAILURE);
474 
475 	ASSERT(dip == ipnet_dip);
476 	ddi_remove_minor_node(ipnet_dip, NULL);
477 	ipnet_dip = NULL;
478 	return (DDI_SUCCESS);
479 }
480 
481 /* ARGSUSED */
482 static int
ipnet_devinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** result)483 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result)
484 {
485 	int	error = DDI_FAILURE;
486 
487 	switch (infocmd) {
488 	case DDI_INFO_DEVT2INSTANCE:
489 		*result = (void *)0;
490 		error = DDI_SUCCESS;
491 		break;
492 	case DDI_INFO_DEVT2DEVINFO:
493 		if (ipnet_dip != NULL) {
494 			*result = ipnet_dip;
495 			error = DDI_SUCCESS;
496 		}
497 		break;
498 	}
499 	return (error);
500 }
501 
502 /* ARGSUSED */
503 static int
ipnet_open(queue_t * rq,dev_t * dev,int oflag,int sflag,cred_t * crp)504 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp)
505 {
506 	ipnet_t		*ipnet;
507 	netstack_t	*ns = NULL;
508 	ipnet_stack_t	*ips;
509 	int		err = 0;
510 	zoneid_t	zoneid = crgetzoneid(crp);
511 
512 	/*
513 	 * If the system is labeled, only the global zone is allowed to open
514 	 * IP observability nodes.
515 	 */
516 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
517 		return (EACCES);
518 
519 	/* We don't support open as a module */
520 	if (sflag & MODOPEN)
521 		return (ENOTSUP);
522 
523 	/* This driver is self-cloning, we don't support re-open. */
524 	if (rq->q_ptr != NULL)
525 		return (EBUSY);
526 
527 	if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL)
528 		return (ENOMEM);
529 
530 	VERIFY((ns = netstack_find_by_cred(crp)) != NULL);
531 	ips = ns->netstack_ipnet;
532 
533 	rq->q_ptr = WR(rq)->q_ptr = ipnet;
534 	ipnet->ipnet_rq = rq;
535 	ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space);
536 	ipnet->ipnet_zoneid = zoneid;
537 	ipnet->ipnet_dlstate = DL_UNBOUND;
538 	ipnet->ipnet_ns = ns;
539 
540 	/*
541 	 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need
542 	 * to be processed after ipnet_if is set and the ipnet_t has been
543 	 * inserted in the ips_str_list.
544 	 */
545 	mutex_enter(&ips->ips_event_lock);
546 	if (getminor(*dev) == IPNET_MINOR_LO) {
547 		ipnet->ipnet_flags |= IPNET_LOMODE;
548 		ipnet->ipnet_acceptfn = ipnet_loaccept;
549 	} else {
550 		ipnet->ipnet_acceptfn = ipnet_accept;
551 		ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips);
552 		if (ipnet->ipnet_if == NULL ||
553 		    !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) {
554 			err = ENODEV;
555 			goto done;
556 		}
557 	}
558 
559 	mutex_enter(&ips->ips_walkers_lock);
560 	while (ips->ips_walkers_cnt != 0)
561 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
562 	list_insert_head(&ips->ips_str_list, ipnet);
563 	*dev = makedevice(getmajor(*dev), ipnet->ipnet_minor);
564 	qprocson(rq);
565 
566 	/*
567 	 * Only register our callback if we're the first open client; we call
568 	 * unregister in close() for the last open client.
569 	 */
570 	if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list))
571 		ips->ips_hook = ipobs_register_hook(ns, ipnet_input);
572 	mutex_exit(&ips->ips_walkers_lock);
573 
574 done:
575 	mutex_exit(&ips->ips_event_lock);
576 	if (err != 0) {
577 		netstack_rele(ns);
578 		id_free(ipnet_minor_space, ipnet->ipnet_minor);
579 		if (ipnet->ipnet_if != NULL)
580 			ipnetif_refrele(ipnet->ipnet_if);
581 		kmem_free(ipnet, sizeof (*ipnet));
582 	}
583 	return (err);
584 }
585 
586 static int
ipnet_close(queue_t * rq)587 ipnet_close(queue_t *rq)
588 {
589 	ipnet_t		*ipnet = rq->q_ptr;
590 	ipnet_stack_t	*ips = ipnet->ipnet_ns->netstack_ipnet;
591 
592 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
593 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
594 	if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
595 		ipnet_leave_allmulti(ipnet->ipnet_if, ips);
596 
597 	mutex_enter(&ips->ips_walkers_lock);
598 	while (ips->ips_walkers_cnt != 0)
599 		cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock);
600 
601 	qprocsoff(rq);
602 
603 	list_remove(&ips->ips_str_list, ipnet);
604 	if (ipnet->ipnet_if != NULL)
605 		ipnetif_refrele(ipnet->ipnet_if);
606 	id_free(ipnet_minor_space, ipnet->ipnet_minor);
607 
608 	if (list_is_empty(&ips->ips_str_list)) {
609 		ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook);
610 		ips->ips_hook = NULL;
611 	}
612 
613 	kmem_free(ipnet, sizeof (*ipnet));
614 
615 	mutex_exit(&ips->ips_walkers_lock);
616 	netstack_rele(ips->ips_netstack);
617 	return (0);
618 }
619 
620 static int
ipnet_wput(queue_t * q,mblk_t * mp)621 ipnet_wput(queue_t *q, mblk_t *mp)
622 {
623 	switch (mp->b_datap->db_type) {
624 	case M_FLUSH:
625 		if (*mp->b_rptr & FLUSHW) {
626 			flushq(q, FLUSHDATA);
627 			*mp->b_rptr &= ~FLUSHW;
628 		}
629 		if (*mp->b_rptr & FLUSHR)
630 			qreply(q, mp);
631 		else
632 			freemsg(mp);
633 		break;
634 	case M_PROTO:
635 	case M_PCPROTO:
636 		ipnet_wputnondata(q, mp);
637 		break;
638 	case M_IOCTL:
639 		ipnet_ioctl(q, mp);
640 		break;
641 	case M_IOCDATA:
642 		ipnet_iocdata(q, mp);
643 		break;
644 	default:
645 		freemsg(mp);
646 		break;
647 	}
648 	return (0);
649 }
650 
651 static int
ipnet_rsrv(queue_t * q)652 ipnet_rsrv(queue_t *q)
653 {
654 	mblk_t	*mp;
655 
656 	while ((mp = getq(q)) != NULL) {
657 		ASSERT(DB_TYPE(mp) == M_DATA);
658 		if (canputnext(q)) {
659 			putnext(q, mp);
660 		} else {
661 			(void) putbq(q, mp);
662 			break;
663 		}
664 	}
665 	return (0);
666 }
667 
668 static void
ipnet_ioctl(queue_t * q,mblk_t * mp)669 ipnet_ioctl(queue_t *q, mblk_t *mp)
670 {
671 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
672 
673 	switch (iocp->ioc_cmd) {
674 	case DLIOCRAW:
675 		miocack(q, mp, 0, 0);
676 		break;
677 	case DLIOCIPNETINFO:
678 		if (iocp->ioc_count == TRANSPARENT) {
679 			mcopyin(mp, NULL, sizeof (uint_t), NULL);
680 			qreply(q, mp);
681 			break;
682 		}
683 		/* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */
684 	default:
685 		miocnak(q, mp, 0, EINVAL);
686 		break;
687 	}
688 }
689 
690 static void
ipnet_iocdata(queue_t * q,mblk_t * mp)691 ipnet_iocdata(queue_t *q, mblk_t *mp)
692 {
693 	struct iocblk	*iocp = (struct iocblk *)mp->b_rptr;
694 	ipnet_t	*ipnet = q->q_ptr;
695 
696 	switch (iocp->ioc_cmd) {
697 	case DLIOCIPNETINFO:
698 		if (*(int *)mp->b_cont->b_rptr == 1)
699 			ipnet->ipnet_flags |= IPNET_INFO;
700 		else if (*(int *)mp->b_cont->b_rptr == 0)
701 			ipnet->ipnet_flags &= ~IPNET_INFO;
702 		else
703 			goto iocnak;
704 		miocack(q, mp, 0, DL_IPNETINFO_VERSION);
705 		break;
706 	default:
707 iocnak:
708 		miocnak(q, mp, 0, EINVAL);
709 		break;
710 	}
711 }
712 
713 static void
ipnet_wputnondata(queue_t * q,mblk_t * mp)714 ipnet_wputnondata(queue_t *q, mblk_t *mp)
715 {
716 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
717 	t_uscalar_t		prim = dlp->dl_primitive;
718 
719 	switch (prim) {
720 	case DL_INFO_REQ:
721 		ipnet_inforeq(q, mp);
722 		break;
723 	case DL_UNBIND_REQ:
724 		ipnet_unbindreq(q, mp);
725 		break;
726 	case DL_BIND_REQ:
727 		ipnet_bindreq(q, mp);
728 		break;
729 	case DL_PROMISCON_REQ:
730 		ipnet_dlpromisconreq(q, mp);
731 		break;
732 	case DL_PROMISCOFF_REQ:
733 		ipnet_dlpromiscoffreq(q, mp);
734 		break;
735 	case DL_UNITDATA_REQ:
736 	case DL_DETACH_REQ:
737 	case DL_PHYS_ADDR_REQ:
738 	case DL_SET_PHYS_ADDR_REQ:
739 	case DL_ENABMULTI_REQ:
740 	case DL_DISABMULTI_REQ:
741 	case DL_ATTACH_REQ:
742 		dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0);
743 		break;
744 	default:
745 		dlerrorack(q, mp, prim, DL_BADPRIM, 0);
746 		break;
747 	}
748 }
749 
750 static void
ipnet_inforeq(queue_t * q,mblk_t * mp)751 ipnet_inforeq(queue_t *q, mblk_t *mp)
752 {
753 	dl_info_ack_t	*dlip;
754 	size_t		size = sizeof (dl_info_ack_t) + sizeof (ushort_t);
755 
756 	if (MBLKL(mp) < DL_INFO_REQ_SIZE) {
757 		dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0);
758 		return;
759 	}
760 
761 	if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL)
762 		return;
763 
764 	dlip = (dl_info_ack_t *)mp->b_rptr;
765 	*dlip = ipnet_infoack;
766 	qreply(q, mp);
767 }
768 
769 static void
ipnet_bindreq(queue_t * q,mblk_t * mp)770 ipnet_bindreq(queue_t *q, mblk_t *mp)
771 {
772 	union DL_primitives	*dlp = (union DL_primitives *)mp->b_rptr;
773 	ipnet_t			*ipnet = q->q_ptr;
774 
775 	if (MBLKL(mp) < DL_BIND_REQ_SIZE) {
776 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0);
777 		return;
778 	}
779 
780 	switch (dlp->bind_req.dl_sap) {
781 	case 0 :
782 		ipnet->ipnet_family = AF_UNSPEC;
783 		break;
784 	case IPV4_VERSION :
785 		ipnet->ipnet_family = AF_INET;
786 		break;
787 	case IPV6_VERSION :
788 		ipnet->ipnet_family = AF_INET6;
789 		break;
790 	default :
791 		dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0);
792 		return;
793 		/*NOTREACHED*/
794 	}
795 
796 	ipnet->ipnet_dlstate = DL_IDLE;
797 	dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0);
798 }
799 
800 static void
ipnet_unbindreq(queue_t * q,mblk_t * mp)801 ipnet_unbindreq(queue_t *q, mblk_t *mp)
802 {
803 	ipnet_t	*ipnet = q->q_ptr;
804 
805 	if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) {
806 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0);
807 		return;
808 	}
809 
810 	if (ipnet->ipnet_dlstate != DL_IDLE) {
811 		dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0);
812 	} else {
813 		ipnet->ipnet_dlstate = DL_UNBOUND;
814 		ipnet->ipnet_family = AF_UNSPEC;
815 		dlokack(q, mp, DL_UNBIND_REQ);
816 	}
817 }
818 
819 static void
ipnet_dlpromisconreq(queue_t * q,mblk_t * mp)820 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp)
821 {
822 	ipnet_t		*ipnet = q->q_ptr;
823 	t_uscalar_t	level;
824 	int		err;
825 
826 	if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) {
827 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
828 		return;
829 	}
830 
831 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
832 		dlokack(q, mp, DL_PROMISCON_REQ);
833 		return;
834 	}
835 
836 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
837 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
838 		if ((err = ipnet_join_allmulti(ipnet->ipnet_if,
839 		    ipnet->ipnet_ns->netstack_ipnet)) != 0) {
840 			dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err);
841 			return;
842 		}
843 	}
844 
845 	switch (level) {
846 	case DL_PROMISC_PHYS:
847 		ipnet->ipnet_flags |= IPNET_PROMISC_PHYS;
848 		break;
849 	case DL_PROMISC_SAP:
850 		ipnet->ipnet_flags |= IPNET_PROMISC_SAP;
851 		break;
852 	case DL_PROMISC_MULTI:
853 		ipnet->ipnet_flags |= IPNET_PROMISC_MULTI;
854 		break;
855 	default:
856 		dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0);
857 		return;
858 	}
859 
860 	dlokack(q, mp, DL_PROMISCON_REQ);
861 }
862 
863 static void
ipnet_dlpromiscoffreq(queue_t * q,mblk_t * mp)864 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp)
865 {
866 	ipnet_t		*ipnet = q->q_ptr;
867 	t_uscalar_t	level;
868 	uint16_t	orig_ipnet_flags = ipnet->ipnet_flags;
869 
870 	if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) {
871 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
872 		return;
873 	}
874 
875 	if (ipnet->ipnet_flags & IPNET_LOMODE) {
876 		dlokack(q, mp, DL_PROMISCOFF_REQ);
877 		return;
878 	}
879 
880 	level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level;
881 	switch (level) {
882 	case DL_PROMISC_PHYS:
883 		if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS)
884 			ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS;
885 		break;
886 	case DL_PROMISC_SAP:
887 		if (ipnet->ipnet_flags & IPNET_PROMISC_SAP)
888 			ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP;
889 		break;
890 	case DL_PROMISC_MULTI:
891 		if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI)
892 			ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI;
893 		break;
894 	default:
895 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0);
896 		return;
897 	}
898 
899 	if (orig_ipnet_flags == ipnet->ipnet_flags) {
900 		dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0);
901 		return;
902 	}
903 
904 	if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) {
905 		ipnet_leave_allmulti(ipnet->ipnet_if,
906 		    ipnet->ipnet_ns->netstack_ipnet);
907 	}
908 
909 	dlokack(q, mp, DL_PROMISCOFF_REQ);
910 }
911 
912 static int
ipnet_join_allmulti(ipnetif_t * ipnetif,ipnet_stack_t * ips)913 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
914 {
915 	int		err = 0;
916 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
917 	uint64_t	index = ipnetif->if_index;
918 
919 	mutex_enter(&ips->ips_event_lock);
920 	if (ipnetif->if_multicnt == 0) {
921 		ASSERT((ipnetif->if_flags &
922 		    (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0);
923 		if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) {
924 			err = ip_join_allmulti(index, B_FALSE, ipst);
925 			if (err != 0)
926 				goto done;
927 			ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI;
928 		}
929 		if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) {
930 			err = ip_join_allmulti(index, B_TRUE, ipst);
931 			if (err != 0 &&
932 			    (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) {
933 				(void) ip_leave_allmulti(index, B_FALSE, ipst);
934 				ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
935 				goto done;
936 			}
937 			ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI;
938 		}
939 	}
940 	ipnetif->if_multicnt++;
941 
942 done:
943 	mutex_exit(&ips->ips_event_lock);
944 	return (err);
945 }
946 
947 static void
ipnet_leave_allmulti(ipnetif_t * ipnetif,ipnet_stack_t * ips)948 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips)
949 {
950 	int		err;
951 	ip_stack_t	*ipst = ips->ips_netstack->netstack_ip;
952 	uint64_t	index = ipnetif->if_index;
953 
954 	mutex_enter(&ips->ips_event_lock);
955 	ASSERT(ipnetif->if_multicnt != 0);
956 	if (--ipnetif->if_multicnt == 0) {
957 		if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) {
958 			err = ip_leave_allmulti(index, B_FALSE, ipst);
959 			ASSERT(err == 0 || err == ENODEV);
960 			ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI;
961 		}
962 		if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) {
963 			err = ip_leave_allmulti(index, B_TRUE, ipst);
964 			ASSERT(err == 0 || err == ENODEV);
965 			ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI;
966 		}
967 	}
968 	mutex_exit(&ips->ips_event_lock);
969 }
970 
971 /*
972  * Allocate a new mblk_t and put a dl_ipnetinfo_t in it.
973  * The structure it copies the header information from,
974  * hook_pkt_observe_t, is constructed using network byte
975  * order in ipobs_hook(), so there is no conversion here.
976  */
977 static mblk_t *
ipnet_addheader(hook_pkt_observe_t * hdr,mblk_t * mp)978 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp)
979 {
980 	mblk_t		*dlhdr;
981 	dl_ipnetinfo_t	*dl;
982 
983 	if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) {
984 		freemsg(mp);
985 		return (NULL);
986 	}
987 	dl = (dl_ipnetinfo_t *)dlhdr->b_rptr;
988 	dl->dli_version = DL_IPNETINFO_VERSION;
989 	dl->dli_family = hdr->hpo_family;
990 	dl->dli_htype = hdr->hpo_htype;
991 	dl->dli_pktlen = hdr->hpo_pktlen;
992 	dl->dli_ifindex = hdr->hpo_ifindex;
993 	dl->dli_grifindex = hdr->hpo_grifindex;
994 	dl->dli_zsrc = hdr->hpo_zsrc;
995 	dl->dli_zdst = hdr->hpo_zdst;
996 	dlhdr->b_wptr += sizeof (*dl);
997 	dlhdr->b_cont = mp;
998 
999 	return (dlhdr);
1000 }
1001 
1002 static ipnet_addrtype_t
ipnet_get_addrtype(ipnet_t * ipnet,ipnet_addrp_t * addr)1003 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr)
1004 {
1005 	list_t			*list;
1006 	ipnetif_t		*ipnetif = ipnet->ipnet_if;
1007 	ipnetif_addr_t		*ifaddr;
1008 	ipnet_addrtype_t	addrtype = IPNETADDR_UNKNOWN;
1009 
1010 	/* First check if the address is multicast or limited broadcast. */
1011 	switch (addr->iap_family) {
1012 	case AF_INET:
1013 		if (CLASSD(*(addr->iap_addr4)) ||
1014 		    *(addr->iap_addr4) == INADDR_BROADCAST)
1015 			return (IPNETADDR_MBCAST);
1016 		break;
1017 	case AF_INET6:
1018 		if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6))
1019 			return (IPNETADDR_MBCAST);
1020 		break;
1021 	}
1022 
1023 	/*
1024 	 * Walk the address list to see if the address belongs to our
1025 	 * interface or is one of our subnet broadcast addresses.
1026 	 */
1027 	mutex_enter(&ipnetif->if_addr_lock);
1028 	list = (addr->iap_family == AF_INET) ?
1029 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list;
1030 	for (ifaddr = list_head(list);
1031 	    ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN;
1032 	    ifaddr = list_next(list, ifaddr)) {
1033 		/*
1034 		 * If we're not in the global zone, then only look at
1035 		 * addresses in our zone.
1036 		 */
1037 		if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1038 		    ipnet->ipnet_zoneid != ifaddr->ifa_zone)
1039 			continue;
1040 		switch (addr->iap_family) {
1041 		case AF_INET:
1042 			if (ifaddr->ifa_ip4addr != INADDR_ANY &&
1043 			    *(addr->iap_addr4) == ifaddr->ifa_ip4addr)
1044 				addrtype = IPNETADDR_MYADDR;
1045 			else if (ifaddr->ifa_brdaddr != INADDR_ANY &&
1046 			    *(addr->iap_addr4) == ifaddr->ifa_brdaddr)
1047 				addrtype = IPNETADDR_MBCAST;
1048 			break;
1049 		case AF_INET6:
1050 			if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6,
1051 			    &ifaddr->ifa_ip6addr))
1052 				addrtype = IPNETADDR_MYADDR;
1053 			break;
1054 		}
1055 	}
1056 	mutex_exit(&ipnetif->if_addr_lock);
1057 
1058 	return (addrtype);
1059 }
1060 
1061 /*
1062  * Verify if the packet contained in hdr should be passed up to the
1063  * ipnet client stream.
1064  */
1065 static boolean_t
ipnet_accept(ipnet_t * ipnet,hook_pkt_observe_t * hdr,ipnet_addrp_t * src,ipnet_addrp_t * dst)1066 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1067     ipnet_addrp_t *dst)
1068 {
1069 	boolean_t		obsif;
1070 	uint64_t		ifindex = ipnet->ipnet_if->if_index;
1071 	ipnet_addrtype_t	srctype;
1072 	ipnet_addrtype_t	dsttype;
1073 
1074 	srctype = ipnet_get_addrtype(ipnet, src);
1075 	dsttype = ipnet_get_addrtype(ipnet, dst);
1076 
1077 	/*
1078 	 * If the packet's ifindex matches ours, or the packet's group ifindex
1079 	 * matches ours, it's on the interface we're observing.  (Thus,
1080 	 * observing on the group ifindex matches all ifindexes in the group.)
1081 	 */
1082 	obsif = (ntohl(hdr->hpo_ifindex) == ifindex ||
1083 	    ntohl(hdr->hpo_grifindex) == ifindex);
1084 
1085 	DTRACE_PROBE5(ipnet_accept__addr,
1086 	    ipnet_addrtype_t, srctype, ipnet_addrp_t *, src,
1087 	    ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst,
1088 	    boolean_t, obsif);
1089 
1090 	/*
1091 	 * Do not allow an ipnet stream to see packets that are not from or to
1092 	 * its zone.  The exception is when zones are using the shared stack
1093 	 * model.  In this case, streams in the global zone have visibility
1094 	 * into other shared-stack zones, and broadcast and multicast traffic
1095 	 * is visible by all zones in the stack.
1096 	 */
1097 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID &&
1098 	    dsttype != IPNETADDR_MBCAST) {
1099 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1100 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1101 			return (B_FALSE);
1102 	}
1103 
1104 	/*
1105 	 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the
1106 	 * packet's IP version.
1107 	 */
1108 	if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) &&
1109 	    ipnet->ipnet_family != hdr->hpo_family)
1110 		return (B_FALSE);
1111 
1112 	/* If the destination address is ours, then accept the packet. */
1113 	if (dsttype == IPNETADDR_MYADDR)
1114 		return (B_TRUE);
1115 
1116 	/*
1117 	 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are
1118 	 * sent or received on the interface we're observing, or packets that
1119 	 * have our source address (this allows us to see packets we send).
1120 	 */
1121 	if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) {
1122 		if (srctype == IPNETADDR_MYADDR || obsif)
1123 			return (B_TRUE);
1124 	}
1125 
1126 	/*
1127 	 * We accept multicast and broadcast packets transmitted or received
1128 	 * on the interface we're observing.
1129 	 */
1130 	if (dsttype == IPNETADDR_MBCAST && obsif)
1131 		return (B_TRUE);
1132 
1133 	return (B_FALSE);
1134 }
1135 
1136 /*
1137  * Verify if the packet contained in hdr should be passed up to the ipnet
1138  * client stream that's in IPNET_LOMODE.
1139  */
1140 /* ARGSUSED */
1141 static boolean_t
ipnet_loaccept(ipnet_t * ipnet,hook_pkt_observe_t * hdr,ipnet_addrp_t * src,ipnet_addrp_t * dst)1142 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src,
1143     ipnet_addrp_t *dst)
1144 {
1145 	if (hdr->hpo_htype != htons(IPOBS_HOOK_LOCAL)) {
1146 		/*
1147 		 * ipnet_if is only NULL for IPNET_MINOR_LO devices.
1148 		 */
1149 		if (ipnet->ipnet_if == NULL)
1150 			return (B_FALSE);
1151 	}
1152 
1153 	/*
1154 	 * An ipnet stream must not see packets that are not from/to its zone.
1155 	 */
1156 	if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) {
1157 		if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) &&
1158 		    ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst))
1159 			return (B_FALSE);
1160 	}
1161 
1162 	return (ipnet->ipnet_family == AF_UNSPEC ||
1163 	    ipnet->ipnet_family == hdr->hpo_family);
1164 }
1165 
1166 static void
ipnet_dispatch(void * arg)1167 ipnet_dispatch(void *arg)
1168 {
1169 	mblk_t			*mp = arg;
1170 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1171 	ipnet_t			*ipnet;
1172 	mblk_t			*netmp;
1173 	list_t			*list;
1174 	ipnet_stack_t		*ips;
1175 	ipnet_addrp_t		src;
1176 	ipnet_addrp_t		dst;
1177 
1178 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1179 
1180 	netmp = hdr->hpo_pkt->b_cont;
1181 	src.iap_family = hdr->hpo_family;
1182 	dst.iap_family = hdr->hpo_family;
1183 
1184 	if (hdr->hpo_family == AF_INET) {
1185 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
1186 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
1187 	} else {
1188 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
1189 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
1190 	}
1191 
1192 	ipnet_walkers_inc(ips);
1193 
1194 	list = &ips->ips_str_list;
1195 	for (ipnet = list_head(list); ipnet != NULL;
1196 	    ipnet = list_next(list, ipnet)) {
1197 		if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
1198 			IPSK_BUMP(ips, ik_acceptFail);
1199 			continue;
1200 		}
1201 		IPSK_BUMP(ips, ik_acceptOk);
1202 
1203 		if (list_next(list, ipnet) == NULL) {
1204 			netmp = hdr->hpo_pkt->b_cont;
1205 			hdr->hpo_pkt->b_cont = NULL;
1206 		} else {
1207 			if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL &&
1208 			    (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) {
1209 				IPSK_BUMP(ips, ik_duplicationFail);
1210 				continue;
1211 			}
1212 		}
1213 
1214 		if (ipnet->ipnet_flags & IPNET_INFO) {
1215 			if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) {
1216 				IPSK_BUMP(ips, ik_dispatchHeaderDrop);
1217 				continue;
1218 			}
1219 		}
1220 
1221 		if (ipnet->ipnet_rq->q_first == NULL &&
1222 		    canputnext(ipnet->ipnet_rq)) {
1223 			putnext(ipnet->ipnet_rq, netmp);
1224 			IPSK_BUMP(ips, ik_dispatchDeliver);
1225 		} else if (canput(ipnet->ipnet_rq)) {
1226 			(void) putq(ipnet->ipnet_rq, netmp);
1227 			IPSK_BUMP(ips, ik_dispatchDeliver);
1228 		} else {
1229 			freemsg(netmp);
1230 			IPSK_BUMP(ips, ik_dispatchPutDrop);
1231 		}
1232 	}
1233 
1234 	ipnet_walkers_dec(ips);
1235 
1236 	freemsg(mp);
1237 }
1238 
1239 static void
ipnet_input(mblk_t * mp)1240 ipnet_input(mblk_t *mp)
1241 {
1242 	hook_pkt_observe_t	*hdr = (hook_pkt_observe_t *)mp->b_rptr;
1243 	ipnet_stack_t		*ips;
1244 
1245 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
1246 
1247 	if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) !=
1248 	    DDI_SUCCESS) {
1249 		IPSK_BUMP(ips, ik_dispatchFail);
1250 		freemsg(mp);
1251 	} else {
1252 		IPSK_BUMP(ips, ik_dispatchOk);
1253 	}
1254 }
1255 
1256 static ipnetif_t *
ipnet_alloc_if(ipnet_stack_t * ips)1257 ipnet_alloc_if(ipnet_stack_t *ips)
1258 {
1259 	ipnetif_t	*ipnetif;
1260 
1261 	if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL)
1262 		return (NULL);
1263 
1264 	mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0);
1265 	list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t),
1266 	    offsetof(ipnetif_addr_t, ifa_link));
1267 	list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t),
1268 	    offsetof(ipnetif_addr_t, ifa_link));
1269 	mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0);
1270 
1271 	ipnetif->if_stackp = ips;
1272 
1273 	return (ipnetif);
1274 }
1275 
1276 /*
1277  * Create a new ipnetif_t and new minor node for it.  If creation is
1278  * successful the new ipnetif_t is inserted into an avl_tree
1279  * containing ipnetif's for this stack instance.
1280  */
1281 static ipnetif_t *
ipnetif_create(const char * name,uint64_t index,ipnet_stack_t * ips,uint64_t ifflags)1282 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips,
1283     uint64_t ifflags)
1284 {
1285 	ipnetif_t	*ipnetif;
1286 	avl_index_t	where = 0;
1287 	minor_t		ifminor;
1288 
1289 	/*
1290 	 * Because ipnetif_create() can be called from a NIC event
1291 	 * callback, it should not block.
1292 	 */
1293 	ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space);
1294 	if (ifminor == (minor_t)-1)
1295 		return (NULL);
1296 	if ((ipnetif = ipnet_alloc_if(ips)) == NULL) {
1297 		id_free(ipnet_minor_space, ifminor);
1298 		return (NULL);
1299 	}
1300 
1301 	(void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ);
1302 	ipnetif->if_index = (uint_t)index;
1303 	ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack);
1304 	ipnetif->if_dev = makedevice(ipnet_major, ifminor);
1305 
1306 	ipnetif->if_refcnt = 1;
1307 	if ((ifflags & IFF_LOOPBACK) != 0)
1308 		ipnetif->if_flags = IPNETIF_LOOPBACK;
1309 
1310 	mutex_enter(&ips->ips_avl_lock);
1311 	VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL);
1312 	avl_insert(&ips->ips_avl_by_index, ipnetif, where);
1313 	VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL);
1314 	avl_insert(&ips->ips_avl_by_name, ipnetif, where);
1315 	mutex_exit(&ips->ips_avl_lock);
1316 
1317 	return (ipnetif);
1318 }
1319 
1320 static void
ipnetif_remove(ipnetif_t * ipnetif,ipnet_stack_t * ips)1321 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1322 {
1323 	ipnet_t	*ipnet;
1324 
1325 	ipnet_walkers_inc(ips);
1326 	/* Send a SIGHUP to all open streams associated with this ipnetif. */
1327 	for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL;
1328 	    ipnet = list_next(&ips->ips_str_list, ipnet)) {
1329 		if (ipnet->ipnet_if == ipnetif)
1330 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1331 	}
1332 	ipnet_walkers_dec(ips);
1333 	mutex_enter(&ips->ips_avl_lock);
1334 	avl_remove(&ips->ips_avl_by_index, ipnetif);
1335 	avl_remove(&ips->ips_avl_by_name, ipnetif);
1336 	mutex_exit(&ips->ips_avl_lock);
1337 	/*
1338 	 * Release the reference we implicitly held in ipnetif_create().
1339 	 */
1340 	ipnetif_refrele(ipnetif);
1341 }
1342 
1343 static void
ipnet_purge_addrlist(list_t * addrlist)1344 ipnet_purge_addrlist(list_t *addrlist)
1345 {
1346 	ipnetif_addr_t	*ifa;
1347 
1348 	while ((ifa = list_head(addrlist)) != NULL) {
1349 		list_remove(addrlist, ifa);
1350 		if (ifa->ifa_shared != NULL)
1351 			ipnetif_clone_release(ifa->ifa_shared);
1352 		kmem_free(ifa, sizeof (*ifa));
1353 	}
1354 }
1355 
1356 static void
ipnetif_free(ipnetif_t * ipnetif)1357 ipnetif_free(ipnetif_t *ipnetif)
1358 {
1359 	ASSERT(ipnetif->if_refcnt == 0);
1360 	ASSERT(ipnetif->if_sharecnt == 0);
1361 
1362 	/* Remove IPv4/v6 address lists from the ipnetif */
1363 	ipnet_purge_addrlist(&ipnetif->if_ip4addr_list);
1364 	list_destroy(&ipnetif->if_ip4addr_list);
1365 	ipnet_purge_addrlist(&ipnetif->if_ip6addr_list);
1366 	list_destroy(&ipnetif->if_ip6addr_list);
1367 	mutex_destroy(&ipnetif->if_addr_lock);
1368 	mutex_destroy(&ipnetif->if_reflock);
1369 	if (ipnetif->if_dev != 0)
1370 		id_free(ipnet_minor_space, getminor(ipnetif->if_dev));
1371 	kmem_free(ipnetif, sizeof (*ipnetif));
1372 }
1373 
1374 /*
1375  * Create an ipnetif_addr_t with the given logical interface id (lif)
1376  * and add it to the supplied ipnetif.  The lif is the netinfo
1377  * representation of logical interface id, and we use this id to match
1378  * incoming netinfo events against our lists of addresses.
1379  */
1380 static void
ipnet_add_ifaddr(uint64_t lif,ipnetif_t * ipnetif,net_handle_t nd)1381 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd)
1382 {
1383 	ipnetif_addr_t		*ifaddr;
1384 	zoneid_t		zoneid;
1385 	struct sockaddr_in	bcast;
1386 	struct sockaddr_storage	addr;
1387 	net_ifaddr_t		type = NA_ADDRESS;
1388 	uint64_t		phyif = ipnetif->if_index;
1389 
1390 	if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 ||
1391 	    net_getlifzone(nd, phyif, lif, &zoneid) != 0)
1392 		return;
1393 
1394 	if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL)
1395 		return;
1396 	ifaddr->ifa_zone = zoneid;
1397 	ifaddr->ifa_id = lif;
1398 	ifaddr->ifa_shared = NULL;
1399 
1400 	switch (addr.ss_family) {
1401 	case AF_INET:
1402 		ifaddr->ifa_ip4addr =
1403 		    ((struct sockaddr_in *)&addr)->sin_addr.s_addr;
1404 		/*
1405 		 * Try and get the broadcast address.  Note that it's okay for
1406 		 * an interface to not have a broadcast address, so we don't
1407 		 * fail the entire operation if net_getlifaddr() fails here.
1408 		 */
1409 		type = NA_BROADCAST;
1410 		if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0)
1411 			ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr;
1412 		break;
1413 	case AF_INET6:
1414 		ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr;
1415 		break;
1416 	}
1417 
1418 	/*
1419 	 * The zoneid stored in ipnetif_t needs to correspond to the actual
1420 	 * zone the address is being used in. This facilitates finding the
1421 	 * correct netstack_t pointer, amongst other things, later.
1422 	 */
1423 	if (zoneid == ALL_ZONES)
1424 		zoneid = GLOBAL_ZONEID;
1425 
1426 	mutex_enter(&ipnetif->if_addr_lock);
1427 	if (zoneid != ipnetif->if_zoneid) {
1428 		ipnetif_t *ifp2;
1429 
1430 		ifp2 = ipnetif_clone_create(ipnetif, zoneid);
1431 		ifaddr->ifa_shared = ifp2;
1432 	}
1433 	list_insert_tail(addr.ss_family == AF_INET ?
1434 	    &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr);
1435 	mutex_exit(&ipnetif->if_addr_lock);
1436 }
1437 
1438 static void
ipnet_delete_ifaddr(ipnetif_addr_t * ifaddr,ipnetif_t * ipnetif,boolean_t isv6)1439 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6)
1440 {
1441 	mutex_enter(&ipnetif->if_addr_lock);
1442 	if (ifaddr->ifa_shared != NULL)
1443 		ipnetif_clone_release(ifaddr->ifa_shared);
1444 
1445 	list_remove(isv6 ?
1446 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr);
1447 	mutex_exit(&ipnetif->if_addr_lock);
1448 	kmem_free(ifaddr, sizeof (*ifaddr));
1449 }
1450 
1451 static void
ipnet_plumb_ev(ipnet_nicevent_t * ipne,ipnet_stack_t * ips,boolean_t isv6)1452 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6)
1453 {
1454 	ipnetif_t	*ipnetif;
1455 	boolean_t	refrele_needed = B_TRUE;
1456 	uint64_t	ifflags;
1457 	uint64_t	ifindex;
1458 	char		*ifname;
1459 
1460 	ifflags = 0;
1461 	ifname = ipne->ipne_ifname;
1462 	ifindex = ipne->ipne_ifindex;
1463 
1464 	(void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags);
1465 
1466 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) {
1467 		ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags);
1468 		refrele_needed = B_FALSE;
1469 	}
1470 	if (ipnetif != NULL) {
1471 		ipnetif->if_flags |=
1472 		    isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED;
1473 	}
1474 
1475 	if (ipnetif->if_multicnt != 0) {
1476 		if (ip_join_allmulti(ifindex, isv6,
1477 		    ips->ips_netstack->netstack_ip) == 0) {
1478 			ipnetif->if_flags |=
1479 			    isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI;
1480 		}
1481 	}
1482 
1483 	if (refrele_needed)
1484 		ipnetif_refrele(ipnetif);
1485 }
1486 
1487 static void
ipnet_unplumb_ev(uint64_t ifindex,ipnet_stack_t * ips,boolean_t isv6)1488 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6)
1489 {
1490 	ipnetif_t	*ipnetif;
1491 
1492 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1493 		return;
1494 
1495 	mutex_enter(&ipnetif->if_addr_lock);
1496 	ipnet_purge_addrlist(isv6 ?
1497 	    &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list);
1498 	mutex_exit(&ipnetif->if_addr_lock);
1499 
1500 	/*
1501 	 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive
1502 	 * separate NE_UNPLUMB events for IPv4 and IPv6.  We remove the ipnetif
1503 	 * if both IPv4 and IPv6 interfaces have been unplumbed.
1504 	 */
1505 	ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED;
1506 	if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED)))
1507 		ipnetif_remove(ipnetif, ips);
1508 	ipnetif_refrele(ipnetif);
1509 }
1510 
1511 static void
ipnet_lifup_ev(uint64_t ifindex,uint64_t lifindex,net_handle_t nd,ipnet_stack_t * ips,boolean_t isv6)1512 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd,
1513     ipnet_stack_t *ips, boolean_t isv6)
1514 {
1515 	ipnetif_t	*ipnetif;
1516 	ipnetif_addr_t	*ifaddr;
1517 
1518 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1519 		return;
1520 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) {
1521 		/*
1522 		 * We must have missed a NE_LIF_DOWN event.  Delete this
1523 		 * ifaddr and re-create it.
1524 		 */
1525 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1526 	}
1527 
1528 	ipnet_add_ifaddr(lifindex, ipnetif, nd);
1529 	ipnetif_refrele(ipnetif);
1530 }
1531 
1532 static void
ipnet_lifdown_ev(uint64_t ifindex,uint64_t lifindex,ipnet_stack_t * ips,boolean_t isv6)1533 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips,
1534     boolean_t isv6)
1535 {
1536 	ipnetif_t	*ipnetif;
1537 	ipnetif_addr_t	*ifaddr;
1538 
1539 	if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL)
1540 		return;
1541 	if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL)
1542 		ipnet_delete_ifaddr(ifaddr, ipnetif, isv6);
1543 	ipnetif_refrele(ipnetif);
1544 	/*
1545 	 * Make sure that open streams on this ipnetif are still allowed to
1546 	 * have it open.
1547 	 */
1548 	ipnetif_zonecheck(ipnetif, ips);
1549 }
1550 
1551 /*
1552  * This callback from the NIC event framework dispatches a taskq as the event
1553  * handlers may block.
1554  */
1555 /* ARGSUSED */
1556 static int
ipnet_nicevent_cb(hook_event_token_t token,hook_data_t info,void * arg)1557 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg)
1558 {
1559 	ipnet_stack_t		*ips = arg;
1560 	hook_nic_event_t	*hn = (hook_nic_event_t *)info;
1561 	ipnet_nicevent_t	*ipne;
1562 
1563 	if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL)
1564 		return (0);
1565 	ipne->ipne_event = hn->hne_event;
1566 	ipne->ipne_protocol = hn->hne_protocol;
1567 	ipne->ipne_stackid = ips->ips_netstack->netstack_stackid;
1568 	ipne->ipne_ifindex = hn->hne_nic;
1569 	ipne->ipne_lifindex = hn->hne_lif;
1570 	if (hn->hne_datalen != 0) {
1571 		(void) strlcpy(ipne->ipne_ifname, hn->hne_data,
1572 		    sizeof (ipne->ipne_ifname));
1573 	}
1574 	(void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task,
1575 	    ipne, DDI_NOSLEEP);
1576 	return (0);
1577 }
1578 
1579 static void
ipnet_nicevent_task(void * arg)1580 ipnet_nicevent_task(void *arg)
1581 {
1582 	ipnet_nicevent_t	*ipne = arg;
1583 	netstack_t		*ns;
1584 	ipnet_stack_t		*ips;
1585 	boolean_t		isv6;
1586 
1587 	if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL)
1588 		goto done;
1589 	ips = ns->netstack_ipnet;
1590 	isv6 = (ipne->ipne_protocol == ips->ips_ndv6);
1591 
1592 	mutex_enter(&ips->ips_event_lock);
1593 	switch (ipne->ipne_event) {
1594 	case NE_PLUMB:
1595 		ipnet_plumb_ev(ipne, ips, isv6);
1596 		break;
1597 	case NE_UNPLUMB:
1598 		ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6);
1599 		break;
1600 	case NE_LIF_UP:
1601 		ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex,
1602 		    ipne->ipne_protocol, ips, isv6);
1603 		break;
1604 	case NE_LIF_DOWN:
1605 		ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips,
1606 		    isv6);
1607 		break;
1608 	default:
1609 		break;
1610 	}
1611 	mutex_exit(&ips->ips_event_lock);
1612 done:
1613 	if (ns != NULL)
1614 		netstack_rele(ns);
1615 	kmem_free(ipne, sizeof (ipnet_nicevent_t));
1616 }
1617 
1618 dev_t
ipnet_if_getdev(char * name,zoneid_t zoneid)1619 ipnet_if_getdev(char *name, zoneid_t zoneid)
1620 {
1621 	netstack_t	*ns;
1622 	ipnet_stack_t	*ips;
1623 	ipnetif_t	*ipnetif;
1624 	dev_t		dev = (dev_t)-1;
1625 
1626 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1627 		return (dev);
1628 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1629 		return (dev);
1630 
1631 	ips = ns->netstack_ipnet;
1632 	mutex_enter(&ips->ips_avl_lock);
1633 	if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) {
1634 		if (ipnetif_in_zone(ipnetif, zoneid, ips))
1635 			dev = ipnetif->if_dev;
1636 	}
1637 	mutex_exit(&ips->ips_avl_lock);
1638 	netstack_rele(ns);
1639 
1640 	return (dev);
1641 }
1642 
1643 static ipnetif_t *
ipnetif_getby_index(uint64_t id,ipnet_stack_t * ips)1644 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips)
1645 {
1646 	ipnetif_t	*ipnetif;
1647 
1648 	mutex_enter(&ips->ips_avl_lock);
1649 	if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL)
1650 		ipnetif_refhold(ipnetif);
1651 	mutex_exit(&ips->ips_avl_lock);
1652 	return (ipnetif);
1653 }
1654 
1655 static ipnetif_t *
ipnetif_getby_dev(dev_t dev,ipnet_stack_t * ips)1656 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips)
1657 {
1658 	ipnetif_t	*ipnetif;
1659 	avl_tree_t	*tree;
1660 
1661 	mutex_enter(&ips->ips_avl_lock);
1662 	tree = &ips->ips_avl_by_index;
1663 	for (ipnetif = avl_first(tree); ipnetif != NULL;
1664 	    ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) {
1665 		if (ipnetif->if_dev == dev) {
1666 			ipnetif_refhold(ipnetif);
1667 			break;
1668 		}
1669 	}
1670 	mutex_exit(&ips->ips_avl_lock);
1671 	return (ipnetif);
1672 }
1673 
1674 static ipnetif_addr_t *
ipnet_match_lif(ipnetif_t * ipnetif,lif_if_t lid,boolean_t isv6)1675 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6)
1676 {
1677 	ipnetif_addr_t	*ifaddr;
1678 	list_t	*list;
1679 
1680 	mutex_enter(&ipnetif->if_addr_lock);
1681 	list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list;
1682 	for (ifaddr = list_head(list); ifaddr != NULL;
1683 	    ifaddr = list_next(list, ifaddr)) {
1684 		if (lid == ifaddr->ifa_id)
1685 			break;
1686 	}
1687 	mutex_exit(&ipnetif->if_addr_lock);
1688 	return (ifaddr);
1689 }
1690 
1691 /* ARGSUSED */
1692 static void *
ipnet_stack_init(netstackid_t stackid,netstack_t * ns)1693 ipnet_stack_init(netstackid_t stackid, netstack_t *ns)
1694 {
1695 	ipnet_stack_t	*ips;
1696 
1697 	ips = kmem_zalloc(sizeof (*ips), KM_SLEEP);
1698 	ips->ips_netstack = ns;
1699 	mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0);
1700 	avl_create(&ips->ips_avl_by_index, ipnetif_compare_index,
1701 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index));
1702 	avl_create(&ips->ips_avl_by_name, ipnetif_compare_name,
1703 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name));
1704 	avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone,
1705 	    sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared));
1706 	mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL);
1707 	cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL);
1708 	list_create(&ips->ips_str_list, sizeof (ipnet_t),
1709 	    offsetof(ipnet_t, ipnet_next));
1710 	ipnet_register_netihook(ips);
1711 	return (ips);
1712 }
1713 
1714 /* ARGSUSED */
1715 static void
ipnet_stack_fini(netstackid_t stackid,void * arg)1716 ipnet_stack_fini(netstackid_t stackid, void *arg)
1717 {
1718 	ipnet_stack_t	*ips = arg;
1719 	ipnetif_t	*ipnetif, *nipnetif;
1720 
1721 	if (ips->ips_kstatp != NULL) {
1722 		zoneid_t zoneid;
1723 
1724 		zoneid = netstackid_to_zoneid(stackid);
1725 		net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp);
1726 	}
1727 	if (ips->ips_ndv4 != NULL) {
1728 		VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS,
1729 		    ips->ips_nicevents) == 0);
1730 		VERIFY(net_protocol_release(ips->ips_ndv4) == 0);
1731 	}
1732 	if (ips->ips_ndv6 != NULL) {
1733 		VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS,
1734 		    ips->ips_nicevents) == 0);
1735 		VERIFY(net_protocol_release(ips->ips_ndv6) == 0);
1736 	}
1737 	hook_free(ips->ips_nicevents);
1738 
1739 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1740 	    ipnetif = nipnetif) {
1741 		nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif);
1742 		ipnetif_remove(ipnetif, ips);
1743 	}
1744 	avl_destroy(&ips->ips_avl_by_shared);
1745 	avl_destroy(&ips->ips_avl_by_index);
1746 	avl_destroy(&ips->ips_avl_by_name);
1747 	mutex_destroy(&ips->ips_avl_lock);
1748 	mutex_destroy(&ips->ips_walkers_lock);
1749 	cv_destroy(&ips->ips_walkers_cv);
1750 	list_destroy(&ips->ips_str_list);
1751 	kmem_free(ips, sizeof (*ips));
1752 }
1753 
1754 /* Do any of the addresses in addrlist belong the supplied zoneid? */
1755 static boolean_t
ipnet_addrs_in_zone(list_t * addrlist,zoneid_t zoneid)1756 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid)
1757 {
1758 	ipnetif_addr_t	*ifa;
1759 
1760 	for (ifa = list_head(addrlist); ifa != NULL;
1761 	    ifa = list_next(addrlist, ifa)) {
1762 		if (ifa->ifa_zone == zoneid)
1763 			return (B_TRUE);
1764 	}
1765 	return (B_FALSE);
1766 }
1767 
1768 /* Should the supplied ipnetif be visible from the supplied zoneid? */
1769 static boolean_t
ipnetif_in_zone(ipnetif_t * ipnetif,zoneid_t zoneid,ipnet_stack_t * ips)1770 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips)
1771 {
1772 	int	ret;
1773 
1774 	/*
1775 	 * The global zone has visibility into all interfaces in the global
1776 	 * stack, and exclusive stack zones have visibility into all
1777 	 * interfaces in their stack.
1778 	 */
1779 	if (zoneid == GLOBAL_ZONEID ||
1780 	    ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID)
1781 		return (B_TRUE);
1782 
1783 	/*
1784 	 * Shared-stack zones only have visibility for interfaces that have
1785 	 * addresses in their zone.
1786 	 */
1787 	mutex_enter(&ipnetif->if_addr_lock);
1788 	ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) ||
1789 	    ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid);
1790 	mutex_exit(&ipnetif->if_addr_lock);
1791 	return (ret);
1792 }
1793 
1794 /*
1795  * Verify that any ipnet_t that has a reference to the supplied ipnetif should
1796  * still be allowed to have it open.  A given ipnet_t may no longer be allowed
1797  * to have an ipnetif open if there are no longer any addresses that belong to
1798  * the ipnetif in the ipnet_t's non-global shared-stack zoneid.  If that's the
1799  * case, send the ipnet_t an M_HANGUP.
1800  */
1801 static void
ipnetif_zonecheck(ipnetif_t * ipnetif,ipnet_stack_t * ips)1802 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips)
1803 {
1804 	list_t	*strlist = &ips->ips_str_list;
1805 	ipnet_t	*ipnet;
1806 
1807 	ipnet_walkers_inc(ips);
1808 	for (ipnet = list_head(strlist); ipnet != NULL;
1809 	    ipnet = list_next(strlist, ipnet)) {
1810 		if (ipnet->ipnet_if != ipnetif)
1811 			continue;
1812 		if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips))
1813 			(void) putnextctl(ipnet->ipnet_rq, M_HANGUP);
1814 	}
1815 	ipnet_walkers_dec(ips);
1816 }
1817 
1818 void
ipnet_walk_if(ipnet_walkfunc_t * cb,void * arg,zoneid_t zoneid)1819 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid)
1820 {
1821 	ipnetif_t		*ipnetif;
1822 	list_t			cbdata;
1823 	ipnetif_cbdata_t	*cbnode;
1824 	netstack_t		*ns;
1825 	ipnet_stack_t		*ips;
1826 
1827 	/*
1828 	 * On labeled systems, non-global zones shouldn't see anything
1829 	 * in /dev/ipnet.
1830 	 */
1831 	if (is_system_labeled() && zoneid != GLOBAL_ZONEID)
1832 		return;
1833 
1834 	if ((ns = netstack_find_by_zoneid(zoneid)) == NULL)
1835 		return;
1836 
1837 	ips = ns->netstack_ipnet;
1838 	list_create(&cbdata, sizeof (ipnetif_cbdata_t),
1839 	    offsetof(ipnetif_cbdata_t, ic_next));
1840 
1841 	mutex_enter(&ips->ips_avl_lock);
1842 	for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL;
1843 	    ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) {
1844 		if (!ipnetif_in_zone(ipnetif, zoneid, ips))
1845 			continue;
1846 		cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP);
1847 		(void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ);
1848 		cbnode->ic_dev = ipnetif->if_dev;
1849 		list_insert_head(&cbdata, cbnode);
1850 	}
1851 	mutex_exit(&ips->ips_avl_lock);
1852 
1853 	while ((cbnode = list_head(&cbdata)) != NULL) {
1854 		cb(cbnode->ic_ifname, arg, cbnode->ic_dev);
1855 		list_remove(&cbdata, cbnode);
1856 		kmem_free(cbnode, sizeof (ipnetif_cbdata_t));
1857 	}
1858 	list_destroy(&cbdata);
1859 	netstack_rele(ns);
1860 }
1861 
1862 static int
ipnetif_compare_index(const void * index_ptr,const void * ipnetifp)1863 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp)
1864 {
1865 	int64_t	index1 = *((int64_t *)index_ptr);
1866 	int64_t	index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index;
1867 
1868 	return (SIGNOF(index2 - index1));
1869 }
1870 
1871 static int
ipnetif_compare_name(const void * name_ptr,const void * ipnetifp)1872 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp)
1873 {
1874 	int	res;
1875 
1876 	res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr);
1877 	return (SIGNOF(res));
1878 }
1879 
1880 static int
ipnetif_compare_name_zone(const void * key_ptr,const void * ipnetifp)1881 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp)
1882 {
1883 	const uintptr_t	*ptr = key_ptr;
1884 	const ipnetif_t	*ifp;
1885 	int		res;
1886 
1887 	ifp = ipnetifp;
1888 	res = ifp->if_zoneid - ptr[0];
1889 	if (res != 0)
1890 		return (SIGNOF(res));
1891 	res = strcmp(ifp->if_name, (char *)ptr[1]);
1892 	return (SIGNOF(res));
1893 }
1894 
1895 static void
ipnetif_refhold(ipnetif_t * ipnetif)1896 ipnetif_refhold(ipnetif_t *ipnetif)
1897 {
1898 	mutex_enter(&ipnetif->if_reflock);
1899 	ipnetif->if_refcnt++;
1900 	mutex_exit(&ipnetif->if_reflock);
1901 }
1902 
1903 static void
ipnetif_refrele(ipnetif_t * ipnetif)1904 ipnetif_refrele(ipnetif_t *ipnetif)
1905 {
1906 	mutex_enter(&ipnetif->if_reflock);
1907 	ASSERT(ipnetif->if_refcnt > 0);
1908 	if (--ipnetif->if_refcnt == 0)
1909 		ipnetif_free(ipnetif);
1910 	else
1911 		mutex_exit(&ipnetif->if_reflock);
1912 }
1913 
1914 static void
ipnet_walkers_inc(ipnet_stack_t * ips)1915 ipnet_walkers_inc(ipnet_stack_t *ips)
1916 {
1917 	mutex_enter(&ips->ips_walkers_lock);
1918 	ips->ips_walkers_cnt++;
1919 	mutex_exit(&ips->ips_walkers_lock);
1920 }
1921 
1922 static void
ipnet_walkers_dec(ipnet_stack_t * ips)1923 ipnet_walkers_dec(ipnet_stack_t *ips)
1924 {
1925 	mutex_enter(&ips->ips_walkers_lock);
1926 	ASSERT(ips->ips_walkers_cnt != 0);
1927 	if (--ips->ips_walkers_cnt == 0)
1928 		cv_broadcast(&ips->ips_walkers_cv);
1929 	mutex_exit(&ips->ips_walkers_lock);
1930 }
1931 
1932 /*ARGSUSED*/
1933 static int
ipobs_bounce_func(hook_event_token_t token,hook_data_t info,void * arg)1934 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg)
1935 {
1936 	hook_pkt_observe_t	*hdr;
1937 	pfv_t			func = (pfv_t)arg;
1938 	mblk_t			*mp;
1939 
1940 	hdr = (hook_pkt_observe_t *)info;
1941 	/*
1942 	 * Code in ip_input() expects that it is the only one accessing the
1943 	 * packet.
1944 	 */
1945 	mp = copymsg(hdr->hpo_pkt);
1946 	if (mp == NULL)  {
1947 		netstack_t *ns = hdr->hpo_ctx;
1948 		ipnet_stack_t *ips = ns->netstack_ipnet;
1949 
1950 		IPSK_BUMP(ips, ik_dispatchDupDrop);
1951 		return (0);
1952 	}
1953 
1954 	hdr = (hook_pkt_observe_t *)mp->b_rptr;
1955 	hdr->hpo_pkt = mp;
1956 
1957 	func(mp);
1958 
1959 	return (0);
1960 }
1961 
1962 hook_t *
ipobs_register_hook(netstack_t * ns,pfv_t func)1963 ipobs_register_hook(netstack_t *ns, pfv_t func)
1964 {
1965 	ip_stack_t	*ipst = ns->netstack_ip;
1966 	char		name[32];
1967 	hook_t		*hook;
1968 
1969 	HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func);
1970 	VERIFY(hook != NULL);
1971 
1972 	/*
1973 	 * To register multiple hooks with he same callback function,
1974 	 * a unique name is needed.
1975 	 */
1976 	(void) snprintf(name, sizeof (name), "ipobserve_%p", (void *)hook);
1977 	hook->h_name = strdup(name);
1978 
1979 	(void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1980 	(void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1981 
1982 	return (hook);
1983 }
1984 
1985 void
ipobs_unregister_hook(netstack_t * ns,hook_t * hook)1986 ipobs_unregister_hook(netstack_t *ns, hook_t *hook)
1987 {
1988 	ip_stack_t	*ipst = ns->netstack_ip;
1989 
1990 	(void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook);
1991 
1992 	(void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook);
1993 
1994 	strfree(hook->h_name);
1995 
1996 	hook_free(hook);
1997 }
1998 
1999 /* ******************************************************************** */
2000 /* BPF Functions below							*/
2001 /* ******************************************************************** */
2002 
2003 /*
2004  * Convenience function to make mapping a zoneid to an ipnet_stack_t easy.
2005  */
2006 ipnet_stack_t *
ipnet_find_by_zoneid(zoneid_t zoneid)2007 ipnet_find_by_zoneid(zoneid_t zoneid)
2008 {
2009 	netstack_t	*ns;
2010 
2011 	VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL);
2012 	return (ns->netstack_ipnet);
2013 }
2014 
2015 /*
2016  * Functions, such as the above ipnet_find_by_zoneid(), will return a
2017  * pointer to ipnet_stack_t by calling a netstack lookup function.
2018  * The netstack_find_*() functions return a pointer after doing a "hold"
2019  * on the data structure and thereby require a "release" when the caller
2020  * is finished with it. We need to mirror that API here and thus a caller
2021  * of ipnet_find_by_zoneid() is required to call ipnet_rele().
2022  */
2023 void
ipnet_rele(ipnet_stack_t * ips)2024 ipnet_rele(ipnet_stack_t *ips)
2025 {
2026 	netstack_rele(ips->ips_netstack);
2027 }
2028 
2029 /*
2030  */
2031 void
ipnet_set_itap(bpf_itap_fn_t tapfunc)2032 ipnet_set_itap(bpf_itap_fn_t tapfunc)
2033 {
2034 	ipnet_itap = tapfunc;
2035 }
2036 
2037 /*
2038  * The list of interfaces available via ipnet is private for each zone,
2039  * so the AVL tree of each zone must be searched for a given name, even
2040  * if all names are unique.
2041  */
2042 int
ipnet_open_byname(const char * name,ipnetif_t ** ptr,zoneid_t zoneid)2043 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid)
2044 {
2045 	ipnet_stack_t	*ips;
2046 	ipnetif_t	*ipnetif;
2047 
2048 	ASSERT(ptr != NULL);
2049 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2050 
2051 	mutex_enter(&ips->ips_avl_lock);
2052 
2053 	/*
2054 	 * Shared instance zone?
2055 	 */
2056 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2057 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2058 
2059 		ipnetif = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2060 	} else {
2061 		ipnetif = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2062 	}
2063 	if (ipnetif != NULL)
2064 		ipnetif_refhold(ipnetif);
2065 	mutex_exit(&ips->ips_avl_lock);
2066 
2067 	*ptr = ipnetif;
2068 	ipnet_rele(ips);
2069 
2070 	if (ipnetif == NULL)
2071 		return (ESRCH);
2072 	return (0);
2073 }
2074 
2075 void
ipnet_close_byhandle(ipnetif_t * ifp)2076 ipnet_close_byhandle(ipnetif_t *ifp)
2077 {
2078 	ASSERT(ifp != NULL);
2079 	ipnetif_refrele(ifp);
2080 }
2081 
2082 const char *
ipnet_name(ipnetif_t * ifp)2083 ipnet_name(ipnetif_t *ifp)
2084 {
2085 	ASSERT(ifp != NULL);
2086 	return (ifp->if_name);
2087 }
2088 
2089 /*
2090  * To find the linkid for a given name, it is necessary to know which zone
2091  * the interface name belongs to and to search the avl tree for that zone
2092  * as there is no master list of all interfaces and which zone they belong
2093  * to. It is assumed that the caller of this function is somehow already
2094  * working with the ipnet interfaces and hence the ips_event_lock is held.
2095  * When BPF calls into this function, it is doing so because of an event
2096  * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id
2097  * value returned has meaning without the need for grabbing a hold on the
2098  * owning structure.
2099  */
2100 int
ipnet_get_linkid_byname(const char * name,uint_t * idp,zoneid_t zoneid)2101 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid)
2102 {
2103 	ipnet_stack_t	*ips;
2104 	ipnetif_t	*ifp;
2105 
2106 	VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL);
2107 	ASSERT(mutex_owned(&ips->ips_event_lock));
2108 
2109 	mutex_enter(&ips->ips_avl_lock);
2110 	ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL);
2111 	if (ifp != NULL)
2112 		*idp = (uint_t)ifp->if_index;
2113 
2114 	/*
2115 	 * Shared instance zone?
2116 	 */
2117 	if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) {
2118 		uintptr_t key[2] = { zoneid, (uintptr_t)name };
2119 
2120 		ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL);
2121 		if (ifp != NULL)
2122 			*idp = (uint_t)ifp->if_index;
2123 	}
2124 
2125 	mutex_exit(&ips->ips_avl_lock);
2126 	ipnet_rele(ips);
2127 
2128 	if (ifp == NULL)
2129 		return (ESRCH);
2130 	return (0);
2131 }
2132 
2133 /*
2134  * Strictly speaking, there is no such thing as a "client" in ipnet, like
2135  * there is in mac. BPF only needs to have this because it is required as
2136  * part of interfacing correctly with mac. The reuse of the original
2137  * ipnetif_t as a client poses no danger, so long as it is done with its
2138  * own ref-count'd hold that is given up on close.
2139  */
2140 int
ipnet_client_open(ipnetif_t * ptr,ipnetif_t ** result)2141 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result)
2142 {
2143 	ASSERT(ptr != NULL);
2144 	ASSERT(result != NULL);
2145 	ipnetif_refhold(ptr);
2146 	*result = ptr;
2147 
2148 	return (0);
2149 }
2150 
2151 void
ipnet_client_close(ipnetif_t * ptr)2152 ipnet_client_close(ipnetif_t *ptr)
2153 {
2154 	ASSERT(ptr != NULL);
2155 	ipnetif_refrele(ptr);
2156 }
2157 
2158 /*
2159  * This is called from BPF when it needs to start receiving packets
2160  * from ipnet.
2161  *
2162  * The use of the ipnet_t structure here is somewhat lightweight when
2163  * compared to how it is used elsewhere but it already has all of the
2164  * right fields in it, so reuse here doesn't seem out of order. Its
2165  * primary purpose here is to provide the means to store pointers for
2166  * use when ipnet_promisc_remove() needs to be called.
2167  *
2168  * This should never be called for the IPNET_MINOR_LO device as it is
2169  * never created via ipnetif_create.
2170  */
2171 /*ARGSUSED*/
2172 int
ipnet_promisc_add(void * handle,uint_t how,void * data,uintptr_t * mhandle,int flags)2173 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle,
2174     int flags)
2175 {
2176 	ip_stack_t	*ipst;
2177 	netstack_t	*ns;
2178 	ipnetif_t	*ifp;
2179 	ipnet_t		*ipnet;
2180 	char		name[32];
2181 	int		error;
2182 
2183 	ifp = (ipnetif_t *)handle;
2184 	ns = netstack_find_by_zoneid(ifp->if_zoneid);
2185 
2186 	if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) {
2187 		error = ipnet_join_allmulti(ifp, ns->netstack_ipnet);
2188 		if (error != 0)
2189 			return (error);
2190 	} else {
2191 		return (EINVAL);
2192 	}
2193 
2194 	ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP);
2195 	ipnet->ipnet_if = ifp;
2196 	ipnet->ipnet_ns = ns;
2197 	ipnet->ipnet_flags = flags;
2198 
2199 	if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) {
2200 		ipnet->ipnet_acceptfn = ipnet_loaccept;
2201 	} else {
2202 		ipnet->ipnet_acceptfn = ipnet_accept;
2203 	}
2204 
2205 	/*
2206 	 * To register multiple hooks with the same callback function,
2207 	 * a unique name is needed.
2208 	 */
2209 	HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet);
2210 	(void) snprintf(name, sizeof (name), "ipnet_promisc_%p",
2211 	    (void *)ipnet->ipnet_hook);
2212 	ipnet->ipnet_hook->h_name = strdup(name);
2213 	ipnet->ipnet_data = data;
2214 	ipnet->ipnet_zoneid = ifp->if_zoneid;
2215 
2216 	ipst = ns->netstack_ip;
2217 
2218 	error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2219 	    ipnet->ipnet_hook);
2220 	if (error != 0)
2221 		goto regfail;
2222 
2223 	error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2224 	    ipnet->ipnet_hook);
2225 	if (error != 0) {
2226 		(void) net_hook_unregister(ipst->ips_ip4_observe_pr,
2227 		    NH_OBSERVE, ipnet->ipnet_hook);
2228 		goto regfail;
2229 	}
2230 
2231 	*mhandle = (uintptr_t)ipnet;
2232 	netstack_rele(ns);
2233 
2234 	return (0);
2235 
2236 regfail:
2237 	cmn_err(CE_WARN, "net_hook_register failed: %d", error);
2238 	strfree(ipnet->ipnet_hook->h_name);
2239 	hook_free(ipnet->ipnet_hook);
2240 	netstack_rele(ns);
2241 	return (error);
2242 }
2243 
2244 void
ipnet_promisc_remove(void * data)2245 ipnet_promisc_remove(void *data)
2246 {
2247 	ip_stack_t	*ipst;
2248 	ipnet_t		*ipnet;
2249 	hook_t		*hook;
2250 
2251 	ipnet = data;
2252 	ipst = ipnet->ipnet_ns->netstack_ip;
2253 	hook = ipnet->ipnet_hook;
2254 
2255 	VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE,
2256 	    hook) == 0);
2257 
2258 	VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE,
2259 	    hook) == 0);
2260 
2261 	strfree(hook->h_name);
2262 
2263 	hook_free(hook);
2264 
2265 	kmem_free(ipnet, sizeof (*ipnet));
2266 }
2267 
2268 /*
2269  * arg here comes from the ipnet_t allocated in ipnet_promisc_add.
2270  * An important field from that structure is "ipnet_data" that
2271  * contains the "data" pointer passed into ipnet_promisc_add: it needs
2272  * to be passed back to bpf when we call into ipnet_itap.
2273  *
2274  * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called
2275  * from BPF.
2276  */
2277 /*ARGSUSED*/
2278 static int
ipnet_bpf_bounce(hook_event_token_t token,hook_data_t info,void * arg)2279 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg)
2280 {
2281 	hook_pkt_observe_t	*hdr;
2282 	ipnet_addrp_t		src;
2283 	ipnet_addrp_t		dst;
2284 	ipnet_stack_t		*ips;
2285 	ipnet_t			*ipnet;
2286 	mblk_t			*netmp;
2287 	mblk_t			*mp;
2288 
2289 	hdr = (hook_pkt_observe_t *)info;
2290 	mp = hdr->hpo_pkt;
2291 	ipnet = (ipnet_t *)arg;
2292 	ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet;
2293 
2294 	netmp = hdr->hpo_pkt->b_cont;
2295 	src.iap_family = hdr->hpo_family;
2296 	dst.iap_family = hdr->hpo_family;
2297 
2298 	if (hdr->hpo_family == AF_INET) {
2299 		src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src;
2300 		dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst;
2301 	} else {
2302 		src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src;
2303 		dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst;
2304 	}
2305 
2306 	if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) {
2307 		IPSK_BUMP(ips, ik_acceptFail);
2308 		return (0);
2309 	}
2310 	IPSK_BUMP(ips, ik_acceptOk);
2311 
2312 	ipnet_itap(ipnet->ipnet_data, mp,
2313 	    hdr->hpo_htype == htons(IPOBS_HOOK_OUTBOUND),
2314 	    ntohl(hdr->hpo_pktlen) + MBLKL(mp));
2315 
2316 	return (0);
2317 }
2318 
2319 /*
2320  * clone'd ipnetif_t's are created when a shared IP instance zone comes
2321  * to life and configures an IP address. The model that BPF uses is that
2322  * each interface must have a unique pointer and each interface must be
2323  * representative of what it can capture. They are limited to one DLT
2324  * per interface and one zone per interface. Thus every interface that
2325  * can be seen in a zone must be announced via an attach to bpf. For
2326  * shared instance zones, this means the ipnet driver needs to detect
2327  * when an address is added to an interface in a zone for the first
2328  * time (and also when the last address is removed.)
2329  */
2330 static ipnetif_t *
ipnetif_clone_create(ipnetif_t * ifp,zoneid_t zoneid)2331 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid)
2332 {
2333 	uintptr_t	key[2] = { zoneid, (uintptr_t)ifp->if_name };
2334 	ipnet_stack_t	*ips = ifp->if_stackp;
2335 	avl_index_t	where = 0;
2336 	ipnetif_t	*newif;
2337 
2338 	mutex_enter(&ips->ips_avl_lock);
2339 	newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where);
2340 	if (newif != NULL) {
2341 		ipnetif_refhold(newif);
2342 		newif->if_sharecnt++;
2343 		mutex_exit(&ips->ips_avl_lock);
2344 		return (newif);
2345 	}
2346 
2347 	newif = ipnet_alloc_if(ips);
2348 	if (newif == NULL) {
2349 		mutex_exit(&ips->ips_avl_lock);
2350 		return (NULL);
2351 	}
2352 
2353 	newif->if_refcnt = 1;
2354 	newif->if_sharecnt = 1;
2355 	newif->if_zoneid = zoneid;
2356 	(void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ);
2357 	newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK;
2358 	newif->if_index = ifp->if_index;
2359 
2360 	avl_insert(&ips->ips_avl_by_shared, newif, where);
2361 	mutex_exit(&ips->ips_avl_lock);
2362 
2363 	return (newif);
2364 }
2365 
2366 static void
ipnetif_clone_release(ipnetif_t * ipnetif)2367 ipnetif_clone_release(ipnetif_t *ipnetif)
2368 {
2369 	boolean_t	dofree = B_FALSE;
2370 	boolean_t	doremove = B_FALSE;
2371 	ipnet_stack_t	*ips = ipnetif->if_stackp;
2372 
2373 	mutex_enter(&ipnetif->if_reflock);
2374 	ASSERT(ipnetif->if_refcnt > 0);
2375 	if (--ipnetif->if_refcnt == 0)
2376 		dofree = B_TRUE;
2377 	ASSERT(ipnetif->if_sharecnt > 0);
2378 	if (--ipnetif->if_sharecnt == 0)
2379 		doremove = B_TRUE;
2380 	mutex_exit(&ipnetif->if_reflock);
2381 	if (doremove) {
2382 		mutex_enter(&ips->ips_avl_lock);
2383 		avl_remove(&ips->ips_avl_by_shared, ipnetif);
2384 		mutex_exit(&ips->ips_avl_lock);
2385 	}
2386 	if (dofree) {
2387 		ASSERT(ipnetif->if_sharecnt == 0);
2388 		ipnetif_free(ipnetif);
2389 	}
2390 }
2391