1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * The ipnet device defined here provides access to packets at the IP layer. To 29 * provide access to packets at this layer it registers a callback function in 30 * the ip module and when there are open instances of the device ip will pass 31 * packets into the device. Packets from ip are passed on the input, output and 32 * loopback paths. Internally the module returns to ip as soon as possible by 33 * deferring processing using a taskq. 34 * 35 * Management of the devices in /dev/ipnet/ is handled by the devname 36 * filesystem and use of the neti interfaces. This module registers for NIC 37 * events using the neti framework so that when IP interfaces are bought up, 38 * taken down etc. the ipnet module is notified and its view of the interfaces 39 * configured on the system adjusted. On attach, the module gets an initial 40 * view of the system again using the neti framework but as it has already 41 * registered for IP interface events, it is still up-to-date with any changes. 42 */ 43 44 #include <sys/types.h> 45 #include <sys/conf.h> 46 #include <sys/cred.h> 47 #include <sys/stat.h> 48 #include <sys/ddi.h> 49 #include <sys/sunddi.h> 50 #include <sys/modctl.h> 51 #include <sys/dlpi.h> 52 #include <sys/strsun.h> 53 #include <sys/id_space.h> 54 #include <sys/kmem.h> 55 #include <sys/mkdev.h> 56 #include <sys/neti.h> 57 #include <net/if.h> 58 #include <sys/errno.h> 59 #include <sys/list.h> 60 #include <sys/ksynch.h> 61 #include <sys/hook_event.h> 62 #include <sys/sdt.h> 63 #include <sys/stropts.h> 64 #include <sys/sysmacros.h> 65 #include <inet/ip.h> 66 #include <inet/ip_if.h> 67 #include <inet/ip_multi.h> 68 #include <inet/ip6.h> 69 #include <inet/ipnet.h> 70 #include <net/bpf.h> 71 #include <net/bpfdesc.h> 72 #include <net/dlt.h> 73 74 static struct module_info ipnet_minfo = { 75 1, /* mi_idnum */ 76 "ipnet", /* mi_idname */ 77 0, /* mi_minpsz */ 78 INFPSZ, /* mi_maxpsz */ 79 2048, /* mi_hiwat */ 80 0 /* mi_lowat */ 81 }; 82 83 /* 84 * List to hold static view of ipnetif_t's on the system. This is needed to 85 * avoid holding the lock protecting the avl tree of ipnetif's over the 86 * callback into the dev filesystem. 87 */ 88 typedef struct ipnetif_cbdata { 89 char ic_ifname[LIFNAMSIZ]; 90 dev_t ic_dev; 91 list_node_t ic_next; 92 } ipnetif_cbdata_t; 93 94 /* 95 * Convenience enumerated type for ipnet_accept(). It describes the 96 * properties of a given ipnet_addrp_t relative to a single ipnet_t 97 * client stream. The values represent whether the address is ... 98 */ 99 typedef enum { 100 IPNETADDR_MYADDR, /* an address on my ipnetif_t. */ 101 IPNETADDR_MBCAST, /* a multicast or broadcast address. */ 102 IPNETADDR_UNKNOWN /* none of the above. */ 103 } ipnet_addrtype_t; 104 105 /* Argument used for the ipnet_nicevent_taskq callback. */ 106 typedef struct ipnet_nicevent_s { 107 nic_event_t ipne_event; 108 net_handle_t ipne_protocol; 109 netstackid_t ipne_stackid; 110 uint64_t ipne_ifindex; 111 uint64_t ipne_lifindex; 112 char ipne_ifname[LIFNAMSIZ]; 113 } ipnet_nicevent_t; 114 115 static dev_info_t *ipnet_dip; 116 static major_t ipnet_major; 117 static ddi_taskq_t *ipnet_taskq; /* taskq for packets */ 118 static ddi_taskq_t *ipnet_nicevent_taskq; /* taskq for NIC events */ 119 static id_space_t *ipnet_minor_space; 120 static const int IPNET_MINOR_LO = 1; /* minor number for /dev/lo0 */ 121 static const int IPNET_MINOR_MIN = 2; /* start of dynamic minors */ 122 static dl_info_ack_t ipnet_infoack = IPNET_INFO_ACK_INIT; 123 static ipnet_acceptfn_t ipnet_accept, ipnet_loaccept; 124 static bpf_itap_fn_t ipnet_itap; 125 126 static void ipnet_input(mblk_t *); 127 static int ipnet_wput(queue_t *, mblk_t *); 128 static int ipnet_rsrv(queue_t *); 129 static int ipnet_open(queue_t *, dev_t *, int, int, cred_t *); 130 static int ipnet_close(queue_t *); 131 static void ipnet_ioctl(queue_t *, mblk_t *); 132 static void ipnet_iocdata(queue_t *, mblk_t *); 133 static void ipnet_wputnondata(queue_t *, mblk_t *); 134 static int ipnet_attach(dev_info_t *, ddi_attach_cmd_t); 135 static int ipnet_detach(dev_info_t *, ddi_detach_cmd_t); 136 static int ipnet_devinfo(dev_info_t *, ddi_info_cmd_t, void *, void **); 137 static void ipnet_inforeq(queue_t *q, mblk_t *mp); 138 static void ipnet_bindreq(queue_t *q, mblk_t *mp); 139 static void ipnet_unbindreq(queue_t *q, mblk_t *mp); 140 static void ipnet_dlpromisconreq(queue_t *q, mblk_t *mp); 141 static void ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp); 142 static int ipnet_join_allmulti(ipnetif_t *, ipnet_stack_t *); 143 static void ipnet_leave_allmulti(ipnetif_t *, ipnet_stack_t *); 144 static int ipnet_nicevent_cb(hook_event_token_t, hook_data_t, void *); 145 static void ipnet_nicevent_task(void *); 146 static ipnetif_t *ipnetif_create(const char *, uint64_t, ipnet_stack_t *, 147 uint64_t); 148 static void ipnetif_remove(ipnetif_t *, ipnet_stack_t *); 149 static ipnetif_addr_t *ipnet_match_lif(ipnetif_t *, lif_if_t, boolean_t); 150 static ipnetif_t *ipnetif_getby_index(uint64_t, ipnet_stack_t *); 151 static ipnetif_t *ipnetif_getby_dev(dev_t, ipnet_stack_t *); 152 static boolean_t ipnetif_in_zone(ipnetif_t *, zoneid_t, ipnet_stack_t *); 153 static void ipnetif_zonecheck(ipnetif_t *, ipnet_stack_t *); 154 static int ipnet_populate_if(net_handle_t, ipnet_stack_t *, boolean_t); 155 static int ipnetif_compare_name(const void *, const void *); 156 static int ipnetif_compare_name_zone(const void *, const void *); 157 static int ipnetif_compare_index(const void *, const void *); 158 static void ipnet_add_ifaddr(uint64_t, ipnetif_t *, net_handle_t); 159 static void ipnet_delete_ifaddr(ipnetif_addr_t *, ipnetif_t *, boolean_t); 160 static void ipnetif_refhold(ipnetif_t *); 161 static void ipnetif_refrele(ipnetif_t *); 162 static void ipnet_walkers_inc(ipnet_stack_t *); 163 static void ipnet_walkers_dec(ipnet_stack_t *); 164 static void ipnet_register_netihook(ipnet_stack_t *); 165 static void *ipnet_stack_init(netstackid_t, netstack_t *); 166 static void ipnet_stack_fini(netstackid_t, void *); 167 static void ipnet_dispatch(void *); 168 static int ipobs_bounce_func(hook_event_token_t, hook_data_t, void *); 169 static void ipnet_bpfattach(ipnetif_t *); 170 static void ipnet_bpfdetach(ipnetif_t *); 171 static int ipnet_bpf_bounce(hook_event_token_t, hook_data_t, void *); 172 static void ipnet_bpf_probe_shared(ipnet_stack_t *); 173 static void ipnet_bpf_release_shared(ipnet_stack_t *); 174 static ipnetif_t *ipnetif_clone_create(ipnetif_t *, zoneid_t); 175 static void ipnetif_clone_release(ipnetif_t *); 176 177 static struct qinit ipnet_rinit = { 178 NULL, /* qi_putp */ 179 ipnet_rsrv, /* qi_srvp */ 180 ipnet_open, /* qi_qopen */ 181 ipnet_close, /* qi_qclose */ 182 NULL, /* qi_qadmin */ 183 &ipnet_minfo, /* qi_minfo */ 184 }; 185 186 static struct qinit ipnet_winit = { 187 ipnet_wput, /* qi_putp */ 188 NULL, /* qi_srvp */ 189 NULL, /* qi_qopen */ 190 NULL, /* qi_qclose */ 191 NULL, /* qi_qadmin */ 192 &ipnet_minfo, /* qi_minfo */ 193 }; 194 195 static struct streamtab ipnet_info = { 196 &ipnet_rinit, &ipnet_winit 197 }; 198 199 DDI_DEFINE_STREAM_OPS(ipnet_ops, nulldev, nulldev, ipnet_attach, 200 ipnet_detach, nodev, ipnet_devinfo, D_MP | D_MTPERMOD, &ipnet_info, 201 ddi_quiesce_not_supported); 202 203 static struct modldrv modldrv = { 204 &mod_driverops, 205 "STREAMS ipnet driver", 206 &ipnet_ops 207 }; 208 209 static struct modlinkage modlinkage = { 210 MODREV_1, &modldrv, NULL 211 }; 212 213 /* 214 * This structure contains the template data (names and type) that is 215 * copied, in bulk, into the new kstats structure created by net_kstat_create. 216 * No actual statistical information is stored in this instance of the 217 * ipnet_kstats_t structure. 218 */ 219 static ipnet_kstats_t stats_template = { 220 { "duplicationFail", KSTAT_DATA_UINT64 }, 221 { "dispatchOk", KSTAT_DATA_UINT64 }, 222 { "dispatchFail", KSTAT_DATA_UINT64 }, 223 { "dispatchHeaderDrop", KSTAT_DATA_UINT64 }, 224 { "dispatchDupDrop", KSTAT_DATA_UINT64 }, 225 { "dispatchDeliver", KSTAT_DATA_UINT64 }, 226 { "acceptOk", KSTAT_DATA_UINT64 }, 227 { "acceptFail", KSTAT_DATA_UINT64 } 228 }; 229 230 /* 231 * Walk the list of physical interfaces on the machine, for each 232 * interface create a new ipnetif_t and add any addresses to it. We 233 * need to do the walk twice, once for IPv4 and once for IPv6. 234 * 235 * The interfaces are destroyed as part of ipnet_stack_fini() for each 236 * stack. Note that we cannot do this initialization in 237 * ipnet_stack_init(), since ipnet_stack_init() cannot fail. 238 */ 239 static int 240 ipnetif_init(void) 241 { 242 netstack_handle_t nh; 243 netstack_t *ns; 244 ipnet_stack_t *ips; 245 int ret = 0; 246 247 netstack_next_init(&nh); 248 while ((ns = netstack_next(&nh)) != NULL) { 249 ips = ns->netstack_ipnet; 250 if ((ret = ipnet_populate_if(ips->ips_ndv4, ips, B_FALSE)) == 0) 251 ret = ipnet_populate_if(ips->ips_ndv6, ips, B_TRUE); 252 netstack_rele(ns); 253 if (ret != 0) 254 break; 255 } 256 netstack_next_fini(&nh); 257 return (ret); 258 } 259 260 /* 261 * Standard module entry points. 262 */ 263 int 264 _init(void) 265 { 266 int ret; 267 boolean_t netstack_registered = B_FALSE; 268 269 if ((ipnet_major = ddi_name_to_major("ipnet")) == (major_t)-1) 270 return (ENODEV); 271 ipnet_minor_space = id_space_create("ipnet_minor_space", 272 IPNET_MINOR_MIN, MAXMIN32); 273 274 /* 275 * We call ddi_taskq_create() with nthread == 1 to ensure in-order 276 * delivery of packets to clients. Note that we need to create the 277 * taskqs before calling netstack_register() since ipnet_stack_init() 278 * registers callbacks that use 'em. 279 */ 280 ipnet_taskq = ddi_taskq_create(NULL, "ipnet", 1, TASKQ_DEFAULTPRI, 0); 281 ipnet_nicevent_taskq = ddi_taskq_create(NULL, "ipnet_nic_event_queue", 282 1, TASKQ_DEFAULTPRI, 0); 283 if (ipnet_taskq == NULL || ipnet_nicevent_taskq == NULL) { 284 ret = ENOMEM; 285 goto done; 286 } 287 288 netstack_register(NS_IPNET, ipnet_stack_init, NULL, ipnet_stack_fini); 289 netstack_registered = B_TRUE; 290 291 if ((ret = ipnetif_init()) == 0) 292 ret = mod_install(&modlinkage); 293 done: 294 if (ret != 0) { 295 if (ipnet_taskq != NULL) 296 ddi_taskq_destroy(ipnet_taskq); 297 if (ipnet_nicevent_taskq != NULL) 298 ddi_taskq_destroy(ipnet_nicevent_taskq); 299 if (netstack_registered) 300 netstack_unregister(NS_IPNET); 301 id_space_destroy(ipnet_minor_space); 302 } 303 return (ret); 304 } 305 306 int 307 _fini(void) 308 { 309 int err; 310 311 if ((err = mod_remove(&modlinkage)) != 0) 312 return (err); 313 314 netstack_unregister(NS_IPNET); 315 ddi_taskq_destroy(ipnet_nicevent_taskq); 316 ddi_taskq_destroy(ipnet_taskq); 317 id_space_destroy(ipnet_minor_space); 318 return (0); 319 } 320 321 int 322 _info(struct modinfo *modinfop) 323 { 324 return (mod_info(&modlinkage, modinfop)); 325 } 326 327 static void 328 ipnet_register_netihook(ipnet_stack_t *ips) 329 { 330 int ret; 331 zoneid_t zoneid; 332 netid_t netid; 333 334 HOOK_INIT(ips->ips_nicevents, ipnet_nicevent_cb, "ipnet_nicevents", 335 ips); 336 337 /* 338 * It is possible for an exclusive stack to be in the process of 339 * shutting down here, and the netid and protocol lookups could fail 340 * in that case. 341 */ 342 zoneid = netstackid_to_zoneid(ips->ips_netstack->netstack_stackid); 343 if ((netid = net_zoneidtonetid(zoneid)) == -1) 344 return; 345 346 if ((ips->ips_ndv4 = net_protocol_lookup(netid, NHF_INET)) != NULL) { 347 if ((ret = net_hook_register(ips->ips_ndv4, NH_NIC_EVENTS, 348 ips->ips_nicevents)) != 0) { 349 VERIFY(net_protocol_release(ips->ips_ndv4) == 0); 350 ips->ips_ndv4 = NULL; 351 cmn_err(CE_WARN, "unable to register IPv4 netinfo hooks" 352 " in zone %d: %d", zoneid, ret); 353 } 354 } 355 if ((ips->ips_ndv6 = net_protocol_lookup(netid, NHF_INET6)) != NULL) { 356 if ((ret = net_hook_register(ips->ips_ndv6, NH_NIC_EVENTS, 357 ips->ips_nicevents)) != 0) { 358 VERIFY(net_protocol_release(ips->ips_ndv6) == 0); 359 ips->ips_ndv6 = NULL; 360 cmn_err(CE_WARN, "unable to register IPv6 netinfo hooks" 361 " in zone %d: %d", zoneid, ret); 362 } 363 } 364 365 /* 366 * Create a local set of kstats for each zone. 367 */ 368 ips->ips_kstatp = net_kstat_create(netid, "ipnet", 0, "ipnet_stats", 369 "misc", KSTAT_TYPE_NAMED, 370 sizeof (ipnet_kstats_t) / sizeof (kstat_named_t), 0); 371 if (ips->ips_kstatp != NULL) { 372 bcopy(&stats_template, &ips->ips_stats, 373 sizeof (ips->ips_stats)); 374 ips->ips_kstatp->ks_data = &ips->ips_stats; 375 ips->ips_kstatp->ks_private = 376 (void *)(uintptr_t)ips->ips_netstack->netstack_stackid; 377 kstat_install(ips->ips_kstatp); 378 } else { 379 cmn_err(CE_WARN, "net_kstat_create(%s,%s,%s) failed", 380 "ipnet", "ipnet_stats", "misc"); 381 } 382 } 383 384 /* 385 * This function is called on attach to build an initial view of the 386 * interfaces on the system. It will be called once for IPv4 and once 387 * for IPv6, although there is only one ipnet interface for both IPv4 388 * and IPv6 there are separate address lists. 389 */ 390 static int 391 ipnet_populate_if(net_handle_t nd, ipnet_stack_t *ips, boolean_t isv6) 392 { 393 phy_if_t phyif; 394 lif_if_t lif; 395 ipnetif_t *ipnetif; 396 char name[LIFNAMSIZ]; 397 boolean_t new_if = B_FALSE; 398 uint64_t ifflags; 399 int ret = 0; 400 401 /* 402 * If ipnet_register_netihook() was unable to initialize this 403 * stack's net_handle_t, then we cannot populate any interface 404 * information. This usually happens when we attempted to 405 * grab a net_handle_t as a stack was shutting down. We don't 406 * want to fail the entire _init() operation because of a 407 * stack shutdown (other stacks will continue to work just 408 * fine), so we silently return success here. 409 */ 410 if (nd == NULL) 411 return (0); 412 413 /* 414 * Make sure we're not processing NIC events during the 415 * population of our interfaces and address lists. 416 */ 417 mutex_enter(&ips->ips_event_lock); 418 419 for (phyif = net_phygetnext(nd, 0); phyif != 0; 420 phyif = net_phygetnext(nd, phyif)) { 421 if (net_getifname(nd, phyif, name, LIFNAMSIZ) != 0) 422 continue; 423 ifflags = 0; 424 (void) net_getlifflags(nd, phyif, 0, &ifflags); 425 if ((ipnetif = ipnetif_getby_index(phyif, ips)) == NULL) { 426 ipnetif = ipnetif_create(name, phyif, ips, ifflags); 427 if (ipnetif == NULL) { 428 ret = ENOMEM; 429 goto done; 430 } 431 new_if = B_TRUE; 432 } 433 ipnetif->if_flags |= 434 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED; 435 436 for (lif = net_lifgetnext(nd, phyif, 0); lif != 0; 437 lif = net_lifgetnext(nd, phyif, lif)) { 438 /* 439 * Skip addresses that aren't up. We'll add 440 * them when we receive an NE_LIF_UP event. 441 */ 442 if (net_getlifflags(nd, phyif, lif, &ifflags) != 0 || 443 !(ifflags & IFF_UP)) 444 continue; 445 /* Don't add it if we already have it. */ 446 if (ipnet_match_lif(ipnetif, lif, isv6) != NULL) 447 continue; 448 ipnet_add_ifaddr(lif, ipnetif, nd); 449 } 450 if (!new_if) 451 ipnetif_refrele(ipnetif); 452 } 453 454 done: 455 mutex_exit(&ips->ips_event_lock); 456 return (ret); 457 } 458 459 static int 460 ipnet_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 461 { 462 if (cmd != DDI_ATTACH) 463 return (DDI_FAILURE); 464 465 if (ddi_create_minor_node(dip, "lo0", S_IFCHR, IPNET_MINOR_LO, 466 DDI_PSEUDO, 0) == DDI_FAILURE) 467 return (DDI_FAILURE); 468 469 ipnet_dip = dip; 470 return (DDI_SUCCESS); 471 } 472 473 static int 474 ipnet_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 475 { 476 if (cmd != DDI_DETACH) 477 return (DDI_FAILURE); 478 479 ASSERT(dip == ipnet_dip); 480 ddi_remove_minor_node(ipnet_dip, NULL); 481 ipnet_dip = NULL; 482 return (DDI_SUCCESS); 483 } 484 485 /* ARGSUSED */ 486 static int 487 ipnet_devinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) 488 { 489 int error = DDI_FAILURE; 490 491 switch (infocmd) { 492 case DDI_INFO_DEVT2INSTANCE: 493 *result = (void *)0; 494 error = DDI_SUCCESS; 495 break; 496 case DDI_INFO_DEVT2DEVINFO: 497 if (ipnet_dip != NULL) { 498 *result = ipnet_dip; 499 error = DDI_SUCCESS; 500 } 501 break; 502 } 503 return (error); 504 } 505 506 /* ARGSUSED */ 507 static int 508 ipnet_open(queue_t *rq, dev_t *dev, int oflag, int sflag, cred_t *crp) 509 { 510 ipnet_t *ipnet; 511 netstack_t *ns = NULL; 512 ipnet_stack_t *ips; 513 int err = 0; 514 zoneid_t zoneid = crgetzoneid(crp); 515 516 /* 517 * If the system is labeled, only the global zone is allowed to open 518 * IP observability nodes. 519 */ 520 if (is_system_labeled() && zoneid != GLOBAL_ZONEID) 521 return (EACCES); 522 523 /* We don't support open as a module */ 524 if (sflag & MODOPEN) 525 return (ENOTSUP); 526 527 /* This driver is self-cloning, we don't support re-open. */ 528 if (rq->q_ptr != NULL) 529 return (EBUSY); 530 531 if ((ipnet = kmem_zalloc(sizeof (*ipnet), KM_NOSLEEP)) == NULL) 532 return (ENOMEM); 533 534 VERIFY((ns = netstack_find_by_cred(crp)) != NULL); 535 ips = ns->netstack_ipnet; 536 537 rq->q_ptr = WR(rq)->q_ptr = ipnet; 538 ipnet->ipnet_rq = rq; 539 ipnet->ipnet_minor = (minor_t)id_alloc(ipnet_minor_space); 540 ipnet->ipnet_zoneid = zoneid; 541 ipnet->ipnet_dlstate = DL_UNBOUND; 542 ipnet->ipnet_ns = ns; 543 544 /* 545 * We need to hold ips_event_lock here as any NE_LIF_DOWN events need 546 * to be processed after ipnet_if is set and the ipnet_t has been 547 * inserted in the ips_str_list. 548 */ 549 mutex_enter(&ips->ips_event_lock); 550 if (getminor(*dev) == IPNET_MINOR_LO) { 551 ipnet->ipnet_flags |= IPNET_LOMODE; 552 ipnet->ipnet_acceptfn = ipnet_loaccept; 553 } else { 554 ipnet->ipnet_acceptfn = ipnet_accept; 555 ipnet->ipnet_if = ipnetif_getby_dev(*dev, ips); 556 if (ipnet->ipnet_if == NULL || 557 !ipnetif_in_zone(ipnet->ipnet_if, zoneid, ips)) { 558 err = ENODEV; 559 goto done; 560 } 561 } 562 563 mutex_enter(&ips->ips_walkers_lock); 564 while (ips->ips_walkers_cnt != 0) 565 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock); 566 list_insert_head(&ips->ips_str_list, ipnet); 567 *dev = makedevice(getmajor(*dev), ipnet->ipnet_minor); 568 qprocson(rq); 569 570 /* 571 * Only register our callback if we're the first open client; we call 572 * unregister in close() for the last open client. 573 */ 574 if (list_head(&ips->ips_str_list) == list_tail(&ips->ips_str_list)) 575 ips->ips_hook = ipobs_register_hook(ns, ipnet_input); 576 mutex_exit(&ips->ips_walkers_lock); 577 578 done: 579 mutex_exit(&ips->ips_event_lock); 580 if (err != 0) { 581 netstack_rele(ns); 582 id_free(ipnet_minor_space, ipnet->ipnet_minor); 583 if (ipnet->ipnet_if != NULL) 584 ipnetif_refrele(ipnet->ipnet_if); 585 kmem_free(ipnet, sizeof (*ipnet)); 586 } 587 return (err); 588 } 589 590 static int 591 ipnet_close(queue_t *rq) 592 { 593 ipnet_t *ipnet = rq->q_ptr; 594 ipnet_stack_t *ips = ipnet->ipnet_ns->netstack_ipnet; 595 596 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) 597 ipnet_leave_allmulti(ipnet->ipnet_if, ips); 598 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI) 599 ipnet_leave_allmulti(ipnet->ipnet_if, ips); 600 601 mutex_enter(&ips->ips_walkers_lock); 602 while (ips->ips_walkers_cnt != 0) 603 cv_wait(&ips->ips_walkers_cv, &ips->ips_walkers_lock); 604 605 qprocsoff(rq); 606 607 list_remove(&ips->ips_str_list, ipnet); 608 if (ipnet->ipnet_if != NULL) 609 ipnetif_refrele(ipnet->ipnet_if); 610 id_free(ipnet_minor_space, ipnet->ipnet_minor); 611 612 if (list_is_empty(&ips->ips_str_list)) { 613 ipobs_unregister_hook(ips->ips_netstack, ips->ips_hook); 614 ips->ips_hook = NULL; 615 } 616 617 kmem_free(ipnet, sizeof (*ipnet)); 618 619 mutex_exit(&ips->ips_walkers_lock); 620 netstack_rele(ips->ips_netstack); 621 return (0); 622 } 623 624 static int 625 ipnet_wput(queue_t *q, mblk_t *mp) 626 { 627 switch (mp->b_datap->db_type) { 628 case M_FLUSH: 629 if (*mp->b_rptr & FLUSHW) { 630 flushq(q, FLUSHDATA); 631 *mp->b_rptr &= ~FLUSHW; 632 } 633 if (*mp->b_rptr & FLUSHR) 634 qreply(q, mp); 635 else 636 freemsg(mp); 637 break; 638 case M_PROTO: 639 case M_PCPROTO: 640 ipnet_wputnondata(q, mp); 641 break; 642 case M_IOCTL: 643 ipnet_ioctl(q, mp); 644 break; 645 case M_IOCDATA: 646 ipnet_iocdata(q, mp); 647 break; 648 default: 649 freemsg(mp); 650 break; 651 } 652 return (0); 653 } 654 655 static int 656 ipnet_rsrv(queue_t *q) 657 { 658 mblk_t *mp; 659 660 while ((mp = getq(q)) != NULL) { 661 ASSERT(DB_TYPE(mp) == M_DATA); 662 if (canputnext(q)) { 663 putnext(q, mp); 664 } else { 665 (void) putbq(q, mp); 666 break; 667 } 668 } 669 return (0); 670 } 671 672 static void 673 ipnet_ioctl(queue_t *q, mblk_t *mp) 674 { 675 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 676 677 switch (iocp->ioc_cmd) { 678 case DLIOCRAW: 679 miocack(q, mp, 0, 0); 680 break; 681 case DLIOCIPNETINFO: 682 if (iocp->ioc_count == TRANSPARENT) { 683 mcopyin(mp, NULL, sizeof (uint_t), NULL); 684 qreply(q, mp); 685 break; 686 } 687 /* Fallthrough, we don't support I_STR with DLIOCIPNETINFO. */ 688 default: 689 miocnak(q, mp, 0, EINVAL); 690 break; 691 } 692 } 693 694 static void 695 ipnet_iocdata(queue_t *q, mblk_t *mp) 696 { 697 struct iocblk *iocp = (struct iocblk *)mp->b_rptr; 698 ipnet_t *ipnet = q->q_ptr; 699 700 switch (iocp->ioc_cmd) { 701 case DLIOCIPNETINFO: 702 if (*(int *)mp->b_cont->b_rptr == 1) 703 ipnet->ipnet_flags |= IPNET_INFO; 704 else if (*(int *)mp->b_cont->b_rptr == 0) 705 ipnet->ipnet_flags &= ~IPNET_INFO; 706 else 707 goto iocnak; 708 miocack(q, mp, 0, DL_IPNETINFO_VERSION); 709 break; 710 default: 711 iocnak: 712 miocnak(q, mp, 0, EINVAL); 713 break; 714 } 715 } 716 717 static void 718 ipnet_wputnondata(queue_t *q, mblk_t *mp) 719 { 720 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; 721 t_uscalar_t prim = dlp->dl_primitive; 722 723 switch (prim) { 724 case DL_INFO_REQ: 725 ipnet_inforeq(q, mp); 726 break; 727 case DL_UNBIND_REQ: 728 ipnet_unbindreq(q, mp); 729 break; 730 case DL_BIND_REQ: 731 ipnet_bindreq(q, mp); 732 break; 733 case DL_PROMISCON_REQ: 734 ipnet_dlpromisconreq(q, mp); 735 break; 736 case DL_PROMISCOFF_REQ: 737 ipnet_dlpromiscoffreq(q, mp); 738 break; 739 case DL_UNITDATA_REQ: 740 case DL_DETACH_REQ: 741 case DL_PHYS_ADDR_REQ: 742 case DL_SET_PHYS_ADDR_REQ: 743 case DL_ENABMULTI_REQ: 744 case DL_DISABMULTI_REQ: 745 case DL_ATTACH_REQ: 746 dlerrorack(q, mp, prim, DL_UNSUPPORTED, 0); 747 break; 748 default: 749 dlerrorack(q, mp, prim, DL_BADPRIM, 0); 750 break; 751 } 752 } 753 754 static void 755 ipnet_inforeq(queue_t *q, mblk_t *mp) 756 { 757 dl_info_ack_t *dlip; 758 size_t size = sizeof (dl_info_ack_t) + sizeof (ushort_t); 759 760 if (MBLKL(mp) < DL_INFO_REQ_SIZE) { 761 dlerrorack(q, mp, DL_INFO_REQ, DL_BADPRIM, 0); 762 return; 763 } 764 765 if ((mp = mexchange(q, mp, size, M_PCPROTO, DL_INFO_ACK)) == NULL) 766 return; 767 768 dlip = (dl_info_ack_t *)mp->b_rptr; 769 *dlip = ipnet_infoack; 770 qreply(q, mp); 771 } 772 773 static void 774 ipnet_bindreq(queue_t *q, mblk_t *mp) 775 { 776 union DL_primitives *dlp = (union DL_primitives *)mp->b_rptr; 777 ipnet_t *ipnet = q->q_ptr; 778 779 if (MBLKL(mp) < DL_BIND_REQ_SIZE) { 780 dlerrorack(q, mp, DL_BIND_REQ, DL_BADPRIM, 0); 781 return; 782 } 783 784 switch (dlp->bind_req.dl_sap) { 785 case 0 : 786 ipnet->ipnet_family = AF_UNSPEC; 787 break; 788 case IPV4_VERSION : 789 ipnet->ipnet_family = AF_INET; 790 break; 791 case IPV6_VERSION : 792 ipnet->ipnet_family = AF_INET6; 793 break; 794 default : 795 dlerrorack(q, mp, DL_BIND_REQ, DL_BADSAP, 0); 796 return; 797 /*NOTREACHED*/ 798 } 799 800 ipnet->ipnet_dlstate = DL_IDLE; 801 dlbindack(q, mp, dlp->bind_req.dl_sap, 0, 0, 0, 0); 802 } 803 804 static void 805 ipnet_unbindreq(queue_t *q, mblk_t *mp) 806 { 807 ipnet_t *ipnet = q->q_ptr; 808 809 if (MBLKL(mp) < DL_UNBIND_REQ_SIZE) { 810 dlerrorack(q, mp, DL_UNBIND_REQ, DL_BADPRIM, 0); 811 return; 812 } 813 814 if (ipnet->ipnet_dlstate != DL_IDLE) { 815 dlerrorack(q, mp, DL_UNBIND_REQ, DL_OUTSTATE, 0); 816 } else { 817 ipnet->ipnet_dlstate = DL_UNBOUND; 818 ipnet->ipnet_family = AF_UNSPEC; 819 dlokack(q, mp, DL_UNBIND_REQ); 820 } 821 } 822 823 static void 824 ipnet_dlpromisconreq(queue_t *q, mblk_t *mp) 825 { 826 ipnet_t *ipnet = q->q_ptr; 827 t_uscalar_t level; 828 int err; 829 830 if (MBLKL(mp) < DL_PROMISCON_REQ_SIZE) { 831 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0); 832 return; 833 } 834 835 if (ipnet->ipnet_flags & IPNET_LOMODE) { 836 dlokack(q, mp, DL_PROMISCON_REQ); 837 return; 838 } 839 840 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level; 841 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) { 842 if ((err = ipnet_join_allmulti(ipnet->ipnet_if, 843 ipnet->ipnet_ns->netstack_ipnet)) != 0) { 844 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_SYSERR, err); 845 return; 846 } 847 } 848 849 switch (level) { 850 case DL_PROMISC_PHYS: 851 ipnet->ipnet_flags |= IPNET_PROMISC_PHYS; 852 break; 853 case DL_PROMISC_SAP: 854 ipnet->ipnet_flags |= IPNET_PROMISC_SAP; 855 break; 856 case DL_PROMISC_MULTI: 857 ipnet->ipnet_flags |= IPNET_PROMISC_MULTI; 858 break; 859 default: 860 dlerrorack(q, mp, DL_PROMISCON_REQ, DL_BADPRIM, 0); 861 return; 862 } 863 864 dlokack(q, mp, DL_PROMISCON_REQ); 865 } 866 867 static void 868 ipnet_dlpromiscoffreq(queue_t *q, mblk_t *mp) 869 { 870 ipnet_t *ipnet = q->q_ptr; 871 t_uscalar_t level; 872 uint16_t orig_ipnet_flags = ipnet->ipnet_flags; 873 874 if (MBLKL(mp) < DL_PROMISCOFF_REQ_SIZE) { 875 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0); 876 return; 877 } 878 879 if (ipnet->ipnet_flags & IPNET_LOMODE) { 880 dlokack(q, mp, DL_PROMISCOFF_REQ); 881 return; 882 } 883 884 level = ((dl_promiscon_req_t *)mp->b_rptr)->dl_level; 885 switch (level) { 886 case DL_PROMISC_PHYS: 887 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) 888 ipnet->ipnet_flags &= ~IPNET_PROMISC_PHYS; 889 break; 890 case DL_PROMISC_SAP: 891 if (ipnet->ipnet_flags & IPNET_PROMISC_SAP) 892 ipnet->ipnet_flags &= ~IPNET_PROMISC_SAP; 893 break; 894 case DL_PROMISC_MULTI: 895 if (ipnet->ipnet_flags & IPNET_PROMISC_MULTI) 896 ipnet->ipnet_flags &= ~IPNET_PROMISC_MULTI; 897 break; 898 default: 899 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_BADPRIM, 0); 900 return; 901 } 902 903 if (orig_ipnet_flags == ipnet->ipnet_flags) { 904 dlerrorack(q, mp, DL_PROMISCOFF_REQ, DL_NOTENAB, 0); 905 return; 906 } 907 908 if (level == DL_PROMISC_PHYS || level == DL_PROMISC_MULTI) { 909 ipnet_leave_allmulti(ipnet->ipnet_if, 910 ipnet->ipnet_ns->netstack_ipnet); 911 } 912 913 dlokack(q, mp, DL_PROMISCOFF_REQ); 914 } 915 916 static int 917 ipnet_join_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips) 918 { 919 int err = 0; 920 ip_stack_t *ipst = ips->ips_netstack->netstack_ip; 921 uint64_t index = ipnetif->if_index; 922 923 mutex_enter(&ips->ips_event_lock); 924 if (ipnetif->if_multicnt == 0) { 925 ASSERT((ipnetif->if_flags & 926 (IPNETIF_IPV4ALLMULTI | IPNETIF_IPV6ALLMULTI)) == 0); 927 if (ipnetif->if_flags & IPNETIF_IPV4PLUMBED) { 928 err = ip_join_allmulti(index, B_FALSE, ipst); 929 if (err != 0) 930 goto done; 931 ipnetif->if_flags |= IPNETIF_IPV4ALLMULTI; 932 } 933 if (ipnetif->if_flags & IPNETIF_IPV6PLUMBED) { 934 err = ip_join_allmulti(index, B_TRUE, ipst); 935 if (err != 0 && 936 (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI)) { 937 (void) ip_leave_allmulti(index, B_FALSE, ipst); 938 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI; 939 goto done; 940 } 941 ipnetif->if_flags |= IPNETIF_IPV6ALLMULTI; 942 } 943 } 944 ipnetif->if_multicnt++; 945 946 done: 947 mutex_exit(&ips->ips_event_lock); 948 return (err); 949 } 950 951 static void 952 ipnet_leave_allmulti(ipnetif_t *ipnetif, ipnet_stack_t *ips) 953 { 954 int err; 955 ip_stack_t *ipst = ips->ips_netstack->netstack_ip; 956 uint64_t index = ipnetif->if_index; 957 958 mutex_enter(&ips->ips_event_lock); 959 ASSERT(ipnetif->if_multicnt != 0); 960 if (--ipnetif->if_multicnt == 0) { 961 if (ipnetif->if_flags & IPNETIF_IPV4ALLMULTI) { 962 err = ip_leave_allmulti(index, B_FALSE, ipst); 963 ASSERT(err == 0 || err == ENODEV); 964 ipnetif->if_flags &= ~IPNETIF_IPV4ALLMULTI; 965 } 966 if (ipnetif->if_flags & IPNETIF_IPV6ALLMULTI) { 967 err = ip_leave_allmulti(index, B_TRUE, ipst); 968 ASSERT(err == 0 || err == ENODEV); 969 ipnetif->if_flags &= ~IPNETIF_IPV6ALLMULTI; 970 } 971 } 972 mutex_exit(&ips->ips_event_lock); 973 } 974 975 /* 976 * Allocate a new mblk_t and put a dl_ipnetinfo_t in it. 977 * The structure it copies the header information from, 978 * hook_pkt_observe_t, is constructed using network byte 979 * order in ipobs_hook(), so there is no conversion here. 980 */ 981 static mblk_t * 982 ipnet_addheader(hook_pkt_observe_t *hdr, mblk_t *mp) 983 { 984 mblk_t *dlhdr; 985 dl_ipnetinfo_t *dl; 986 987 if ((dlhdr = allocb(sizeof (dl_ipnetinfo_t), BPRI_HI)) == NULL) { 988 freemsg(mp); 989 return (NULL); 990 } 991 dl = (dl_ipnetinfo_t *)dlhdr->b_rptr; 992 dl->dli_version = DL_IPNETINFO_VERSION; 993 dl->dli_family = hdr->hpo_family; 994 dl->dli_htype = hdr->hpo_htype; 995 dl->dli_pktlen = hdr->hpo_pktlen; 996 dl->dli_ifindex = hdr->hpo_ifindex; 997 dl->dli_grifindex = hdr->hpo_grifindex; 998 dl->dli_zsrc = hdr->hpo_zsrc; 999 dl->dli_zdst = hdr->hpo_zdst; 1000 dlhdr->b_wptr += sizeof (*dl); 1001 dlhdr->b_cont = mp; 1002 1003 return (dlhdr); 1004 } 1005 1006 static ipnet_addrtype_t 1007 ipnet_get_addrtype(ipnet_t *ipnet, ipnet_addrp_t *addr) 1008 { 1009 list_t *list; 1010 ipnetif_t *ipnetif = ipnet->ipnet_if; 1011 ipnetif_addr_t *ifaddr; 1012 ipnet_addrtype_t addrtype = IPNETADDR_UNKNOWN; 1013 1014 /* First check if the address is multicast or limited broadcast. */ 1015 switch (addr->iap_family) { 1016 case AF_INET: 1017 if (CLASSD(*(addr->iap_addr4)) || 1018 *(addr->iap_addr4) == INADDR_BROADCAST) 1019 return (IPNETADDR_MBCAST); 1020 break; 1021 case AF_INET6: 1022 if (IN6_IS_ADDR_MULTICAST(addr->iap_addr6)) 1023 return (IPNETADDR_MBCAST); 1024 break; 1025 } 1026 1027 /* 1028 * Walk the address list to see if the address belongs to our 1029 * interface or is one of our subnet broadcast addresses. 1030 */ 1031 mutex_enter(&ipnetif->if_addr_lock); 1032 list = (addr->iap_family == AF_INET) ? 1033 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list; 1034 for (ifaddr = list_head(list); 1035 ifaddr != NULL && addrtype == IPNETADDR_UNKNOWN; 1036 ifaddr = list_next(list, ifaddr)) { 1037 /* 1038 * If we're not in the global zone, then only look at 1039 * addresses in our zone. 1040 */ 1041 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID && 1042 ipnet->ipnet_zoneid != ifaddr->ifa_zone) 1043 continue; 1044 switch (addr->iap_family) { 1045 case AF_INET: 1046 if (ifaddr->ifa_ip4addr != INADDR_ANY && 1047 *(addr->iap_addr4) == ifaddr->ifa_ip4addr) 1048 addrtype = IPNETADDR_MYADDR; 1049 else if (ifaddr->ifa_brdaddr != INADDR_ANY && 1050 *(addr->iap_addr4) == ifaddr->ifa_brdaddr) 1051 addrtype = IPNETADDR_MBCAST; 1052 break; 1053 case AF_INET6: 1054 if (IN6_ARE_ADDR_EQUAL(addr->iap_addr6, 1055 &ifaddr->ifa_ip6addr)) 1056 addrtype = IPNETADDR_MYADDR; 1057 break; 1058 } 1059 } 1060 mutex_exit(&ipnetif->if_addr_lock); 1061 1062 return (addrtype); 1063 } 1064 1065 /* 1066 * Verify if the packet contained in hdr should be passed up to the 1067 * ipnet client stream. 1068 */ 1069 static boolean_t 1070 ipnet_accept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src, 1071 ipnet_addrp_t *dst) 1072 { 1073 boolean_t obsif; 1074 uint64_t ifindex = ipnet->ipnet_if->if_index; 1075 ipnet_addrtype_t srctype; 1076 ipnet_addrtype_t dsttype; 1077 1078 srctype = ipnet_get_addrtype(ipnet, src); 1079 dsttype = ipnet_get_addrtype(ipnet, dst); 1080 1081 /* 1082 * If the packet's ifindex matches ours, or the packet's group ifindex 1083 * matches ours, it's on the interface we're observing. (Thus, 1084 * observing on the group ifindex matches all ifindexes in the group.) 1085 */ 1086 obsif = (ntohl(hdr->hpo_ifindex) == ifindex || 1087 ntohl(hdr->hpo_grifindex) == ifindex); 1088 1089 DTRACE_PROBE5(ipnet_accept__addr, 1090 ipnet_addrtype_t, srctype, ipnet_addrp_t *, src, 1091 ipnet_addrtype_t, dsttype, ipnet_addrp_t *, dst, 1092 boolean_t, obsif); 1093 1094 /* 1095 * Do not allow an ipnet stream to see packets that are not from or to 1096 * its zone. The exception is when zones are using the shared stack 1097 * model. In this case, streams in the global zone have visibility 1098 * into other shared-stack zones, and broadcast and multicast traffic 1099 * is visible by all zones in the stack. 1100 */ 1101 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID && 1102 dsttype != IPNETADDR_MBCAST) { 1103 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) && 1104 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst)) 1105 return (B_FALSE); 1106 } 1107 1108 /* 1109 * If DL_PROMISC_SAP isn't enabled, then the bound SAP must match the 1110 * packet's IP version. 1111 */ 1112 if (!(ipnet->ipnet_flags & IPNET_PROMISC_SAP) && 1113 ipnet->ipnet_family != hdr->hpo_family) 1114 return (B_FALSE); 1115 1116 /* If the destination address is ours, then accept the packet. */ 1117 if (dsttype == IPNETADDR_MYADDR) 1118 return (B_TRUE); 1119 1120 /* 1121 * If DL_PROMISC_PHYS is enabled, then we can see all packets that are 1122 * sent or received on the interface we're observing, or packets that 1123 * have our source address (this allows us to see packets we send). 1124 */ 1125 if (ipnet->ipnet_flags & IPNET_PROMISC_PHYS) { 1126 if (srctype == IPNETADDR_MYADDR || obsif) 1127 return (B_TRUE); 1128 } 1129 1130 /* 1131 * We accept multicast and broadcast packets transmitted or received 1132 * on the interface we're observing. 1133 */ 1134 if (dsttype == IPNETADDR_MBCAST && obsif) 1135 return (B_TRUE); 1136 1137 return (B_FALSE); 1138 } 1139 1140 /* 1141 * Verify if the packet contained in hdr should be passed up to the ipnet 1142 * client stream that's in IPNET_LOMODE. 1143 */ 1144 /* ARGSUSED */ 1145 static boolean_t 1146 ipnet_loaccept(ipnet_t *ipnet, hook_pkt_observe_t *hdr, ipnet_addrp_t *src, 1147 ipnet_addrp_t *dst) 1148 { 1149 if (hdr->hpo_htype != IPOBS_HOOK_LOCAL) { 1150 /* 1151 * ipnet_if is only NULL for IPNET_MINOR_LO devices. 1152 */ 1153 if (ipnet->ipnet_if == NULL) 1154 return (B_FALSE); 1155 } 1156 1157 /* 1158 * An ipnet stream must not see packets that are not from/to its zone. 1159 */ 1160 if (ipnet->ipnet_zoneid != GLOBAL_ZONEID) { 1161 if (ipnet->ipnet_zoneid != ntohl(hdr->hpo_zsrc) && 1162 ipnet->ipnet_zoneid != ntohl(hdr->hpo_zdst)) 1163 return (B_FALSE); 1164 } 1165 1166 return (ipnet->ipnet_family == AF_UNSPEC || 1167 ipnet->ipnet_family == hdr->hpo_family); 1168 } 1169 1170 static void 1171 ipnet_dispatch(void *arg) 1172 { 1173 mblk_t *mp = arg; 1174 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr; 1175 ipnet_t *ipnet; 1176 mblk_t *netmp; 1177 list_t *list; 1178 ipnet_stack_t *ips; 1179 ipnet_addrp_t src; 1180 ipnet_addrp_t dst; 1181 1182 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet; 1183 1184 netmp = hdr->hpo_pkt->b_cont; 1185 src.iap_family = hdr->hpo_family; 1186 dst.iap_family = hdr->hpo_family; 1187 1188 if (hdr->hpo_family == AF_INET) { 1189 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src; 1190 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst; 1191 } else { 1192 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src; 1193 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst; 1194 } 1195 1196 ipnet_walkers_inc(ips); 1197 1198 list = &ips->ips_str_list; 1199 for (ipnet = list_head(list); ipnet != NULL; 1200 ipnet = list_next(list, ipnet)) { 1201 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) { 1202 IPSK_BUMP(ips, ik_acceptFail); 1203 continue; 1204 } 1205 IPSK_BUMP(ips, ik_acceptOk); 1206 1207 if (list_next(list, ipnet) == NULL) { 1208 netmp = hdr->hpo_pkt->b_cont; 1209 hdr->hpo_pkt->b_cont = NULL; 1210 } else { 1211 if ((netmp = dupmsg(hdr->hpo_pkt->b_cont)) == NULL && 1212 (netmp = copymsg(hdr->hpo_pkt->b_cont)) == NULL) { 1213 IPSK_BUMP(ips, ik_duplicationFail); 1214 continue; 1215 } 1216 } 1217 1218 if (ipnet->ipnet_flags & IPNET_INFO) { 1219 if ((netmp = ipnet_addheader(hdr, netmp)) == NULL) { 1220 IPSK_BUMP(ips, ik_dispatchHeaderDrop); 1221 continue; 1222 } 1223 } 1224 1225 if (ipnet->ipnet_rq->q_first == NULL && 1226 canputnext(ipnet->ipnet_rq)) { 1227 putnext(ipnet->ipnet_rq, netmp); 1228 IPSK_BUMP(ips, ik_dispatchDeliver); 1229 } else if (canput(ipnet->ipnet_rq)) { 1230 (void) putq(ipnet->ipnet_rq, netmp); 1231 IPSK_BUMP(ips, ik_dispatchDeliver); 1232 } else { 1233 freemsg(netmp); 1234 IPSK_BUMP(ips, ik_dispatchPutDrop); 1235 } 1236 } 1237 1238 ipnet_walkers_dec(ips); 1239 1240 freemsg(mp); 1241 } 1242 1243 static void 1244 ipnet_input(mblk_t *mp) 1245 { 1246 hook_pkt_observe_t *hdr = (hook_pkt_observe_t *)mp->b_rptr; 1247 ipnet_stack_t *ips; 1248 1249 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet; 1250 1251 if (ddi_taskq_dispatch(ipnet_taskq, ipnet_dispatch, mp, DDI_NOSLEEP) != 1252 DDI_SUCCESS) { 1253 IPSK_BUMP(ips, ik_dispatchFail); 1254 freemsg(mp); 1255 } else { 1256 IPSK_BUMP(ips, ik_dispatchOk); 1257 } 1258 } 1259 1260 static ipnetif_t * 1261 ipnet_alloc_if(ipnet_stack_t *ips) 1262 { 1263 ipnetif_t *ipnetif; 1264 1265 if ((ipnetif = kmem_zalloc(sizeof (*ipnetif), KM_NOSLEEP)) == NULL) 1266 return (NULL); 1267 1268 mutex_init(&ipnetif->if_addr_lock, NULL, MUTEX_DEFAULT, 0); 1269 list_create(&ipnetif->if_ip4addr_list, sizeof (ipnetif_addr_t), 1270 offsetof(ipnetif_addr_t, ifa_link)); 1271 list_create(&ipnetif->if_ip6addr_list, sizeof (ipnetif_addr_t), 1272 offsetof(ipnetif_addr_t, ifa_link)); 1273 mutex_init(&ipnetif->if_reflock, NULL, MUTEX_DEFAULT, 0); 1274 1275 ipnetif->if_stackp = ips; 1276 1277 return (ipnetif); 1278 } 1279 1280 /* 1281 * Create a new ipnetif_t and new minor node for it. If creation is 1282 * successful the new ipnetif_t is inserted into an avl_tree 1283 * containing ipnetif's for this stack instance. 1284 */ 1285 static ipnetif_t * 1286 ipnetif_create(const char *name, uint64_t index, ipnet_stack_t *ips, 1287 uint64_t ifflags) 1288 { 1289 ipnetif_t *ipnetif; 1290 avl_index_t where = 0; 1291 minor_t ifminor; 1292 1293 /* 1294 * Because ipnetif_create() can be called from a NIC event 1295 * callback, it should not block. 1296 */ 1297 ifminor = (minor_t)id_alloc_nosleep(ipnet_minor_space); 1298 if (ifminor == (minor_t)-1) 1299 return (NULL); 1300 if ((ipnetif = ipnet_alloc_if(ips)) == NULL) { 1301 id_free(ipnet_minor_space, ifminor); 1302 return (NULL); 1303 } 1304 1305 (void) strlcpy(ipnetif->if_name, name, LIFNAMSIZ); 1306 ipnetif->if_index = (uint_t)index; 1307 ipnetif->if_zoneid = netstack_get_zoneid(ips->ips_netstack); 1308 ipnetif->if_dev = makedevice(ipnet_major, ifminor); 1309 1310 ipnetif->if_refcnt = 1; 1311 if ((ifflags & IFF_LOOPBACK) != 0) 1312 ipnetif->if_flags = IPNETIF_LOOPBACK; 1313 1314 mutex_enter(&ips->ips_avl_lock); 1315 VERIFY(avl_find(&ips->ips_avl_by_index, &index, &where) == NULL); 1316 avl_insert(&ips->ips_avl_by_index, ipnetif, where); 1317 VERIFY(avl_find(&ips->ips_avl_by_name, (void *)name, &where) == NULL); 1318 avl_insert(&ips->ips_avl_by_name, ipnetif, where); 1319 mutex_exit(&ips->ips_avl_lock); 1320 /* 1321 * Now that the interface can be found by lookups back into ipnet, 1322 * allowing for sanity checking, call the BPF attach. 1323 */ 1324 ipnet_bpfattach(ipnetif); 1325 1326 return (ipnetif); 1327 } 1328 1329 static void 1330 ipnetif_remove(ipnetif_t *ipnetif, ipnet_stack_t *ips) 1331 { 1332 ipnet_t *ipnet; 1333 1334 ipnet_walkers_inc(ips); 1335 /* Send a SIGHUP to all open streams associated with this ipnetif. */ 1336 for (ipnet = list_head(&ips->ips_str_list); ipnet != NULL; 1337 ipnet = list_next(&ips->ips_str_list, ipnet)) { 1338 if (ipnet->ipnet_if == ipnetif) 1339 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP); 1340 } 1341 ipnet_walkers_dec(ips); 1342 mutex_enter(&ips->ips_avl_lock); 1343 avl_remove(&ips->ips_avl_by_index, ipnetif); 1344 avl_remove(&ips->ips_avl_by_name, ipnetif); 1345 mutex_exit(&ips->ips_avl_lock); 1346 /* 1347 * Now that the interface can't be found, do a BPF detach 1348 */ 1349 ipnet_bpfdetach(ipnetif); 1350 /* 1351 * Release the reference we implicitly held in ipnetif_create(). 1352 */ 1353 ipnetif_refrele(ipnetif); 1354 } 1355 1356 static void 1357 ipnet_purge_addrlist(list_t *addrlist) 1358 { 1359 ipnetif_addr_t *ifa; 1360 1361 while ((ifa = list_head(addrlist)) != NULL) { 1362 list_remove(addrlist, ifa); 1363 if (ifa->ifa_shared != NULL) 1364 ipnetif_clone_release(ifa->ifa_shared); 1365 kmem_free(ifa, sizeof (*ifa)); 1366 } 1367 } 1368 1369 static void 1370 ipnetif_free(ipnetif_t *ipnetif) 1371 { 1372 ASSERT(ipnetif->if_refcnt == 0); 1373 ASSERT(ipnetif->if_sharecnt == 0); 1374 1375 /* Remove IPv4/v6 address lists from the ipnetif */ 1376 ipnet_purge_addrlist(&ipnetif->if_ip4addr_list); 1377 list_destroy(&ipnetif->if_ip4addr_list); 1378 ipnet_purge_addrlist(&ipnetif->if_ip6addr_list); 1379 list_destroy(&ipnetif->if_ip6addr_list); 1380 mutex_destroy(&ipnetif->if_addr_lock); 1381 mutex_destroy(&ipnetif->if_reflock); 1382 if (ipnetif->if_dev != 0) 1383 id_free(ipnet_minor_space, getminor(ipnetif->if_dev)); 1384 kmem_free(ipnetif, sizeof (*ipnetif)); 1385 } 1386 1387 /* 1388 * Create an ipnetif_addr_t with the given logical interface id (lif) 1389 * and add it to the supplied ipnetif. The lif is the netinfo 1390 * representation of logical interface id, and we use this id to match 1391 * incoming netinfo events against our lists of addresses. 1392 */ 1393 static void 1394 ipnet_add_ifaddr(uint64_t lif, ipnetif_t *ipnetif, net_handle_t nd) 1395 { 1396 ipnetif_addr_t *ifaddr; 1397 zoneid_t zoneid; 1398 struct sockaddr_in bcast; 1399 struct sockaddr_storage addr; 1400 net_ifaddr_t type = NA_ADDRESS; 1401 uint64_t phyif = ipnetif->if_index; 1402 1403 if (net_getlifaddr(nd, phyif, lif, 1, &type, &addr) != 0 || 1404 net_getlifzone(nd, phyif, lif, &zoneid) != 0) 1405 return; 1406 1407 if ((ifaddr = kmem_alloc(sizeof (*ifaddr), KM_NOSLEEP)) == NULL) 1408 return; 1409 ifaddr->ifa_zone = zoneid; 1410 ifaddr->ifa_id = lif; 1411 ifaddr->ifa_shared = NULL; 1412 1413 switch (addr.ss_family) { 1414 case AF_INET: 1415 ifaddr->ifa_ip4addr = 1416 ((struct sockaddr_in *)&addr)->sin_addr.s_addr; 1417 /* 1418 * Try and get the broadcast address. Note that it's okay for 1419 * an interface to not have a broadcast address, so we don't 1420 * fail the entire operation if net_getlifaddr() fails here. 1421 */ 1422 type = NA_BROADCAST; 1423 if (net_getlifaddr(nd, phyif, lif, 1, &type, &bcast) == 0) 1424 ifaddr->ifa_brdaddr = bcast.sin_addr.s_addr; 1425 break; 1426 case AF_INET6: 1427 ifaddr->ifa_ip6addr = ((struct sockaddr_in6 *)&addr)->sin6_addr; 1428 break; 1429 } 1430 1431 /* 1432 * The zoneid stored in ipnetif_t needs to correspond to the actual 1433 * zone the address is being used in. This facilitates finding the 1434 * correct netstack_t pointer, amongst other things, later. 1435 */ 1436 if (zoneid == ALL_ZONES) 1437 zoneid = GLOBAL_ZONEID; 1438 1439 mutex_enter(&ipnetif->if_addr_lock); 1440 if (zoneid != ipnetif->if_zoneid) { 1441 ipnetif_t *ifp2; 1442 1443 ifp2 = ipnetif_clone_create(ipnetif, zoneid); 1444 ifaddr->ifa_shared = ifp2; 1445 } 1446 list_insert_tail(addr.ss_family == AF_INET ? 1447 &ipnetif->if_ip4addr_list : &ipnetif->if_ip6addr_list, ifaddr); 1448 mutex_exit(&ipnetif->if_addr_lock); 1449 } 1450 1451 static void 1452 ipnet_delete_ifaddr(ipnetif_addr_t *ifaddr, ipnetif_t *ipnetif, boolean_t isv6) 1453 { 1454 mutex_enter(&ipnetif->if_addr_lock); 1455 if (ifaddr->ifa_shared != NULL) 1456 ipnetif_clone_release(ifaddr->ifa_shared); 1457 1458 list_remove(isv6 ? 1459 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list, ifaddr); 1460 mutex_exit(&ipnetif->if_addr_lock); 1461 kmem_free(ifaddr, sizeof (*ifaddr)); 1462 } 1463 1464 static void 1465 ipnet_plumb_ev(ipnet_nicevent_t *ipne, ipnet_stack_t *ips, boolean_t isv6) 1466 { 1467 ipnetif_t *ipnetif; 1468 boolean_t refrele_needed = B_TRUE; 1469 uint64_t ifflags; 1470 uint64_t ifindex; 1471 char *ifname; 1472 1473 ifflags = 0; 1474 ifname = ipne->ipne_ifname; 1475 ifindex = ipne->ipne_ifindex; 1476 1477 (void) net_getlifflags(ipne->ipne_protocol, ifindex, 0, &ifflags); 1478 1479 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) { 1480 ipnetif = ipnetif_create(ifname, ifindex, ips, ifflags); 1481 refrele_needed = B_FALSE; 1482 } 1483 if (ipnetif != NULL) { 1484 ipnetif->if_flags |= 1485 isv6 ? IPNETIF_IPV6PLUMBED : IPNETIF_IPV4PLUMBED; 1486 } 1487 1488 if (ipnetif->if_multicnt != 0) { 1489 if (ip_join_allmulti(ifindex, isv6, 1490 ips->ips_netstack->netstack_ip) == 0) { 1491 ipnetif->if_flags |= 1492 isv6 ? IPNETIF_IPV6ALLMULTI : IPNETIF_IPV4ALLMULTI; 1493 } 1494 } 1495 1496 if (refrele_needed) 1497 ipnetif_refrele(ipnetif); 1498 } 1499 1500 static void 1501 ipnet_unplumb_ev(uint64_t ifindex, ipnet_stack_t *ips, boolean_t isv6) 1502 { 1503 ipnetif_t *ipnetif; 1504 1505 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) 1506 return; 1507 1508 mutex_enter(&ipnetif->if_addr_lock); 1509 ipnet_purge_addrlist(isv6 ? 1510 &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list); 1511 mutex_exit(&ipnetif->if_addr_lock); 1512 1513 /* 1514 * Note that we have one ipnetif for both IPv4 and IPv6, but we receive 1515 * separate NE_UNPLUMB events for IPv4 and IPv6. We remove the ipnetif 1516 * if both IPv4 and IPv6 interfaces have been unplumbed. 1517 */ 1518 ipnetif->if_flags &= isv6 ? ~IPNETIF_IPV6PLUMBED : ~IPNETIF_IPV4PLUMBED; 1519 if (!(ipnetif->if_flags & (IPNETIF_IPV4PLUMBED | IPNETIF_IPV6PLUMBED))) 1520 ipnetif_remove(ipnetif, ips); 1521 ipnetif_refrele(ipnetif); 1522 } 1523 1524 static void 1525 ipnet_lifup_ev(uint64_t ifindex, uint64_t lifindex, net_handle_t nd, 1526 ipnet_stack_t *ips, boolean_t isv6) 1527 { 1528 ipnetif_t *ipnetif; 1529 ipnetif_addr_t *ifaddr; 1530 1531 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) 1532 return; 1533 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) { 1534 /* 1535 * We must have missed a NE_LIF_DOWN event. Delete this 1536 * ifaddr and re-create it. 1537 */ 1538 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6); 1539 } 1540 1541 ipnet_add_ifaddr(lifindex, ipnetif, nd); 1542 ipnetif_refrele(ipnetif); 1543 } 1544 1545 static void 1546 ipnet_lifdown_ev(uint64_t ifindex, uint64_t lifindex, ipnet_stack_t *ips, 1547 boolean_t isv6) 1548 { 1549 ipnetif_t *ipnetif; 1550 ipnetif_addr_t *ifaddr; 1551 1552 if ((ipnetif = ipnetif_getby_index(ifindex, ips)) == NULL) 1553 return; 1554 if ((ifaddr = ipnet_match_lif(ipnetif, lifindex, isv6)) != NULL) 1555 ipnet_delete_ifaddr(ifaddr, ipnetif, isv6); 1556 ipnetif_refrele(ipnetif); 1557 /* 1558 * Make sure that open streams on this ipnetif are still allowed to 1559 * have it open. 1560 */ 1561 ipnetif_zonecheck(ipnetif, ips); 1562 } 1563 1564 /* 1565 * This callback from the NIC event framework dispatches a taskq as the event 1566 * handlers may block. 1567 */ 1568 /* ARGSUSED */ 1569 static int 1570 ipnet_nicevent_cb(hook_event_token_t token, hook_data_t info, void *arg) 1571 { 1572 ipnet_stack_t *ips = arg; 1573 hook_nic_event_t *hn = (hook_nic_event_t *)info; 1574 ipnet_nicevent_t *ipne; 1575 1576 if ((ipne = kmem_alloc(sizeof (ipnet_nicevent_t), KM_NOSLEEP)) == NULL) 1577 return (0); 1578 ipne->ipne_event = hn->hne_event; 1579 ipne->ipne_protocol = hn->hne_protocol; 1580 ipne->ipne_stackid = ips->ips_netstack->netstack_stackid; 1581 ipne->ipne_ifindex = hn->hne_nic; 1582 ipne->ipne_lifindex = hn->hne_lif; 1583 if (hn->hne_datalen != 0) { 1584 (void) strlcpy(ipne->ipne_ifname, hn->hne_data, 1585 sizeof (ipne->ipne_ifname)); 1586 } 1587 (void) ddi_taskq_dispatch(ipnet_nicevent_taskq, ipnet_nicevent_task, 1588 ipne, DDI_NOSLEEP); 1589 return (0); 1590 } 1591 1592 static void 1593 ipnet_nicevent_task(void *arg) 1594 { 1595 ipnet_nicevent_t *ipne = arg; 1596 netstack_t *ns; 1597 ipnet_stack_t *ips; 1598 boolean_t isv6; 1599 1600 if ((ns = netstack_find_by_stackid(ipne->ipne_stackid)) == NULL) 1601 goto done; 1602 ips = ns->netstack_ipnet; 1603 isv6 = (ipne->ipne_protocol == ips->ips_ndv6); 1604 1605 mutex_enter(&ips->ips_event_lock); 1606 switch (ipne->ipne_event) { 1607 case NE_PLUMB: 1608 ipnet_plumb_ev(ipne, ips, isv6); 1609 break; 1610 case NE_UNPLUMB: 1611 ipnet_unplumb_ev(ipne->ipne_ifindex, ips, isv6); 1612 break; 1613 case NE_LIF_UP: 1614 ipnet_lifup_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, 1615 ipne->ipne_protocol, ips, isv6); 1616 break; 1617 case NE_LIF_DOWN: 1618 ipnet_lifdown_ev(ipne->ipne_ifindex, ipne->ipne_lifindex, ips, 1619 isv6); 1620 break; 1621 default: 1622 break; 1623 } 1624 mutex_exit(&ips->ips_event_lock); 1625 done: 1626 if (ns != NULL) 1627 netstack_rele(ns); 1628 kmem_free(ipne, sizeof (ipnet_nicevent_t)); 1629 } 1630 1631 dev_t 1632 ipnet_if_getdev(char *name, zoneid_t zoneid) 1633 { 1634 netstack_t *ns; 1635 ipnet_stack_t *ips; 1636 ipnetif_t *ipnetif; 1637 dev_t dev = (dev_t)-1; 1638 1639 if (is_system_labeled() && zoneid != GLOBAL_ZONEID) 1640 return (dev); 1641 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL) 1642 return (dev); 1643 1644 ips = ns->netstack_ipnet; 1645 mutex_enter(&ips->ips_avl_lock); 1646 if ((ipnetif = avl_find(&ips->ips_avl_by_name, name, NULL)) != NULL) { 1647 if (ipnetif_in_zone(ipnetif, zoneid, ips)) 1648 dev = ipnetif->if_dev; 1649 } 1650 mutex_exit(&ips->ips_avl_lock); 1651 netstack_rele(ns); 1652 1653 return (dev); 1654 } 1655 1656 static ipnetif_t * 1657 ipnetif_getby_index(uint64_t id, ipnet_stack_t *ips) 1658 { 1659 ipnetif_t *ipnetif; 1660 1661 mutex_enter(&ips->ips_avl_lock); 1662 if ((ipnetif = avl_find(&ips->ips_avl_by_index, &id, NULL)) != NULL) 1663 ipnetif_refhold(ipnetif); 1664 mutex_exit(&ips->ips_avl_lock); 1665 return (ipnetif); 1666 } 1667 1668 static ipnetif_t * 1669 ipnetif_getby_dev(dev_t dev, ipnet_stack_t *ips) 1670 { 1671 ipnetif_t *ipnetif; 1672 avl_tree_t *tree; 1673 1674 mutex_enter(&ips->ips_avl_lock); 1675 tree = &ips->ips_avl_by_index; 1676 for (ipnetif = avl_first(tree); ipnetif != NULL; 1677 ipnetif = avl_walk(tree, ipnetif, AVL_AFTER)) { 1678 if (ipnetif->if_dev == dev) { 1679 ipnetif_refhold(ipnetif); 1680 break; 1681 } 1682 } 1683 mutex_exit(&ips->ips_avl_lock); 1684 return (ipnetif); 1685 } 1686 1687 static ipnetif_addr_t * 1688 ipnet_match_lif(ipnetif_t *ipnetif, lif_if_t lid, boolean_t isv6) 1689 { 1690 ipnetif_addr_t *ifaddr; 1691 list_t *list; 1692 1693 mutex_enter(&ipnetif->if_addr_lock); 1694 list = isv6 ? &ipnetif->if_ip6addr_list : &ipnetif->if_ip4addr_list; 1695 for (ifaddr = list_head(list); ifaddr != NULL; 1696 ifaddr = list_next(list, ifaddr)) { 1697 if (lid == ifaddr->ifa_id) 1698 break; 1699 } 1700 mutex_exit(&ipnetif->if_addr_lock); 1701 return (ifaddr); 1702 } 1703 1704 /* ARGSUSED */ 1705 static void * 1706 ipnet_stack_init(netstackid_t stackid, netstack_t *ns) 1707 { 1708 ipnet_stack_t *ips; 1709 1710 ips = kmem_zalloc(sizeof (*ips), KM_SLEEP); 1711 ips->ips_netstack = ns; 1712 mutex_init(&ips->ips_avl_lock, NULL, MUTEX_DEFAULT, 0); 1713 avl_create(&ips->ips_avl_by_index, ipnetif_compare_index, 1714 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_index)); 1715 avl_create(&ips->ips_avl_by_name, ipnetif_compare_name, 1716 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_name)); 1717 avl_create(&ips->ips_avl_by_shared, ipnetif_compare_name_zone, 1718 sizeof (ipnetif_t), offsetof(ipnetif_t, if_avl_by_shared)); 1719 mutex_init(&ips->ips_walkers_lock, NULL, MUTEX_DEFAULT, NULL); 1720 cv_init(&ips->ips_walkers_cv, NULL, CV_DRIVER, NULL); 1721 list_create(&ips->ips_str_list, sizeof (ipnet_t), 1722 offsetof(ipnet_t, ipnet_next)); 1723 ipnet_register_netihook(ips); 1724 return (ips); 1725 } 1726 1727 /* ARGSUSED */ 1728 static void 1729 ipnet_stack_fini(netstackid_t stackid, void *arg) 1730 { 1731 ipnet_stack_t *ips = arg; 1732 ipnetif_t *ipnetif, *nipnetif; 1733 1734 if (ips->ips_kstatp != NULL) { 1735 zoneid_t zoneid; 1736 1737 zoneid = netstackid_to_zoneid(stackid); 1738 net_kstat_delete(net_zoneidtonetid(zoneid), ips->ips_kstatp); 1739 } 1740 if (ips->ips_ndv4 != NULL) { 1741 VERIFY(net_hook_unregister(ips->ips_ndv4, NH_NIC_EVENTS, 1742 ips->ips_nicevents) == 0); 1743 VERIFY(net_protocol_release(ips->ips_ndv4) == 0); 1744 } 1745 if (ips->ips_ndv6 != NULL) { 1746 VERIFY(net_hook_unregister(ips->ips_ndv6, NH_NIC_EVENTS, 1747 ips->ips_nicevents) == 0); 1748 VERIFY(net_protocol_release(ips->ips_ndv6) == 0); 1749 } 1750 hook_free(ips->ips_nicevents); 1751 1752 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL; 1753 ipnetif = nipnetif) { 1754 nipnetif = AVL_NEXT(&ips->ips_avl_by_index, ipnetif); 1755 ipnetif_remove(ipnetif, ips); 1756 } 1757 avl_destroy(&ips->ips_avl_by_shared); 1758 avl_destroy(&ips->ips_avl_by_index); 1759 avl_destroy(&ips->ips_avl_by_name); 1760 mutex_destroy(&ips->ips_avl_lock); 1761 mutex_destroy(&ips->ips_walkers_lock); 1762 cv_destroy(&ips->ips_walkers_cv); 1763 list_destroy(&ips->ips_str_list); 1764 kmem_free(ips, sizeof (*ips)); 1765 } 1766 1767 /* Do any of the addresses in addrlist belong the supplied zoneid? */ 1768 static boolean_t 1769 ipnet_addrs_in_zone(list_t *addrlist, zoneid_t zoneid) 1770 { 1771 ipnetif_addr_t *ifa; 1772 1773 for (ifa = list_head(addrlist); ifa != NULL; 1774 ifa = list_next(addrlist, ifa)) { 1775 if (ifa->ifa_zone == zoneid) 1776 return (B_TRUE); 1777 } 1778 return (B_FALSE); 1779 } 1780 1781 /* Should the supplied ipnetif be visible from the supplied zoneid? */ 1782 static boolean_t 1783 ipnetif_in_zone(ipnetif_t *ipnetif, zoneid_t zoneid, ipnet_stack_t *ips) 1784 { 1785 int ret; 1786 1787 /* 1788 * The global zone has visibility into all interfaces in the global 1789 * stack, and exclusive stack zones have visibility into all 1790 * interfaces in their stack. 1791 */ 1792 if (zoneid == GLOBAL_ZONEID || 1793 ips->ips_netstack->netstack_stackid != GLOBAL_NETSTACKID) 1794 return (B_TRUE); 1795 1796 /* 1797 * Shared-stack zones only have visibility for interfaces that have 1798 * addresses in their zone. 1799 */ 1800 mutex_enter(&ipnetif->if_addr_lock); 1801 ret = ipnet_addrs_in_zone(&ipnetif->if_ip4addr_list, zoneid) || 1802 ipnet_addrs_in_zone(&ipnetif->if_ip6addr_list, zoneid); 1803 mutex_exit(&ipnetif->if_addr_lock); 1804 return (ret); 1805 } 1806 1807 /* 1808 * Verify that any ipnet_t that has a reference to the supplied ipnetif should 1809 * still be allowed to have it open. A given ipnet_t may no longer be allowed 1810 * to have an ipnetif open if there are no longer any addresses that belong to 1811 * the ipnetif in the ipnet_t's non-global shared-stack zoneid. If that's the 1812 * case, send the ipnet_t an M_HANGUP. 1813 */ 1814 static void 1815 ipnetif_zonecheck(ipnetif_t *ipnetif, ipnet_stack_t *ips) 1816 { 1817 list_t *strlist = &ips->ips_str_list; 1818 ipnet_t *ipnet; 1819 1820 ipnet_walkers_inc(ips); 1821 for (ipnet = list_head(strlist); ipnet != NULL; 1822 ipnet = list_next(strlist, ipnet)) { 1823 if (ipnet->ipnet_if != ipnetif) 1824 continue; 1825 if (!ipnetif_in_zone(ipnetif, ipnet->ipnet_zoneid, ips)) 1826 (void) putnextctl(ipnet->ipnet_rq, M_HANGUP); 1827 } 1828 ipnet_walkers_dec(ips); 1829 } 1830 1831 void 1832 ipnet_walk_if(ipnet_walkfunc_t *cb, void *arg, zoneid_t zoneid) 1833 { 1834 ipnetif_t *ipnetif; 1835 list_t cbdata; 1836 ipnetif_cbdata_t *cbnode; 1837 netstack_t *ns; 1838 ipnet_stack_t *ips; 1839 1840 /* 1841 * On labeled systems, non-global zones shouldn't see anything 1842 * in /dev/ipnet. 1843 */ 1844 if (is_system_labeled() && zoneid != GLOBAL_ZONEID) 1845 return; 1846 1847 if ((ns = netstack_find_by_zoneid(zoneid)) == NULL) 1848 return; 1849 1850 ips = ns->netstack_ipnet; 1851 list_create(&cbdata, sizeof (ipnetif_cbdata_t), 1852 offsetof(ipnetif_cbdata_t, ic_next)); 1853 1854 mutex_enter(&ips->ips_avl_lock); 1855 for (ipnetif = avl_first(&ips->ips_avl_by_index); ipnetif != NULL; 1856 ipnetif = avl_walk(&ips->ips_avl_by_index, ipnetif, AVL_AFTER)) { 1857 if (!ipnetif_in_zone(ipnetif, zoneid, ips)) 1858 continue; 1859 cbnode = kmem_zalloc(sizeof (ipnetif_cbdata_t), KM_SLEEP); 1860 (void) strlcpy(cbnode->ic_ifname, ipnetif->if_name, LIFNAMSIZ); 1861 cbnode->ic_dev = ipnetif->if_dev; 1862 list_insert_head(&cbdata, cbnode); 1863 } 1864 mutex_exit(&ips->ips_avl_lock); 1865 1866 while ((cbnode = list_head(&cbdata)) != NULL) { 1867 cb(cbnode->ic_ifname, arg, cbnode->ic_dev); 1868 list_remove(&cbdata, cbnode); 1869 kmem_free(cbnode, sizeof (ipnetif_cbdata_t)); 1870 } 1871 list_destroy(&cbdata); 1872 netstack_rele(ns); 1873 } 1874 1875 static int 1876 ipnetif_compare_index(const void *index_ptr, const void *ipnetifp) 1877 { 1878 int64_t index1 = *((int64_t *)index_ptr); 1879 int64_t index2 = (int64_t)((ipnetif_t *)ipnetifp)->if_index; 1880 1881 return (SIGNOF(index2 - index1)); 1882 } 1883 1884 static int 1885 ipnetif_compare_name(const void *name_ptr, const void *ipnetifp) 1886 { 1887 int res; 1888 1889 res = strcmp(((ipnetif_t *)ipnetifp)->if_name, name_ptr); 1890 return (SIGNOF(res)); 1891 } 1892 1893 static int 1894 ipnetif_compare_name_zone(const void *key_ptr, const void *ipnetifp) 1895 { 1896 const uintptr_t *ptr = key_ptr; 1897 const ipnetif_t *ifp; 1898 int res; 1899 1900 ifp = ipnetifp; 1901 res = ifp->if_zoneid - ptr[0]; 1902 if (res != 0) 1903 return (SIGNOF(res)); 1904 res = strcmp(ifp->if_name, (char *)ptr[1]); 1905 return (SIGNOF(res)); 1906 } 1907 1908 static void 1909 ipnetif_refhold(ipnetif_t *ipnetif) 1910 { 1911 mutex_enter(&ipnetif->if_reflock); 1912 ipnetif->if_refcnt++; 1913 mutex_exit(&ipnetif->if_reflock); 1914 } 1915 1916 static void 1917 ipnetif_refrele(ipnetif_t *ipnetif) 1918 { 1919 mutex_enter(&ipnetif->if_reflock); 1920 ASSERT(ipnetif->if_refcnt > 0); 1921 if (--ipnetif->if_refcnt == 0) 1922 ipnetif_free(ipnetif); 1923 else 1924 mutex_exit(&ipnetif->if_reflock); 1925 } 1926 1927 static void 1928 ipnet_walkers_inc(ipnet_stack_t *ips) 1929 { 1930 mutex_enter(&ips->ips_walkers_lock); 1931 ips->ips_walkers_cnt++; 1932 mutex_exit(&ips->ips_walkers_lock); 1933 } 1934 1935 static void 1936 ipnet_walkers_dec(ipnet_stack_t *ips) 1937 { 1938 mutex_enter(&ips->ips_walkers_lock); 1939 ASSERT(ips->ips_walkers_cnt != 0); 1940 if (--ips->ips_walkers_cnt == 0) 1941 cv_broadcast(&ips->ips_walkers_cv); 1942 mutex_exit(&ips->ips_walkers_lock); 1943 } 1944 1945 /*ARGSUSED*/ 1946 static int 1947 ipobs_bounce_func(hook_event_token_t token, hook_data_t info, void *arg) 1948 { 1949 hook_pkt_observe_t *hdr; 1950 pfv_t func = (pfv_t)arg; 1951 mblk_t *mp; 1952 1953 hdr = (hook_pkt_observe_t *)info; 1954 mp = dupmsg(hdr->hpo_pkt); 1955 if (mp == NULL) { 1956 mp = copymsg(hdr->hpo_pkt); 1957 if (mp == NULL) { 1958 netstack_t *ns = hdr->hpo_ctx; 1959 ipnet_stack_t *ips = ns->netstack_ipnet; 1960 1961 IPSK_BUMP(ips, ik_dispatchDupDrop); 1962 return (0); 1963 } 1964 } 1965 1966 hdr = (hook_pkt_observe_t *)mp->b_rptr; 1967 hdr->hpo_pkt = mp; 1968 1969 func(mp); 1970 1971 return (0); 1972 } 1973 1974 hook_t * 1975 ipobs_register_hook(netstack_t *ns, pfv_t func) 1976 { 1977 ip_stack_t *ipst = ns->netstack_ip; 1978 char name[32]; 1979 hook_t *hook; 1980 1981 HOOK_INIT(hook, ipobs_bounce_func, "", (void *)func); 1982 VERIFY(hook != NULL); 1983 1984 /* 1985 * To register multiple hooks with he same callback function, 1986 * a unique name is needed. 1987 */ 1988 (void) snprintf(name, sizeof (name), "ipobserve_%p", hook); 1989 hook->h_name = strdup(name); 1990 1991 (void) net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook); 1992 (void) net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook); 1993 1994 return (hook); 1995 } 1996 1997 void 1998 ipobs_unregister_hook(netstack_t *ns, hook_t *hook) 1999 { 2000 ip_stack_t *ipst = ns->netstack_ip; 2001 2002 (void) net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, hook); 2003 2004 (void) net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, hook); 2005 2006 strfree(hook->h_name); 2007 2008 hook_free(hook); 2009 } 2010 2011 /* ******************************************************************** */ 2012 /* BPF Functions below */ 2013 /* ******************************************************************** */ 2014 2015 /* 2016 * Convenience function to make mapping a zoneid to an ipnet_stack_t easy. 2017 */ 2018 static ipnet_stack_t * 2019 ipnet_find_by_zoneid(zoneid_t zoneid) 2020 { 2021 netstack_t *ns; 2022 2023 VERIFY((ns = netstack_find_by_zoneid(zoneid)) != NULL); 2024 return (ns->netstack_ipnet); 2025 } 2026 2027 /* 2028 * Rather than weave the complexity of what needs to be done for a BPF 2029 * device attach or detach into the code paths of where they're used, 2030 * it is presented here in a couple of simple functions, along with 2031 * other similar code. 2032 * 2033 * The refrele/refhold here provide the means by which it is known 2034 * when the clone structures can be free'd. 2035 */ 2036 static void 2037 ipnet_bpfdetach(ipnetif_t *ifp) 2038 { 2039 if (ifp->if_stackp->ips_bpfdetach_fn != NULL) { 2040 ifp->if_stackp->ips_bpfdetach_fn((uintptr_t)ifp); 2041 ipnetif_refrele(ifp); 2042 } 2043 } 2044 2045 static void 2046 ipnet_bpfattach(ipnetif_t *ifp) 2047 { 2048 if (ifp->if_stackp->ips_bpfattach_fn != NULL) { 2049 ipnetif_refhold(ifp); 2050 ifp->if_stackp->ips_bpfattach_fn((uintptr_t)ifp, DL_IPNET, 2051 ifp->if_zoneid, BPR_IPNET); 2052 } 2053 } 2054 2055 /* 2056 * Set the functions to call back to when adding or removing an interface so 2057 * that BPF can keep its internal list of these up to date. 2058 */ 2059 void 2060 ipnet_set_bpfattach(bpf_attach_fn_t attach, bpf_detach_fn_t detach, 2061 zoneid_t zoneid, bpf_itap_fn_t tapfunc, bpf_provider_reg_fn_t provider) 2062 { 2063 ipnet_stack_t *ips; 2064 ipnetif_t *ipnetif; 2065 avl_tree_t *tree; 2066 ipnetif_t *next; 2067 2068 if (zoneid == GLOBAL_ZONEID) { 2069 ipnet_itap = tapfunc; 2070 } 2071 2072 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL); 2073 2074 /* 2075 * If we're setting a new attach function, call it for every 2076 * mac that has already been attached. 2077 */ 2078 if (attach != NULL && ips->ips_bpfattach_fn == NULL) { 2079 ASSERT(detach != NULL); 2080 if (provider != NULL) { 2081 (void) provider(&bpf_ipnet); 2082 } 2083 /* 2084 * The call to ipnet_bpfattach() calls into bpf`bpfattach 2085 * which then wants to resolve the link name into a link id. 2086 * For ipnet, this results in a call back to 2087 * ipnet_get_linkid_byname which also needs to lock and walk 2088 * the AVL tree. Thus the call to ipnet_bpfattach needs to 2089 * be made without the avl_lock held. 2090 */ 2091 mutex_enter(&ips->ips_event_lock); 2092 ips->ips_bpfattach_fn = attach; 2093 ips->ips_bpfdetach_fn = detach; 2094 mutex_enter(&ips->ips_avl_lock); 2095 tree = &ips->ips_avl_by_index; 2096 for (ipnetif = avl_first(tree); ipnetif != NULL; 2097 ipnetif = next) { 2098 ipnetif_refhold(ipnetif); 2099 mutex_exit(&ips->ips_avl_lock); 2100 ipnet_bpfattach(ipnetif); 2101 mutex_enter(&ips->ips_avl_lock); 2102 next = avl_walk(tree, ipnetif, AVL_AFTER); 2103 ipnetif_refrele(ipnetif); 2104 } 2105 mutex_exit(&ips->ips_avl_lock); 2106 ipnet_bpf_probe_shared(ips); 2107 mutex_exit(&ips->ips_event_lock); 2108 2109 } else if (attach == NULL && ips->ips_bpfattach_fn != NULL) { 2110 ASSERT(ips->ips_bpfdetach_fn != NULL); 2111 mutex_enter(&ips->ips_event_lock); 2112 ips->ips_bpfattach_fn = NULL; 2113 mutex_enter(&ips->ips_avl_lock); 2114 tree = &ips->ips_avl_by_index; 2115 for (ipnetif = avl_first(tree); ipnetif != NULL; 2116 ipnetif = next) { 2117 ipnetif_refhold(ipnetif); 2118 mutex_exit(&ips->ips_avl_lock); 2119 ipnet_bpfdetach((ipnetif_t *)ipnetif); 2120 mutex_enter(&ips->ips_avl_lock); 2121 next = avl_walk(tree, ipnetif, AVL_AFTER); 2122 ipnetif_refrele(ipnetif); 2123 } 2124 mutex_exit(&ips->ips_avl_lock); 2125 ipnet_bpf_release_shared(ips); 2126 ips->ips_bpfdetach_fn = NULL; 2127 mutex_exit(&ips->ips_event_lock); 2128 2129 if (provider != NULL) { 2130 (void) provider(&bpf_ipnet); 2131 } 2132 } 2133 } 2134 2135 /* 2136 * The list of interfaces available via ipnet is private for each zone, 2137 * so the AVL tree of each zone must be searched for a given name, even 2138 * if all names are unique. 2139 */ 2140 int 2141 ipnet_open_byname(const char *name, ipnetif_t **ptr, zoneid_t zoneid) 2142 { 2143 ipnet_stack_t *ips; 2144 ipnetif_t *ipnetif; 2145 2146 ASSERT(ptr != NULL); 2147 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL); 2148 2149 mutex_enter(&ips->ips_avl_lock); 2150 ipnetif = avl_find(&ips->ips_avl_by_name, (char *)name, NULL); 2151 if (ipnetif != NULL) { 2152 ipnetif_refhold(ipnetif); 2153 } 2154 mutex_exit(&ips->ips_avl_lock); 2155 2156 *ptr = ipnetif; 2157 2158 if (ipnetif == NULL) 2159 return (ESRCH); 2160 return (0); 2161 } 2162 2163 void 2164 ipnet_close_byhandle(ipnetif_t *ifp) 2165 { 2166 ASSERT(ifp != NULL); 2167 ipnetif_refrele(ifp); 2168 } 2169 2170 const char * 2171 ipnet_name(ipnetif_t *ifp) 2172 { 2173 ASSERT(ifp != NULL); 2174 return (ifp->if_name); 2175 } 2176 2177 /* 2178 * To find the linkid for a given name, it is necessary to know which zone 2179 * the interface name belongs to and to search the avl tree for that zone 2180 * as there is no master list of all interfaces and which zone they belong 2181 * to. It is assumed that the caller of this function is somehow already 2182 * working with the ipnet interfaces and hence the ips_event_lock is held. 2183 * When BPF calls into this function, it is doing so because of an event 2184 * in ipnet, and thus ipnet holds the ips_event_lock. Thus the datalink id 2185 * value returned has meaning without the need for grabbing a hold on the 2186 * owning structure. 2187 */ 2188 int 2189 ipnet_get_linkid_byname(const char *name, uint_t *idp, zoneid_t zoneid) 2190 { 2191 ipnet_stack_t *ips; 2192 ipnetif_t *ifp; 2193 2194 VERIFY((ips = ipnet_find_by_zoneid(zoneid)) != NULL); 2195 ASSERT(mutex_owned(&ips->ips_event_lock)); 2196 2197 mutex_enter(&ips->ips_avl_lock); 2198 ifp = avl_find(&ips->ips_avl_by_name, (void *)name, NULL); 2199 if (ifp != NULL) 2200 *idp = (uint_t)ifp->if_index; 2201 2202 /* 2203 * Shared instance zone? 2204 */ 2205 if (netstackid_to_zoneid(zoneid_to_netstackid(zoneid)) != zoneid) { 2206 uintptr_t key[2] = { zoneid, (uintptr_t)name }; 2207 2208 ifp = avl_find(&ips->ips_avl_by_shared, (void *)key, NULL); 2209 if (ifp != NULL) 2210 *idp = (uint_t)ifp->if_index; 2211 } 2212 2213 mutex_exit(&ips->ips_avl_lock); 2214 2215 if (ifp == NULL) 2216 return (ESRCH); 2217 return (0); 2218 } 2219 2220 /* 2221 * Strictly speaking, there is no such thing as a "client" in ipnet, like 2222 * there is in mac. BPF only needs to have this because it is required as 2223 * part of interfacing correctly with mac. The reuse of the original 2224 * ipnetif_t as a client poses no danger, so long as it is done with its 2225 * own ref-count'd hold that is given up on close. 2226 */ 2227 int 2228 ipnet_client_open(ipnetif_t *ptr, ipnetif_t **result) 2229 { 2230 ASSERT(ptr != NULL); 2231 ASSERT(result != NULL); 2232 ipnetif_refhold(ptr); 2233 *result = ptr; 2234 2235 return (0); 2236 } 2237 2238 void 2239 ipnet_client_close(ipnetif_t *ptr) 2240 { 2241 ASSERT(ptr != NULL); 2242 ipnetif_refrele(ptr); 2243 } 2244 2245 /* 2246 * This is called from BPF when it needs to start receiving packets 2247 * from ipnet. 2248 * 2249 * The use of the ipnet_t structure here is somewhat lightweight when 2250 * compared to how it is used elsewhere but it already has all of the 2251 * right fields in it, so reuse here doesn't seem out of order. Its 2252 * primary purpose here is to provide the means to store pointers for 2253 * use when ipnet_promisc_remove() needs to be called. 2254 * 2255 * This should never be called for the IPNET_MINOR_LO device as it is 2256 * never created via ipnetif_create. 2257 */ 2258 /*ARGSUSED*/ 2259 int 2260 ipnet_promisc_add(void *handle, uint_t how, void *data, uintptr_t *mhandle, 2261 int flags) 2262 { 2263 ip_stack_t *ipst; 2264 netstack_t *ns; 2265 ipnetif_t *ifp; 2266 ipnet_t *ipnet; 2267 char name[32]; 2268 int error; 2269 2270 ifp = (ipnetif_t *)handle; 2271 ns = netstack_find_by_zoneid(ifp->if_zoneid); 2272 2273 if ((how == DL_PROMISC_PHYS) || (how == DL_PROMISC_MULTI)) { 2274 error = ipnet_join_allmulti(ifp, ns->netstack_ipnet); 2275 if (error != 0) 2276 return (error); 2277 } else { 2278 return (EINVAL); 2279 } 2280 2281 ipnet = kmem_zalloc(sizeof (*ipnet), KM_SLEEP); 2282 ipnet->ipnet_if = ifp; 2283 ipnet->ipnet_ns = ns; 2284 ipnet->ipnet_flags = flags; 2285 2286 if ((ifp->if_flags & IPNETIF_LOOPBACK) != 0) { 2287 ipnet->ipnet_acceptfn = ipnet_loaccept; 2288 } else { 2289 ipnet->ipnet_acceptfn = ipnet_accept; 2290 } 2291 2292 /* 2293 * To register multiple hooks with the same callback function, 2294 * a unique name is needed. 2295 */ 2296 HOOK_INIT(ipnet->ipnet_hook, ipnet_bpf_bounce, "", ipnet); 2297 (void) snprintf(name, sizeof (name), "ipnet_promisc_%p", 2298 ipnet->ipnet_hook); 2299 ipnet->ipnet_hook->h_name = strdup(name); 2300 ipnet->ipnet_data = data; 2301 ipnet->ipnet_zoneid = ifp->if_zoneid; 2302 2303 ipst = ns->netstack_ip; 2304 2305 error = net_hook_register(ipst->ips_ip4_observe_pr, NH_OBSERVE, 2306 ipnet->ipnet_hook); 2307 if (error != 0) 2308 goto regfail; 2309 2310 error = net_hook_register(ipst->ips_ip6_observe_pr, NH_OBSERVE, 2311 ipnet->ipnet_hook); 2312 if (error != 0) { 2313 (void) net_hook_unregister(ipst->ips_ip4_observe_pr, 2314 NH_OBSERVE, ipnet->ipnet_hook); 2315 goto regfail; 2316 } 2317 2318 *mhandle = (uintptr_t)ipnet; 2319 2320 return (0); 2321 2322 regfail: 2323 cmn_err(CE_WARN, "net_hook_register failed: %d", error); 2324 strfree(ipnet->ipnet_hook->h_name); 2325 hook_free(ipnet->ipnet_hook); 2326 return (error); 2327 } 2328 2329 void 2330 ipnet_promisc_remove(void *data) 2331 { 2332 ip_stack_t *ipst; 2333 ipnet_t *ipnet; 2334 hook_t *hook; 2335 2336 ipnet = data; 2337 ipst = ipnet->ipnet_ns->netstack_ip; 2338 hook = ipnet->ipnet_hook; 2339 2340 VERIFY(net_hook_unregister(ipst->ips_ip4_observe_pr, NH_OBSERVE, 2341 hook) == 0); 2342 2343 VERIFY(net_hook_unregister(ipst->ips_ip6_observe_pr, NH_OBSERVE, 2344 hook) == 0); 2345 2346 strfree(hook->h_name); 2347 2348 hook_free(hook); 2349 2350 kmem_free(ipnet, sizeof (*ipnet)); 2351 } 2352 2353 /* 2354 * arg here comes from the ipnet_t allocated in ipnet_promisc_add. 2355 * An important field from that structure is "ipnet_data" that 2356 * contains the "data" pointer passed into ipnet_promisc_add: it needs 2357 * to be passed back to bpf when we call into ipnet_itap. 2358 * 2359 * ipnet_itap is set by ipnet_set_bpfattach, which in turn is called 2360 * from BPF. 2361 */ 2362 /*ARGSUSED*/ 2363 static int 2364 ipnet_bpf_bounce(hook_event_token_t token, hook_data_t info, void *arg) 2365 { 2366 hook_pkt_observe_t *hdr; 2367 ipnet_addrp_t src; 2368 ipnet_addrp_t dst; 2369 ipnet_stack_t *ips; 2370 ipnet_t *ipnet; 2371 mblk_t *netmp; 2372 mblk_t *mp; 2373 2374 hdr = (hook_pkt_observe_t *)info; 2375 mp = hdr->hpo_pkt; 2376 ipnet = (ipnet_t *)arg; 2377 ips = ((netstack_t *)hdr->hpo_ctx)->netstack_ipnet; 2378 2379 netmp = hdr->hpo_pkt->b_cont; 2380 src.iap_family = hdr->hpo_family; 2381 dst.iap_family = hdr->hpo_family; 2382 2383 if (hdr->hpo_family == AF_INET) { 2384 src.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_src; 2385 dst.iap_addr4 = &((ipha_t *)(netmp->b_rptr))->ipha_dst; 2386 } else { 2387 src.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_src; 2388 dst.iap_addr6 = &((ip6_t *)(netmp->b_rptr))->ip6_dst; 2389 } 2390 2391 if (!(*ipnet->ipnet_acceptfn)(ipnet, hdr, &src, &dst)) { 2392 IPSK_BUMP(ips, ik_acceptFail); 2393 return (0); 2394 } 2395 IPSK_BUMP(ips, ik_acceptOk); 2396 2397 ipnet_itap(ipnet->ipnet_data, mp, 2398 hdr->hpo_htype == IPOBS_HOOK_OUTBOUND, 2399 ntohs(hdr->hpo_pktlen) + (mp->b_wptr - mp->b_rptr)); 2400 2401 return (0); 2402 } 2403 2404 /* 2405 * clone'd ipnetif_t's are created when a shared IP instance zone comes 2406 * to life and configures an IP address. The model that BPF uses is that 2407 * each interface must have a unique pointer and each interface must be 2408 * representative of what it can capture. They are limited to one DLT 2409 * per interface and one zone per interface. Thus every interface that 2410 * can be seen in a zone must be announced via an attach to bpf. For 2411 * shared instance zones, this means the ipnet driver needs to detect 2412 * when an address is added to an interface in a zone for the first 2413 * time (and also when the last address is removed.) 2414 */ 2415 static ipnetif_t * 2416 ipnetif_clone_create(ipnetif_t *ifp, zoneid_t zoneid) 2417 { 2418 uintptr_t key[2] = { zoneid, (uintptr_t)ifp->if_name }; 2419 ipnet_stack_t *ips = ifp->if_stackp; 2420 avl_index_t where = 0; 2421 ipnetif_t *newif; 2422 2423 mutex_enter(&ips->ips_avl_lock); 2424 newif = avl_find(&ips->ips_avl_by_shared, (void *)key, &where); 2425 if (newif != NULL) { 2426 ipnetif_refhold(newif); 2427 newif->if_sharecnt++; 2428 mutex_exit(&ips->ips_avl_lock); 2429 return (newif); 2430 } 2431 2432 newif = ipnet_alloc_if(ips); 2433 if (newif == NULL) { 2434 mutex_exit(&ips->ips_avl_lock); 2435 return (NULL); 2436 } 2437 2438 newif->if_refcnt = 1; 2439 newif->if_sharecnt = 1; 2440 newif->if_zoneid = zoneid; 2441 (void) strlcpy(newif->if_name, ifp->if_name, LIFNAMSIZ); 2442 newif->if_flags = ifp->if_flags & IPNETIF_LOOPBACK; 2443 newif->if_index = ifp->if_index; 2444 2445 avl_insert(&ips->ips_avl_by_shared, newif, where); 2446 mutex_exit(&ips->ips_avl_lock); 2447 2448 ipnet_bpfattach(newif); 2449 2450 return (newif); 2451 } 2452 2453 static void 2454 ipnetif_clone_release(ipnetif_t *ipnetif) 2455 { 2456 boolean_t dofree = B_FALSE; 2457 boolean_t doremove = B_FALSE; 2458 ipnet_stack_t *ips = ipnetif->if_stackp; 2459 2460 mutex_enter(&ipnetif->if_reflock); 2461 ASSERT(ipnetif->if_refcnt > 0); 2462 if (--ipnetif->if_refcnt == 0) 2463 dofree = B_TRUE; 2464 ASSERT(ipnetif->if_sharecnt > 0); 2465 if (--ipnetif->if_sharecnt == 0) 2466 doremove = B_TRUE; 2467 mutex_exit(&ipnetif->if_reflock); 2468 if (doremove) { 2469 mutex_enter(&ips->ips_avl_lock); 2470 avl_remove(&ips->ips_avl_by_shared, ipnetif); 2471 mutex_exit(&ips->ips_avl_lock); 2472 ipnet_bpfdetach(ipnetif); 2473 } 2474 if (dofree) { 2475 ASSERT(ipnetif->if_sharecnt == 0); 2476 ipnetif_free(ipnetif); 2477 } 2478 } 2479 2480 /* 2481 * Called when BPF loads, the goal is to tell BPF about all of the interfaces 2482 * in use by zones that have a shared IP stack. These interfaces are stored 2483 * in the ips_avl_by_shared tree. Note that if there are 1000 bge0's in use 2484 * as bge0:1 through to bge0:1000, then this would be represented by a single 2485 * bge0 on that AVL tree. 2486 */ 2487 static void 2488 ipnet_bpf_probe_shared(ipnet_stack_t *ips) 2489 { 2490 ipnetif_t *next; 2491 ipnetif_t *ifp; 2492 2493 mutex_enter(&ips->ips_avl_lock); 2494 2495 for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL; 2496 ifp = next) { 2497 ipnetif_refhold(ifp); 2498 mutex_exit(&ips->ips_avl_lock); 2499 ipnet_bpfattach(ifp); 2500 mutex_enter(&ips->ips_avl_lock); 2501 next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER); 2502 ipnetif_refrele(ifp); 2503 } 2504 mutex_exit(&ips->ips_avl_lock); 2505 } 2506 2507 static void 2508 ipnet_bpf_release_shared(ipnet_stack_t *ips) 2509 { 2510 ipnetif_t *next; 2511 ipnetif_t *ifp; 2512 2513 mutex_enter(&ips->ips_avl_lock); 2514 2515 for (ifp = avl_first(&ips->ips_avl_by_shared); ifp != NULL; 2516 ifp = next) { 2517 ipnetif_refhold(ifp); 2518 mutex_exit(&ips->ips_avl_lock); 2519 ipnet_bpfdetach(ifp); 2520 mutex_enter(&ips->ips_avl_lock); 2521 next = avl_walk(&ips->ips_avl_by_shared, ifp, AVL_AFTER); 2522 ipnetif_refrele(ifp); 2523 } 2524 mutex_exit(&ips->ips_avl_lock); 2525 } 2526