/* * This file and its contents are supplied under the terms of the * Common Development and Distribution License ("CDDL"), version 1.0. * You may only use this file in accordance with the terms of version * 1.0 of the CDDL. * * A full copy of the text of the CDDL should have accompanied this * source. A copy of the CDDL is also available via the Internet at * http://www.illumos.org/license/CDDL. */ /* * Copyright (c) 2012, Joyent, Inc. All rights reserved. */ /* * ipd: Internet packet disturber * * The purpose of ipd is to simulate congested and lossy networks when they * don't actually exist. The features of these congested and lossy networks are * events that end up leading to retransmits and thus kicking us out of the * TCP/IP fastpath. Since normally this would require us to have an actually * congested network, which can be problematic, we instead simulate this * behavior. * * 1. ipd's operations and restrictions * * ipd currently has facilities to cause IP traffic to be: * * - Corrupted with some probability. * - Delayed for a set number of microseconds. * - Dropped with some probability. * * Each of these features are enabled on a per-zone basic. The current * implementation restricts this specifically to exclusive stack zones. * Enabling ipd on a given zone causes pfhooks to be installed for that zone's * netstack. Because of the nature of ipd, it currently only supports exclusive * stack zones and as a further restriction, it only allows the global zone * administrative access. ipd can be enabled for the global zone, but doing so * will cause all shared-stack zones to also be affected. * * 2. General architecture and Locking * * ipd consists of a few components. There is a per netstack data structure that * is created and destroyed with the creation and destruction of each exclusive * stack zone. Each of these netstacks is stored in a global list which is * accessed for control of ipd via ioctls. The following diagram touches on the * data structures that are used throughout ipd. * * ADMINISTRATIVE DATA PATH * * +--------+ +------+ +------+ * | ipdadm | | ip | | nics | * +--------+ +------+ +------+ * | ^ | | * | | ioctl(2) | | * V | V V * +----------+ +-------------------------+ * | /dev/ipd | | pfhooks packet callback | == ipd_hook() * +----------+ +-------------------------+ * | | * | | * V | * +----------------+ | * | list_t ipd_nsl |------+ | * +----------------+ | | * | | * V per netstack V * +----------------------------+ * | ipd_nestack_t | * +----------------------------+ * * ipd has two different entry points, one is administrative, the other is the * data path. The administrative path is accessed by a userland component called * ipdadm(1M). It communicates to the kernel component via ioctls to /dev/ipd. * If the administrative path enables a specific zone, then the data path will * become active for that zone. Any packet that leaves that zone's IP stack or * is going to enter it, comes through the callback specified in the hook_t(9S) * structure. This will cause each packet to go through ipd_hook(). * * While the locking inside of ipd should be straightforward, unfortunately, the * pfhooks subsystem necessarily complicates this a little bit. There are * currently three different sets of locks in ipd. * * - Global lock N on the netstack list. * - Global lock A on the active count. * - Per-netstack data structure lock Z. * * # Locking rules * * L.1a N must always be acquired first and released last * * If you need to acquire the netstack list lock, either for reading or writing, * then N must be acquired first and before any other locks. It may not be * dropped before any other lock. * * L.1b N must only be acquired from the administrative path and zone creation, * shutdown, and destruct callbacks. * * The data path, e.g. receiving the per-packet callbacks, should never be * grabbing the list lock. If it is, then the architecture here needs to be * reconsidered. * * L.2 Z cannot be held across calls to the pfhooks subsystem if packet hooks * are active. * * The way the pfhooks subsystem is designed is that a reference count is * present on the hook_t while it is active. As long as that reference count is * non-zero, a call to net_hook_unregister will block until it is lowered. * Because the callbacks want the same lock for the netstack that is held by the * administrative path calling into net_hook_unregister, we deadlock. * * ioctl from ipdadm remove hook_t cb (from nic) hook_t cb (from IP) * ----------------------- -------------------- ------------------- * | | | * | bump hook_t refcount | * mutex_enter(ipd_nsl_lock); enter ipd_hook() bump hook_t refcount * mutex acquired mutex_enter(ins->ipdn_lock); | * | mutex acquired enter ipd_hook() * mutex_enter(ins->ipdn_lock); | mutex_enter(ins->ipdn_lock); * | | | * | | | * | mutex_exit(ins->ipdn_lock); | * | | | * mutex acquired leave ipd_hook() | * | decrement hook_t refcount | * | | | * ipd_teardown_hooks() | | * net_hook_unregister() | | * cv_wait() if recount | | * | | | * --------------------------------------------------------------------------- * * At this point, we can see that the second hook callback still doesn't have * the mutex, but it has bumped the hook_t refcount. However, it will never * acquire the mutex that it needs to finish its operation and decrement the * refcount. * * Obviously, deadlocking is not acceptable, thus the following corollary to the * second locking rule: * * L.2 Corollary: If Z is being released across a call to the pfhooks subsystem, * N must be held. * * There is currently only one path where we have to worry about this. That is * when we are removing a hook, but the zone is not being shutdown, then hooks * are currently active. The only place that this currently happens is in * ipd_check_hooks(). * */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IPDN_STATUS_DISABLED 0x1 #define IPDN_STATUS_ENABLED 0x2 #define IPDN_STATUS_CONDEMNED 0x4 /* * These flags are used to determine whether or not the hooks are registered. */ #define IPDN_HOOK_NONE 0x0 #define IPDN_HOOK_V4IN 0x1 #define IPDN_HOOK_V4OUT 0x2 #define IPDN_HOOK_V6IN 0x4 #define IPDN_HOOK_V6OUT 0x8 #define IPDN_HOOK_ALL 0xf /* * Per-netstack kstats. */ typedef struct ipd_nskstat { kstat_named_t ink_ndrops; kstat_named_t ink_ncorrupts; kstat_named_t ink_ndelays; } ipd_nskstat_t; /* * Different parts of this structure have different locking semantics. The list * node is not normally referenced, if it is, one has to hold the ipd_nsl_lock. * The following members are read only: ipdn_netid and ipdn_zoneid. The members * of the kstat structure are always accessible in the data path, but the * counters must be bumped with atomic operations. The ipdn_lock protects every * other aspect of this structure. Please see the big theory statement on the * requirements for lock ordering. */ typedef struct ipd_netstack { list_node_t ipdn_link; /* link on ipd_nsl */ netid_t ipdn_netid; /* netstack id */ zoneid_t ipdn_zoneid; /* zone id */ kstat_t *ipdn_kstat; /* kstat_t ptr */ ipd_nskstat_t ipdn_ksdata; /* kstat data */ kmutex_t ipdn_lock; /* protects following members */ int ipdn_status; /* status flags */ net_handle_t ipdn_v4hdl; /* IPv4 net handle */ net_handle_t ipdn_v6hdl; /* IPv4 net handle */ int ipdn_hooked; /* are hooks registered */ hook_t *ipdn_v4in; /* IPv4 traffic in hook */ hook_t *ipdn_v4out; /* IPv4 traffice out hook */ hook_t *ipdn_v6in; /* IPv6 traffic in hook */ hook_t *ipdn_v6out; /* IPv6 traffic out hook */ int ipdn_enabled; /* which perturbs are on */ int ipdn_corrupt; /* corrupt percentage */ int ipdn_drop; /* drop percentage */ uint_t ipdn_delay; /* delay us */ long ipdn_rand; /* random seed */ } ipd_netstack_t; /* * ipd internal variables */ static dev_info_t *ipd_devi; /* device info */ static net_instance_t *ipd_neti; /* net_instance for hooks */ static unsigned int ipd_max_delay = IPD_MAX_DELAY; /* max delay in us */ static kmutex_t ipd_nsl_lock; /* lock for the nestack list */ static list_t ipd_nsl; /* list of netstacks */ static kmutex_t ipd_nactive_lock; /* lock for nactive */ static unsigned int ipd_nactive; /* number of active netstacks */ static int ipd_nactive_fudge = 4; /* amount to fudge by in list */ /* * Note that this random number implementation is based upon the old BSD 4.1 * rand. It's good enough for us! */ static int ipd_nextrand(ipd_netstack_t *ins) { ins->ipdn_rand = ins->ipdn_rand * 1103515245L + 12345; return (ins->ipdn_rand & 0x7fffffff); } static void ipd_ksbump(kstat_named_t *nkp) { atomic_inc_64(&nkp->value.ui64); } /* * This is where all the magic actually happens. The way that this works is we * grab the ins lock to basically get a copy of all the data that we need to do * our job and then let it go to minimize contention. In terms of actual work on * the packet we do them in the following order: * * - drop * - delay * - corrupt */ /*ARGSUSED*/ static int ipd_hook(hook_event_token_t event, hook_data_t data, void *arg) { unsigned char *crp; int dwait, corrupt, drop, rand, off, status; mblk_t *mbp; ipd_netstack_t *ins = arg; hook_pkt_event_t *pkt = (hook_pkt_event_t *)data; mutex_enter(&ins->ipdn_lock); status = ins->ipdn_status; dwait = ins->ipdn_delay; corrupt = ins->ipdn_corrupt; drop = ins->ipdn_drop; rand = ipd_nextrand(ins); mutex_exit(&ins->ipdn_lock); /* * This probably cannot happen, but we'll do an extra guard just in * case. */ if (status & IPDN_STATUS_CONDEMNED) return (0); if (drop != 0 && rand % 100 < drop) { freemsg(*pkt->hpe_mp); *pkt->hpe_mp = NULL; pkt->hpe_mb = NULL; pkt->hpe_hdr = NULL; ipd_ksbump(&ins->ipdn_ksdata.ink_ndrops); return (1); } if (dwait != 0) { if (dwait < TICK_TO_USEC(1)) drv_usecwait(dwait); else delay(drv_usectohz(dwait)); ipd_ksbump(&ins->ipdn_ksdata.ink_ndelays); } if (corrupt != 0 && rand % 100 < corrupt) { /* * Since we're corrupting the mblk, just corrupt everything in * the chain. While we could corrupt the entire packet, that's a * little strong. Instead we're going to just change one of the * bytes in each mblock. */ mbp = *pkt->hpe_mp; while (mbp != NULL) { if (mbp->b_wptr == mbp->b_rptr) continue; /* * While pfhooks probably won't send us anything else, * let's just be extra careful. The stack probably isn't * as resiliant to corruption of control messages. */ if (DB_TYPE(mbp) != M_DATA) continue; off = rand % ((uintptr_t)mbp->b_wptr - (uintptr_t)mbp->b_rptr); crp = mbp->b_rptr + off; off = rand % 8; *crp = *crp ^ (1 << off); mbp = mbp->b_cont; } ipd_ksbump(&ins->ipdn_ksdata.ink_ncorrupts); } return (0); } /* * Sets up and registers all the proper hooks needed for the netstack to capture * packets. Callers are assumed to already be holding the ipd_netstack_t's lock. * If there is a failure in setting something up, it is the responsibility of * this function to clean it up. Once this function has been called, it should * not be called until a corresponding call to tear down the hooks has been * done. */ static int ipd_setup_hooks(ipd_netstack_t *ins) { ASSERT(MUTEX_HELD(&ins->ipdn_lock)); ins->ipdn_v4hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET); if (ins->ipdn_v4hdl == NULL) goto cleanup; ins->ipdn_v6hdl = net_protocol_lookup(ins->ipdn_netid, NHF_INET6); if (ins->ipdn_v6hdl == NULL) goto cleanup; ins->ipdn_v4in = hook_alloc(HOOK_VERSION); if (ins->ipdn_v4in == NULL) goto cleanup; ins->ipdn_v4in->h_flags = 0; ins->ipdn_v4in->h_hint = HH_NONE; ins->ipdn_v4in->h_hintvalue = 0; ins->ipdn_v4in->h_func = ipd_hook; ins->ipdn_v4in->h_arg = ins; ins->ipdn_v4in->h_name = "ipd IPv4 in"; if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_IN, ins->ipdn_v4in) != 0) goto cleanup; ins->ipdn_hooked |= IPDN_HOOK_V4IN; ins->ipdn_v4out = hook_alloc(HOOK_VERSION); if (ins->ipdn_v4out == NULL) goto cleanup; ins->ipdn_v4out->h_flags = 0; ins->ipdn_v4out->h_hint = HH_NONE; ins->ipdn_v4out->h_hintvalue = 0; ins->ipdn_v4out->h_func = ipd_hook; ins->ipdn_v4out->h_arg = ins; ins->ipdn_v4out->h_name = "ipd IPv4 out"; if (net_hook_register(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, ins->ipdn_v4out) != 0) goto cleanup; ins->ipdn_hooked |= IPDN_HOOK_V4OUT; ins->ipdn_v6in = hook_alloc(HOOK_VERSION); if (ins->ipdn_v6in == NULL) goto cleanup; ins->ipdn_v6in->h_flags = 0; ins->ipdn_v6in->h_hint = HH_NONE; ins->ipdn_v6in->h_hintvalue = 0; ins->ipdn_v6in->h_func = ipd_hook; ins->ipdn_v6in->h_arg = ins; ins->ipdn_v6in->h_name = "ipd IPv6 in"; if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_IN, ins->ipdn_v6in) != 0) goto cleanup; ins->ipdn_hooked |= IPDN_HOOK_V6IN; ins->ipdn_v6out = hook_alloc(HOOK_VERSION); if (ins->ipdn_v6out == NULL) goto cleanup; ins->ipdn_v6out->h_flags = 0; ins->ipdn_v6out->h_hint = HH_NONE; ins->ipdn_v6out->h_hintvalue = 0; ins->ipdn_v6out->h_func = ipd_hook; ins->ipdn_v6out->h_arg = ins; ins->ipdn_v6out->h_name = "ipd IPv6 out"; if (net_hook_register(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, ins->ipdn_v6out) != 0) goto cleanup; ins->ipdn_hooked |= IPDN_HOOK_V6OUT; mutex_enter(&ipd_nactive_lock); ipd_nactive++; mutex_exit(&ipd_nactive_lock); return (0); cleanup: if (ins->ipdn_hooked & IPDN_HOOK_V6OUT) (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, ins->ipdn_v6out); if (ins->ipdn_hooked & IPDN_HOOK_V6IN) (void) net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN, ins->ipdn_v6in); if (ins->ipdn_hooked & IPDN_HOOK_V4OUT) (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, ins->ipdn_v4out); if (ins->ipdn_hooked & IPDN_HOOK_V4IN) (void) net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN, ins->ipdn_v4in); ins->ipdn_hooked = IPDN_HOOK_NONE; if (ins->ipdn_v6out != NULL) hook_free(ins->ipdn_v6out); if (ins->ipdn_v6in != NULL) hook_free(ins->ipdn_v6in); if (ins->ipdn_v4out != NULL) hook_free(ins->ipdn_v4out); if (ins->ipdn_v4in != NULL) hook_free(ins->ipdn_v4in); if (ins->ipdn_v6hdl != NULL) (void) net_protocol_release(ins->ipdn_v6hdl); if (ins->ipdn_v4hdl != NULL) (void) net_protocol_release(ins->ipdn_v4hdl); return (1); } static void ipd_teardown_hooks(ipd_netstack_t *ins) { ASSERT(ins->ipdn_hooked == IPDN_HOOK_ALL); VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_OUT, ins->ipdn_v6out) == 0); VERIFY(net_hook_unregister(ins->ipdn_v6hdl, NH_PHYSICAL_IN, ins->ipdn_v6in) == 0); VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_OUT, ins->ipdn_v4out) == 0); VERIFY(net_hook_unregister(ins->ipdn_v4hdl, NH_PHYSICAL_IN, ins->ipdn_v4in) == 0); ins->ipdn_hooked = IPDN_HOOK_NONE; hook_free(ins->ipdn_v6out); hook_free(ins->ipdn_v6in); hook_free(ins->ipdn_v4out); hook_free(ins->ipdn_v4in); VERIFY(net_protocol_release(ins->ipdn_v6hdl) == 0); VERIFY(net_protocol_release(ins->ipdn_v4hdl) == 0); mutex_enter(&ipd_nactive_lock); ipd_nactive--; mutex_exit(&ipd_nactive_lock); } static int ipd_check_hooks(ipd_netstack_t *ins, int type, boolean_t enable) { int olden, rval; olden = ins->ipdn_enabled; if (enable) ins->ipdn_enabled |= type; else ins->ipdn_enabled &= ~type; /* * If hooks were previously enabled. */ if (olden == 0 && ins->ipdn_enabled != 0) { rval = ipd_setup_hooks(ins); if (rval != 0) { ins->ipdn_enabled &= ~type; ASSERT(ins->ipdn_enabled == 0); return (rval); } return (0); } if (olden != 0 && ins->ipdn_enabled == 0) { ASSERT(olden != 0); /* * We have to drop the lock here, lest we cause a deadlock. * Unfortunately, there may be hooks that are running and are * actively in flight and we have to call the unregister * function. Due to the hooks framework, if there is an inflight * hook (most likely right now), and we are holding the * netstack's lock, those hooks will never return. This is * unfortunate. * * Because we only come into this path holding the list lock, we * know that only way that someone else can come in and get to * this structure is via the hook callbacks which are going to * only be doing reads. They'll also see that everything has * been disabled and return. So while this is unfortunate, it * should be relatively safe. */ mutex_exit(&ins->ipdn_lock); ipd_teardown_hooks(ins); mutex_enter(&ins->ipdn_lock); return (0); } /* * Othwerise, nothing should have changed here. */ ASSERT((olden == 0) == (ins->ipdn_enabled == 0)); return (0); } static int ipd_toggle_corrupt(ipd_netstack_t *ins, int percent) { int rval; ASSERT(MUTEX_HELD(&ins->ipdn_lock)); if (percent < 0 || percent > 100) return (ERANGE); /* * If we've been asked to set the value to a value that we already have, * great, then we're done. */ if (percent == ins->ipdn_corrupt) return (0); ins->ipdn_corrupt = percent; rval = ipd_check_hooks(ins, IPD_CORRUPT, percent != 0); /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the * request to enable corruption. */ if (rval != 0) ins->ipdn_corrupt = 0; return (rval); } static int ipd_toggle_delay(ipd_netstack_t *ins, uint32_t delay) { int rval; ASSERT(MUTEX_HELD(&ins->ipdn_lock)); if (delay > ipd_max_delay) return (ERANGE); /* * If we've been asked to set the value to a value that we already have, * great, then we're done. */ if (delay == ins->ipdn_delay) return (0); ins->ipdn_delay = delay; rval = ipd_check_hooks(ins, IPD_DELAY, delay != 0); /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the * request to enable corruption. */ if (rval != 0) ins->ipdn_delay = 0; return (rval); } static int ipd_toggle_drop(ipd_netstack_t *ins, int percent) { int rval; ASSERT(MUTEX_HELD(&ins->ipdn_lock)); if (percent < 0 || percent > 100) return (ERANGE); /* * If we've been asked to set the value to a value that we already have, * great, then we're done. */ if (percent == ins->ipdn_drop) return (0); ins->ipdn_drop = percent; rval = ipd_check_hooks(ins, IPD_DROP, percent != 0); /* * If ipd_check_hooks_failed, that must mean that we failed to set up * the hooks, so we are going to effectively zero out and fail the * request to enable corruption. */ if (rval != 0) ins->ipdn_drop = 0; return (rval); } static int ipd_ioctl_perturb(ipd_ioc_perturb_t *ipi, cred_t *cr, intptr_t cmd) { zoneid_t zid; ipd_netstack_t *ins; int rval = 0; /* * If the zone that we're coming from is not the GZ, then we ignore it * completely and then instead just set the zoneid to be that of the * caller. If the zoneid is that of the GZ, then we don't touch this * value. */ zid = crgetzoneid(cr); if (zid != GLOBAL_ZONEID) ipi->ipip_zoneid = zid; if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID && zid != GLOBAL_ZONEID) return (EPERM); /* * We need to hold the ipd_nsl_lock throughout the entire operation, * otherwise someone else could come in and remove us from the list and * free us, e.g. the netstack destroy handler. By holding the lock, we * stop it from being able to do anything wrong. */ mutex_enter(&ipd_nsl_lock); for (ins = list_head(&ipd_nsl); ins != NULL; ins = list_next(&ipd_nsl, ins)) { if (ins->ipdn_zoneid == ipi->ipip_zoneid) break; } if (ins == NULL) { mutex_exit(&ipd_nsl_lock); return (EINVAL); } mutex_enter(&ins->ipdn_lock); if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) { rval = ESHUTDOWN; goto cleanup; } switch (cmd) { case IPDIOC_CORRUPT: rval = ipd_toggle_corrupt(ins, ipi->ipip_arg); break; case IPDIOC_DELAY: rval = ipd_toggle_delay(ins, ipi->ipip_arg); break; case IPDIOC_DROP: rval = ipd_toggle_drop(ins, ipi->ipip_arg); break; } cleanup: mutex_exit(&ins->ipdn_lock); mutex_exit(&ipd_nsl_lock); return (rval); } static int ipd_ioctl_remove(ipd_ioc_perturb_t *ipi, cred_t *cr) { zoneid_t zid; ipd_netstack_t *ins; int rval = 0; /* * See ipd_ioctl_perturb for the rational here. */ zid = crgetzoneid(cr); if (zid != GLOBAL_ZONEID) ipi->ipip_zoneid = zid; if (zoneid_to_netstackid(ipi->ipip_zoneid) == GLOBAL_NETSTACKID && zid != GLOBAL_ZONEID) return (EPERM); mutex_enter(&ipd_nsl_lock); for (ins = list_head(&ipd_nsl); ins != NULL; ins = list_next(&ipd_nsl, ins)) { if (ins->ipdn_zoneid == ipi->ipip_zoneid) break; } if (ins == NULL) { mutex_exit(&ipd_nsl_lock); return (EINVAL); } mutex_enter(&ins->ipdn_lock); /* * If this is condemned, that means it's very shortly going to be torn * down. In that case, there's no reason to actually do anything here, * as it will all be done rather shortly in the destroy function. * Furthermore, because condemned corresponds with it having hit * shutdown, we know that no more packets can be received by this * netstack. All this translates to a no-op. */ if (ins->ipdn_status & IPDN_STATUS_CONDEMNED) { rval = 0; goto cleanup; } rval = EINVAL; /* * Go through and disable the requested pieces. We can safely ignore the * return value of ipd_check_hooks because the removal case should never * fail, we verify that in the hook teardown case. */ if (ipi->ipip_arg & IPD_CORRUPT) { ins->ipdn_corrupt = 0; (void) ipd_check_hooks(ins, IPD_CORRUPT, B_FALSE); rval = 0; } if (ipi->ipip_arg & IPD_DELAY) { ins->ipdn_delay = 0; (void) ipd_check_hooks(ins, IPD_DELAY, B_FALSE); rval = 0; } if (ipi->ipip_arg & IPD_DROP) { ins->ipdn_drop = 0; (void) ipd_check_hooks(ins, IPD_DROP, B_FALSE); rval = 0; } cleanup: mutex_exit(&ins->ipdn_lock); mutex_exit(&ipd_nsl_lock); return (rval); } /* * When this function is called, the value of the ipil_nzones argument controls * how this function works. When called with a value of zero, then we treat that * as the caller asking us what's a reasonable number of entries for me to * allocate memory for. If the zone is the global zone, then we tell them how * many folks are currently active and add a fudge factor. Otherwise the answer * is always one. * * In the non-zero case, we give them that number of zone ids. While this isn't * quite ideal as it might mean that someone misses something, this generally * won't be an issue, as it involves a rather tight race condition in the * current ipdadm implementation. */ static int ipd_ioctl_list(intptr_t arg, cred_t *cr) { zoneid_t zid; ipd_ioc_info_t *configs; ipd_netstack_t *ins; uint_t azones, rzones, nzones, cur; int rval = 0; STRUCT_DECL(ipd_ioc_list, h); STRUCT_INIT(h, get_udatamodel()); if (ddi_copyin((void *)arg, STRUCT_BUF(h), STRUCT_SIZE(h), 0) != 0) return (EFAULT); zid = crgetzoneid(cr); rzones = STRUCT_FGET(h, ipil_nzones); if (rzones == 0) { if (zid == GLOBAL_ZONEID) { mutex_enter(&ipd_nactive_lock); rzones = ipd_nactive + ipd_nactive_fudge; mutex_exit(&ipd_nactive_lock); } else { rzones = 1; } STRUCT_FSET(h, ipil_nzones, rzones); if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0) return (EFAULT); return (0); } mutex_enter(&ipd_nsl_lock); if (zid == GLOBAL_ZONEID) { azones = ipd_nactive; } else { azones = 1; } configs = kmem_alloc(sizeof (ipd_ioc_info_t) * azones, KM_SLEEP); cur = 0; for (ins = list_head(&ipd_nsl); ins != NULL; ins = list_next(&ipd_nsl, ins)) { if (ins->ipdn_enabled == 0) continue; ASSERT(cur < azones); if (zid == GLOBAL_ZONEID || zid == ins->ipdn_zoneid) { configs[cur].ipii_zoneid = ins->ipdn_zoneid; mutex_enter(&ins->ipdn_lock); configs[cur].ipii_corrupt = ins->ipdn_corrupt; configs[cur].ipii_delay = ins->ipdn_delay; configs[cur].ipii_drop = ins->ipdn_drop; mutex_exit(&ins->ipdn_lock); ++cur; } if (zid != GLOBAL_ZONEID && zid == ins->ipdn_zoneid) break; } mutex_exit(&ipd_nsl_lock); ASSERT(zid != GLOBAL_ZONEID || cur == azones); if (cur == 0) STRUCT_FSET(h, ipil_nzones, 0); else STRUCT_FSET(h, ipil_nzones, cur); nzones = MIN(cur, rzones); if (nzones > 0) { if (ddi_copyout(configs, STRUCT_FGETP(h, ipil_info), nzones * sizeof (ipd_ioc_info_t), 0) != 0) rval = EFAULT; } kmem_free(configs, sizeof (ipd_ioc_info_t) * azones); if (ddi_copyout(STRUCT_BUF(h), (void *)arg, STRUCT_SIZE(h), 0) != 0) return (EFAULT); return (rval); } static void * ipd_nin_create(const netid_t id) { ipd_netstack_t *ins; ipd_nskstat_t *ink; ins = kmem_zalloc(sizeof (ipd_netstack_t), KM_SLEEP); ins->ipdn_status = IPDN_STATUS_DISABLED; ins->ipdn_netid = id; ins->ipdn_zoneid = netstackid_to_zoneid(id); ins->ipdn_rand = gethrtime(); mutex_init(&ins->ipdn_lock, NULL, MUTEX_DRIVER, NULL); ins->ipdn_kstat = net_kstat_create(id, "ipd", ins->ipdn_zoneid, "ipd", "net", KSTAT_TYPE_NAMED, sizeof (ipd_nskstat_t) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (ins->ipdn_kstat != NULL) { if (ins->ipdn_zoneid != GLOBAL_ZONEID) kstat_zone_add(ins->ipdn_kstat, GLOBAL_ZONEID); ink = &ins->ipdn_ksdata; ins->ipdn_kstat->ks_data = ink; kstat_named_init(&ink->ink_ncorrupts, "corrupts", KSTAT_DATA_UINT64); kstat_named_init(&ink->ink_ndrops, "drops", KSTAT_DATA_UINT64); kstat_named_init(&ink->ink_ndelays, "delays", KSTAT_DATA_UINT64); kstat_install(ins->ipdn_kstat); } mutex_enter(&ipd_nsl_lock); list_insert_tail(&ipd_nsl, ins); mutex_exit(&ipd_nsl_lock); return (ins); } static void ipd_nin_shutdown(const netid_t id, void *arg) { ipd_netstack_t *ins = arg; VERIFY(id == ins->ipdn_netid); mutex_enter(&ins->ipdn_lock); ASSERT(ins->ipdn_status == IPDN_STATUS_DISABLED || ins->ipdn_status == IPDN_STATUS_ENABLED); ins->ipdn_status |= IPDN_STATUS_CONDEMNED; if (ins->ipdn_kstat != NULL) net_kstat_delete(id, ins->ipdn_kstat); mutex_exit(&ins->ipdn_lock); } /*ARGSUSED*/ static void ipd_nin_destroy(const netid_t id, void *arg) { ipd_netstack_t *ins = arg; /* * At this point none of the hooks should be able to fire because the * zone has been shutdown and we are in the process of destroying it. * Thus it should not be possible for someone else to come in and grab * our ipd_netstack_t for this zone. Because of that, we know that we * are the only ones who could be running here. */ mutex_enter(&ipd_nsl_lock); list_remove(&ipd_nsl, ins); mutex_exit(&ipd_nsl_lock); if (ins->ipdn_hooked) ipd_teardown_hooks(ins); mutex_destroy(&ins->ipdn_lock); kmem_free(ins, sizeof (ipd_netstack_t)); } /*ARGSUSED*/ static int ipd_open(dev_t *devp, int flag, int otype, cred_t *credp) { if (flag & FEXCL || flag & FNDELAY) return (EINVAL); if (otype != OTYP_CHR) return (EINVAL); if (!(flag & FREAD && flag & FWRITE)) return (EINVAL); if (secpolicy_ip_config(credp, B_FALSE) != 0) return (EPERM); return (0); } /*ARGSUSED*/ static int ipd_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) { int rval; ipd_ioc_perturb_t ipip; switch (cmd) { case IPDIOC_CORRUPT: case IPDIOC_DELAY: case IPDIOC_DROP: if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t), 0) != 0) return (EFAULT); rval = ipd_ioctl_perturb(&ipip, cr, cmd); return (rval); case IPDIOC_REMOVE: if (ddi_copyin((void *)arg, &ipip, sizeof (ipd_ioc_perturb_t), 0) != 0) return (EFAULT); rval = ipd_ioctl_remove(&ipip, cr); return (rval); case IPDIOC_LIST: /* * Because the list ioctl doesn't have a fixed-size struct due * to needing to pass around a pointer, we instead delegate the * copyin logic to the list code. */ return (ipd_ioctl_list(arg, cr)); default: break; } return (ENOTTY); } /*ARGSUSED*/ static int ipd_close(dev_t dev, int flag, int otype, cred_t *credp) { return (0); } static int ipd_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) { minor_t instance; if (cmd != DDI_ATTACH) return (DDI_FAILURE); if (ipd_devi != NULL) return (DDI_FAILURE); instance = ddi_get_instance(dip); if (ddi_create_minor_node(dip, "ipd", S_IFCHR, instance, DDI_PSEUDO, 0) == DDI_FAILURE) return (DDI_FAILURE); ipd_neti = net_instance_alloc(NETINFO_VERSION); if (ipd_neti == NULL) { ddi_remove_minor_node(dip, NULL); return (DDI_FAILURE); } /* * Note that these global structures MUST be initialized before we call * net_instance_register, as that will instantly cause us to drive into * the ipd_nin_create callbacks. */ list_create(&ipd_nsl, sizeof (ipd_netstack_t), offsetof(ipd_netstack_t, ipdn_link)); mutex_init(&ipd_nsl_lock, NULL, MUTEX_DRIVER, NULL); mutex_init(&ipd_nactive_lock, NULL, MUTEX_DRIVER, NULL); /* Note, net_instance_alloc sets the version. */ ipd_neti->nin_name = "ipd"; ipd_neti->nin_create = ipd_nin_create; ipd_neti->nin_destroy = ipd_nin_destroy; ipd_neti->nin_shutdown = ipd_nin_shutdown; if (net_instance_register(ipd_neti) == DDI_FAILURE) { net_instance_free(ipd_neti); ddi_remove_minor_node(dip, NULL); } ddi_report_dev(dip); ipd_devi = dip; return (DDI_SUCCESS); } /*ARGSUSED*/ static int ipd_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, void *arg, void **result) { int error; switch (infocmd) { case DDI_INFO_DEVT2DEVINFO: *result = ipd_devi; error = DDI_SUCCESS; break; case DDI_INFO_DEVT2INSTANCE: *result = (void *)(uintptr_t)getminor((dev_t)arg); error = DDI_SUCCESS; break; default: error = DDI_FAILURE; break; } return (error); } static int ipd_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) { if (cmd != DDI_DETACH) return (DDI_FAILURE); mutex_enter(&ipd_nactive_lock); if (ipd_nactive > 0) { mutex_exit(&ipd_nactive_lock); return (EBUSY); } mutex_exit(&ipd_nactive_lock); ASSERT(dip == ipd_devi); ddi_remove_minor_node(dip, NULL); ipd_devi = NULL; if (ipd_neti != NULL) { VERIFY(net_instance_unregister(ipd_neti) == 0); net_instance_free(ipd_neti); } mutex_destroy(&ipd_nsl_lock); mutex_destroy(&ipd_nactive_lock); list_destroy(&ipd_nsl); return (DDI_SUCCESS); } static struct cb_ops ipd_cb_ops = { ipd_open, /* open */ ipd_close, /* close */ nodev, /* strategy */ nodev, /* print */ nodev, /* dump */ nodev, /* read */ nodev, /* write */ ipd_ioctl, /* ioctl */ nodev, /* devmap */ nodev, /* mmap */ nodev, /* segmap */ nochpoll, /* poll */ ddi_prop_op, /* cb_prop_op */ NULL, /* streamtab */ D_NEW | D_MP, /* Driver compatibility flag */ CB_REV, /* rev */ nodev, /* aread */ nodev /* awrite */ }; static struct dev_ops ipd_ops = { DEVO_REV, /* devo_rev */ 0, /* refcnt */ ipd_getinfo, /* get_dev_info */ nulldev, /* identify */ nulldev, /* probe */ ipd_attach, /* attach */ ipd_detach, /* detach */ nodev, /* reset */ &ipd_cb_ops, /* driver operations */ NULL, /* bus operations */ nodev, /* dev power */ ddi_quiesce_not_needed /* quiesce */ }; static struct modldrv modldrv = { &mod_driverops, "Internet packet disturber", &ipd_ops }; static struct modlinkage modlinkage = { MODREV_1, { (void *)&modldrv, NULL } }; int _init(void) { return (mod_install(&modlinkage)); } int _info(struct modinfo *modinfop) { return (mod_info(&modlinkage, modinfop)); } int _fini(void) { return (mod_remove(&modlinkage)); }