/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END * * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Convenience macros for getting the ip_stack_t associated with an * ipmp_illgrp_t or ipmp_grp_t. */ #define IPMP_GRP_TO_IPST(grp) PHYINT_TO_IPST((grp)->gr_phyint) #define IPMP_ILLGRP_TO_IPST(illg) ((illg)->ig_ipmp_ill->ill_ipst) /* * Assorted constants that aren't important enough to be tunable. */ #define IPMP_GRP_HASH_SIZE 64 #define IPMP_ILL_REFRESH_TIMEOUT 120 /* seconds */ /* * Templates for IPMP ARP messages. */ static const arie_t ipmp_aract_template = { AR_IPMP_ACTIVATE, sizeof (arie_t), /* Name offset */ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ }; static const arie_t ipmp_ardeact_template = { AR_IPMP_DEACTIVATE, sizeof (arie_t), /* Name offset */ sizeof (arie_t) /* Name length (set by ill_arp_alloc) */ }; /* * IPMP meta-interface kstats (based on those in PSARC/1997/198). */ static const kstat_named_t ipmp_kstats[IPMP_KSTAT_MAX] = { { "obytes", KSTAT_DATA_UINT32 }, { "obytes64", KSTAT_DATA_UINT64 }, { "rbytes", KSTAT_DATA_UINT32 }, { "rbytes64", KSTAT_DATA_UINT64 }, { "opackets", KSTAT_DATA_UINT32 }, { "opackets64", KSTAT_DATA_UINT64 }, { "oerrors", KSTAT_DATA_UINT32 }, { "ipackets", KSTAT_DATA_UINT32 }, { "ipackets64", KSTAT_DATA_UINT64 }, { "ierrors", KSTAT_DATA_UINT32 }, { "multircv", KSTAT_DATA_UINT32 }, { "multixmt", KSTAT_DATA_UINT32 }, { "brdcstrcv", KSTAT_DATA_UINT32 }, { "brdcstxmt", KSTAT_DATA_UINT32 }, { "link_up", KSTAT_DATA_UINT32 } }; static void ipmp_grp_insert(ipmp_grp_t *, mod_hash_hndl_t); static int ipmp_grp_create_kstats(ipmp_grp_t *); static int ipmp_grp_update_kstats(kstat_t *, int); static void ipmp_grp_destroy_kstats(ipmp_grp_t *); static ill_t *ipmp_illgrp_min_ill(ipmp_illgrp_t *); static ill_t *ipmp_illgrp_max_ill(ipmp_illgrp_t *); static void ipmp_illgrp_set_cast(ipmp_illgrp_t *, ill_t *); static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *, uint_t); static boolean_t ipmp_ill_activate(ill_t *); static void ipmp_ill_deactivate(ill_t *); static void ipmp_ill_ire_mark_testhidden(ire_t *, char *); static void ipmp_ill_ire_clear_testhidden(ire_t *, char *); static void ipmp_ill_refresh_active_timer_start(ill_t *); static void ipmp_ill_rtsaddrmsg(ill_t *, int); static void ipmp_ill_bind_ipif(ill_t *, ipif_t *, enum ip_resolver_action); static ipif_t *ipmp_ill_unbind_ipif(ill_t *, ipif_t *, boolean_t); static void ipmp_phyint_get_kstats(phyint_t *, uint64_t *); static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *); /* * Initialize IPMP state for IP stack `ipst'; called from ip_stack_init(). */ void ipmp_init(ip_stack_t *ipst) { ipst->ips_ipmp_grp_hash = mod_hash_create_extended("ipmp_grp_hash", IPMP_GRP_HASH_SIZE, mod_hash_null_keydtor, mod_hash_null_valdtor, mod_hash_bystr, NULL, mod_hash_strkey_cmp, KM_SLEEP); rw_init(&ipst->ips_ipmp_lock, NULL, RW_DEFAULT, 0); } /* * Destroy IPMP state for IP stack `ipst'; called from ip_stack_fini(). */ void ipmp_destroy(ip_stack_t *ipst) { mod_hash_destroy_hash(ipst->ips_ipmp_grp_hash); rw_destroy(&ipst->ips_ipmp_lock); } /* * Create an IPMP group named `grname', associate it with IPMP phyint `phyi', * and add it to the hash. On success, return a pointer to the created group. * Caller must ensure `grname' is not yet in the hash. Assumes that the IPMP * meta-interface associated with the group also has the same name (but they * may differ later via ipmp_grp_rename()). */ ipmp_grp_t * ipmp_grp_create(const char *grname, phyint_t *phyi) { ipmp_grp_t *grp; ip_stack_t *ipst = PHYINT_TO_IPST(phyi); mod_hash_hndl_t mh; ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); if ((grp = kmem_zalloc(sizeof (ipmp_grp_t), KM_NOSLEEP)) == NULL) return (NULL); (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); (void) strlcpy(grp->gr_ifname, grname, sizeof (grp->gr_ifname)); /* * Cache the group's phyint. This is safe since a phyint_t will * outlive its ipmp_grp_t. */ grp->gr_phyint = phyi; /* * Create IPMP group kstats. */ if (ipmp_grp_create_kstats(grp) != 0) { kmem_free(grp, sizeof (ipmp_grp_t)); return (NULL); } /* * Insert the group into the hash. */ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) { ipmp_grp_destroy_kstats(grp); kmem_free(grp, sizeof (ipmp_grp_t)); return (NULL); } ipmp_grp_insert(grp, mh); return (grp); } /* * Create IPMP kstat structures for `grp'. Return an errno upon failure. */ static int ipmp_grp_create_kstats(ipmp_grp_t *grp) { kstat_t *ksp; netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; ksp = kstat_create_netstack("ipmp", 0, grp->gr_ifname, "net", KSTAT_TYPE_NAMED, IPMP_KSTAT_MAX, 0, id); if (ksp == NULL) return (ENOMEM); ksp->ks_update = ipmp_grp_update_kstats; ksp->ks_private = grp; bcopy(ipmp_kstats, ksp->ks_data, sizeof (ipmp_kstats)); kstat_install(ksp); grp->gr_ksp = ksp; return (0); } /* * Update the IPMP kstats tracked by `ksp'; called by the kstats framework. */ static int ipmp_grp_update_kstats(kstat_t *ksp, int rw) { uint_t i; kstat_named_t *kn = KSTAT_NAMED_PTR(ksp); ipmp_grp_t *grp = ksp->ks_private; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ipsq_t *ipsq, *grp_ipsq = grp->gr_phyint->phyint_ipsq; phyint_t *phyi; uint64_t phyi_kstats[IPMP_KSTAT_MAX]; if (rw == KSTAT_WRITE) return (EACCES); /* * Start with the group's baseline values. */ for (i = 0; i < IPMP_KSTAT_MAX; i++) { if (kn[i].data_type == KSTAT_DATA_UINT32) { kn[i].value.ui32 = grp->gr_kstats0[i]; } else { ASSERT(kn[i].data_type == KSTAT_DATA_UINT64); kn[i].value.ui64 = grp->gr_kstats0[i]; } } /* * Add in the stats of each phyint currently in the group. Since we * don't directly track the phyints in a group, we cheat by walking * the IPSQ set under ill_g_lock. (The IPSQ list cannot change while * ill_g_lock is held.) */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); ipsq = grp_ipsq->ipsq_next; for (; ipsq != grp_ipsq; ipsq = ipsq->ipsq_next) { phyi = ipsq->ipsq_phyint; /* * If a phyint in a group is being unplumbed, it's possible * that ill_glist_delete() -> phyint_free() already freed the * phyint (and set ipsq_phyint to NULL), but the unplumb * operation has yet to complete (and thus ipsq_dq() has yet * to remove the phyint's IPSQ from the group IPSQ's phyint * list). We skip those phyints here (note that their kstats * have already been added to gr_kstats0[]). */ if (phyi == NULL) continue; ipmp_phyint_get_kstats(phyi, phyi_kstats); for (i = 0; i < IPMP_KSTAT_MAX; i++) { phyi_kstats[i] -= phyi->phyint_kstats0[i]; if (kn[i].data_type == KSTAT_DATA_UINT32) kn[i].value.ui32 += phyi_kstats[i]; else kn[i].value.ui64 += phyi_kstats[i]; } } kn[IPMP_KSTAT_LINK_UP].value.ui32 = (grp->gr_phyint->phyint_flags & PHYI_RUNNING) != 0; rw_exit(&ipst->ips_ill_g_lock); return (0); } /* * Destroy IPMP kstat structures for `grp'. */ static void ipmp_grp_destroy_kstats(ipmp_grp_t *grp) { netstackid_t id = IPMP_GRP_TO_IPST(grp)->ips_netstack->netstack_stackid; kstat_delete_netstack(grp->gr_ksp, id); bzero(grp->gr_kstats0, sizeof (grp->gr_kstats0)); grp->gr_ksp = NULL; } /* * Look up an IPMP group named `grname' on IP stack `ipst'. Return NULL if it * does not exist. */ ipmp_grp_t * ipmp_grp_lookup(const char *grname, ip_stack_t *ipst) { ipmp_grp_t *grp; ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, (mod_hash_val_t *)&grp) == 0) return (grp); return (NULL); } /* * Place information about group `grp' into `lifgr'. */ void ipmp_grp_info(const ipmp_grp_t *grp, lifgroupinfo_t *lifgr) { ill_t *ill; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); lifgr->gi_v4 = (grp->gr_v4 != NULL); lifgr->gi_v6 = (grp->gr_v6 != NULL); lifgr->gi_nv4 = grp->gr_nv4 + grp->gr_pendv4; lifgr->gi_nv6 = grp->gr_nv6 + grp->gr_pendv6; lifgr->gi_mactype = grp->gr_nif > 0 ? grp->gr_mactype : SUNW_DL_IPMP; (void) strlcpy(lifgr->gi_grifname, grp->gr_ifname, LIFNAMSIZ); lifgr->gi_m4ifname[0] = '\0'; lifgr->gi_m6ifname[0] = '\0'; lifgr->gi_bcifname[0] = '\0'; if (grp->gr_v4 != NULL && (ill = grp->gr_v4->ig_cast_ill) != NULL) { (void) strlcpy(lifgr->gi_m4ifname, ill->ill_name, LIFNAMSIZ); (void) strlcpy(lifgr->gi_bcifname, ill->ill_name, LIFNAMSIZ); } if (grp->gr_v6 != NULL && (ill = grp->gr_v6->ig_cast_ill) != NULL) (void) strlcpy(lifgr->gi_m6ifname, ill->ill_name, LIFNAMSIZ); } /* * Insert `grp' into the hash using the reserved hash entry `mh'. * Caller must ensure `grp' is not yet in the hash. */ static void ipmp_grp_insert(ipmp_grp_t *grp, mod_hash_hndl_t mh) { int err; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); /* * Since grp->gr_name will exist at least as long as `grp' is in the * hash, we use it directly as the key. */ err = mod_hash_insert_reserve(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grp->gr_name, (mod_hash_val_t)grp, mh); if (err != 0) { /* * This should never happen since `mh' was preallocated. */ panic("cannot insert IPMP group \"%s\" (err %d)", grp->gr_name, err); } } /* * Remove `grp' from the hash. Caller must ensure `grp' is in it. */ static void ipmp_grp_remove(ipmp_grp_t *grp) { int err; mod_hash_val_t val; mod_hash_key_t key = (mod_hash_key_t)grp->gr_name; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); err = mod_hash_remove(ipst->ips_ipmp_grp_hash, key, &val); if (err != 0 || val != grp) { panic("cannot remove IPMP group \"%s\" (err %d)", grp->gr_name, err); } } /* * Attempt to rename `grp' to new name `grname'. Return an errno if the new * group name already exists or is invalid, or if there isn't enough memory. */ int ipmp_grp_rename(ipmp_grp_t *grp, const char *grname) { mod_hash_hndl_t mh; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); if (grname[0] == '\0') return (EINVAL); if (mod_hash_find(ipst->ips_ipmp_grp_hash, (mod_hash_key_t)grname, (mod_hash_val_t *)&grp) != MH_ERR_NOTFOUND) return (EEXIST); /* * Before we remove the group from the hash, ensure we'll be able to * re-insert it by reserving space. */ if (mod_hash_reserve_nosleep(ipst->ips_ipmp_grp_hash, &mh) != 0) return (ENOMEM); ipmp_grp_remove(grp); (void) strlcpy(grp->gr_name, grname, sizeof (grp->gr_name)); ipmp_grp_insert(grp, mh); return (0); } /* * Destroy `grp' and remove it from the hash. Caller must ensure `grp' is in * the hash, and that there are no interfaces on it. */ void ipmp_grp_destroy(ipmp_grp_t *grp) { ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); /* * If there are still interfaces using this group, panic before things * go really off the rails. */ if (grp->gr_nif != 0) panic("cannot destroy IPMP group \"%s\": in use", grp->gr_name); ipmp_grp_remove(grp); ipmp_grp_destroy_kstats(grp); ASSERT(grp->gr_v4 == NULL); ASSERT(grp->gr_v6 == NULL); ASSERT(grp->gr_nv4 == 0); ASSERT(grp->gr_nv6 == 0); ASSERT(grp->gr_nactif == 0); ASSERT(grp->gr_linkdownmp == NULL); grp->gr_phyint = NULL; kmem_free(grp, sizeof (ipmp_grp_t)); } /* * Check whether `ill' is suitable for inclusion into `grp', and return an * errno describing the problem (if any). NOTE: many of these errno values * are interpreted by ifconfig, which will take corrective action and retry * the SIOCSLIFGROUPNAME, so please exercise care when changing them. */ static int ipmp_grp_vet_ill(ipmp_grp_t *grp, ill_t *ill) { ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); /* * To sidestep complicated address migration logic in the kernel and * to force the kernel's all-hosts multicast memberships to be blown * away, all addresses that had been brought up must be brought back * down prior to adding an interface to a group. (This includes * addresses currently down due to DAD.) Once the interface has been * added to the group, its addresses can then be brought back up, at * which point they will be moved to the IPMP meta-interface. * NOTE: we do this before ill_appaddr_cnt() since bringing down the * link-local causes in.ndpd to remove its ADDRCONF'd addresses. */ if (ill->ill_ipif_up_count + ill->ill_ipif_dup_count > 0) return (EADDRINUSE); /* * To avoid confusing applications by changing addresses that are * under their control, all such control must be removed prior to * adding an interface into a group. */ if (ill_appaddr_cnt(ill) != 0) return (EADDRNOTAVAIL); /* * Since PTP addresses do not share the same broadcast domain, they * are not allowed to be in an IPMP group. */ if (ill_ptpaddr_cnt(ill) != 0) return (EINVAL); /* * An ill must support multicast to be allowed into a group. */ if (!(ill->ill_flags & ILLF_MULTICAST)) return (ENOTSUP); /* * An ill must strictly be using ARP and/or ND for address * resolution for it to be allowed into a group. */ if (ill->ill_flags & (ILLF_NONUD | ILLF_NOARP | ILLF_XRESOLV)) return (ENOTSUP); /* * An ill cannot also be using usesrc groups. (Although usesrc uses * ill_g_usesrc_lock, we don't need to grab it since usesrc also does * all its modifications as writer.) */ if (IS_USESRC_ILL(ill) || IS_USESRC_CLI_ILL(ill)) return (ENOTSUP); /* * All ills in a group must be the same mactype. */ if (grp->gr_nif > 0 && grp->gr_mactype != ill->ill_mactype) return (EINVAL); return (0); } /* * Check whether `phyi' is suitable for inclusion into `grp', and return an * errno describing the problem (if any). See comment above ipmp_grp_vet_ill() * regarding errno values. */ int ipmp_grp_vet_phyint(ipmp_grp_t *grp, phyint_t *phyi) { int err = 0; ip_stack_t *ipst = IPMP_GRP_TO_IPST(grp); ASSERT(IAM_WRITER_IPSQ(phyi->phyint_ipsq)); ASSERT(RW_LOCK_HELD(&ipst->ips_ipmp_lock)); /* * An interface cannot have address families plumbed that are not * configured in the group. */ if (phyi->phyint_illv4 != NULL && grp->gr_v4 == NULL || phyi->phyint_illv6 != NULL && grp->gr_v6 == NULL) return (EAFNOSUPPORT); if (phyi->phyint_illv4 != NULL) err = ipmp_grp_vet_ill(grp, phyi->phyint_illv4); if (err == 0 && phyi->phyint_illv6 != NULL) err = ipmp_grp_vet_ill(grp, phyi->phyint_illv6); return (err); } /* * Create a new illgrp on IPMP meta-interface `ill'. */ ipmp_illgrp_t * ipmp_illgrp_create(ill_t *ill) { uint_t mtu = ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU; ipmp_illgrp_t *illg; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_IPMP(ill)); ASSERT(ill->ill_grp == NULL); if ((illg = kmem_zalloc(sizeof (ipmp_illgrp_t), KM_NOSLEEP)) == NULL) return (NULL); list_create(&illg->ig_if, sizeof (ill_t), offsetof(ill_t, ill_grpnode)); list_create(&illg->ig_actif, sizeof (ill_t), offsetof(ill_t, ill_actnode)); list_create(&illg->ig_arpent, sizeof (ipmp_arpent_t), offsetof(ipmp_arpent_t, ia_node)); illg->ig_ipmp_ill = ill; ill->ill_grp = illg; ipmp_illgrp_set_mtu(illg, mtu); return (illg); } /* * Destroy illgrp `illg', and disconnect it from its IPMP meta-interface. */ void ipmp_illgrp_destroy(ipmp_illgrp_t *illg) { ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); ASSERT(IS_IPMP(illg->ig_ipmp_ill)); /* * Verify `illg' is empty. */ ASSERT(illg->ig_next_ill == NULL); ASSERT(illg->ig_cast_ill == NULL); ASSERT(list_is_empty(&illg->ig_arpent)); ASSERT(list_is_empty(&illg->ig_if)); ASSERT(list_is_empty(&illg->ig_actif)); ASSERT(illg->ig_nactif == 0); /* * Destroy `illg'. */ illg->ig_ipmp_ill->ill_grp = NULL; illg->ig_ipmp_ill = NULL; list_destroy(&illg->ig_if); list_destroy(&illg->ig_actif); list_destroy(&illg->ig_arpent); kmem_free(illg, sizeof (ipmp_illgrp_t)); } /* * Add `ipif' to the pool of usable data addresses on `illg' and attempt to * bind it to an underlying ill, while keeping an even address distribution. * If the bind is successful, return a pointer to the bound ill. */ ill_t * ipmp_illgrp_add_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) { ill_t *minill; ipmp_arpent_t *entp; ASSERT(IAM_WRITER_IPIF(ipif)); ASSERT(ipmp_ipif_is_dataaddr(ipif)); /* * IPMP data address mappings are internally managed by IP itself, so * delete any existing ARP entries associated with the address. */ if (!ipif->ipif_isv6) { entp = ipmp_illgrp_lookup_arpent(illg, &ipif->ipif_lcl_addr); if (entp != NULL) ipmp_illgrp_destroy_arpent(illg, entp); } if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) ipmp_ill_bind_ipif(minill, ipif, Res_act_none); return (ipif->ipif_bound ? ipif->ipif_bound_ill : NULL); } /* * Delete `ipif' from the pool of usable data addresses on `illg'. If it's * bound, unbind it from the underlying ill while keeping an even address * distribution. */ void ipmp_illgrp_del_ipif(ipmp_illgrp_t *illg, ipif_t *ipif) { ill_t *maxill, *boundill = ipif->ipif_bound_ill; ASSERT(IAM_WRITER_IPIF(ipif)); if (boundill != NULL) { (void) ipmp_ill_unbind_ipif(boundill, ipif, B_FALSE); maxill = ipmp_illgrp_max_ill(illg); if (maxill->ill_bound_cnt > boundill->ill_bound_cnt + 1) { ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); ipmp_ill_bind_ipif(boundill, ipif, Res_act_rebind); } } } /* * Return the active ill with the greatest number of data addresses in `illg'. */ static ill_t * ipmp_illgrp_max_ill(ipmp_illgrp_t *illg) { ill_t *ill, *bestill = NULL; ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); ill = list_head(&illg->ig_actif); for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { if (bestill == NULL || ill->ill_bound_cnt > bestill->ill_bound_cnt) { bestill = ill; } } return (bestill); } /* * Return the active ill with the fewest number of data addresses in `illg'. */ static ill_t * ipmp_illgrp_min_ill(ipmp_illgrp_t *illg) { ill_t *ill, *bestill = NULL; ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); ill = list_head(&illg->ig_actif); for (; ill != NULL; ill = list_next(&illg->ig_actif, ill)) { if (bestill == NULL || ill->ill_bound_cnt < bestill->ill_bound_cnt) { if (ill->ill_bound_cnt == 0) return (ill); /* can't get better */ bestill = ill; } } return (bestill); } /* * Return a pointer to IPMP meta-interface for `illg' (which must exist). * Since ig_ipmp_ill never changes for a given illg, no locks are needed. */ ill_t * ipmp_illgrp_ipmp_ill(ipmp_illgrp_t *illg) { return (illg->ig_ipmp_ill); } /* * Return a pointer to the next available underlying ill in `illg', or NULL if * one doesn't exist. Caller must be inside the IPSQ. */ ill_t * ipmp_illgrp_next_ill(ipmp_illgrp_t *illg) { ill_t *ill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); if ((ill = illg->ig_next_ill) != NULL) { illg->ig_next_ill = list_next(&illg->ig_actif, ill); if (illg->ig_next_ill == NULL) illg->ig_next_ill = list_head(&illg->ig_actif); } rw_exit(&ipst->ips_ipmp_lock); return (ill); } /* * Return a held pointer to the next available underlying ill in `illg', or * NULL if one doesn't exist. Caller need not be inside the IPSQ. */ ill_t * ipmp_illgrp_hold_next_ill(ipmp_illgrp_t *illg) { ill_t *ill; uint_t i; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); for (i = 0; i < illg->ig_nactif; i++) { ill = illg->ig_next_ill; illg->ig_next_ill = list_next(&illg->ig_actif, ill); if (illg->ig_next_ill == NULL) illg->ig_next_ill = list_head(&illg->ig_actif); if (ill_check_and_refhold(ill) == 0) { rw_exit(&ipst->ips_ipmp_lock); return (ill); } } rw_exit(&ipst->ips_ipmp_lock); return (NULL); } /* * Return a pointer to the nominated multicast ill in `illg', or NULL if one * doesn't exist. Caller must be inside the IPSQ. */ ill_t * ipmp_illgrp_cast_ill(ipmp_illgrp_t *illg) { ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); return (illg->ig_cast_ill); } /* * Return a held pointer to the nominated multicast ill in `illg', or NULL if * one doesn't exist. Caller need not be inside the IPSQ. */ ill_t * ipmp_illgrp_hold_cast_ill(ipmp_illgrp_t *illg) { ill_t *castill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); rw_enter(&ipst->ips_ipmp_lock, RW_READER); castill = illg->ig_cast_ill; if (castill != NULL && ill_check_and_refhold(castill) == 0) { rw_exit(&ipst->ips_ipmp_lock); return (castill); } rw_exit(&ipst->ips_ipmp_lock); return (NULL); } /* * Set the nominated cast ill on `illg' to `castill'. If `castill' is NULL, * any existing nomination is removed. Caller must be inside the IPSQ. */ static void ipmp_illgrp_set_cast(ipmp_illgrp_t *illg, ill_t *castill) { ill_t *ocastill = illg->ig_cast_ill; ill_t *ipmp_ill = illg->ig_ipmp_ill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IAM_WRITER_ILL(ipmp_ill)); /* * Disable old nominated ill (if any). */ if (ocastill != NULL) { DTRACE_PROBE2(ipmp__illgrp__cast__disable, ipmp_illgrp_t *, illg, ill_t *, ocastill); ASSERT(ocastill->ill_nom_cast); ocastill->ill_nom_cast = B_FALSE; /* * If the IPMP meta-interface is down, we never did the join, * so we must not try to leave. */ if (ipmp_ill->ill_dl_up) ill_leave_multicast(ipmp_ill); } /* * Set new nomination. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); illg->ig_cast_ill = castill; rw_exit(&ipst->ips_ipmp_lock); if (ocastill != NULL) { /* * Delete any IREs tied to the old nomination. We must do * this after the new castill is set and has reached global * visibility since the datapath has not been quiesced. */ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, ill_stq_cache_delete, ocastill, ocastill); } /* * Enable new nominated ill (if any). */ if (castill != NULL) { DTRACE_PROBE2(ipmp__illgrp__cast__enable, ipmp_illgrp_t *, illg, ill_t *, castill); ASSERT(!castill->ill_nom_cast); castill->ill_nom_cast = B_TRUE; /* * If the IPMP meta-interface is down, the attempt to recover * will silently fail but ill_need_recover_multicast will be * erroneously cleared -- so check first. */ if (ipmp_ill->ill_dl_up) ill_recover_multicast(ipmp_ill); } /* * For IPv4, refresh our broadcast IREs. This needs to be done even * if there's no new nomination since ill_refresh_bcast() still must * update the IPMP meta-interface's broadcast IREs to point back at * the IPMP meta-interface itself. */ if (!ipmp_ill->ill_isv6) ill_refresh_bcast(ipmp_ill); } /* * Create an IPMP ARP entry and add it to the set tracked on `illg'. If an * entry for the same IP address already exists, destroy it first. Return the * created IPMP ARP entry, or NULL on failure. */ ipmp_arpent_t * ipmp_illgrp_create_arpent(ipmp_illgrp_t *illg, mblk_t *mp, boolean_t proxyarp) { uchar_t *addrp; area_t *area = (area_t *)mp->b_rptr; ipmp_arpent_t *entp, *oentp; ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); ASSERT(area->area_proto_addr_length == sizeof (ipaddr_t)); if ((entp = kmem_zalloc(sizeof (ipmp_arpent_t), KM_NOSLEEP)) == NULL) return (NULL); if ((mp = copyb(mp)) == NULL) { kmem_free(entp, sizeof (ipmp_arpent_t)); return (NULL); } DB_TYPE(mp) = M_PROTO; entp->ia_area_mp = mp; entp->ia_proxyarp = proxyarp; addrp = mi_offset_paramc(mp, area->area_proto_addr_offset, sizeof (ipaddr_t)); bcopy(addrp, &entp->ia_ipaddr, sizeof (ipaddr_t)); if ((oentp = ipmp_illgrp_lookup_arpent(illg, &entp->ia_ipaddr)) != NULL) ipmp_illgrp_destroy_arpent(illg, oentp); list_insert_head(&illg->ig_arpent, entp); return (entp); } /* * Remove IPMP ARP entry `entp' from the set tracked on `illg' and destroy it. */ void ipmp_illgrp_destroy_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) { ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); list_remove(&illg->ig_arpent, entp); freeb(entp->ia_area_mp); kmem_free(entp, sizeof (ipmp_arpent_t)); } /* * Mark that ARP has been notified about the IP address on `entp'; `illg' is * taken as a debugging aid for DTrace FBT probes. */ /* ARGSUSED */ void ipmp_illgrp_mark_arpent(ipmp_illgrp_t *illg, ipmp_arpent_t *entp) { entp->ia_notified = B_TRUE; } /* * Look up the IPMP ARP entry for IP address `addrp' on `illg'; if `addrp' is * NULL, any IPMP ARP entry is requested. Return NULL if it does not exist. */ ipmp_arpent_t * ipmp_illgrp_lookup_arpent(ipmp_illgrp_t *illg, ipaddr_t *addrp) { ipmp_arpent_t *entp = list_head(&illg->ig_arpent); ASSERT(IAM_WRITER_ILL(illg->ig_ipmp_ill)); if (addrp == NULL) return (entp); for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) if (entp->ia_ipaddr == *addrp) break; return (entp); } /* * Refresh ARP entries on `illg' to be distributed across its active * interfaces. Entries that cannot be refreshed (e.g., because there are no * active interfaces) are marked so that subsequent calls can try again. */ void ipmp_illgrp_refresh_arpent(ipmp_illgrp_t *illg) { ill_t *ill, *ipmp_ill = illg->ig_ipmp_ill; uint_t paddrlen = ipmp_ill->ill_phys_addr_length; area_t *area; mblk_t *area_mp; uchar_t *physaddr; ipmp_arpent_t *entp; ASSERT(IAM_WRITER_ILL(ipmp_ill)); ASSERT(!ipmp_ill->ill_isv6); ill = list_head(&illg->ig_actif); entp = list_head(&illg->ig_arpent); for (; entp != NULL; entp = list_next(&illg->ig_arpent, entp)) { if (ill == NULL || ipmp_ill->ill_ipif_up_count == 0) { entp->ia_notified = B_FALSE; continue; } area = (area_t *)entp->ia_area_mp->b_rptr; ASSERT(paddrlen == ill->ill_phys_addr_length); ASSERT(paddrlen == area->area_hw_addr_length); physaddr = mi_offset_paramc(entp->ia_area_mp, area->area_hw_addr_offset, paddrlen); /* * If this is a proxy ARP entry, we can skip notifying ARP if * the entry is already up-to-date. If it has changed, we * update the entry's hardware address before notifying ARP. */ if (entp->ia_proxyarp) { if (bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0 && entp->ia_notified) continue; bcopy(ill->ill_phys_addr, physaddr, paddrlen); } if ((area_mp = copyb(entp->ia_area_mp)) == NULL) { entp->ia_notified = B_FALSE; continue; } putnext(ipmp_ill->ill_rq, area_mp); ipmp_illgrp_mark_arpent(illg, entp); if ((ill = list_next(&illg->ig_actif, ill)) == NULL) ill = list_head(&illg->ig_actif); } } /* * Return an interface in `illg' with the specified `physaddr', or NULL if one * doesn't exist. Caller must hold ill_g_lock if it's not inside the IPSQ. */ ill_t * ipmp_illgrp_find_ill(ipmp_illgrp_t *illg, uchar_t *physaddr, uint_t paddrlen) { ill_t *ill; ill_t *ipmp_ill = illg->ig_ipmp_ill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IAM_WRITER_ILL(ipmp_ill) || RW_LOCK_HELD(&ipst->ips_ill_g_lock)); ill = list_head(&illg->ig_if); for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { if (ill->ill_phys_addr_length == paddrlen && bcmp(ill->ill_phys_addr, physaddr, paddrlen) == 0) return (ill); } return (NULL); } /* * Asynchronously update the MTU for an IPMP ill by injecting a DL_NOTIFY_IND. * Caller must be inside the IPSQ unless this is initialization. */ static void ipmp_illgrp_set_mtu(ipmp_illgrp_t *illg, uint_t mtu) { ill_t *ill = illg->ig_ipmp_ill; mblk_t *mp; ASSERT(illg->ig_mtu == 0 || IAM_WRITER_ILL(ill)); /* * If allocation fails, we have bigger problems than MTU. */ if ((mp = ip_dlnotify_alloc(DL_NOTE_SDU_SIZE, mtu)) != NULL) { illg->ig_mtu = mtu; put(ill->ill_rq, mp); } } /* * Recalculate the IPMP group MTU for `illg', and update its associated IPMP * ill MTU if necessary. */ void ipmp_illgrp_refresh_mtu(ipmp_illgrp_t *illg) { ill_t *ill; ill_t *ipmp_ill = illg->ig_ipmp_ill; uint_t mtu = 0; ASSERT(IAM_WRITER_ILL(ipmp_ill)); /* * Since ill_max_mtu can only change under ill_lock, we hold ill_lock * for each ill as we iterate through the list. Any changes to the * ill_max_mtu will also trigger an update, so even if we missed it * this time around, the update will catch it. */ ill = list_head(&illg->ig_if); for (; ill != NULL; ill = list_next(&illg->ig_if, ill)) { mutex_enter(&ill->ill_lock); if (mtu == 0 || ill->ill_max_mtu < mtu) mtu = ill->ill_max_mtu; mutex_exit(&ill->ill_lock); } /* * MTU must be at least the minimum MTU. */ mtu = MAX(mtu, ipmp_ill->ill_isv6 ? IPV6_MIN_MTU : IP_MIN_MTU); if (illg->ig_mtu != mtu) ipmp_illgrp_set_mtu(illg, mtu); } /* * Link illgrp `illg' to IPMP group `grp'. To simplify the caller, silently * allow the same link to be established more than once. */ void ipmp_illgrp_link_grp(ipmp_illgrp_t *illg, ipmp_grp_t *grp) { ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); if (illg->ig_ipmp_ill->ill_isv6) { ASSERT(grp->gr_v6 == NULL || grp->gr_v6 == illg); grp->gr_v6 = illg; } else { ASSERT(grp->gr_v4 == NULL || grp->gr_v4 == illg); grp->gr_v4 = illg; } } /* * Unlink illgrp `illg' from its IPMP group. Return an errno if the illgrp * cannot be unlinked (e.g., because there are still interfaces using it). */ int ipmp_illgrp_unlink_grp(ipmp_illgrp_t *illg) { ipmp_grp_t *grp = illg->ig_ipmp_ill->ill_phyint->phyint_grp; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(RW_WRITE_HELD(&ipst->ips_ipmp_lock)); if (illg->ig_ipmp_ill->ill_isv6) { if (grp->gr_nv6 + grp->gr_pendv6 != 0) return (EBUSY); grp->gr_v6 = NULL; } else { if (grp->gr_nv4 + grp->gr_pendv4 != 0) return (EBUSY); grp->gr_v4 = NULL; } return (0); } /* * Place `ill' into `illg', and rebalance the data addresses on `illg' * to be spread evenly across the ills now in it. Also, adjust the IPMP * ill as necessary to account for `ill' (e.g., MTU). */ void ipmp_ill_join_illgrp(ill_t *ill, ipmp_illgrp_t *illg) { ill_t *ipmp_ill; ipif_t *ipif; ip_stack_t *ipst = ill->ill_ipst; /* IS_UNDER_IPMP() requires ill_grp to be non-NULL */ ASSERT(!IS_IPMP(ill) && ill->ill_phyint->phyint_grp != NULL); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(ill->ill_grp == NULL); ipmp_ill = illg->ig_ipmp_ill; /* * Account for `ill' joining the illgrp. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); if (ill->ill_isv6) ill->ill_phyint->phyint_grp->gr_nv6++; else ill->ill_phyint->phyint_grp->gr_nv4++; rw_exit(&ipst->ips_ipmp_lock); /* * Ensure the ILLF_ROUTER flag remains consistent across the group. */ mutex_enter(&ill->ill_lock); if (ipmp_ill->ill_flags & ILLF_ROUTER) ill->ill_flags |= ILLF_ROUTER; else ill->ill_flags &= ~ILLF_ROUTER; mutex_exit(&ill->ill_lock); /* * Blow away all multicast memberships that currently exist on `ill'. * This may seem odd, but it's consistent with the application view * that `ill' no longer exists (e.g., due to ipmp_ill_rtsaddrmsg()). */ if (ill->ill_isv6) { reset_conn_ill(ill); reset_mrt_ill(ill); } else { ipif = ill->ill_ipif; for (; ipif != NULL; ipif = ipif->ipif_next) { reset_conn_ipif(ipif); reset_mrt_vif_ipif(ipif); } } ip_purge_allmulti(ill); /* * Borrow the first ill's ill_phys_addr_length value for the illgrp's * physical address length. All other ills must have the same value, * since they are required to all be the same mactype. Also update * the IPMP ill's MTU and CoS marking, if necessary. */ if (list_is_empty(&illg->ig_if)) { ASSERT(ipmp_ill->ill_phys_addr_length == 0); /* * NOTE: we leave ill_phys_addr NULL since the IPMP group * doesn't have a physical address. This means that code must * not assume that ill_phys_addr is non-NULL just because * ill_phys_addr_length is non-zero. Likewise for ill_nd_lla. */ ipmp_ill->ill_phys_addr_length = ill->ill_phys_addr_length; ipmp_ill->ill_nd_lla_len = ill->ill_phys_addr_length; ipmp_ill->ill_type = ill->ill_type; if (ill->ill_flags & ILLF_COS_ENABLED) { mutex_enter(&ipmp_ill->ill_lock); ipmp_ill->ill_flags |= ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); } else { ASSERT(ipmp_ill->ill_phys_addr_length == ill->ill_phys_addr_length); ASSERT(ipmp_ill->ill_type == ill->ill_type); if (!(ill->ill_flags & ILLF_COS_ENABLED)) { mutex_enter(&ipmp_ill->ill_lock); ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } if (illg->ig_mtu > ill->ill_max_mtu) ipmp_illgrp_set_mtu(illg, ill->ill_max_mtu); } rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); list_insert_tail(&illg->ig_if, ill); ill->ill_grp = illg; rw_exit(&ipst->ips_ill_g_lock); /* * Hide the IREs on `ill' so that we don't accidentally find them when * sending data traffic. */ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_mark_testhidden, ill, ill); /* * Merge any broadcast IREs, if need be. */ if (!ill->ill_isv6) ill_refresh_bcast(ill); ipmp_ill_refresh_active(ill); } /* * Remove `ill' from its illgrp, and rebalance the data addresses in that * illgrp to be spread evenly across the remaining ills. Also, adjust the * IPMP ill as necessary now that `ill' is removed (e.g., MTU). */ void ipmp_ill_leave_illgrp(ill_t *ill) { ill_t *ipmp_ill; ipif_t *ipif; ipmp_arpent_t *entp; ipmp_illgrp_t *illg = ill->ill_grp; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IS_UNDER_IPMP(ill)); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(illg != NULL); ipmp_ill = illg->ig_ipmp_ill; /* * Cancel IPMP-specific ill timeouts. */ (void) untimeout(ill->ill_refresh_tid); /* * Expose any previously-hidden IREs on `ill'. */ ire_walk_ill(MATCH_IRE_ILL, 0, ipmp_ill_ire_clear_testhidden, ill, ill); /* * Ensure the multicast state for each ipif on `ill' is down so that * our ipif_multicast_up() (once `ill' leaves the group) will rejoin * all eligible groups. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) if (ipif->ipif_flags & IPIF_UP) ipif_multicast_down(ipif); /* * Account for `ill' leaving the illgrp. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); if (ill->ill_isv6) ill->ill_phyint->phyint_grp->gr_nv6--; else ill->ill_phyint->phyint_grp->gr_nv4--; rw_exit(&ipst->ips_ipmp_lock); /* * Pull `ill' out of the interface lists. */ if (list_link_active(&ill->ill_actnode)) ipmp_ill_deactivate(ill); rw_enter(&ipst->ips_ill_g_lock, RW_WRITER); list_remove(&illg->ig_if, ill); ill->ill_grp = NULL; rw_exit(&ipst->ips_ill_g_lock); /* * Recreate any broadcast IREs that had been shared, if need be. */ if (!ill->ill_isv6) ill_refresh_bcast(ill); /* * Re-establish multicast memberships that were previously being * handled by the IPMP meta-interface. */ for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) if (ipif->ipif_flags & IPIF_UP) ipif_multicast_up(ipif); /* * Refresh the group MTU based on the new interface list. */ ipmp_illgrp_refresh_mtu(illg); if (list_is_empty(&illg->ig_if)) { /* * No ills left in the illgrp; we no longer have a physical * address length, nor can we support ARP, CoS, or anything * else that depends on knowing the link layer type. */ while ((entp = ipmp_illgrp_lookup_arpent(illg, NULL)) != NULL) ipmp_illgrp_destroy_arpent(illg, entp); ipmp_ill->ill_phys_addr_length = 0; ipmp_ill->ill_nd_lla_len = 0; ipmp_ill->ill_type = IFT_OTHER; mutex_enter(&ipmp_ill->ill_lock); ipmp_ill->ill_flags &= ~ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } else { /* * If `ill' didn't support CoS, see if it can now be enabled. */ if (!(ill->ill_flags & ILLF_COS_ENABLED)) { ASSERT(!(ipmp_ill->ill_flags & ILLF_COS_ENABLED)); ill = list_head(&illg->ig_if); do { if (!(ill->ill_flags & ILLF_COS_ENABLED)) break; } while ((ill = list_next(&illg->ig_if, ill)) != NULL); if (ill == NULL) { mutex_enter(&ipmp_ill->ill_lock); ipmp_ill->ill_flags |= ILLF_COS_ENABLED; mutex_exit(&ipmp_ill->ill_lock); } } } } /* * Check if `ill' should be active, and activate or deactivate if need be. * Return B_FALSE if a refresh was necessary but could not be performed. */ static boolean_t ipmp_ill_try_refresh_active(ill_t *ill) { boolean_t refreshed = B_TRUE; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_UNDER_IPMP(ill)); if (ipmp_ill_is_active(ill)) { if (!list_link_active(&ill->ill_actnode)) refreshed = ipmp_ill_activate(ill); } else { if (list_link_active(&ill->ill_actnode)) ipmp_ill_deactivate(ill); } return (refreshed); } /* * Check if `ill' should be active, and activate or deactivate if need be. * If the refresh fails, schedule a timer to try again later. */ void ipmp_ill_refresh_active(ill_t *ill) { if (!ipmp_ill_try_refresh_active(ill)) ipmp_ill_refresh_active_timer_start(ill); } /* * Retry ipmp_ill_try_refresh_active() on the ill named by `ill_arg'. */ static void ipmp_ill_refresh_active_timer(void *ill_arg) { ill_t *ill = ill_arg; boolean_t refreshed = B_FALSE; /* * Clear ill_refresh_tid to indicate that no timeout is pending * (another thread could schedule a new timeout while we're still * running, but that's harmless). If the ill is going away, bail. */ mutex_enter(&ill->ill_lock); ill->ill_refresh_tid = 0; if (ill->ill_state_flags & ILL_CONDEMNED) { mutex_exit(&ill->ill_lock); return; } mutex_exit(&ill->ill_lock); if (ipsq_try_enter(NULL, ill, NULL, NULL, NULL, NEW_OP, B_FALSE)) { refreshed = ipmp_ill_try_refresh_active(ill); ipsq_exit(ill->ill_phyint->phyint_ipsq); } /* * If the refresh failed, schedule another attempt. */ if (!refreshed) ipmp_ill_refresh_active_timer_start(ill); } /* * Retry an ipmp_ill_try_refresh_active() on the ill named by `arg'. */ static void ipmp_ill_refresh_active_timer_start(ill_t *ill) { mutex_enter(&ill->ill_lock); /* * If the ill is going away or a refresh is already scheduled, bail. */ if (ill->ill_refresh_tid != 0 || (ill->ill_state_flags & ILL_CONDEMNED)) { mutex_exit(&ill->ill_lock); return; } ill->ill_refresh_tid = timeout(ipmp_ill_refresh_active_timer, ill, SEC_TO_TICK(IPMP_ILL_REFRESH_TIMEOUT)); mutex_exit(&ill->ill_lock); } /* * Activate `ill' so it will be used to send and receive data traffic. Return * B_FALSE if `ill' cannot be activated. Note that we allocate any messages * needed to deactivate `ill' here as well so that deactivation cannot fail. */ static boolean_t ipmp_ill_activate(ill_t *ill) { ipif_t *ipif; mblk_t *actmp = NULL, *deactmp = NULL; mblk_t *linkupmp = NULL, *linkdownmp = NULL; ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; const char *grifname = grp->gr_ifname; ipmp_illgrp_t *illg = ill->ill_grp; ill_t *maxill; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_UNDER_IPMP(ill)); /* * If this will be the first active interface in the group, allocate * the link-up and link-down messages. */ if (grp->gr_nactif == 0) { linkupmp = ip_dlnotify_alloc(DL_NOTE_LINK_UP, 0); linkdownmp = ip_dlnotify_alloc(DL_NOTE_LINK_DOWN, 0); if (linkupmp == NULL || linkdownmp == NULL) goto fail; } /* * For IPv4, allocate the activate/deactivate messages, and tell ARP. */ if (!ill->ill_isv6) { actmp = ill_arie_alloc(ill, grifname, &ipmp_aract_template); deactmp = ill_arie_alloc(ill, grifname, &ipmp_ardeact_template); if (actmp == NULL || deactmp == NULL) goto fail; ASSERT(ill->ill_ardeact_mp == NULL); ill->ill_ardeact_mp = deactmp; putnext(illg->ig_ipmp_ill->ill_rq, actmp); } if (list_is_empty(&illg->ig_actif)) { /* * Now that we have an active ill, nominate it for multicast * and broadcast duties. Do this before ipmp_ill_bind_ipif() * since that may need to send multicast packets (e.g., IPv6 * neighbor discovery probes). */ ipmp_illgrp_set_cast(illg, ill); /* * This is the first active ill in the illgrp -- add 'em all. * We can access/walk ig_ipmp_ill's ipif list since we're * writer on its IPSQ as well. */ ipif = illg->ig_ipmp_ill->ill_ipif; for (; ipif != NULL; ipif = ipif->ipif_next) if (ipmp_ipif_is_up_dataaddr(ipif)) ipmp_ill_bind_ipif(ill, ipif, Res_act_initial); } else { /* * Redistribute the addresses by moving them from the ill with * the most addresses until the ill being activated is at the * same level as the rest of the ills. */ for (;;) { maxill = ipmp_illgrp_max_ill(illg); ASSERT(maxill != NULL); if (ill->ill_bound_cnt + 1 >= maxill->ill_bound_cnt) break; ipif = ipmp_ill_unbind_ipif(maxill, NULL, B_TRUE); ipmp_ill_bind_ipif(ill, ipif, Res_act_rebind); } /* * TODO: explore whether it's advantageous to flush IRE_CACHE * bindings to force existing connections to be redistributed * to the new ill. */ } /* * Put the interface in the active list. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); list_insert_tail(&illg->ig_actif, ill); illg->ig_nactif++; illg->ig_next_ill = ill; rw_exit(&ipst->ips_ipmp_lock); /* * Refresh ARP entries to use `ill', if need be. */ if (!ill->ill_isv6) ipmp_illgrp_refresh_arpent(illg); /* * Finally, mark the group link up, if necessary. */ if (grp->gr_nactif++ == 0) { ASSERT(grp->gr_linkdownmp == NULL); grp->gr_linkdownmp = linkdownmp; put(illg->ig_ipmp_ill->ill_rq, linkupmp); } return (B_TRUE); fail: freemsg(actmp); freemsg(deactmp); freemsg(linkupmp); freemsg(linkdownmp); return (B_FALSE); } /* * Deactivate `ill' so it will not be used to send or receive data traffic. */ static void ipmp_ill_deactivate(ill_t *ill) { ill_t *minill; ipif_t *ipif, *ubnextipif, *ubheadipif = NULL; mblk_t *mp; ipmp_grp_t *grp = ill->ill_phyint->phyint_grp; ipmp_illgrp_t *illg = ill->ill_grp; ip_stack_t *ipst = IPMP_ILLGRP_TO_IPST(illg); ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_UNDER_IPMP(ill)); /* * Delete IRE_CACHE entries tied to this ill before they become stale. */ ire_walk_ill(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, ill_stq_cache_delete, ill, ill); /* * Pull the interface out of the active list. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); list_remove(&illg->ig_actif, ill); illg->ig_nactif--; illg->ig_next_ill = list_head(&illg->ig_actif); rw_exit(&ipst->ips_ipmp_lock); /* * If the ill that's being deactivated had been nominated for * multicast/broadcast, nominate a new one. */ if (ill == illg->ig_cast_ill) ipmp_illgrp_set_cast(illg, list_head(&illg->ig_actif)); /* * Unbind all of the ipifs bound to this ill, and save 'em in a list; * we'll rebind them after we tell the resolver the ill is no longer * active. We must do things in this order or the resolver could * accidentally rebind to the ill we're trying to remove if multiple * ills in the group have the same hardware address (which is * unsupported, but shouldn't lead to a wedged machine). */ while ((ipif = ipmp_ill_unbind_ipif(ill, NULL, B_TRUE)) != NULL) { ipif->ipif_bound_next = ubheadipif; ubheadipif = ipif; } if (!ill->ill_isv6) { /* * Tell ARP `ill' is no longer active in the group. */ mp = ill->ill_ardeact_mp; ill->ill_ardeact_mp = NULL; ASSERT(mp != NULL); putnext(illg->ig_ipmp_ill->ill_rq, mp); /* * Refresh any ARP entries that had been using `ill'. */ ipmp_illgrp_refresh_arpent(illg); } /* * Rebind each ipif from the deactivated ill to the active ill with * the fewest ipifs. If there are no active ills, the ipifs will * remain unbound. */ for (ipif = ubheadipif; ipif != NULL; ipif = ubnextipif) { ubnextipif = ipif->ipif_bound_next; ipif->ipif_bound_next = NULL; if ((minill = ipmp_illgrp_min_ill(illg)) != NULL) ipmp_ill_bind_ipif(minill, ipif, Res_act_rebind); } /* * Finally, mark the group link down, if necessary. */ if (--grp->gr_nactif == 0) { mp = grp->gr_linkdownmp; grp->gr_linkdownmp = NULL; ASSERT(mp != NULL); put(illg->ig_ipmp_ill->ill_rq, mp); } } /* * Send the routing socket messages needed to make `ill' "appear" (RTM_ADD) * or "disappear" (RTM_DELETE) to non-IPMP-aware routing socket listeners. */ static void ipmp_ill_rtsaddrmsg(ill_t *ill, int cmd) { ipif_t *ipif; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(cmd == RTM_ADD || cmd == RTM_DELETE); /* * If `ill' is truly down, there are no messages to generate since: * * 1. If cmd == RTM_DELETE, then we're supposed to hide the interface * and its addresses by bringing them down. But that's already * true, so there's nothing to hide. * * 2. If cmd == RTM_ADD, then we're supposed to generate messages * indicating that any previously-hidden up addresses are again * back up (along with the interface). But they aren't, so * there's nothing to expose. */ if (ill->ill_ipif_up_count == 0) return; if (cmd == RTM_ADD) ip_rts_xifmsg(ill->ill_ipif, IPIF_UP, 0, RTSQ_NORMAL); for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) if (ipif->ipif_flags & IPIF_UP) ip_rts_newaddrmsg(cmd, 0, ipif, RTSQ_NORMAL); if (cmd == RTM_DELETE) ip_rts_xifmsg(ill->ill_ipif, 0, IPIF_UP, RTSQ_NORMAL); } /* * Bind the address named by `ipif' to the underlying ill named by `ill'. * If `act' is Res_act_none, don't notify the resolver. Otherwise, `act' * will indicate to the resolver whether this is an initial bringup of * `ipif', or just a rebind to another ill. */ static void ipmp_ill_bind_ipif(ill_t *ill, ipif_t *ipif, enum ip_resolver_action act) { int err = 0; ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_ILL(ill) && IAM_WRITER_IPIF(ipif)); ASSERT(IS_UNDER_IPMP(ill) && IS_IPMP(ipif->ipif_ill)); ASSERT(act == Res_act_none || ipmp_ipif_is_up_dataaddr(ipif)); ASSERT(ipif->ipif_bound_ill == NULL); ASSERT(ipif->ipif_bound_next == NULL); ipif->ipif_bound_next = ill->ill_bound_ipif; ill->ill_bound_ipif = ipif; ill->ill_bound_cnt++; rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); ipif->ipif_bound_ill = ill; rw_exit(&ipst->ips_ipmp_lock); /* * If necessary, tell ARP/NDP about the new mapping. Note that * ipif_resolver_up() cannot fail for non-XRESOLV IPv6 ills. */ if (act != Res_act_none) { if (ill->ill_isv6) { VERIFY(ipif_resolver_up(ipif, act) == 0); err = ipif_ndp_up(ipif, act == Res_act_initial); } else { err = ipif_resolver_up(ipif, act); } /* * Since ipif_ndp_up() never returns EINPROGRESS and * ipif_resolver_up() only returns EINPROGRESS when the * associated ill is not up, we should never be here with * EINPROGRESS. We rely on this to simplify the design. */ ASSERT(err != EINPROGRESS); } /* TODO: retry binding on failure? when? */ ipif->ipif_bound = (err == 0); } /* * Unbind the address named by `ipif' from the underlying ill named by `ill'. * If `ipif' is NULL, then an arbitrary ipif on `ill' is unbound and returned. * If no ipifs are bound to `ill', NULL is returned. If `notifyres' is * B_TRUE, notify the resolver about the change. */ static ipif_t * ipmp_ill_unbind_ipif(ill_t *ill, ipif_t *ipif, boolean_t notifyres) { ill_t *ipmp_ill; ipif_t *previpif; ip_stack_t *ipst = ill->ill_ipst; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(IS_UNDER_IPMP(ill)); ipmp_ill = ill->ill_grp->ig_ipmp_ill; /* * If necessary, find an ipif to unbind. */ if (ipif == NULL) { if ((ipif = ill->ill_bound_ipif) == NULL) { ASSERT(ill->ill_bound_cnt == 0); return (NULL); } } ASSERT(IAM_WRITER_IPIF(ipif)); ASSERT(IS_IPMP(ipif->ipif_ill)); ASSERT(ipif->ipif_bound_ill == ill); ASSERT(ill->ill_bound_cnt > 0); /* * Unbind it. */ rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); ipif->ipif_bound_ill = NULL; rw_exit(&ipst->ips_ipmp_lock); ill->ill_bound_cnt--; if (ill->ill_bound_ipif == ipif) { ill->ill_bound_ipif = ipif->ipif_bound_next; } else { previpif = ill->ill_bound_ipif; while (previpif->ipif_bound_next != ipif) previpif = previpif->ipif_bound_next; previpif->ipif_bound_next = ipif->ipif_bound_next; } ipif->ipif_bound_next = NULL; /* * If requested, notify the resolvers (provided we're bound). */ if (notifyres && ipif->ipif_bound) { if (ill->ill_isv6) { ipif_ndp_down(ipif); } else { ASSERT(ipif->ipif_arp_del_mp != NULL); putnext(ipmp_ill->ill_rq, ipif->ipif_arp_del_mp); ipif->ipif_arp_del_mp = NULL; } } ipif->ipif_bound = B_FALSE; return (ipif); } /* * Check if `ill' is active. Caller must hold ill_lock and phyint_lock if * it's not inside the IPSQ. Since ipmp_ill_try_refresh_active() calls this * to determine whether an ill should be considered active, other consumers * may race and learn about an ill that should be deactivated/activated before * IPMP has performed the activation/deactivation. This should be safe though * since at worst e.g. ire_atomic_start() will prematurely delete an IRE that * would've been cleaned up by ipmp_ill_deactivate(). */ boolean_t ipmp_ill_is_active(ill_t *ill) { phyint_t *phyi = ill->ill_phyint; ASSERT(IS_UNDER_IPMP(ill)); ASSERT(IAM_WRITER_ILL(ill) || (MUTEX_HELD(&ill->ill_lock) && MUTEX_HELD(&phyi->phyint_lock))); /* * Note that PHYI_RUNNING isn't checked since we rely on in.mpathd to * set PHYI_FAILED whenever PHYI_RUNNING is cleared. This allows the * link flapping logic to be just in in.mpathd and allows us to ignore * changes to PHYI_RUNNING. */ return (!(ill->ill_ipif_up_count == 0 || (phyi->phyint_flags & (PHYI_OFFLINE|PHYI_INACTIVE|PHYI_FAILED)))); } /* * IRE walker callback: set IRE_MARK_TESTHIDDEN on cache/interface/offsubnet * IREs with a source address on `ill_arg'. */ static void ipmp_ill_ire_mark_testhidden(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(!IS_IPMP(ill)); if (ire->ire_ipif->ipif_ill != ill) return; switch (ire->ire_type) { case IRE_HOST: case IRE_PREFIX: case IRE_DEFAULT: case IRE_CACHE: case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: DTRACE_PROBE1(ipmp__mark__testhidden, ire_t *, ire); ire->ire_marks |= IRE_MARK_TESTHIDDEN; break; default: break; } } /* * IRE walker callback: clear IRE_MARK_TESTHIDDEN if the IRE has a source * address on `ill_arg'. */ static void ipmp_ill_ire_clear_testhidden(ire_t *ire, char *ill_arg) { ill_t *ill = (ill_t *)ill_arg; ASSERT(IAM_WRITER_ILL(ill)); ASSERT(!IS_IPMP(ill)); if (ire->ire_ipif->ipif_ill == ill) { DTRACE_PROBE1(ipmp__clear__testhidden, ire_t *, ire); ire->ire_marks &= ~IRE_MARK_TESTHIDDEN; } } /* * Return a held pointer to the IPMP ill for underlying interface `ill', or * NULL if one doesn't exist. (Unfortunately, this function needs to take an * underlying ill rather than an ipmp_illgrp_t because an underlying ill's * ill_grp pointer may become stale when not inside an IPSQ and not holding * ipmp_lock.) Caller need not be inside the IPSQ. */ ill_t * ipmp_ill_hold_ipmp_ill(ill_t *ill) { ip_stack_t *ipst = ill->ill_ipst; ipmp_illgrp_t *illg; ASSERT(!IS_IPMP(ill)); rw_enter(&ipst->ips_ipmp_lock, RW_READER); illg = ill->ill_grp; if (illg != NULL && ill_check_and_refhold(illg->ig_ipmp_ill) == 0) { rw_exit(&ipst->ips_ipmp_lock); return (illg->ig_ipmp_ill); } /* * Assume `ill' was removed from the illgrp in the meantime. */ rw_exit(&ill->ill_ipst->ips_ipmp_lock); return (NULL); } /* * Return the interface index for the IPMP ill tied to underlying interface * `ill', or zero if one doesn't exist. Caller need not be inside the IPSQ. */ uint_t ipmp_ill_get_ipmp_ifindex(const ill_t *ill) { uint_t ifindex = 0; ip_stack_t *ipst = ill->ill_ipst; ipmp_grp_t *grp; ASSERT(!IS_IPMP(ill)); rw_enter(&ipst->ips_ipmp_lock, RW_READER); if ((grp = ill->ill_phyint->phyint_grp) != NULL) ifindex = grp->gr_phyint->phyint_ifindex; rw_exit(&ipst->ips_ipmp_lock); return (ifindex); } /* * Place phyint `phyi' into IPMP group `grp'. */ void ipmp_phyint_join_grp(phyint_t *phyi, ipmp_grp_t *grp) { ill_t *ill; ipsq_t *ipsq = phyi->phyint_ipsq; ipsq_t *grp_ipsq = grp->gr_phyint->phyint_ipsq; ip_stack_t *ipst = PHYINT_TO_IPST(phyi); ASSERT(IAM_WRITER_IPSQ(ipsq)); ASSERT(phyi->phyint_illv4 != NULL || phyi->phyint_illv6 != NULL); /* * Send routing socket messages indicating that the phyint's ills * and ipifs vanished. */ if (phyi->phyint_illv4 != NULL) { ill = phyi->phyint_illv4; ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); } if (phyi->phyint_illv6 != NULL) { ill = phyi->phyint_illv6; ipmp_ill_rtsaddrmsg(ill, RTM_DELETE); } /* * Snapshot the phyint's initial kstats as a baseline. */ ipmp_phyint_get_kstats(phyi, phyi->phyint_kstats0); rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); phyi->phyint_grp = grp; if (++grp->gr_nif == 1) grp->gr_mactype = ill->ill_mactype; else ASSERT(grp->gr_mactype == ill->ill_mactype); /* * Now that we're in the group, request a switch to the group's xop * when we ipsq_exit(). All future operations will be exclusive on * the group xop until ipmp_phyint_leave_grp() is called. */ ASSERT(ipsq->ipsq_swxop == NULL); ASSERT(grp_ipsq->ipsq_xop == &grp_ipsq->ipsq_ownxop); ipsq->ipsq_swxop = &grp_ipsq->ipsq_ownxop; rw_exit(&ipst->ips_ipmp_lock); } /* * Remove phyint `phyi' from its current IPMP group. */ void ipmp_phyint_leave_grp(phyint_t *phyi) { uint_t i; ipsq_t *ipsq = phyi->phyint_ipsq; ip_stack_t *ipst = PHYINT_TO_IPST(phyi); uint64_t phyi_kstats[IPMP_KSTAT_MAX]; ASSERT(IAM_WRITER_IPSQ(ipsq)); /* * If any of the phyint's ills are still in an illgrp, kick 'em out. */ if (phyi->phyint_illv4 != NULL && IS_UNDER_IPMP(phyi->phyint_illv4)) ipmp_ill_leave_illgrp(phyi->phyint_illv4); if (phyi->phyint_illv6 != NULL && IS_UNDER_IPMP(phyi->phyint_illv6)) ipmp_ill_leave_illgrp(phyi->phyint_illv6); /* * Send routing socket messages indicating that the phyint's ills * and ipifs have reappeared. */ if (phyi->phyint_illv4 != NULL) ipmp_ill_rtsaddrmsg(phyi->phyint_illv4, RTM_ADD); if (phyi->phyint_illv6 != NULL) ipmp_ill_rtsaddrmsg(phyi->phyint_illv6, RTM_ADD); /* * Calculate the phyint's cumulative kstats while it was in the group, * and add that to the group's baseline. */ ipmp_phyint_get_kstats(phyi, phyi_kstats); for (i = 0; i < IPMP_KSTAT_MAX; i++) { phyi_kstats[i] -= phyi->phyint_kstats0[i]; atomic_add_64(&phyi->phyint_grp->gr_kstats0[i], phyi_kstats[i]); } rw_enter(&ipst->ips_ipmp_lock, RW_WRITER); phyi->phyint_grp->gr_nif--; phyi->phyint_grp = NULL; /* * As our final act in leaving the group, request a switch back to our * IPSQ's own xop when we ipsq_exit(). */ ASSERT(ipsq->ipsq_swxop == NULL); ipsq->ipsq_swxop = &ipsq->ipsq_ownxop; rw_exit(&ipst->ips_ipmp_lock); } /* * Store the IPMP-related kstats for `phyi' into the array named by `kstats'. * Assumes that `kstats' has at least IPMP_KSTAT_MAX elements. */ static void ipmp_phyint_get_kstats(phyint_t *phyi, uint64_t kstats[]) { uint_t i, j; const char *name; kstat_t *ksp; kstat_named_t *kn; bzero(kstats, sizeof (kstats[0]) * IPMP_KSTAT_MAX); /* * NOTE: ALL_ZONES here assumes that there's at most one link * with a given name on a given system (safe for now). */ ksp = kstat_hold_byname("link", 0, phyi->phyint_name, ALL_ZONES); if (ksp == NULL) return; KSTAT_ENTER(ksp); if (ksp->ks_data != NULL && ksp->ks_type == KSTAT_TYPE_NAMED) { /* * Bring kstats up-to-date before recording. */ (void) KSTAT_UPDATE(ksp, KSTAT_READ); kn = KSTAT_NAMED_PTR(ksp); for (i = 0; i < IPMP_KSTAT_MAX; i++) { name = ipmp_kstats[i].name; kstats[i] = 0; for (j = 0; j < ksp->ks_ndata; j++) { if (strcmp(kn[j].name, name) != 0) continue; switch (kn[j].data_type) { case KSTAT_DATA_INT32: case KSTAT_DATA_UINT32: kstats[i] = kn[j].value.ui32; break; #ifdef _LP64 case KSTAT_DATA_LONG: case KSTAT_DATA_ULONG: kstats[i] = kn[j].value.ul; break; #endif case KSTAT_DATA_INT64: case KSTAT_DATA_UINT64: kstats[i] = kn[j].value.ui64; break; } break; } } } KSTAT_EXIT(ksp); kstat_rele(ksp); } /* * Refresh the active state of all ills on `phyi'. */ void ipmp_phyint_refresh_active(phyint_t *phyi) { if (phyi->phyint_illv4 != NULL) ipmp_ill_refresh_active(phyi->phyint_illv4); if (phyi->phyint_illv6 != NULL) ipmp_ill_refresh_active(phyi->phyint_illv6); } /* * Return a held pointer to the underlying ill bound to `ipif', or NULL if one * doesn't exist. Caller need not be inside the IPSQ. */ ill_t * ipmp_ipif_hold_bound_ill(const ipif_t *ipif) { ill_t *boundill; ip_stack_t *ipst = ipif->ipif_ill->ill_ipst; ASSERT(IS_IPMP(ipif->ipif_ill)); rw_enter(&ipst->ips_ipmp_lock, RW_READER); boundill = ipif->ipif_bound_ill; if (boundill != NULL && ill_check_and_refhold(boundill) == 0) { rw_exit(&ipst->ips_ipmp_lock); return (boundill); } rw_exit(&ipst->ips_ipmp_lock); return (NULL); } /* * Return a pointer to the underlying ill bound to `ipif', or NULL if one * doesn't exist. Caller must be inside the IPSQ. */ ill_t * ipmp_ipif_bound_ill(const ipif_t *ipif) { ASSERT(IAM_WRITER_ILL(ipif->ipif_ill)); ASSERT(IS_IPMP(ipif->ipif_ill)); return (ipif->ipif_bound_ill); } /* * Check if `ipif' is a "stub" (placeholder address not being used). */ boolean_t ipmp_ipif_is_stubaddr(const ipif_t *ipif) { if (ipif->ipif_flags & IPIF_UP) return (B_FALSE); if (ipif->ipif_ill->ill_isv6) return (IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); else return (ipif->ipif_lcl_addr == INADDR_ANY); } /* * Check if `ipif' is an IPMP data address. */ boolean_t ipmp_ipif_is_dataaddr(const ipif_t *ipif) { if (ipif->ipif_flags & IPIF_NOFAILOVER) return (B_FALSE); if (ipif->ipif_ill->ill_isv6) return (!IN6_IS_ADDR_UNSPECIFIED(&ipif->ipif_v6lcl_addr)); else return (ipif->ipif_lcl_addr != INADDR_ANY); } /* * Check if `ipif' is an IPIF_UP IPMP data address. */ static boolean_t ipmp_ipif_is_up_dataaddr(const ipif_t *ipif) { return (ipmp_ipif_is_dataaddr(ipif) && (ipif->ipif_flags & IPIF_UP)); }