/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #pragma ident "%Z%%M% %I% %E% SMI" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include /* * Function names with nce_ prefix are static while function * names with ndp_ prefix are used by rest of the IP. * * Lock ordering: * * ndp_g_lock -> ill_lock -> nce_lock * * The ndp_g_lock protects the NCE hash (nce_hash_tbl, NCE_HASH_PTR) and * nce_next. Nce_lock protects the contents of the NCE (particularly * nce_refcnt). */ static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *new_ll_addr, uint32_t ll_addr_len); static void nce_ire_delete(nce_t *nce); static void nce_ire_delete1(ire_t *ire, char *nce_arg); static void nce_set_ll(nce_t *nce, uchar_t *ll_addr); static nce_t *nce_lookup_addr(ill_t *, const in6_addr_t *, nce_t *); static nce_t *nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr); static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr); static int nce_set_multicast(ill_t *ill, const in6_addr_t *addr); static void nce_queue_mp(nce_t *nce, mblk_t *mp); static mblk_t *nce_udreq_alloc(ill_t *ill); static void nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr); static uint32_t nce_solicit(nce_t *nce, mblk_t *mp); static boolean_t nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, boolean_t use_lla_addr, const in6_addr_t *sender, const in6_addr_t *target, int flag); static int ndp_add_v4(ill_t *, const in_addr_t *, uint16_t, nce_t **, nce_t *); #ifdef DEBUG static void nce_trace_cleanup(const nce_t *); #endif #define NCE_HASH_PTR_V4(ipst, addr) \ (&((ipst)->ips_ndp4->nce_hash_tbl[IRE_ADDR_HASH(addr, NCE_TABLE_SIZE)])) #define NCE_HASH_PTR_V6(ipst, addr) \ (&((ipst)->ips_ndp6->nce_hash_tbl[NCE_ADDR_HASH_V6(addr, \ NCE_TABLE_SIZE)])) /* * Compute default flags to use for an advertisement of this nce's address. */ static int nce_advert_flags(const nce_t *nce) { int flag = 0; if (nce->nce_flags & NCE_F_ISROUTER) flag |= NDP_ISROUTER; if (!(nce->nce_flags & NCE_F_ANYCAST)) flag |= NDP_ORIDE; return (flag); } /* Non-tunable probe interval, based on link capabilities */ #define ILL_PROBE_INTERVAL(ill) ((ill)->ill_note_link ? 150 : 1500) /* * NDP Cache Entry creation routine. * Mapped entries will never do NUD . * This routine must always be called with ndp6->ndp_g_lock held. * Prior to return, nce_refcnt is incremented. */ int ndp_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags, uint16_t state, nce_t **newnce) { static nce_t nce_nil; nce_t *nce; mblk_t *mp; mblk_t *template; nce_t **ncep; int err; boolean_t dropped = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); ASSERT(ill != NULL && ill->ill_isv6); if (IN6_IS_ADDR_UNSPECIFIED(addr)) { ip0dbg(("ndp_add_v6: no addr\n")); return (EINVAL); } if ((flags & ~NCE_EXTERNAL_FLAGS_MASK)) { ip0dbg(("ndp_add_v6: flags = %x\n", (int)flags)); return (EINVAL); } if (IN6_IS_ADDR_UNSPECIFIED(extract_mask) && (flags & NCE_F_MAPPING)) { ip0dbg(("ndp_add_v6: extract mask zero for mapping")); return (EINVAL); } /* * Allocate the mblk to hold the nce. * * XXX This can come out of a separate cache - nce_cache. * We don't need the mp anymore as there are no more * "qwriter"s */ mp = allocb(sizeof (nce_t), BPRI_MED); if (mp == NULL) return (ENOMEM); nce = (nce_t *)mp->b_rptr; mp->b_wptr = (uchar_t *)&nce[1]; *nce = nce_nil; /* * This one holds link layer address */ if (ill->ill_net_type == IRE_IF_RESOLVER) { template = nce_udreq_alloc(ill); } else { if (ill->ill_resolver_mp == NULL) { freeb(mp); return (EINVAL); } ASSERT((ill->ill_net_type == IRE_IF_NORESOLVER)); template = copyb(ill->ill_resolver_mp); } if (template == NULL) { freeb(mp); return (ENOMEM); } nce->nce_ill = ill; nce->nce_ipversion = IPV6_VERSION; nce->nce_flags = flags; nce->nce_state = state; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; nce->nce_rcnt = ill->ill_xmit_count; nce->nce_addr = *addr; nce->nce_mask = *mask; nce->nce_extract_mask = *extract_mask; nce->nce_ll_extract_start = hw_extract_start; nce->nce_fp_mp = NULL; nce->nce_res_mp = template; if (state == ND_REACHABLE) nce->nce_last = TICK_TO_MSEC(lbolt64); else nce->nce_last = 0; nce->nce_qd_mp = NULL; nce->nce_mp = mp; if (hw_addr != NULL) nce_set_ll(nce, hw_addr); /* This one is for nce getting created */ nce->nce_refcnt = 1; mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); if (nce->nce_flags & NCE_F_MAPPING) { ASSERT(IN6_IS_ADDR_MULTICAST(addr)); ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_mask)); ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); ncep = &ipst->ips_ndp6->nce_mask_entries; } else { ncep = ((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); } nce->nce_trace_disable = B_FALSE; /* * Atomically ensure that the ill is not CONDEMNED, before * adding the NCE. */ mutex_enter(&ill->ill_lock); if (ill->ill_state_flags & ILL_CONDEMNED) { mutex_exit(&ill->ill_lock); freeb(mp); freeb(template); return (EINVAL); } if ((nce->nce_next = *ncep) != NULL) nce->nce_next->nce_ptpn = &nce->nce_next; *ncep = nce; nce->nce_ptpn = ncep; *newnce = nce; /* This one is for nce being used by an active thread */ NCE_REFHOLD(*newnce); /* Bump up the number of nce's referencing this ill */ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, (char *), "nce", (void *), nce); ill->ill_nce_cnt++; mutex_exit(&ill->ill_lock); err = 0; if ((flags & NCE_F_PERMANENT) && state == ND_PROBE) { mutex_enter(&nce->nce_lock); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; mutex_exit(&nce->nce_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, &ipv6_all_zeros, addr, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; mutex_exit(&nce->nce_lock); } NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); err = EINPROGRESS; } else if (flags & NCE_F_UNSOL_ADV) { /* * We account for the transmit below by assigning one * less than the ndd variable. Subsequent decrements * are done in ndp_timer. */ mutex_enter(&nce->nce_lock); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); nce->nce_unsolicit_count = ipst->ips_ip_ndp_unsolicit_count - 1; mutex_exit(&nce->nce_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, /* ill to be used for extracting ill_nd_lla */ B_TRUE, /* use ill_nd_lla */ addr, /* Source and target of the advertisement pkt */ &ipv6_all_hosts_mcast, /* Destination of the packet */ nce_advert_flags(nce)); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_unsolicit_count++; if (nce->nce_unsolicit_count != 0) { nce->nce_timeout_id = timeout(ndp_timer, nce, MSEC_TO_TICK(ipst->ips_ip_ndp_unsolicit_interval)); } mutex_exit(&nce->nce_lock); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); } /* * If the hw_addr is NULL, typically for ND_INCOMPLETE nces, then * we call nce_fastpath as soon as the nce is resolved in ndp_process. * We call nce_fastpath from nce_update if the link layer address of * the peer changes from nce_update */ if (hw_addr != NULL || ill->ill_net_type == IRE_IF_NORESOLVER) nce_fastpath(nce); return (err); } int ndp_lookup_then_add_v6(ill_t *ill, uchar_t *hw_addr, const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *extract_mask, uint32_t hw_extract_start, uint16_t flags, uint16_t state, nce_t **newnce) { int err = 0; nce_t *nce; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill->ill_isv6); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); nce = nce_lookup_addr(ill, addr, nce); if (nce == NULL) { err = ndp_add_v6(ill, hw_addr, addr, mask, extract_mask, hw_extract_start, flags, state, newnce); } else { *newnce = nce; err = EEXIST; } mutex_exit(&ipst->ips_ndp6->ndp_g_lock); return (err); } /* * Remove all the CONDEMNED nces from the appropriate hash table. * We create a private list of NCEs, these may have ires pointing * to them, so the list will be passed through to clean up dependent * ires and only then we can do NCE_REFRELE which can make NCE inactive. */ static void nce_remove(ndp_g_t *ndp, nce_t *nce, nce_t **free_nce_list) { nce_t *nce1; nce_t **ptpn; ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); ASSERT(ndp->ndp_g_walker == 0); for (; nce; nce = nce1) { nce1 = nce->nce_next; mutex_enter(&nce->nce_lock); if (nce->nce_flags & NCE_F_CONDEMNED) { ptpn = nce->nce_ptpn; nce1 = nce->nce_next; if (nce1 != NULL) nce1->nce_ptpn = ptpn; *ptpn = nce1; nce->nce_ptpn = NULL; nce->nce_next = NULL; nce->nce_next = *free_nce_list; *free_nce_list = nce; } mutex_exit(&nce->nce_lock); } } /* * 1. Mark the nce CONDEMNED. This ensures that no new nce_lookup() * will return this NCE. Also no new IREs will be created that * point to this NCE (See ire_add_v6). Also no new timeouts will * be started (See NDP_RESTART_TIMER). * 2. Cancel any currently running timeouts. * 3. If there is an ndp walker, return. The walker will do the cleanup. * This ensures that walkers see a consistent list of NCEs while walking. * 4. Otherwise remove the NCE from the list of NCEs * 5. Delete all IREs pointing to this NCE. */ void ndp_delete(nce_t *nce) { nce_t **ptpn; nce_t *nce1; int ipversion = nce->nce_ipversion; ndp_g_t *ndp; ip_stack_t *ipst = nce->nce_ill->ill_ipst; if (ipversion == IPV4_VERSION) ndp = ipst->ips_ndp4; else ndp = ipst->ips_ndp6; /* Serialize deletes */ mutex_enter(&nce->nce_lock); if (nce->nce_flags & NCE_F_CONDEMNED) { /* Some other thread is doing the delete */ mutex_exit(&nce->nce_lock); return; } /* * Caller has a refhold. Also 1 ref for being in the list. Thus * refcnt has to be >= 2 */ ASSERT(nce->nce_refcnt >= 2); nce->nce_flags |= NCE_F_CONDEMNED; mutex_exit(&nce->nce_lock); nce_fastpath_list_delete(nce); /* * Cancel any running timer. Timeout can't be restarted * since CONDEMNED is set. Can't hold nce_lock across untimeout. * Passing invalid timeout id is fine. */ if (nce->nce_timeout_id != 0) { (void) untimeout(nce->nce_timeout_id); nce->nce_timeout_id = 0; } mutex_enter(&ndp->ndp_g_lock); if (nce->nce_ptpn == NULL) { /* * The last ndp walker has already removed this nce from * the list after we marked the nce CONDEMNED and before * we grabbed the global lock. */ mutex_exit(&ndp->ndp_g_lock); return; } if (ndp->ndp_g_walker > 0) { /* * Can't unlink. The walker will clean up */ ndp->ndp_g_walker_cleanup = B_TRUE; mutex_exit(&ndp->ndp_g_lock); return; } /* * Now remove the nce from the list. NDP_RESTART_TIMER won't restart * the timer since it is marked CONDEMNED. */ ptpn = nce->nce_ptpn; nce1 = nce->nce_next; if (nce1 != NULL) nce1->nce_ptpn = ptpn; *ptpn = nce1; nce->nce_ptpn = NULL; nce->nce_next = NULL; mutex_exit(&ndp->ndp_g_lock); nce_ire_delete(nce); } void ndp_inactive(nce_t *nce) { mblk_t **mpp; ill_t *ill; ASSERT(nce->nce_refcnt == 0); ASSERT(MUTEX_HELD(&nce->nce_lock)); ASSERT(nce->nce_fastpath == NULL); /* Free all nce allocated messages */ mpp = &nce->nce_first_mp_to_free; do { while (*mpp != NULL) { mblk_t *mp; mp = *mpp; *mpp = mp->b_next; inet_freemsg(mp); } } while (mpp++ != &nce->nce_last_mp_to_free); #ifdef DEBUG nce_trace_cleanup(nce); #endif ill = nce->nce_ill; mutex_enter(&ill->ill_lock); DTRACE_PROBE3(ill__decr__cnt, (ill_t *), ill, (char *), "nce", (void *), nce); ill->ill_nce_cnt--; /* * If the number of nce's associated with this ill have dropped * to zero, check whether we need to restart any operation that * is waiting for this to happen. */ if (ILL_DOWN_OK(ill)) { /* ipif_ill_refrele_tail drops the ill_lock */ ipif_ill_refrele_tail(ill); } else { mutex_exit(&ill->ill_lock); } mutex_destroy(&nce->nce_lock); if (nce->nce_mp != NULL) inet_freemsg(nce->nce_mp); } /* * ndp_walk routine. Delete the nce if it is associated with the ill * that is going away. Always called as a writer. */ void ndp_delete_per_ill(nce_t *nce, uchar_t *arg) { if ((nce != NULL) && nce->nce_ill == (ill_t *)arg) { ndp_delete(nce); } } /* * Walk a list of to be inactive NCEs and blow away all the ires. */ static void nce_ire_delete_list(nce_t *nce) { nce_t *nce_next; ASSERT(nce != NULL); while (nce != NULL) { nce_next = nce->nce_next; nce->nce_next = NULL; /* * It is possible for the last ndp walker (this thread) * to come here after ndp_delete has marked the nce CONDEMNED * and before it has removed the nce from the fastpath list * or called untimeout. So we need to do it here. It is safe * for both ndp_delete and this thread to do it twice or * even simultaneously since each of the threads has a * reference on the nce. */ nce_fastpath_list_delete(nce); /* * Cancel any running timer. Timeout can't be restarted * since CONDEMNED is set. Can't hold nce_lock across untimeout. * Passing invalid timeout id is fine. */ if (nce->nce_timeout_id != 0) { (void) untimeout(nce->nce_timeout_id); nce->nce_timeout_id = 0; } /* * We might hit this func thus in the v4 case: * ipif_down->ipif_ndp_down->ndp_walk */ if (nce->nce_ipversion == IPV4_VERSION) { ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, nce_ire_delete1, (char *)nce, nce->nce_ill); } else { ASSERT(nce->nce_ipversion == IPV6_VERSION); ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, nce_ire_delete1, (char *)nce, nce->nce_ill); } NCE_REFRELE_NOTR(nce); nce = nce_next; } } /* * Delete an ire when the nce goes away. */ /* ARGSUSED */ static void nce_ire_delete(nce_t *nce) { if (nce->nce_ipversion == IPV6_VERSION) { ire_walk_ill_v6(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, nce_ire_delete1, (char *)nce, nce->nce_ill); NCE_REFRELE_NOTR(nce); } else { ire_walk_ill_v4(MATCH_IRE_ILL | MATCH_IRE_TYPE, IRE_CACHE, nce_ire_delete1, (char *)nce, nce->nce_ill); NCE_REFRELE_NOTR(nce); } } /* * ire_walk routine used to delete every IRE that shares this nce */ static void nce_ire_delete1(ire_t *ire, char *nce_arg) { nce_t *nce = (nce_t *)nce_arg; ASSERT(ire->ire_type == IRE_CACHE); if (ire->ire_nce == nce) { ASSERT(ire->ire_ipversion == nce->nce_ipversion); ire_delete(ire); } } /* * Restart DAD on given NCE. Returns B_TRUE if DAD has been restarted. */ boolean_t ndp_restart_dad(nce_t *nce) { boolean_t started; boolean_t dropped; if (nce == NULL) return (B_FALSE); mutex_enter(&nce->nce_lock); if (nce->nce_state == ND_PROBE) { mutex_exit(&nce->nce_lock); started = B_TRUE; } else if (nce->nce_state == ND_REACHABLE) { nce->nce_state = ND_PROBE; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT - 1; mutex_exit(&nce->nce_lock); dropped = nce_xmit(nce->nce_ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_PROBE); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; mutex_exit(&nce->nce_lock); } NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(nce->nce_ill)); started = B_TRUE; } else { mutex_exit(&nce->nce_lock); started = B_FALSE; } return (started); } /* * IPv6 Cache entry lookup. Try to find an nce matching the parameters passed. * If one is found, the refcnt on the nce will be incremented. */ nce_t * ndp_lookup_v6(ill_t *ill, const in6_addr_t *addr, boolean_t caller_holds_lock) { nce_t *nce; ip_stack_t *ipst; ASSERT(ill != NULL); ipst = ill->ill_ipst; ASSERT(ill != NULL && ill->ill_isv6); if (!caller_holds_lock) { mutex_enter(&ipst->ips_ndp6->ndp_g_lock); } /* Get head of v6 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); nce = nce_lookup_addr(ill, addr, nce); if (nce == NULL) nce = nce_lookup_mapping(ill, addr); if (!caller_holds_lock) mutex_exit(&ipst->ips_ndp6->ndp_g_lock); return (nce); } /* * IPv4 Cache entry lookup. Try to find an nce matching the parameters passed. * If one is found, the refcnt on the nce will be incremented. * Since multicast mappings are handled in arp, there are no nce_mcast_entries * so we skip the nce_lookup_mapping call. * XXX TODO: if the nce is found to be ND_STALE, ndp_delete it and return NULL */ nce_t * ndp_lookup_v4(ill_t *ill, const in_addr_t *addr, boolean_t caller_holds_lock) { nce_t *nce; in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; if (!caller_holds_lock) { mutex_enter(&ipst->ips_ndp4->ndp_g_lock); } /* Get head of v4 hash table */ nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); nce = nce_lookup_addr(ill, &addr6, nce); if (!caller_holds_lock) mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce); } /* * Cache entry lookup. Try to find an nce matching the parameters passed. * Look only for exact entries (no mappings). If an nce is found, increment * the hold count on that nce. The caller passes in the start of the * appropriate hash table, and must be holding the appropriate global * lock (ndp_g_lock). */ static nce_t * nce_lookup_addr(ill_t *ill, const in6_addr_t *addr, nce_t *nce) { ndp_g_t *ndp; ip_stack_t *ipst = ill->ill_ipst; if (ill->ill_isv6) ndp = ipst->ips_ndp6; else ndp = ipst->ips_ndp4; ASSERT(ill != NULL); ASSERT(MUTEX_HELD(&ndp->ndp_g_lock)); if (IN6_IS_ADDR_UNSPECIFIED(addr)) return (NULL); for (; nce != NULL; nce = nce->nce_next) { if (nce->nce_ill == ill) { if (IN6_ARE_ADDR_EQUAL(&nce->nce_addr, addr) && IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { mutex_enter(&nce->nce_lock); if (!(nce->nce_flags & NCE_F_CONDEMNED)) { NCE_REFHOLD_LOCKED(nce); mutex_exit(&nce->nce_lock); break; } mutex_exit(&nce->nce_lock); } } } return (nce); } /* * Cache entry lookup. Try to find an nce matching the parameters passed. * Look only for mappings. */ static nce_t * nce_lookup_mapping(ill_t *ill, const in6_addr_t *addr) { nce_t *nce; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill != NULL && ill->ill_isv6); ASSERT(MUTEX_HELD(&ipst->ips_ndp6->ndp_g_lock)); if (!IN6_IS_ADDR_MULTICAST(addr)) return (NULL); nce = ipst->ips_ndp6->nce_mask_entries; for (; nce != NULL; nce = nce->nce_next) if (nce->nce_ill == ill && (V6_MASK_EQ(*addr, nce->nce_mask, nce->nce_addr))) { mutex_enter(&nce->nce_lock); if (!(nce->nce_flags & NCE_F_CONDEMNED)) { NCE_REFHOLD_LOCKED(nce); mutex_exit(&nce->nce_lock); break; } mutex_exit(&nce->nce_lock); } return (nce); } /* * Process passed in parameters either from an incoming packet or via * user ioctl. */ void ndp_process(nce_t *nce, uchar_t *hw_addr, uint32_t flag, boolean_t is_adv) { ill_t *ill = nce->nce_ill; uint32_t hw_addr_len = ill->ill_nd_lla_len; mblk_t *mp; boolean_t ll_updated = B_FALSE; boolean_t ll_changed; ip_stack_t *ipst = ill->ill_ipst; ASSERT(nce->nce_ipversion == IPV6_VERSION); /* * No updates of link layer address or the neighbor state is * allowed, when the cache is in NONUD state. This still * allows for responding to reachability solicitation. */ mutex_enter(&nce->nce_lock); if (nce->nce_state == ND_INCOMPLETE) { if (hw_addr == NULL) { mutex_exit(&nce->nce_lock); return; } nce_set_ll(nce, hw_addr); /* * Update nce state and send the queued packets * back to ip this time ire will be added. */ if (flag & ND_NA_FLAG_SOLICITED) { nce_update(nce, ND_REACHABLE, NULL); } else { nce_update(nce, ND_STALE, NULL); } mutex_exit(&nce->nce_lock); nce_fastpath(nce); mutex_enter(&nce->nce_lock); mp = nce->nce_qd_mp; nce->nce_qd_mp = NULL; mutex_exit(&nce->nce_lock); while (mp != NULL) { mblk_t *nxt_mp, *data_mp; nxt_mp = mp->b_next; mp->b_next = NULL; if (mp->b_datap->db_type == M_CTL) data_mp = mp->b_cont; else data_mp = mp; if (data_mp->b_prev != NULL) { ill_t *inbound_ill; queue_t *fwdq = NULL; uint_t ifindex; ifindex = (uint_t)(uintptr_t)data_mp->b_prev; inbound_ill = ill_lookup_on_ifindex(ifindex, B_TRUE, NULL, NULL, NULL, NULL, ipst); if (inbound_ill == NULL) { data_mp->b_prev = NULL; freemsg(mp); return; } else { fwdq = inbound_ill->ill_rq; } data_mp->b_prev = NULL; /* * Send a forwarded packet back into ip_rput_v6 * just as in ire_send_v6(). * Extract the queue from b_prev (set in * ip_rput_data_v6). */ if (fwdq != NULL) { /* * Forwarded packets hop count will * get decremented in ip_rput_data_v6 */ if (data_mp != mp) freeb(mp); put(fwdq, data_mp); } else { /* * Send locally originated packets back * into * ip_wput_v6. */ put(ill->ill_wq, mp); } ill_refrele(inbound_ill); } else { put(ill->ill_wq, mp); } mp = nxt_mp; } return; } ll_changed = nce_cmp_ll_addr(nce, hw_addr, hw_addr_len); if (!is_adv) { /* If this is a SOLICITATION request only */ if (ll_changed) nce_update(nce, ND_STALE, hw_addr); mutex_exit(&nce->nce_lock); return; } if (!(flag & ND_NA_FLAG_OVERRIDE) && ll_changed) { /* If in any other state than REACHABLE, ignore */ if (nce->nce_state == ND_REACHABLE) { nce_update(nce, ND_STALE, NULL); } mutex_exit(&nce->nce_lock); return; } else { if (ll_changed) { nce_update(nce, ND_UNCHANGED, hw_addr); ll_updated = B_TRUE; } if (flag & ND_NA_FLAG_SOLICITED) { nce_update(nce, ND_REACHABLE, NULL); } else { if (ll_updated) { nce_update(nce, ND_STALE, NULL); } } mutex_exit(&nce->nce_lock); if (!(flag & ND_NA_FLAG_ROUTER) && (nce->nce_flags & NCE_F_ISROUTER)) { ire_t *ire; /* * Router turned to host. We need to remove the * entry as well as any default route that may be * using this as a next hop. This is required by * section 7.2.5 of RFC 2461. */ ire = ire_ftable_lookup_v6(&ipv6_all_zeros, &ipv6_all_zeros, &nce->nce_addr, IRE_DEFAULT, nce->nce_ill->ill_ipif, NULL, ALL_ZONES, 0, NULL, MATCH_IRE_ILL | MATCH_IRE_TYPE | MATCH_IRE_GW | MATCH_IRE_DEFAULT, ipst); if (ire != NULL) { ip_rts_rtmsg(RTM_DELETE, ire, 0, ipst); ire_delete(ire); ire_refrele(ire); } ndp_delete(nce); } } } /* * Pass arg1 to the pfi supplied, along with each nce in existence. * ndp_walk() places a REFHOLD on the nce and drops the lock when * walking the hash list. */ void ndp_walk_common(ndp_g_t *ndp, ill_t *ill, pfi_t pfi, void *arg1, boolean_t trace) { nce_t *nce; nce_t *nce1; nce_t **ncep; nce_t *free_nce_list = NULL; mutex_enter(&ndp->ndp_g_lock); /* Prevent ndp_delete from unlink and free of NCE */ ndp->ndp_g_walker++; mutex_exit(&ndp->ndp_g_lock); for (ncep = ndp->nce_hash_tbl; ncep < A_END(ndp->nce_hash_tbl); ncep++) { for (nce = *ncep; nce != NULL; nce = nce1) { nce1 = nce->nce_next; if (ill == NULL || nce->nce_ill == ill) { if (trace) { NCE_REFHOLD(nce); (*pfi)(nce, arg1); NCE_REFRELE(nce); } else { NCE_REFHOLD_NOTR(nce); (*pfi)(nce, arg1); NCE_REFRELE_NOTR(nce); } } } } for (nce = ndp->nce_mask_entries; nce != NULL; nce = nce1) { nce1 = nce->nce_next; if (ill == NULL || nce->nce_ill == ill) { if (trace) { NCE_REFHOLD(nce); (*pfi)(nce, arg1); NCE_REFRELE(nce); } else { NCE_REFHOLD_NOTR(nce); (*pfi)(nce, arg1); NCE_REFRELE_NOTR(nce); } } } mutex_enter(&ndp->ndp_g_lock); ndp->ndp_g_walker--; /* * While NCE's are removed from global list they are placed * in a private list, to be passed to nce_ire_delete_list(). * The reason is, there may be ires pointing to this nce * which needs to cleaned up. */ if (ndp->ndp_g_walker_cleanup && ndp->ndp_g_walker == 0) { /* Time to delete condemned entries */ for (ncep = ndp->nce_hash_tbl; ncep < A_END(ndp->nce_hash_tbl); ncep++) { nce = *ncep; if (nce != NULL) { nce_remove(ndp, nce, &free_nce_list); } } nce = ndp->nce_mask_entries; if (nce != NULL) { nce_remove(ndp, nce, &free_nce_list); } ndp->ndp_g_walker_cleanup = B_FALSE; } mutex_exit(&ndp->ndp_g_lock); if (free_nce_list != NULL) { nce_ire_delete_list(free_nce_list); } } /* * Walk everything. * Note that ill can be NULL hence can't derive the ipst from it. */ void ndp_walk(ill_t *ill, pfi_t pfi, void *arg1, ip_stack_t *ipst) { ndp_walk_common(ipst->ips_ndp4, ill, pfi, arg1, B_TRUE); ndp_walk_common(ipst->ips_ndp6, ill, pfi, arg1, B_TRUE); } /* * Process resolve requests. Handles both mapped entries * as well as cases that needs to be send out on the wire. * Lookup a NCE for a given IRE. Regardless of whether one exists * or one is created, we defer making ire point to nce until the * ire is actually added at which point the nce_refcnt on the nce is * incremented. This is done primarily to have symmetry between ire_add() * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. */ int ndp_resolver(ill_t *ill, const in6_addr_t *dst, mblk_t *mp, zoneid_t zoneid) { nce_t *nce; int err = 0; uint32_t ms; mblk_t *mp_nce = NULL; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill->ill_isv6); if (IN6_IS_ADDR_MULTICAST(dst)) { err = nce_set_multicast(ill, dst); return (err); } err = ndp_lookup_then_add_v6(ill, NULL, /* No hardware address */ dst, &ipv6_all_ones, &ipv6_all_zeros, 0, (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, ND_INCOMPLETE, &nce); switch (err) { case 0: /* * New cache entry was created. Make sure that the state * is not ND_INCOMPLETE. It can be in some other state * even before we send out the solicitation as we could * get un-solicited advertisements. * * If this is an XRESOLV interface, simply return 0, * since we don't want to solicit just yet. */ if (ill->ill_flags & ILLF_XRESOLV) { NCE_REFRELE(nce); return (0); } rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&nce->nce_lock); if (nce->nce_state != ND_INCOMPLETE) { mutex_exit(&nce->nce_lock); rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); return (0); } mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); if (mp_nce == NULL) { /* The caller will free mp */ mutex_exit(&nce->nce_lock); rw_exit(&ipst->ips_ill_g_lock); ndp_delete(nce); NCE_REFRELE(nce); return (ENOMEM); } ms = nce_solicit(nce, mp_nce); rw_exit(&ipst->ips_ill_g_lock); if (ms == 0) { /* The caller will free mp */ if (mp_nce != mp) freeb(mp_nce); mutex_exit(&nce->nce_lock); ndp_delete(nce); NCE_REFRELE(nce); return (EBUSY); } mutex_exit(&nce->nce_lock); NDP_RESTART_TIMER(nce, (clock_t)ms); NCE_REFRELE(nce); return (EINPROGRESS); case EEXIST: /* Resolution in progress just queue the packet */ mutex_enter(&nce->nce_lock); if (nce->nce_state == ND_INCOMPLETE) { mp_nce = ip_prepend_zoneid(mp, zoneid, ipst); if (mp_nce == NULL) { err = ENOMEM; } else { nce_queue_mp(nce, mp_nce); err = EINPROGRESS; } } else { /* * Any other state implies we have * a nce but IRE needs to be added ... * ire_add_v6() will take care of the * the case when the nce becomes CONDEMNED * before the ire is added to the table. */ err = 0; } mutex_exit(&nce->nce_lock); NCE_REFRELE(nce); break; default: ip1dbg(("ndp_resolver: Can't create NCE %d\n", err)); break; } return (err); } /* * When there is no resolver, the link layer template is passed in * the IRE. * Lookup a NCE for a given IRE. Regardless of whether one exists * or one is created, we defer making ire point to nce until the * ire is actually added at which point the nce_refcnt on the nce is * incremented. This is done primarily to have symmetry between ire_add() * and ire_delete() which decrements the nce_refcnt, when an ire is deleted. */ int ndp_noresolver(ill_t *ill, const in6_addr_t *dst) { nce_t *nce; int err = 0; ASSERT(ill != NULL); ASSERT(ill->ill_isv6); if (IN6_IS_ADDR_MULTICAST(dst)) { err = nce_set_multicast(ill, dst); return (err); } err = ndp_lookup_then_add_v6(ill, NULL, /* hardware address */ dst, &ipv6_all_ones, &ipv6_all_zeros, 0, (ill->ill_flags & ILLF_NONUD) ? NCE_F_NONUD : 0, ND_REACHABLE, &nce); switch (err) { case 0: /* * Cache entry with a proper resolver cookie was * created. */ NCE_REFRELE(nce); break; case EEXIST: err = 0; NCE_REFRELE(nce); break; default: ip1dbg(("ndp_noresolver: Can't create NCE %d\n", err)); break; } return (err); } /* * For each interface an entry is added for the unspecified multicast group. * Here that mapping is used to form the multicast cache entry for a particular * multicast destination. */ static int nce_set_multicast(ill_t *ill, const in6_addr_t *dst) { nce_t *mnce; /* Multicast mapping entry */ nce_t *nce; uchar_t *hw_addr = NULL; int err = 0; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill != NULL); ASSERT(ill->ill_isv6); ASSERT(!(IN6_IS_ADDR_UNSPECIFIED(dst))); mutex_enter(&ipst->ips_ndp6->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *dst)); nce = nce_lookup_addr(ill, dst, nce); if (nce != NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); NCE_REFRELE(nce); return (0); } /* No entry, now lookup for a mapping this should never fail */ mnce = nce_lookup_mapping(ill, dst); if (mnce == NULL) { /* Something broken for the interface. */ mutex_exit(&ipst->ips_ndp6->ndp_g_lock); return (ESRCH); } ASSERT(mnce->nce_flags & NCE_F_MAPPING); if (ill->ill_net_type == IRE_IF_RESOLVER) { /* * For IRE_IF_RESOLVER a hardware mapping can be * generated, for IRE_IF_NORESOLVER, resolution cookie * in the ill is copied in ndp_add_v6(). */ hw_addr = kmem_alloc(ill->ill_nd_lla_len, KM_NOSLEEP); if (hw_addr == NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); NCE_REFRELE(mnce); return (ENOMEM); } nce_make_mapping(mnce, hw_addr, (uchar_t *)dst); } NCE_REFRELE(mnce); /* * IRE_IF_NORESOLVER type simply copies the resolution * cookie passed in. So no hw_addr is needed. */ err = ndp_add_v6(ill, hw_addr, dst, &ipv6_all_ones, &ipv6_all_zeros, 0, NCE_F_NONUD, ND_REACHABLE, &nce); mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (hw_addr != NULL) kmem_free(hw_addr, ill->ill_nd_lla_len); if (err != 0) { ip1dbg(("nce_set_multicast: create failed" "%d\n", err)); return (err); } NCE_REFRELE(nce); return (0); } /* * Return the link layer address, and any flags of a nce. */ int ndp_query(ill_t *ill, struct lif_nd_req *lnr) { nce_t *nce; in6_addr_t *addr; sin6_t *sin6; dl_unitdata_req_t *dl; ASSERT(ill != NULL && ill->ill_isv6); sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; nce = ndp_lookup_v6(ill, addr, B_FALSE); if (nce == NULL) return (ESRCH); /* If in INCOMPLETE state, no link layer address is available yet */ if (nce->nce_state == ND_INCOMPLETE) goto done; dl = (dl_unitdata_req_t *)nce->nce_res_mp->b_rptr; if (ill->ill_flags & ILLF_XRESOLV) lnr->lnr_hdw_len = dl->dl_dest_addr_length; else lnr->lnr_hdw_len = ill->ill_nd_lla_len; ASSERT(NCE_LL_ADDR_OFFSET(ill) + lnr->lnr_hdw_len <= sizeof (lnr->lnr_hdw_addr)); bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), (uchar_t *)&lnr->lnr_hdw_addr, lnr->lnr_hdw_len); if (nce->nce_flags & NCE_F_ISROUTER) lnr->lnr_flags = NDF_ISROUTER_ON; if (nce->nce_flags & NCE_F_ANYCAST) lnr->lnr_flags |= NDF_ANYCAST_ON; done: NCE_REFRELE(nce); return (0); } /* * Send Enable/Disable multicast reqs to driver. */ int ndp_mcastreq(ill_t *ill, const in6_addr_t *addr, uint32_t hw_addr_len, uint32_t hw_addr_offset, mblk_t *mp) { nce_t *nce; uchar_t *hw_addr; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill != NULL && ill->ill_isv6); ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); hw_addr = mi_offset_paramc(mp, hw_addr_offset, hw_addr_len); if (hw_addr == NULL || !IN6_IS_ADDR_MULTICAST(addr)) { freemsg(mp); return (EINVAL); } mutex_enter(&ipst->ips_ndp6->ndp_g_lock); nce = nce_lookup_mapping(ill, addr); if (nce == NULL) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); freemsg(mp); return (ESRCH); } mutex_exit(&ipst->ips_ndp6->ndp_g_lock); /* * Update dl_addr_length and dl_addr_offset for primitives that * have physical addresses as opposed to full saps */ switch (((union DL_primitives *)mp->b_rptr)->dl_primitive) { case DL_ENABMULTI_REQ: /* Track the state if this is the first enabmulti */ if (ill->ill_dlpi_multicast_state == IDS_UNKNOWN) ill->ill_dlpi_multicast_state = IDS_INPROGRESS; ip1dbg(("ndp_mcastreq: ENABMULTI\n")); break; case DL_DISABMULTI_REQ: ip1dbg(("ndp_mcastreq: DISABMULTI\n")); break; default: NCE_REFRELE(nce); ip1dbg(("ndp_mcastreq: default\n")); return (EINVAL); } nce_make_mapping(nce, hw_addr, (uchar_t *)addr); NCE_REFRELE(nce); ill_dlpi_send(ill, mp); return (0); } /* * Send a neighbor solicitation. * Returns number of milliseconds after which we should either rexmit or abort. * Return of zero means we should abort. * The caller holds the nce_lock to protect nce_qd_mp and nce_rcnt. * * NOTE: This routine drops nce_lock (and later reacquires it) when sending * the packet. * NOTE: This routine does not consume mp. */ uint32_t nce_solicit(nce_t *nce, mblk_t *mp) { ill_t *ill; ill_t *src_ill; ip6_t *ip6h; in6_addr_t src; in6_addr_t dst; ipif_t *ipif; ip6i_t *ip6i; boolean_t dropped = B_FALSE; ip_stack_t *ipst = nce->nce_ill->ill_ipst; ASSERT(RW_READ_HELD(&ipst->ips_ill_g_lock)); ASSERT(MUTEX_HELD(&nce->nce_lock)); ill = nce->nce_ill; ASSERT(ill != NULL); if (nce->nce_rcnt == 0) { return (0); } if (mp == NULL) { ASSERT(nce->nce_qd_mp != NULL); mp = nce->nce_qd_mp; } else { nce_queue_mp(nce, mp); } /* Handle ip_newroute_v6 giving us IPSEC packets */ if (mp->b_datap->db_type == M_CTL) mp = mp->b_cont; ip6h = (ip6_t *)mp->b_rptr; if (ip6h->ip6_nxt == IPPROTO_RAW) { /* * This message should have been pulled up already in * ip_wput_v6. We can't do pullups here because the message * could be from the nce_qd_mp which could have b_next/b_prev * non-NULL. */ ip6i = (ip6i_t *)ip6h; ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= sizeof (ip6i_t) + IPV6_HDR_LEN); ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); } src = ip6h->ip6_src; /* * If the src of outgoing packet is one of the assigned interface * addresses use it, otherwise we will pick the source address below. */ src_ill = ill; if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { if (ill->ill_group != NULL) src_ill = ill->ill_group->illgrp_ill; for (; src_ill != NULL; src_ill = src_ill->ill_group_next) { for (ipif = src_ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if (IN6_ARE_ADDR_EQUAL(&src, &ipif->ipif_v6lcl_addr)) { break; } } if (ipif != NULL) break; } /* * If no relevant ipif can be found, then it's not one of our * addresses. Reset to :: and let nce_xmit. If an ipif can be * found, but it's not yet done with DAD verification, then * just postpone this transmission until later. */ if (src_ill == NULL) src = ipv6_all_zeros; else if (!ipif->ipif_addr_ready) return (ill->ill_reachable_retrans_time); } dst = nce->nce_addr; /* * If source address is unspecified, nce_xmit will choose * one for us and initialize the hardware address also * appropriately. */ if (IN6_IS_ADDR_UNSPECIFIED(&src)) src_ill = NULL; nce->nce_rcnt--; mutex_exit(&nce->nce_lock); rw_exit(&ipst->ips_ill_g_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, src_ill, B_TRUE, &src, &dst, 0); rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&nce->nce_lock); if (dropped) nce->nce_rcnt++; return (ill->ill_reachable_retrans_time); } /* * Attempt to recover an address on an interface that's been marked as a * duplicate. Because NCEs are destroyed when the interface goes down, there's * no easy way to just probe the address and have the right thing happen if * it's no longer in use. Instead, we just bring it up normally and allow the * regular interface start-up logic to probe for a remaining duplicate and take * us back down if necessary. * Neither DHCP nor temporary addresses arrive here; they're excluded by * ip_ndp_excl. */ /* ARGSUSED */ static void ip_ndp_recover(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; in6_addr_t *addr = (in6_addr_t *)mp->b_rptr; for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { /* * We do not support recovery of proxy ARP'd interfaces, * because the system lacks a complete proxy ARP mechanism. */ if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, addr)) { continue; } /* * If we have already recovered or if the interface is going * away, then ignore. */ mutex_enter(&ill->ill_lock); if (!(ipif->ipif_flags & IPIF_DUPLICATE) || (ipif->ipif_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { mutex_exit(&ill->ill_lock); continue; } ipif->ipif_flags &= ~IPIF_DUPLICATE; ill->ill_ipif_dup_count--; mutex_exit(&ill->ill_lock); ipif->ipif_was_dup = B_TRUE; if (ipif_ndp_up(ipif) != EINPROGRESS) (void) ipif_up_done_v6(ipif); } freeb(mp); } /* * Attempt to recover an IPv6 interface that's been shut down as a duplicate. * As long as someone else holds the address, the interface will stay down. * When that conflict goes away, the interface is brought back up. This is * done so that accidental shutdowns of addresses aren't made permanent. Your * server will recover from a failure. * * For DHCP and temporary addresses, recovery is not done in the kernel. * Instead, it's handled by user space processes (dhcpagent and in.ndpd). * * This function is entered on a timer expiry; the ID is in ipif_recovery_id. */ static void ipif6_dup_recovery(void *arg) { ipif_t *ipif = arg; ipif->ipif_recovery_id = 0; if (!(ipif->ipif_flags & IPIF_DUPLICATE)) return; /* * No lock, because this is just an optimization. */ if (ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED)) return; /* If the link is down, we'll retry this later */ if (!(ipif->ipif_ill->ill_phyint->phyint_flags & PHYI_RUNNING)) return; ndp_do_recovery(ipif); } /* * Perform interface recovery by forcing the duplicate interfaces up and * allowing the system to determine which ones should stay up. * * Called both by recovery timer expiry and link-up notification. */ void ndp_do_recovery(ipif_t *ipif) { ill_t *ill = ipif->ipif_ill; mblk_t *mp; ip_stack_t *ipst = ill->ill_ipst; mp = allocb(sizeof (ipif->ipif_v6lcl_addr), BPRI_MED); if (mp == NULL) { mutex_enter(&ill->ill_lock); if (ipif->ipif_recovery_id == 0 && !(ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED))) { ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); } else { bcopy(&ipif->ipif_v6lcl_addr, mp->b_rptr, sizeof (ipif->ipif_v6lcl_addr)); ill_refhold(ill); qwriter_ip(ill, ill->ill_rq, mp, ip_ndp_recover, NEW_OP, B_FALSE); } } /* * Find the solicitation in the given message, and extract printable details * (MAC and IP addresses) from it. */ static nd_neighbor_solicit_t * ip_ndp_find_solicitation(mblk_t *mp, mblk_t *dl_mp, ill_t *ill, char *hbuf, size_t hlen, char *sbuf, size_t slen, uchar_t **haddr) { nd_neighbor_solicit_t *ns; ip6_t *ip6h; uchar_t *addr; int alen; alen = 0; ip6h = (ip6_t *)mp->b_rptr; if (dl_mp == NULL) { nd_opt_hdr_t *opt; int nslen; /* * If it's from the fast-path, then it can't be a probe * message, and thus must include the source linkaddr option. * Extract that here. */ ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); nslen = mp->b_wptr - (uchar_t *)ns; if ((nslen -= sizeof (*ns)) > 0) { opt = ndp_get_option((nd_opt_hdr_t *)(ns + 1), nslen, ND_OPT_SOURCE_LINKADDR); if (opt != NULL && opt->nd_opt_len * 8 - sizeof (*opt) >= ill->ill_nd_lla_len) { addr = (uchar_t *)(opt + 1); alen = ill->ill_nd_lla_len; } } /* * We cheat a bit here for the sake of printing usable log * messages in the rare case where the reply we got was unicast * without a source linkaddr option, and the interface is in * fastpath mode. (Sigh.) */ if (alen == 0 && ill->ill_type == IFT_ETHER && MBLKHEAD(mp) >= sizeof (struct ether_header)) { struct ether_header *pether; pether = (struct ether_header *)((char *)ip6h - sizeof (*pether)); addr = pether->ether_shost.ether_addr_octet; alen = ETHERADDRL; } } else { dl_unitdata_ind_t *dlu; dlu = (dl_unitdata_ind_t *)dl_mp->b_rptr; alen = dlu->dl_src_addr_length; if (alen > 0 && dlu->dl_src_addr_offset >= sizeof (*dlu) && dlu->dl_src_addr_offset + alen <= MBLKL(dl_mp)) { addr = dl_mp->b_rptr + dlu->dl_src_addr_offset; if (ill->ill_sap_length < 0) { alen += ill->ill_sap_length; } else { addr += ill->ill_sap_length; alen -= ill->ill_sap_length; } } } if (alen > 0) { *haddr = addr; (void) mac_colon_addr(addr, alen, hbuf, hlen); } else { *haddr = NULL; (void) strcpy(hbuf, "?"); } ns = (nd_neighbor_solicit_t *)((char *)ip6h + IPV6_HDR_LEN); (void) inet_ntop(AF_INET6, &ns->nd_ns_target, sbuf, slen); return (ns); } /* * This is for exclusive changes due to NDP duplicate address detection * failure. */ /* ARGSUSED */ static void ip_ndp_excl(ipsq_t *ipsq, queue_t *rq, mblk_t *mp, void *dummy_arg) { ill_t *ill = rq->q_ptr; ipif_t *ipif; char ibuf[LIFNAMSIZ + 10]; /* 10 digits for logical i/f number */ char hbuf[MAC_STR_LEN]; char sbuf[INET6_ADDRSTRLEN]; nd_neighbor_solicit_t *ns; mblk_t *dl_mp = NULL; uchar_t *haddr; ip_stack_t *ipst = ill->ill_ipst; if (DB_TYPE(mp) != M_DATA) { dl_mp = mp; mp = mp->b_cont; } ns = ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); if (haddr != NULL && bcmp(haddr, ill->ill_phys_addr, ill->ill_phys_addr_length) == 0) { /* * Ignore conflicts generated by misbehaving switches that just * reflect our own messages back to us. */ goto ignore_conflict; } for (ipif = ill->ill_ipif; ipif != NULL; ipif = ipif->ipif_next) { if ((ipif->ipif_flags & IPIF_POINTOPOINT) || !IN6_ARE_ADDR_EQUAL(&ipif->ipif_v6lcl_addr, &ns->nd_ns_target)) { continue; } /* If it's already marked, then don't do anything. */ if (ipif->ipif_flags & IPIF_DUPLICATE) continue; /* * If this is a failure during duplicate recovery, then don't * complain. It may take a long time to recover. */ if (!ipif->ipif_was_dup) { ipif_get_name(ipif, ibuf, sizeof (ibuf)); cmn_err(CE_WARN, "%s has duplicate address %s (in " "use by %s); disabled", ibuf, sbuf, hbuf); } mutex_enter(&ill->ill_lock); ASSERT(!(ipif->ipif_flags & IPIF_DUPLICATE)); ipif->ipif_flags |= IPIF_DUPLICATE; ill->ill_ipif_dup_count++; mutex_exit(&ill->ill_lock); (void) ipif_down(ipif, NULL, NULL); ipif_down_tail(ipif); mutex_enter(&ill->ill_lock); if (!(ipif->ipif_flags & (IPIF_DHCPRUNNING|IPIF_TEMPORARY)) && ill->ill_net_type == IRE_IF_RESOLVER && !(ipif->ipif_state_flags & (IPIF_MOVING | IPIF_CONDEMNED)) && ipst->ips_ip_dup_recovery > 0) { ipif->ipif_recovery_id = timeout(ipif6_dup_recovery, ipif, MSEC_TO_TICK(ipst->ips_ip_dup_recovery)); } mutex_exit(&ill->ill_lock); } ignore_conflict: if (dl_mp != NULL) freeb(dl_mp); freemsg(mp); } /* * Handle failure by tearing down the ipifs with the specified address. Note * that tearing down the ipif also means deleting the nce through ipif_down, so * it's not possible to do recovery by just restarting the nce timer. Instead, * we start a timer on the ipif. */ static void ip_ndp_failure(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) { if ((mp = copymsg(mp)) != NULL) { if (dl_mp == NULL) dl_mp = mp; else if ((dl_mp = copyb(dl_mp)) != NULL) dl_mp->b_cont = mp; if (dl_mp == NULL) { freemsg(mp); } else { ill_refhold(ill); qwriter_ip(ill, ill->ill_rq, dl_mp, ip_ndp_excl, NEW_OP, B_FALSE); } } ndp_delete(nce); } /* * Handle a discovered conflict: some other system is advertising that it owns * one of our IP addresses. We need to defend ourselves, or just shut down the * interface. */ static void ip_ndp_conflict(ill_t *ill, mblk_t *mp, mblk_t *dl_mp, nce_t *nce) { ipif_t *ipif; uint32_t now; uint_t maxdefense; uint_t defs; ip_stack_t *ipst = ill->ill_ipst; ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); if (ipif == NULL) return; /* * First, figure out if this address is disposable. */ if (ipif->ipif_flags & (IPIF_DHCPRUNNING | IPIF_TEMPORARY)) maxdefense = ipst->ips_ip_max_temp_defend; else maxdefense = ipst->ips_ip_max_defend; /* * Now figure out how many times we've defended ourselves. Ignore * defenses that happened long in the past. */ now = gethrestime_sec(); mutex_enter(&nce->nce_lock); if ((defs = nce->nce_defense_count) > 0 && now - nce->nce_defense_time > ipst->ips_ip_defend_interval) { nce->nce_defense_count = defs = 0; } nce->nce_defense_count++; nce->nce_defense_time = now; mutex_exit(&nce->nce_lock); ipif_refrele(ipif); /* * If we've defended ourselves too many times already, then give up and * tear down the interface(s) using this address. Otherwise, defend by * sending out an unsolicited Neighbor Advertisement. */ if (defs >= maxdefense) { ip_ndp_failure(ill, mp, dl_mp, nce); } else { char hbuf[MAC_STR_LEN]; char sbuf[INET6_ADDRSTRLEN]; uchar_t *haddr; (void) ip_ndp_find_solicitation(mp, dl_mp, ill, hbuf, sizeof (hbuf), sbuf, sizeof (sbuf), &haddr); cmn_err(CE_WARN, "node %s is using our IP address %s on %s", hbuf, sbuf, ill->ill_name); (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, nce_advert_flags(nce)); } } static void ndp_input_solicit(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { nd_neighbor_solicit_t *ns; uint32_t hlen = ill->ill_nd_lla_len; uchar_t *haddr = NULL; icmp6_t *icmp_nd; ip6_t *ip6h; nce_t *our_nce = NULL; in6_addr_t target; in6_addr_t src; int len; int flag = 0; nd_opt_hdr_t *opt = NULL; boolean_t bad_solicit = B_FALSE; mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip6h = (ip6_t *)mp->b_rptr; icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; src = ip6h->ip6_src; ns = (nd_neighbor_solicit_t *)icmp_nd; target = ns->nd_ns_target; if (IN6_IS_ADDR_MULTICAST(&target)) { if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ndp_input_solicit: Target is" " multicast! %s\n", AF_INET6, &target); } bad_solicit = B_TRUE; goto done; } if (len > sizeof (nd_neighbor_solicit_t)) { /* Options present */ opt = (nd_opt_hdr_t *)&ns[1]; len -= sizeof (nd_neighbor_solicit_t); if (!ndp_verify_optlen(opt, len)) { ip1dbg(("ndp_input_solicit: Bad opt len\n")); bad_solicit = B_TRUE; goto done; } } if (IN6_IS_ADDR_UNSPECIFIED(&src)) { /* Check to see if this is a valid DAD solicitation */ if (!IN6_IS_ADDR_MC_SOLICITEDNODE(&ip6h->ip6_dst)) { if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ndp_input_solicit: IPv6 " "Destination is not solicited node " "multicast %s\n", AF_INET6, &ip6h->ip6_dst); } bad_solicit = B_TRUE; goto done; } } our_nce = ndp_lookup_v6(ill, &target, B_FALSE); /* * If this is a valid Solicitation, a permanent * entry should exist in the cache */ if (our_nce == NULL || !(our_nce->nce_flags & NCE_F_PERMANENT)) { ip1dbg(("ndp_input_solicit: Wrong target in NS?!" "ifname=%s ", ill->ill_name)); if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg(" dst %s\n", AF_INET6, &target); } bad_solicit = B_TRUE; goto done; } /* At this point we should have a verified NS per spec */ if (opt != NULL) { opt = ndp_get_option(opt, len, ND_OPT_SOURCE_LINKADDR); if (opt != NULL) { haddr = (uchar_t *)&opt[1]; if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { ip1dbg(("ndp_input_advert: bad SLLA\n")); bad_solicit = B_TRUE; goto done; } } } /* If sending directly to peer, set the unicast flag */ if (!IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) flag |= NDP_UNICAST; /* * Create/update the entry for the soliciting node. * or respond to outstanding queries, don't if * the source is unspecified address. */ if (!IN6_IS_ADDR_UNSPECIFIED(&src)) { int err; nce_t *nnce; ASSERT(ill->ill_isv6); /* * Regular solicitations *must* include the Source Link-Layer * Address option. Ignore messages that do not. */ if (haddr == NULL && IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst)) { ip1dbg(("ndp_input_solicit: source link-layer address " "option missing with a specified source.\n")); bad_solicit = B_TRUE; goto done; } /* * This is a regular solicitation. If we're still in the * process of verifying the address, then don't respond at all * and don't keep track of the sender. */ if (our_nce->nce_state == ND_PROBE) goto done; /* * If the solicitation doesn't have sender hardware address * (legal for unicast solicitation), then process without * installing the return NCE. Either we already know it, or * we'll be forced to look it up when (and if) we reply to the * packet. */ if (haddr == NULL) goto no_source; err = ndp_lookup_then_add_v6(ill, haddr, &src, /* Soliciting nodes address */ &ipv6_all_ones, &ipv6_all_zeros, 0, 0, ND_STALE, &nnce); switch (err) { case 0: /* done with this entry */ NCE_REFRELE(nnce); break; case EEXIST: /* * B_FALSE indicates this is not an * an advertisement. */ ndp_process(nnce, haddr, 0, B_FALSE); NCE_REFRELE(nnce); break; default: ip1dbg(("ndp_input_solicit: Can't create NCE %d\n", err)); goto done; } no_source: flag |= NDP_SOLICITED; } else { /* * No source link layer address option should be present in a * valid DAD request. */ if (haddr != NULL) { ip1dbg(("ndp_input_solicit: source link-layer address " "option present with an unspecified source.\n")); bad_solicit = B_TRUE; goto done; } if (our_nce->nce_state == ND_PROBE) { /* * Internally looped-back probes won't have DLPI * attached to them. External ones (which are sent by * multicast) always will. Just ignore our own * transmissions. */ if (dl_mp != NULL) { /* * If someone else is probing our address, then * we've crossed wires. Declare failure. */ ip_ndp_failure(ill, mp, dl_mp, our_nce); } goto done; } /* * This is a DAD probe. Multicast the advertisement to the * all-nodes address. */ src = ipv6_all_hosts_mcast; } flag |= nce_advert_flags(our_nce); /* Response to a solicitation */ (void) nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, /* ill to be used for extracting ill_nd_lla */ B_TRUE, /* use ill_nd_lla */ &target, /* Source and target of the advertisement pkt */ &src, /* IP Destination (source of original pkt) */ flag); done: if (bad_solicit) BUMP_MIB(mib, ipv6IfIcmpInBadNeighborSolicitations); if (our_nce != NULL) NCE_REFRELE(our_nce); } void ndp_input_advert(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { nd_neighbor_advert_t *na; uint32_t hlen = ill->ill_nd_lla_len; uchar_t *haddr = NULL; icmp6_t *icmp_nd; ip6_t *ip6h; nce_t *dst_nce = NULL; in6_addr_t target; nd_opt_hdr_t *opt = NULL; int len; mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; ip_stack_t *ipst = ill->ill_ipst; ip6h = (ip6_t *)mp->b_rptr; icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; na = (nd_neighbor_advert_t *)icmp_nd; if (IN6_IS_ADDR_MULTICAST(&ip6h->ip6_dst) && (na->nd_na_flags_reserved & ND_NA_FLAG_SOLICITED)) { ip1dbg(("ndp_input_advert: Target is multicast but the " "solicited flag is not zero\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; } target = na->nd_na_target; if (IN6_IS_ADDR_MULTICAST(&target)) { ip1dbg(("ndp_input_advert: Target is multicast!\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; } if (len > sizeof (nd_neighbor_advert_t)) { opt = (nd_opt_hdr_t *)&na[1]; if (!ndp_verify_optlen(opt, len - sizeof (nd_neighbor_advert_t))) { ip1dbg(("ndp_input_advert: cannot verify SLLA\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; } /* At this point we have a verified NA per spec */ len -= sizeof (nd_neighbor_advert_t); opt = ndp_get_option(opt, len, ND_OPT_TARGET_LINKADDR); if (opt != NULL) { haddr = (uchar_t *)&opt[1]; if (hlen > opt->nd_opt_len * 8 - sizeof (*opt) || hlen == 0) { ip1dbg(("ndp_input_advert: bad SLLA\n")); BUMP_MIB(mib, ipv6IfIcmpInBadNeighborAdvertisements); return; } } } /* * If this interface is part of the group look at all the * ills in the group. */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); if (ill->ill_group != NULL) ill = ill->ill_group->illgrp_ill; for (; ill != NULL; ill = ill->ill_group_next) { mutex_enter(&ill->ill_lock); if (!ILL_CAN_LOOKUP(ill)) { mutex_exit(&ill->ill_lock); continue; } ill_refhold_locked(ill); mutex_exit(&ill->ill_lock); dst_nce = ndp_lookup_v6(ill, &target, B_FALSE); /* We have to drop the lock since ndp_process calls put* */ rw_exit(&ipst->ips_ill_g_lock); if (dst_nce != NULL) { if ((dst_nce->nce_flags & NCE_F_PERMANENT) && dst_nce->nce_state == ND_PROBE) { /* * Someone else sent an advertisement for an * address that we're trying to configure. * Tear it down. Note that dl_mp might be NULL * if we're getting a unicast reply. This * isn't typically done (multicast is the norm * in response to a probe), but ip_ndp_failure * will handle the dl_mp == NULL case as well. */ ip_ndp_failure(ill, mp, dl_mp, dst_nce); } else if (dst_nce->nce_flags & NCE_F_PERMANENT) { /* * Someone just announced one of our local * addresses. If it wasn't us, then this is a * conflict. Defend the address or shut it * down. */ if (dl_mp != NULL && (haddr == NULL || nce_cmp_ll_addr(dst_nce, haddr, ill->ill_nd_lla_len))) { ip_ndp_conflict(ill, mp, dl_mp, dst_nce); } } else { if (na->nd_na_flags_reserved & ND_NA_FLAG_ROUTER) { dst_nce->nce_flags |= NCE_F_ISROUTER; } /* B_TRUE indicates this an advertisement */ ndp_process(dst_nce, haddr, na->nd_na_flags_reserved, B_TRUE); } NCE_REFRELE(dst_nce); } rw_enter(&ipst->ips_ill_g_lock, RW_READER); ill_refrele(ill); } rw_exit(&ipst->ips_ill_g_lock); } /* * Process NDP neighbor solicitation/advertisement messages. * The checksum has already checked o.k before reaching here. */ void ndp_input(ill_t *ill, mblk_t *mp, mblk_t *dl_mp) { icmp6_t *icmp_nd; ip6_t *ip6h; int len; mib2_ipv6IfIcmpEntry_t *mib = ill->ill_icmp6_mib; if (!pullupmsg(mp, -1)) { ip1dbg(("ndp_input: pullupmsg failed\n")); BUMP_MIB(ill->ill_ip_mib, ipIfStatsInDiscards); goto done; } ip6h = (ip6_t *)mp->b_rptr; if (ip6h->ip6_hops != IPV6_MAX_HOPS) { ip1dbg(("ndp_input: hoplimit != IPV6_MAX_HOPS\n")); BUMP_MIB(mib, ipv6IfIcmpBadHoplimit); goto done; } /* * NDP does not accept any extension headers between the * IP header and the ICMP header since e.g. a routing * header could be dangerous. * This assumes that any AH or ESP headers are removed * by ip prior to passing the packet to ndp_input. */ if (ip6h->ip6_nxt != IPPROTO_ICMPV6) { ip1dbg(("ndp_input: Wrong next header 0x%x\n", ip6h->ip6_nxt)); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } icmp_nd = (icmp6_t *)(mp->b_rptr + IPV6_HDR_LEN); ASSERT(icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT || icmp_nd->icmp6_type == ND_NEIGHBOR_ADVERT); if (icmp_nd->icmp6_code != 0) { ip1dbg(("ndp_input: icmp6 code != 0 \n")); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } len = mp->b_wptr - mp->b_rptr - IPV6_HDR_LEN; /* * Make sure packet length is large enough for either * a NS or a NA icmp packet. */ if (len < sizeof (struct icmp6_hdr) + sizeof (struct in6_addr)) { ip1dbg(("ndp_input: packet too short\n")); BUMP_MIB(mib, ipv6IfIcmpInErrors); goto done; } if (icmp_nd->icmp6_type == ND_NEIGHBOR_SOLICIT) { ndp_input_solicit(ill, mp, dl_mp); } else { ndp_input_advert(ill, mp, dl_mp); } done: freemsg(mp); } /* * nce_xmit is called to form and transmit a ND solicitation or * advertisement ICMP packet. * * If the source address is unspecified and this isn't a probe (used for * duplicate address detection), an appropriate source address and link layer * address will be chosen here. The link layer address option is included if * the source is specified (i.e., all non-probe packets), and omitted (per the * specification) otherwise. * * It returns B_FALSE only if it does a successful put() to the * corresponding ill's ill_wq otherwise returns B_TRUE. */ static boolean_t nce_xmit(ill_t *ill, uint32_t operation, ill_t *hwaddr_ill, boolean_t use_nd_lla, const in6_addr_t *sender, const in6_addr_t *target, int flag) { uint32_t len; icmp6_t *icmp6; mblk_t *mp; ip6_t *ip6h; nd_opt_hdr_t *opt; uint_t plen; ip6i_t *ip6i; ipif_t *src_ipif = NULL; uint8_t *hw_addr; zoneid_t zoneid = GLOBAL_ZONEID; /* * If we have a unspecified source(sender) address, select a * proper source address for the solicitation here itself so * that we can initialize the h/w address correctly. This is * needed for interface groups as source address can come from * the whole group and the h/w address initialized from ill will * be wrong if the source address comes from a different ill. * * If the sender is specified then we use this address in order * to lookup the zoneid before calling ip_output_v6(). This is to * enable unicast ND_NEIGHBOR_ADVERT packets to be routed correctly * by IP (we cannot guarantee that the global zone has an interface * route to the destination). * * Note that the NA never comes here with the unspecified source * address. The following asserts that whenever the source * address is specified, the haddr also should be specified. */ ASSERT(IN6_IS_ADDR_UNSPECIFIED(sender) || (hwaddr_ill != NULL)); if (IN6_IS_ADDR_UNSPECIFIED(sender) && !(flag & NDP_PROBE)) { ASSERT(operation != ND_NEIGHBOR_ADVERT); /* * Pick a source address for this solicitation, but * restrict the selection to addresses assigned to the * output interface (or interface group). We do this * because the destination will create a neighbor cache * entry for the source address of this packet, so the * source address had better be a valid neighbor. */ src_ipif = ipif_select_source_v6(ill, target, RESTRICT_TO_ILL, IPV6_PREFER_SRC_DEFAULT, ALL_ZONES); if (src_ipif == NULL) { char buf[INET6_ADDRSTRLEN]; ip1dbg(("nce_xmit: No source ipif for dst %s\n", inet_ntop(AF_INET6, (char *)target, buf, sizeof (buf)))); return (B_TRUE); } sender = &src_ipif->ipif_v6src_addr; hwaddr_ill = src_ipif->ipif_ill; } else if (!(IN6_IS_ADDR_UNSPECIFIED(sender))) { zoneid = ipif_lookup_addr_zoneid_v6(sender, ill, ill->ill_ipst); /* * It's possible for ipif_lookup_addr_zoneid_v6() to return * ALL_ZONES if it cannot find a matching ipif for the address * we are trying to use. In this case we err on the side of * trying to send the packet by defaulting to the GLOBAL_ZONEID. */ if (zoneid == ALL_ZONES) zoneid = GLOBAL_ZONEID; } /* * Always make sure that the NS/NA packets don't get load * spread. This is needed so that the probe packets sent * by the in.mpathd daemon can really go out on the desired * interface. Probe packets are made to go out on a desired * interface by including a ip6i with ATTACH_IF flag. As these * packets indirectly end up sending/receiving NS/NA packets * (neighbor doing NUD), we have to make sure that NA * also go out on the same interface. */ plen = (sizeof (nd_opt_hdr_t) + ill->ill_nd_lla_len + 7) / 8; len = IPV6_HDR_LEN + sizeof (ip6i_t) + sizeof (nd_neighbor_advert_t) + plen * 8; mp = allocb(len, BPRI_LO); if (mp == NULL) { if (src_ipif != NULL) ipif_refrele(src_ipif); return (B_TRUE); } bzero((char *)mp->b_rptr, len); mp->b_wptr = mp->b_rptr + len; ip6i = (ip6i_t *)mp->b_rptr; ip6i->ip6i_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6i->ip6i_nxt = IPPROTO_RAW; ip6i->ip6i_flags = IP6I_ATTACH_IF | IP6I_HOPLIMIT; if (flag & NDP_PROBE) ip6i->ip6i_flags |= IP6I_UNSPEC_SRC; ip6i->ip6i_ifindex = ill->ill_phyint->phyint_ifindex; ip6h = (ip6_t *)(mp->b_rptr + sizeof (ip6i_t)); ip6h->ip6_vcf = IPV6_DEFAULT_VERS_AND_FLOW; ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); ip6h->ip6_nxt = IPPROTO_ICMPV6; ip6h->ip6_hops = IPV6_MAX_HOPS; ip6h->ip6_dst = *target; icmp6 = (icmp6_t *)&ip6h[1]; opt = (nd_opt_hdr_t *)((uint8_t *)ip6h + IPV6_HDR_LEN + sizeof (nd_neighbor_advert_t)); if (operation == ND_NEIGHBOR_SOLICIT) { nd_neighbor_solicit_t *ns = (nd_neighbor_solicit_t *)icmp6; if (!(flag & NDP_PROBE)) opt->nd_opt_type = ND_OPT_SOURCE_LINKADDR; ip6h->ip6_src = *sender; ns->nd_ns_target = *target; if (!(flag & NDP_UNICAST)) { /* Form multicast address of the target */ ip6h->ip6_dst = ipv6_solicited_node_mcast; ip6h->ip6_dst.s6_addr32[3] |= ns->nd_ns_target.s6_addr32[3]; } } else { nd_neighbor_advert_t *na = (nd_neighbor_advert_t *)icmp6; ASSERT(!(flag & NDP_PROBE)); opt->nd_opt_type = ND_OPT_TARGET_LINKADDR; ip6h->ip6_src = *sender; na->nd_na_target = *sender; if (flag & NDP_ISROUTER) na->nd_na_flags_reserved |= ND_NA_FLAG_ROUTER; if (flag & NDP_SOLICITED) na->nd_na_flags_reserved |= ND_NA_FLAG_SOLICITED; if (flag & NDP_ORIDE) na->nd_na_flags_reserved |= ND_NA_FLAG_OVERRIDE; } hw_addr = NULL; if (!(flag & NDP_PROBE)) { hw_addr = use_nd_lla ? hwaddr_ill->ill_nd_lla : hwaddr_ill->ill_phys_addr; if (hw_addr != NULL) { /* Fill in link layer address and option len */ opt->nd_opt_len = (uint8_t)plen; bcopy(hw_addr, &opt[1], hwaddr_ill->ill_nd_lla_len); } } if (hw_addr == NULL) { /* If there's no link layer address option, then strip it. */ len -= plen * 8; mp->b_wptr = mp->b_rptr + len; ip6h->ip6_plen = htons(len - IPV6_HDR_LEN - sizeof (ip6i_t)); } icmp6->icmp6_type = (uint8_t)operation; icmp6->icmp6_code = 0; /* * Prepare for checksum by putting icmp length in the icmp * checksum field. The checksum is calculated in ip_wput_v6. */ icmp6->icmp6_cksum = ip6h->ip6_plen; if (src_ipif != NULL) ipif_refrele(src_ipif); ip_output_v6((void *)(uintptr_t)zoneid, mp, ill->ill_wq, IP_WPUT); return (B_FALSE); } /* * Make a link layer address (does not include the SAP) from an nce. * To form the link layer address, use the last four bytes of ipv6 * address passed in and the fixed offset stored in nce. */ static void nce_make_mapping(nce_t *nce, uchar_t *addrpos, uchar_t *addr) { uchar_t *mask, *to; ill_t *ill = nce->nce_ill; int len; if (ill->ill_net_type == IRE_IF_NORESOLVER) return; ASSERT(nce->nce_res_mp != NULL); ASSERT(ill->ill_net_type == IRE_IF_RESOLVER); ASSERT(nce->nce_flags & NCE_F_MAPPING); ASSERT(!IN6_IS_ADDR_UNSPECIFIED(&nce->nce_extract_mask)); ASSERT(addr != NULL); bcopy(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), addrpos, ill->ill_nd_lla_len); len = MIN((int)ill->ill_nd_lla_len - nce->nce_ll_extract_start, IPV6_ADDR_LEN); mask = (uchar_t *)&nce->nce_extract_mask; mask += (IPV6_ADDR_LEN - len); addr += (IPV6_ADDR_LEN - len); to = addrpos + nce->nce_ll_extract_start; while (len-- > 0) *to++ |= *mask++ & *addr++; } mblk_t * nce_udreq_alloc(ill_t *ill) { mblk_t *template_mp = NULL; dl_unitdata_req_t *dlur; int sap_length; ASSERT(ill->ill_isv6); sap_length = ill->ill_sap_length; template_mp = ip_dlpi_alloc(sizeof (dl_unitdata_req_t) + ill->ill_nd_lla_len + ABS(sap_length), DL_UNITDATA_REQ); if (template_mp == NULL) return (NULL); dlur = (dl_unitdata_req_t *)template_mp->b_rptr; dlur->dl_priority.dl_min = 0; dlur->dl_priority.dl_max = 0; dlur->dl_dest_addr_length = ABS(sap_length) + ill->ill_nd_lla_len; dlur->dl_dest_addr_offset = sizeof (dl_unitdata_req_t); /* Copy in the SAP value. */ NCE_LL_SAP_COPY(ill, template_mp); return (template_mp); } /* * NDP retransmit timer. * This timer goes off when: * a. It is time to retransmit NS for resolver. * b. It is time to send reachability probes. */ void ndp_timer(void *arg) { nce_t *nce = arg; ill_t *ill = nce->nce_ill; uint32_t ms; char addrbuf[INET6_ADDRSTRLEN]; mblk_t *mp; boolean_t dropped = B_FALSE; ip_stack_t *ipst = ill->ill_ipst; /* * The timer has to be cancelled by ndp_delete before doing the final * refrele. So the NCE is guaranteed to exist when the timer runs * until it clears the timeout_id. Before clearing the timeout_id * bump up the refcnt so that we can continue to use the nce */ ASSERT(nce != NULL); /* * Grab the ill_g_lock now itself to avoid lock order problems. * nce_solicit needs ill_g_lock to be able to traverse ills */ rw_enter(&ipst->ips_ill_g_lock, RW_READER); mutex_enter(&nce->nce_lock); NCE_REFHOLD_LOCKED(nce); nce->nce_timeout_id = 0; /* * Check the reachability state first. */ switch (nce->nce_state) { case ND_DELAY: rw_exit(&ipst->ips_ill_g_lock); nce->nce_state = ND_PROBE; mutex_exit(&nce->nce_lock); (void) nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, &ipv6_all_zeros, &nce->nce_addr, NDP_UNICAST); if (ip_debug > 3) { /* ip2dbg */ pr_addr_dbg("ndp_timer: state for %s changed " "to PROBE\n", AF_INET6, &nce->nce_addr); } NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); NCE_REFRELE(nce); return; case ND_PROBE: /* must be retransmit timer */ rw_exit(&ipst->ips_ill_g_lock); nce->nce_pcnt--; ASSERT(nce->nce_pcnt < ND_MAX_UNICAST_SOLICIT && nce->nce_pcnt >= -1); if (nce->nce_pcnt > 0) { /* * As per RFC2461, the nce gets deleted after * MAX_UNICAST_SOLICIT unsuccessful re-transmissions. * Note that the first unicast solicitation is sent * during the DELAY state. */ ip2dbg(("ndp_timer: pcount=%x dst %s\n", nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, addrbuf, sizeof (addrbuf)))); mutex_exit(&nce->nce_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_SOLICIT, NULL, B_FALSE, &ipv6_all_zeros, &nce->nce_addr, (nce->nce_flags & NCE_F_PERMANENT) ? NDP_PROBE : NDP_UNICAST); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_pcnt++; mutex_exit(&nce->nce_lock); } NDP_RESTART_TIMER(nce, ILL_PROBE_INTERVAL(ill)); } else if (nce->nce_pcnt < 0) { /* No hope, delete the nce */ nce->nce_state = ND_UNREACHABLE; mutex_exit(&nce->nce_lock); if (ip_debug > 2) { /* ip1dbg */ pr_addr_dbg("ndp_timer: Delete IRE for" " dst %s\n", AF_INET6, &nce->nce_addr); } ndp_delete(nce); } else if (!(nce->nce_flags & NCE_F_PERMANENT)) { /* Wait RetransTimer, before deleting the entry */ ip2dbg(("ndp_timer: pcount=%x dst %s\n", nce->nce_pcnt, inet_ntop(AF_INET6, &nce->nce_addr, addrbuf, sizeof (addrbuf)))); mutex_exit(&nce->nce_lock); /* Wait one interval before killing */ NDP_RESTART_TIMER(nce, ill->ill_reachable_retrans_time); } else if (ill->ill_phyint->phyint_flags & PHYI_RUNNING) { ipif_t *ipif; /* * We're done probing, and we can now declare this * address to be usable. Let IP know that it's ok to * use. */ nce->nce_state = ND_REACHABLE; mutex_exit(&nce->nce_lock); ipif = ipif_lookup_addr_v6(&nce->nce_addr, ill, ALL_ZONES, NULL, NULL, NULL, NULL, ipst); if (ipif != NULL) { if (ipif->ipif_was_dup) { char ibuf[LIFNAMSIZ + 10]; char sbuf[INET6_ADDRSTRLEN]; ipif->ipif_was_dup = B_FALSE; (void) inet_ntop(AF_INET6, &ipif->ipif_v6lcl_addr, sbuf, sizeof (sbuf)); ipif_get_name(ipif, ibuf, sizeof (ibuf)); cmn_err(CE_NOTE, "recovered address " "%s on %s", sbuf, ibuf); } if ((ipif->ipif_flags & IPIF_UP) && !ipif->ipif_addr_ready) { ip_rts_ifmsg(ipif); ip_rts_newaddrmsg(RTM_ADD, 0, ipif); sctp_update_ipif(ipif, SCTP_IPIF_UP); } ipif->ipif_addr_ready = 1; ipif_refrele(ipif); } /* Begin defending our new address */ nce->nce_unsolicit_count = 0; dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, B_FALSE, &nce->nce_addr, &ipv6_all_hosts_mcast, nce_advert_flags(nce)); if (dropped) { nce->nce_unsolicit_count = 1; NDP_RESTART_TIMER(nce, ipst->ips_ip_ndp_unsolicit_interval); } else if (ipst->ips_ip_ndp_defense_interval != 0) { NDP_RESTART_TIMER(nce, ipst->ips_ip_ndp_defense_interval); } } else { /* * This is an address we're probing to be our own, but * the ill is down. Wait until it comes back before * doing anything, but switch to reachable state so * that the restart will work. */ nce->nce_state = ND_REACHABLE; mutex_exit(&nce->nce_lock); } NCE_REFRELE(nce); return; case ND_INCOMPLETE: /* * Must be resolvers retransmit timer. */ for (mp = nce->nce_qd_mp; mp != NULL; mp = mp->b_next) { ip6i_t *ip6i; ip6_t *ip6h; mblk_t *data_mp; /* * Walk the list of packets queued, and see if there * are any multipathing probe packets. Such packets * are always queued at the head. Since this is a * retransmit timer firing, mark such packets as * delayed in ND resolution. This info will be used * in ip_wput_v6(). Multipathing probe packets will * always have an ip6i_t. Once we hit a packet without * it, we can break out of this loop. */ if (mp->b_datap->db_type == M_CTL) data_mp = mp->b_cont; else data_mp = mp; ip6h = (ip6_t *)data_mp->b_rptr; if (ip6h->ip6_nxt != IPPROTO_RAW) break; /* * This message should have been pulled up already in * ip_wput_v6. We can't do pullups here because the * b_next/b_prev is non-NULL. */ ip6i = (ip6i_t *)ip6h; ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= sizeof (ip6i_t) + IPV6_HDR_LEN); /* Mark this packet as delayed due to ND resolution */ if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) ip6i->ip6i_flags |= IP6I_ND_DELAYED; } if (nce->nce_qd_mp != NULL) { ms = nce_solicit(nce, NULL); rw_exit(&ipst->ips_ill_g_lock); if (ms == 0) { if (nce->nce_state != ND_REACHABLE) { mutex_exit(&nce->nce_lock); nce_resolv_failed(nce); ndp_delete(nce); } else { mutex_exit(&nce->nce_lock); } } else { mutex_exit(&nce->nce_lock); NDP_RESTART_TIMER(nce, (clock_t)ms); } NCE_REFRELE(nce); return; } mutex_exit(&nce->nce_lock); rw_exit(&ipst->ips_ill_g_lock); NCE_REFRELE(nce); break; case ND_REACHABLE : rw_exit(&ipst->ips_ill_g_lock); if (((nce->nce_flags & NCE_F_UNSOL_ADV) && nce->nce_unsolicit_count != 0) || ((nce->nce_flags & NCE_F_PERMANENT) && ipst->ips_ip_ndp_defense_interval != 0)) { if (nce->nce_unsolicit_count > 0) nce->nce_unsolicit_count--; mutex_exit(&nce->nce_lock); dropped = nce_xmit(ill, ND_NEIGHBOR_ADVERT, ill, /* ill to be used for hw addr */ B_FALSE, /* use ill_phys_addr */ &nce->nce_addr, &ipv6_all_hosts_mcast, nce_advert_flags(nce)); if (dropped) { mutex_enter(&nce->nce_lock); nce->nce_unsolicit_count++; mutex_exit(&nce->nce_lock); } if (nce->nce_unsolicit_count != 0) { NDP_RESTART_TIMER(nce, ipst->ips_ip_ndp_unsolicit_interval); } else { NDP_RESTART_TIMER(nce, ipst->ips_ip_ndp_defense_interval); } } else { mutex_exit(&nce->nce_lock); } NCE_REFRELE(nce); break; default: rw_exit(&ipst->ips_ill_g_lock); mutex_exit(&nce->nce_lock); NCE_REFRELE(nce); break; } } /* * Set a link layer address from the ll_addr passed in. * Copy SAP from ill. */ static void nce_set_ll(nce_t *nce, uchar_t *ll_addr) { ill_t *ill = nce->nce_ill; uchar_t *woffset; ASSERT(ll_addr != NULL); /* Always called before fast_path_probe */ ASSERT(nce->nce_fp_mp == NULL); if (ill->ill_sap_length != 0) { /* * Copy the SAP type specified in the * request into the xmit template. */ NCE_LL_SAP_COPY(ill, nce->nce_res_mp); } if (ill->ill_phys_addr_length > 0) { /* * The bcopy() below used to be called for the physical address * length rather than the link layer address length. For * ethernet and many other media, the phys_addr and lla are * identical. * However, with xresolv interfaces being introduced, the * phys_addr and lla are no longer the same, and the physical * address may not have any useful meaning, so we use the lla * for IPv6 address resolution and destination addressing. * * For PPP or other interfaces with a zero length * physical address, don't do anything here. * The bcopy() with a zero phys_addr length was previously * a no-op for interfaces with a zero-length physical address. * Using the lla for them would change the way they operate. * Doing nothing in such cases preserves expected behavior. */ woffset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); bcopy(ll_addr, woffset, ill->ill_nd_lla_len); } } static boolean_t nce_cmp_ll_addr(const nce_t *nce, const uchar_t *ll_addr, uint32_t ll_addr_len) { ill_t *ill = nce->nce_ill; uchar_t *ll_offset; ASSERT(nce->nce_res_mp != NULL); if (ll_addr == NULL) return (B_FALSE); ll_offset = nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill); if (bcmp(ll_addr, ll_offset, ll_addr_len) != 0) return (B_TRUE); return (B_FALSE); } /* * Updates the link layer address or the reachability state of * a cache entry. Reset probe counter if needed. */ static void nce_update(nce_t *nce, uint16_t new_state, uchar_t *new_ll_addr) { ill_t *ill = nce->nce_ill; boolean_t need_stop_timer = B_FALSE; boolean_t need_fastpath_update = B_FALSE; ASSERT(MUTEX_HELD(&nce->nce_lock)); ASSERT(nce->nce_ipversion == IPV6_VERSION); /* * If this interface does not do NUD, there is no point * in allowing an update to the cache entry. Although * we will respond to NS. * The only time we accept an update for a resolver when * NUD is turned off is when it has just been created. * Non-Resolvers will always be created as REACHABLE. */ if (new_state != ND_UNCHANGED) { if ((nce->nce_flags & NCE_F_NONUD) && (nce->nce_state != ND_INCOMPLETE)) return; ASSERT((int16_t)new_state >= ND_STATE_VALID_MIN); ASSERT((int16_t)new_state <= ND_STATE_VALID_MAX); need_stop_timer = B_TRUE; if (new_state == ND_REACHABLE) nce->nce_last = TICK_TO_MSEC(lbolt64); else { /* We force NUD in this case */ nce->nce_last = 0; } nce->nce_state = new_state; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; } /* * In case of fast path we need to free the the fastpath * M_DATA and do another probe. Otherwise we can just * overwrite the DL_UNITDATA_REQ data, noting we'll lose * whatever packets that happens to be transmitting at the time. */ if (new_ll_addr != NULL) { ASSERT(nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill) + ill->ill_nd_lla_len <= nce->nce_res_mp->b_wptr); bcopy(new_ll_addr, nce->nce_res_mp->b_rptr + NCE_LL_ADDR_OFFSET(ill), ill->ill_nd_lla_len); if (nce->nce_fp_mp != NULL) { freemsg(nce->nce_fp_mp); nce->nce_fp_mp = NULL; } need_fastpath_update = B_TRUE; } mutex_exit(&nce->nce_lock); if (need_stop_timer) { (void) untimeout(nce->nce_timeout_id); nce->nce_timeout_id = 0; } if (need_fastpath_update) nce_fastpath(nce); mutex_enter(&nce->nce_lock); } void nce_queue_mp_common(nce_t *nce, mblk_t *mp, boolean_t head_insert) { uint_t count = 0; mblk_t **mpp; ASSERT(MUTEX_HELD(&nce->nce_lock)); for (mpp = &nce->nce_qd_mp; *mpp != NULL; mpp = &(*mpp)->b_next) { if (++count > nce->nce_ill->ill_max_buf) { mblk_t *tmp = nce->nce_qd_mp->b_next; nce->nce_qd_mp->b_next = NULL; nce->nce_qd_mp->b_prev = NULL; freemsg(nce->nce_qd_mp); nce->nce_qd_mp = tmp; } } /* put this on the list */ if (head_insert) { mp->b_next = nce->nce_qd_mp; nce->nce_qd_mp = mp; } else { *mpp = mp; } } static void nce_queue_mp(nce_t *nce, mblk_t *mp) { boolean_t head_insert = B_FALSE; ip6_t *ip6h; ip6i_t *ip6i; mblk_t *data_mp; ASSERT(MUTEX_HELD(&nce->nce_lock)); if (mp->b_datap->db_type == M_CTL) data_mp = mp->b_cont; else data_mp = mp; ip6h = (ip6_t *)data_mp->b_rptr; if (ip6h->ip6_nxt == IPPROTO_RAW) { /* * This message should have been pulled up already in * ip_wput_v6. We can't do pullups here because the message * could be from the nce_qd_mp which could have b_next/b_prev * non-NULL. */ ip6i = (ip6i_t *)ip6h; ASSERT((data_mp->b_wptr - (uchar_t *)ip6i) >= sizeof (ip6i_t) + IPV6_HDR_LEN); /* * Multipathing probe packets have IP6I_DROP_IFDELAYED set. * This has 2 aspects mentioned below. * 1. Perform head insertion in the nce_qd_mp for these packets. * This ensures that next retransmit of ND solicitation * will use the interface specified by the probe packet, * for both NS and NA. This corresponds to the src address * in the IPv6 packet. If we insert at tail, we will be * depending on the packet at the head for successful * ND resolution. This is not reliable, because the interface * on which the NA arrives could be different from the interface * on which the NS was sent, and if the receiving interface is * failed, it will appear that the sending interface is also * failed, causing in.mpathd to misdiagnose this as link * failure. * 2. Drop the original packet, if the ND resolution did not * succeed in the first attempt. However we will create the * nce and the ire, as soon as the ND resolution succeeds. * We don't gain anything by queueing multiple probe packets * and sending them back-to-back once resolution succeeds. * It is sufficient to send just 1 packet after ND resolution * succeeds. Since mpathd is sending down probe packets at a * constant rate, we don't need to send the queued packet. We * need to queue it only for NDP resolution. The benefit of * dropping the probe packets that were delayed in ND * resolution, is that in.mpathd will not see inflated * RTT. If the ND resolution does not succeed within * in.mpathd's failure detection time, mpathd may detect * a failure, and it does not matter whether the packet * was queued or dropped. */ if (ip6i->ip6i_flags & IP6I_DROP_IFDELAYED) head_insert = B_TRUE; } nce_queue_mp_common(nce, mp, head_insert); } /* * Called when address resolution failed due to a timeout. * Send an ICMP unreachable in response to all queued packets. */ void nce_resolv_failed(nce_t *nce) { mblk_t *mp, *nxt_mp, *first_mp; char buf[INET6_ADDRSTRLEN]; ip6_t *ip6h; zoneid_t zoneid = GLOBAL_ZONEID; ip_stack_t *ipst = nce->nce_ill->ill_ipst; ip1dbg(("nce_resolv_failed: dst %s\n", inet_ntop(AF_INET6, (char *)&nce->nce_addr, buf, sizeof (buf)))); mutex_enter(&nce->nce_lock); mp = nce->nce_qd_mp; nce->nce_qd_mp = NULL; mutex_exit(&nce->nce_lock); while (mp != NULL) { nxt_mp = mp->b_next; mp->b_next = NULL; mp->b_prev = NULL; first_mp = mp; if (mp->b_datap->db_type == M_CTL) { ipsec_out_t *io = (ipsec_out_t *)mp->b_rptr; ASSERT(io->ipsec_out_type == IPSEC_OUT); zoneid = io->ipsec_out_zoneid; ASSERT(zoneid != ALL_ZONES); mp = mp->b_cont; } ip6h = (ip6_t *)mp->b_rptr; if (ip6h->ip6_nxt == IPPROTO_RAW) { ip6i_t *ip6i; /* * This message should have been pulled up already * in ip_wput_v6. ip_hdr_complete_v6 assumes that * the header is pulled up. */ ip6i = (ip6i_t *)ip6h; ASSERT((mp->b_wptr - (uchar_t *)ip6i) >= sizeof (ip6i_t) + IPV6_HDR_LEN); mp->b_rptr += sizeof (ip6i_t); } /* * Ignore failure since icmp_unreachable_v6 will silently * drop packets with an unspecified source address. */ (void) ip_hdr_complete_v6((ip6_t *)mp->b_rptr, zoneid, ipst); icmp_unreachable_v6(nce->nce_ill->ill_wq, first_mp, ICMP6_DST_UNREACH_ADDR, B_FALSE, B_FALSE, zoneid, ipst); mp = nxt_mp; } } /* * Called by SIOCSNDP* ioctl to add/change an nce entry * and the corresponding attributes. * Disallow states other than ND_REACHABLE or ND_STALE. */ int ndp_sioc_update(ill_t *ill, lif_nd_req_t *lnr) { sin6_t *sin6; in6_addr_t *addr; nce_t *nce; int err; uint16_t new_flags = 0; uint16_t old_flags = 0; int inflags = lnr->lnr_flags; ip_stack_t *ipst = ill->ill_ipst; ASSERT(ill->ill_isv6); if ((lnr->lnr_state_create != ND_REACHABLE) && (lnr->lnr_state_create != ND_STALE)) return (EINVAL); sin6 = (sin6_t *)&lnr->lnr_addr; addr = &sin6->sin6_addr; mutex_enter(&ipst->ips_ndp6->ndp_g_lock); /* We know it can not be mapping so just look in the hash table */ nce = *((nce_t **)NCE_HASH_PTR_V6(ipst, *addr)); nce = nce_lookup_addr(ill, addr, nce); if (nce != NULL) new_flags = nce->nce_flags; switch (inflags & (NDF_ISROUTER_ON|NDF_ISROUTER_OFF)) { case NDF_ISROUTER_ON: new_flags |= NCE_F_ISROUTER; break; case NDF_ISROUTER_OFF: new_flags &= ~NCE_F_ISROUTER; break; case (NDF_ISROUTER_OFF|NDF_ISROUTER_ON): mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (nce != NULL) NCE_REFRELE(nce); return (EINVAL); } switch (inflags & (NDF_ANYCAST_ON|NDF_ANYCAST_OFF)) { case NDF_ANYCAST_ON: new_flags |= NCE_F_ANYCAST; break; case NDF_ANYCAST_OFF: new_flags &= ~NCE_F_ANYCAST; break; case (NDF_ANYCAST_OFF|NDF_ANYCAST_ON): mutex_exit(&ipst->ips_ndp6->ndp_g_lock); if (nce != NULL) NCE_REFRELE(nce); return (EINVAL); } if (nce == NULL) { err = ndp_add_v6(ill, (uchar_t *)lnr->lnr_hdw_addr, addr, &ipv6_all_ones, &ipv6_all_zeros, 0, new_flags, lnr->lnr_state_create, &nce); if (err != 0) { mutex_exit(&ipst->ips_ndp6->ndp_g_lock); ip1dbg(("ndp_sioc_update: Can't create NCE %d\n", err)); return (err); } } old_flags = nce->nce_flags; if (old_flags & NCE_F_ISROUTER && !(new_flags & NCE_F_ISROUTER)) { /* * Router turned to host, delete all ires. * XXX Just delete the entry, but we need to add too. */ nce->nce_flags &= ~NCE_F_ISROUTER; mutex_exit(&ipst->ips_ndp6->ndp_g_lock); ndp_delete(nce); NCE_REFRELE(nce); return (0); } mutex_exit(&ipst->ips_ndp6->ndp_g_lock); mutex_enter(&nce->nce_lock); nce->nce_flags = new_flags; mutex_exit(&nce->nce_lock); /* * Note that we ignore the state at this point, which * should be either STALE or REACHABLE. Instead we let * the link layer address passed in to determine the state * much like incoming packets. */ ndp_process(nce, (uchar_t *)lnr->lnr_hdw_addr, 0, B_FALSE); NCE_REFRELE(nce); return (0); } /* * If the device driver supports it, we make nce_fp_mp to have * an M_DATA prepend. Otherwise nce_fp_mp will be null. * The caller ensures there is hold on nce for this function. * Note that since ill_fastpath_probe() copies the mblk there is * no need for the hold beyond this function. */ void nce_fastpath(nce_t *nce) { ill_t *ill = nce->nce_ill; int res; ASSERT(ill != NULL); ASSERT(nce->nce_state != ND_INITIAL && nce->nce_state != ND_INCOMPLETE); if (nce->nce_fp_mp != NULL) { /* Already contains fastpath info */ return; } if (nce->nce_res_mp != NULL) { nce_fastpath_list_add(nce); res = ill_fastpath_probe(ill, nce->nce_res_mp); /* * EAGAIN is an indication of a transient error * i.e. allocation failure etc. leave the nce in the list it * will be updated when another probe happens for another ire * if not it will be taken out of the list when the ire is * deleted. */ if (res != 0 && res != EAGAIN) nce_fastpath_list_delete(nce); } } /* * Drain the list of nce's waiting for fastpath response. */ void nce_fastpath_list_dispatch(ill_t *ill, boolean_t (*func)(nce_t *, void *), void *arg) { nce_t *next_nce; nce_t *current_nce; nce_t *first_nce; nce_t *prev_nce = NULL; mutex_enter(&ill->ill_lock); first_nce = current_nce = (nce_t *)ill->ill_fastpath_list; while (current_nce != (nce_t *)&ill->ill_fastpath_list) { next_nce = current_nce->nce_fastpath; /* * Take it off the list if we're flushing, or if the callback * routine tells us to do so. Otherwise, leave the nce in the * fastpath list to handle any pending response from the lower * layer. We can't drain the list when the callback routine * comparison failed, because the response is asynchronous in * nature, and may not arrive in the same order as the list * insertion. */ if (func == NULL || func(current_nce, arg)) { current_nce->nce_fastpath = NULL; if (current_nce == first_nce) ill->ill_fastpath_list = first_nce = next_nce; else prev_nce->nce_fastpath = next_nce; } else { /* previous element that is still in the list */ prev_nce = current_nce; } current_nce = next_nce; } mutex_exit(&ill->ill_lock); } /* * Add nce to the nce fastpath list. */ void nce_fastpath_list_add(nce_t *nce) { ill_t *ill; ill = nce->nce_ill; mutex_enter(&ill->ill_lock); mutex_enter(&nce->nce_lock); /* * if nce has not been deleted and * is not already in the list add it. */ if (!(nce->nce_flags & NCE_F_CONDEMNED) && (nce->nce_fastpath == NULL)) { nce->nce_fastpath = (nce_t *)ill->ill_fastpath_list; ill->ill_fastpath_list = nce; } mutex_exit(&nce->nce_lock); mutex_exit(&ill->ill_lock); } /* * remove nce from the nce fastpath list. */ void nce_fastpath_list_delete(nce_t *nce) { nce_t *nce_ptr; ill_t *ill; ill = nce->nce_ill; ASSERT(ill != NULL); mutex_enter(&ill->ill_lock); if (nce->nce_fastpath == NULL) goto done; ASSERT(ill->ill_fastpath_list != &ill->ill_fastpath_list); if (ill->ill_fastpath_list == nce) { ill->ill_fastpath_list = nce->nce_fastpath; } else { nce_ptr = ill->ill_fastpath_list; while (nce_ptr != (nce_t *)&ill->ill_fastpath_list) { if (nce_ptr->nce_fastpath == nce) { nce_ptr->nce_fastpath = nce->nce_fastpath; break; } nce_ptr = nce_ptr->nce_fastpath; } } nce->nce_fastpath = NULL; done: mutex_exit(&ill->ill_lock); } /* * Update all NCE's that are not in fastpath mode and * have an nce_fp_mp that matches mp. mp->b_cont contains * the fastpath header. * * Returns TRUE if entry should be dequeued, or FALSE otherwise. */ boolean_t ndp_fastpath_update(nce_t *nce, void *arg) { mblk_t *mp, *fp_mp; uchar_t *mp_rptr, *ud_mp_rptr; mblk_t *ud_mp = nce->nce_res_mp; ptrdiff_t cmplen; if (nce->nce_flags & NCE_F_MAPPING) return (B_TRUE); if ((nce->nce_fp_mp != NULL) || (ud_mp == NULL)) return (B_TRUE); ip2dbg(("ndp_fastpath_update: trying\n")); mp = (mblk_t *)arg; mp_rptr = mp->b_rptr; cmplen = mp->b_wptr - mp_rptr; ASSERT(cmplen >= 0); ud_mp_rptr = ud_mp->b_rptr; /* * The nce is locked here to prevent any other threads * from accessing and changing nce_res_mp when the IPv6 address * becomes resolved to an lla while we're in the middle * of looking at and comparing the hardware address (lla). * It is also locked to prevent multiple threads in nce_fastpath_update * from examining nce_res_mp atthe same time. */ mutex_enter(&nce->nce_lock); if (ud_mp->b_wptr - ud_mp_rptr != cmplen || bcmp((char *)mp_rptr, (char *)ud_mp_rptr, cmplen) != 0) { mutex_exit(&nce->nce_lock); /* * Don't take the ire off the fastpath list yet, * since the response may come later. */ return (B_FALSE); } /* Matched - install mp as the fastpath mp */ ip1dbg(("ndp_fastpath_update: match\n")); fp_mp = dupb(mp->b_cont); if (fp_mp != NULL) { nce->nce_fp_mp = fp_mp; } mutex_exit(&nce->nce_lock); return (B_TRUE); } /* * This function handles the DL_NOTE_FASTPATH_FLUSH notification from * driver. Note that it assumes IP is exclusive... */ /* ARGSUSED */ void ndp_fastpath_flush(nce_t *nce, char *arg) { if (nce->nce_flags & NCE_F_MAPPING) return; /* No fastpath info? */ if (nce->nce_fp_mp == NULL || nce->nce_res_mp == NULL) return; if (nce->nce_ipversion == IPV4_VERSION && nce->nce_flags & NCE_F_BCAST) { /* * IPv4 BROADCAST entries: * We can't delete the nce since it is difficult to * recreate these without going through the * ipif down/up dance. * * All access to nce->nce_fp_mp in the case of these * is protected by nce_lock. */ mutex_enter(&nce->nce_lock); if (nce->nce_fp_mp != NULL) { freeb(nce->nce_fp_mp); nce->nce_fp_mp = NULL; mutex_exit(&nce->nce_lock); nce_fastpath(nce); } else { mutex_exit(&nce->nce_lock); } } else { /* Just delete the NCE... */ ndp_delete(nce); } } /* * Return a pointer to a given option in the packet. * Assumes that option part of the packet have already been validated. */ nd_opt_hdr_t * ndp_get_option(nd_opt_hdr_t *opt, int optlen, int opt_type) { while (optlen > 0) { if (opt->nd_opt_type == opt_type) return (opt); optlen -= 8 * opt->nd_opt_len; opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); } return (NULL); } /* * Verify all option lengths present are > 0, also check to see * if the option lengths and packet length are consistent. */ boolean_t ndp_verify_optlen(nd_opt_hdr_t *opt, int optlen) { ASSERT(opt != NULL); while (optlen > 0) { if (opt->nd_opt_len == 0) return (B_FALSE); optlen -= 8 * opt->nd_opt_len; if (optlen < 0) return (B_FALSE); opt = (struct nd_opt_hdr *)((char *)opt + 8 * opt->nd_opt_len); } return (B_TRUE); } /* * ndp_walk function. * Free a fraction of the NCE cache entries. * A fraction of zero means to not free any in that category. */ void ndp_cache_reclaim(nce_t *nce, char *arg) { nce_cache_reclaim_t *ncr = (nce_cache_reclaim_t *)arg; uint_t rand; if (nce->nce_flags & NCE_F_PERMANENT) return; rand = (uint_t)lbolt + NCE_ADDR_HASH_V6(nce->nce_addr, NCE_TABLE_SIZE); if (ncr->ncr_host != 0 && (rand/ncr->ncr_host)*ncr->ncr_host == rand) { ndp_delete(nce); return; } } /* * ndp_walk function. * Count the number of NCEs that can be deleted. * These would be hosts but not routers. */ void ndp_cache_count(nce_t *nce, char *arg) { ncc_cache_count_t *ncc = (ncc_cache_count_t *)arg; if (nce->nce_flags & NCE_F_PERMANENT) return; ncc->ncc_total++; if (!(nce->nce_flags & NCE_F_ISROUTER)) ncc->ncc_host++; } #ifdef DEBUG void nce_trace_ref(nce_t *nce) { ASSERT(MUTEX_HELD(&nce->nce_lock)); if (nce->nce_trace_disable) return; if (!th_trace_ref(nce, nce->nce_ill->ill_ipst)) { nce->nce_trace_disable = B_TRUE; nce_trace_cleanup(nce); } } void nce_untrace_ref(nce_t *nce) { ASSERT(MUTEX_HELD(&nce->nce_lock)); if (!nce->nce_trace_disable) th_trace_unref(nce); } static void nce_trace_cleanup(const nce_t *nce) { th_trace_cleanup(nce, nce->nce_trace_disable); } #endif /* * Called when address resolution fails due to a timeout. * Send an ICMP unreachable in response to all queued packets. */ void arp_resolv_failed(nce_t *nce) { mblk_t *mp, *nxt_mp, *first_mp; char buf[INET6_ADDRSTRLEN]; zoneid_t zoneid = GLOBAL_ZONEID; struct in_addr ipv4addr; ip_stack_t *ipst = nce->nce_ill->ill_ipst; IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &ipv4addr); ip3dbg(("arp_resolv_failed: dst %s\n", inet_ntop(AF_INET, &ipv4addr, buf, sizeof (buf)))); mutex_enter(&nce->nce_lock); mp = nce->nce_qd_mp; nce->nce_qd_mp = NULL; mutex_exit(&nce->nce_lock); while (mp != NULL) { nxt_mp = mp->b_next; mp->b_next = NULL; mp->b_prev = NULL; first_mp = mp; /* * Send icmp unreachable messages * to the hosts. */ (void) ip_hdr_complete((ipha_t *)mp->b_rptr, zoneid, ipst); ip3dbg(("arp_resolv_failed: Calling icmp_unreachable\n")); icmp_unreachable(nce->nce_ill->ill_wq, first_mp, ICMP_HOST_UNREACHABLE, zoneid, ipst); mp = nxt_mp; } } int ndp_lookup_then_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, nce_t **newnce, nce_t *src_nce) { int err; nce_t *nce; in6_addr_t addr6; ip_stack_t *ipst = ill->ill_ipst; mutex_enter(&ipst->ips_ndp4->ndp_g_lock); nce = *((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); IN6_IPADDR_TO_V4MAPPED(*addr, &addr6); nce = nce_lookup_addr(ill, &addr6, nce); if (nce == NULL) { err = ndp_add_v4(ill, addr, flags, newnce, src_nce); } else { *newnce = nce; err = EEXIST; } mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (err); } /* * NDP Cache Entry creation routine for IPv4. * Mapped entries are handled in arp. * This routine must always be called with ndp4->ndp_g_lock held. * Prior to return, nce_refcnt is incremented. */ static int ndp_add_v4(ill_t *ill, const in_addr_t *addr, uint16_t flags, nce_t **newnce, nce_t *src_nce) { static nce_t nce_nil; nce_t *nce; mblk_t *mp; mblk_t *template = NULL; nce_t **ncep; ip_stack_t *ipst = ill->ill_ipst; uint16_t state = ND_INITIAL; int err; ASSERT(MUTEX_HELD(&ipst->ips_ndp4->ndp_g_lock)); ASSERT(!ill->ill_isv6); ASSERT((flags & NCE_F_MAPPING) == 0); if (ill->ill_resolver_mp == NULL) return (EINVAL); /* * Allocate the mblk to hold the nce. */ mp = allocb(sizeof (nce_t), BPRI_MED); if (mp == NULL) return (ENOMEM); nce = (nce_t *)mp->b_rptr; mp->b_wptr = (uchar_t *)&nce[1]; *nce = nce_nil; nce->nce_ill = ill; nce->nce_ipversion = IPV4_VERSION; nce->nce_flags = flags; nce->nce_pcnt = ND_MAX_UNICAST_SOLICIT; nce->nce_rcnt = ill->ill_xmit_count; IN6_IPADDR_TO_V4MAPPED(*addr, &nce->nce_addr); nce->nce_mask = ipv6_all_ones; nce->nce_extract_mask = ipv6_all_zeros; nce->nce_ll_extract_start = 0; nce->nce_qd_mp = NULL; nce->nce_mp = mp; /* This one is for nce getting created */ nce->nce_refcnt = 1; mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL); ncep = ((nce_t **)NCE_HASH_PTR_V4(ipst, *addr)); nce->nce_trace_disable = B_FALSE; if (src_nce != NULL) { /* * src_nce has been provided by the caller. The only * caller who provides a non-null, non-broadcast * src_nce is from ip_newroute() which must pass in * a ND_REACHABLE src_nce (this condition is verified * via an ASSERT for the save_ire->ire_nce in ip_newroute()) */ mutex_enter(&src_nce->nce_lock); state = src_nce->nce_state; if ((src_nce->nce_flags & NCE_F_CONDEMNED) || (ipst->ips_ndp4->ndp_g_hw_change > 0)) { /* * src_nce has been deleted, or * ip_arp_news is in the middle of * flushing entries in the the nce. * Fail the add, since we don't know * if it is safe to copy the contents of * src_nce */ DTRACE_PROBE2(nce__bad__src__nce, nce_t *, src_nce, ill_t *, ill); mutex_exit(&src_nce->nce_lock); err = EINVAL; goto err_ret; } template = copyb(src_nce->nce_res_mp); mutex_exit(&src_nce->nce_lock); if (template == NULL) { err = ENOMEM; goto err_ret; } } else if (flags & NCE_F_BCAST) { /* * broadcast nce. */ template = copyb(ill->ill_bcast_mp); if (template == NULL) { err = ENOMEM; goto err_ret; } state = ND_REACHABLE; } else if (ill->ill_net_type == IRE_IF_NORESOLVER) { /* * NORESOLVER entries are always created in the REACHABLE * state. We create a nce_res_mp with the IP nexthop address * in the destination address in the DLPI hdr if the * physical length is exactly 4 bytes. * * XXX not clear which drivers set ill_phys_addr_length to * IP_ADDR_LEN. */ if (ill->ill_phys_addr_length == IP_ADDR_LEN) { template = ill_dlur_gen((uchar_t *)addr, ill->ill_phys_addr_length, ill->ill_sap, ill->ill_sap_length); } else { template = copyb(ill->ill_resolver_mp); } if (template == NULL) { err = ENOMEM; goto err_ret; } state = ND_REACHABLE; } nce->nce_fp_mp = NULL; nce->nce_res_mp = template; nce->nce_state = state; if (state == ND_REACHABLE) { nce->nce_last = TICK_TO_MSEC(lbolt64); nce->nce_init_time = TICK_TO_MSEC(lbolt64); } else { nce->nce_last = 0; if (state == ND_INITIAL) nce->nce_init_time = TICK_TO_MSEC(lbolt64); } ASSERT((nce->nce_res_mp == NULL && nce->nce_state == ND_INITIAL) || (nce->nce_res_mp != NULL && nce->nce_state == ND_REACHABLE)); /* * Atomically ensure that the ill is not CONDEMNED, before * adding the NCE. */ mutex_enter(&ill->ill_lock); if (ill->ill_state_flags & ILL_CONDEMNED) { mutex_exit(&ill->ill_lock); err = EINVAL; goto err_ret; } if ((nce->nce_next = *ncep) != NULL) nce->nce_next->nce_ptpn = &nce->nce_next; *ncep = nce; nce->nce_ptpn = ncep; *newnce = nce; /* This one is for nce being used by an active thread */ NCE_REFHOLD(*newnce); /* Bump up the number of nce's referencing this ill */ DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill, (char *), "nce", (void *), nce); ill->ill_nce_cnt++; mutex_exit(&ill->ill_lock); DTRACE_PROBE1(ndp__add__v4, nce_t *, nce); return (0); err_ret: freeb(mp); freemsg(template); return (err); } void ndp_flush_qd_mp(nce_t *nce) { mblk_t *qd_mp, *qd_next; ASSERT(MUTEX_HELD(&nce->nce_lock)); qd_mp = nce->nce_qd_mp; nce->nce_qd_mp = NULL; while (qd_mp != NULL) { qd_next = qd_mp->b_next; qd_mp->b_next = NULL; qd_mp->b_prev = NULL; freemsg(qd_mp); qd_mp = qd_next; } } /* * ndp_walk routine to delete all entries that have a given destination or * gateway address and cached link layer (MAC) address. This is used when ARP * informs us that a network-to-link-layer mapping may have changed. */ void nce_delete_hw_changed(nce_t *nce, void *arg) { nce_hw_map_t *hwm = arg; mblk_t *mp; dl_unitdata_req_t *dlu; uchar_t *macaddr; ill_t *ill; int saplen; ipaddr_t nce_addr; if (nce->nce_state != ND_REACHABLE) return; IN6_V4MAPPED_TO_IPADDR(&nce->nce_addr, nce_addr); if (nce_addr != hwm->hwm_addr) return; mutex_enter(&nce->nce_lock); if ((mp = nce->nce_res_mp) == NULL) { mutex_exit(&nce->nce_lock); return; } dlu = (dl_unitdata_req_t *)mp->b_rptr; macaddr = (uchar_t *)(dlu + 1); ill = nce->nce_ill; if ((saplen = ill->ill_sap_length) > 0) macaddr += saplen; else saplen = -saplen; /* * If the hardware address is unchanged, then leave this one alone. * Note that saplen == abs(saplen) now. */ if (hwm->hwm_hwlen == dlu->dl_dest_addr_length - saplen && bcmp(hwm->hwm_hwaddr, macaddr, hwm->hwm_hwlen) == 0) { mutex_exit(&nce->nce_lock); return; } mutex_exit(&nce->nce_lock); DTRACE_PROBE1(nce__hw__deleted, nce_t *, nce); ndp_delete(nce); } /* * This function verifies whether a given IPv4 address is potentially known to * the NCE subsystem. If so, then ARP must not delete the corresponding ace_t, * so that it can continue to look for hardware changes on that address. */ boolean_t ndp_lookup_ipaddr(in_addr_t addr, netstack_t *ns) { nce_t *nce; struct in_addr nceaddr; ip_stack_t *ipst = ns->netstack_ip; if (addr == INADDR_ANY) return (B_FALSE); mutex_enter(&ipst->ips_ndp4->ndp_g_lock); nce = *(nce_t **)NCE_HASH_PTR_V4(ipst, addr); for (; nce != NULL; nce = nce->nce_next) { /* Note that only v4 mapped entries are in the table. */ IN6_V4MAPPED_TO_INADDR(&nce->nce_addr, &nceaddr); if (addr == nceaddr.s_addr && IN6_ARE_ADDR_EQUAL(&nce->nce_mask, &ipv6_all_ones)) { /* Single flag check; no lock needed */ if (!(nce->nce_flags & NCE_F_CONDEMNED)) break; } } mutex_exit(&ipst->ips_ndp4->ndp_g_lock); return (nce != NULL); }