/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * This file contains consumer routines of the IPv4 forwarding engine */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IS_DEFAULT_ROUTE(ire) \ (((ire)->ire_type & IRE_DEFAULT) || \ (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); static void ire_del_host_redir(ire_t *, char *); static boolean_t ire_find_best_route(struct radix_node *, void *); /* * Lookup a route in forwarding table. A specific lookup is indicated by * passing the required parameters and indicating the match required in the * flag field. * * Supports IP_BOUND_IF by following the ipif/ill when recursing. */ ire_t * ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { ire_t *ire; struct rt_sockaddr rdst, rmask; struct rt_entry *rt; ire_ftable_args_t margs; ASSERT(ill == NULL || !ill->ill_isv6); /* * ire_match_args() will dereference ill if MATCH_IRE_ILL * is set. */ if ((flags & MATCH_IRE_ILL) && (ill == NULL)) return (NULL); (void) memset(&rdst, 0, sizeof (rdst)); rdst.rt_sin_len = sizeof (rdst); rdst.rt_sin_family = AF_INET; rdst.rt_sin_addr.s_addr = addr; (void) memset(&rmask, 0, sizeof (rmask)); rmask.rt_sin_len = sizeof (rmask); rmask.rt_sin_family = AF_INET; rmask.rt_sin_addr.s_addr = mask; (void) memset(&margs, 0, sizeof (margs)); margs.ift_addr = addr; margs.ift_mask = mask; margs.ift_gateway = gateway; margs.ift_type = type; margs.ift_ill = ill; margs.ift_zoneid = zoneid; margs.ift_tsl = tsl; margs.ift_flags = flags; /* * The flags argument passed to ire_ftable_lookup may cause the * search to return, not the longest matching prefix, but the * "best matching prefix", i.e., the longest prefix that also * satisfies constraints imposed via the permutation of flags * passed in. To achieve this, we invoke ire_match_args() on * each matching leaf in the radix tree. ire_match_args is * invoked by the callback function ire_find_best_route() * We hold the global tree lock in read mode when calling * rn_match_args. Before dropping the global tree lock, ensure * that the radix node can't be deleted by incrementing ire_refcnt. */ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, ipst->ips_ip_ftable, ire_find_best_route, &margs); ire = margs.ift_best_ire; if (rt == NULL) { RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (NULL); } ASSERT(ire != NULL); DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); /* * round-robin only if we have more than one route in the bucket. * ips_ip_ecmp_behavior controls when we do ECMP * 2: always * 1: for IRE_DEFAULT and /0 IRE_INTERFACE * 0: never */ if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { if (ipst->ips_ip_ecmp_behavior == 2 || (ipst->ips_ip_ecmp_behavior == 1 && IS_DEFAULT_ROUTE(ire))) { ire_t *next_ire; margs.ift_best_ire = NULL; next_ire = ire_round_robin(ire->ire_bucket, &margs, xmit_hint, ire, ipst); if (next_ire == NULL) { /* keep ire if next_ire is null */ goto done; } ire_refrele(ire); ire = next_ire; } } done: /* Return generation before dropping lock */ if (generationp != NULL) *generationp = ire->ire_generation; RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); /* * For shared-IP zones we need additional checks to what was * done in ire_match_args to make sure IRE_LOCALs are handled. * * When ip_restrict_interzone_loopback is set, then * we ensure that IRE_LOCAL are only used for loopback * between zones when the logical "Ethernet" would * have looped them back. That is, if in the absense of * the IRE_LOCAL we would have sent to packet out the * same ill. */ if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && ipst->ips_ip_restrict_interzone_loopback) { ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); ASSERT(ire != NULL); } return (ire); } /* * This function is called by * ip_input/ire_route_recursive when doing a route lookup on only the * destination address. * * The optimizations of this function over ire_ftable_lookup are: * o removing unnecessary flag matching * o doing longest prefix match instead of overloading it further * with the unnecessary "best_prefix_match" * * If no route is found we return IRE_NOROUTE. */ ire_t * ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { ire_t *ire; struct rt_sockaddr rdst; struct rt_entry *rt; irb_t *irb; rdst.rt_sin_len = sizeof (rdst); rdst.rt_sin_family = AF_INET; rdst.rt_sin_addr.s_addr = addr; /* * This is basically inlining a simpler version of ire_match_args */ RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, ipst->ips_ip_ftable, NULL, NULL); if (rt == NULL) goto bad; irb = &rt->rt_irb; if (irb->irb_ire_cnt == 0) goto bad; rw_enter(&irb->irb_lock, RW_READER); ire = irb->irb_ire; if (ire == NULL) { rw_exit(&irb->irb_lock); goto bad; } while (IRE_IS_CONDEMNED(ire)) { ire = ire->ire_next; if (ire == NULL) { rw_exit(&irb->irb_lock); goto bad; } } /* we have a ire that matches */ ire_refhold(ire); rw_exit(&irb->irb_lock); /* * round-robin only if we have more than one route in the bucket. * ips_ip_ecmp_behavior controls when we do ECMP * 2: always * 1: for IRE_DEFAULT and /0 IRE_INTERFACE * 0: never * * Note: if we found an IRE_IF_CLONE we won't look at the bucket with * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match * and the IRE_INTERFACESs are likely to be shorter matches. */ if (ire->ire_bucket->irb_ire_cnt > 1) { if (ipst->ips_ip_ecmp_behavior == 2 || (ipst->ips_ip_ecmp_behavior == 1 && IS_DEFAULT_ROUTE(ire))) { ire_t *next_ire; ire_ftable_args_t margs; (void) memset(&margs, 0, sizeof (margs)); margs.ift_addr = addr; margs.ift_zoneid = ALL_ZONES; next_ire = ire_round_robin(ire->ire_bucket, &margs, xmit_hint, ire, ipst); if (next_ire == NULL) { /* keep ire if next_ire is null */ if (generationp != NULL) *generationp = ire->ire_generation; RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (ire); } ire_refrele(ire); ire = next_ire; } } /* Return generation before dropping lock */ if (generationp != NULL) *generationp = ire->ire_generation; RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); /* * Since we only did ALL_ZONES matches there is no special handling * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. */ return (ire); bad: if (generationp != NULL) *generationp = IRE_GENERATION_VERIFY; RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (ire_reject(ipst, B_FALSE)); } /* * Find the ill matching a multicast group. * Allows different routes for multicast addresses * in the unicast routing table (akin to 224.0.0.0 but could be more specific) * which point at different interfaces. This is used when IP_MULTICAST_IF * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't * specify the interface to join on. * * Supports link-local addresses by using ire_route_recursive which follows * the ill when recursing. * * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group * and the MULTIRT property can be different for different groups, we * extract RTF_MULTIRT from the special unicast route added for a group * with CGTP and pass that back in the multirtp argument. * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. * We have a setsrcp argument for the same reason. */ ill_t * ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, boolean_t *multirtp, ipaddr_t *setsrcp) { ire_t *ire; ill_t *ill; ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); ASSERT(ire != NULL); if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } if (multirtp != NULL) *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; ill = ire_nexthop_ill(ire); ire_refrele(ire); return (ill); } /* * Delete the passed in ire if the gateway addr matches */ void ire_del_host_redir(ire_t *ire, char *gateway) { if ((ire->ire_flags & RTF_DYNAMIC) && (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) ire_delete(ire); } /* * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are * pointing at the specified gateway and * delete them. This routine is called only * when a default gateway is going away. */ void ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) { struct rtfuncarg rtfarg; (void) memset(&rtfarg, 0, sizeof (rtfarg)); rtfarg.rt_func = ire_del_host_redir; rtfarg.rt_arg = (void *)&gateway; (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); } /* * Obtain the rt_entry and rt_irb for the route to be added to * the ips_ip_ftable. * First attempt to add a node to the radix tree via rn_addroute. If the * route already exists, return the bucket for the existing route. * * Locking notes: Need to hold the global radix tree lock in write mode to * add a radix node. To prevent the node from being deleted, ire_get_bucket() * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() * while holding the irb_lock, but not the radix tree lock. */ irb_t * ire_get_bucket(ire_t *ire) { struct radix_node *rn; struct rt_entry *rt; struct rt_sockaddr rmask, rdst; irb_t *irb = NULL; ip_stack_t *ipst = ire->ire_ipst; ASSERT(ipst->ips_ip_ftable != NULL); /* first try to see if route exists (based on rtalloc1) */ (void) memset(&rdst, 0, sizeof (rdst)); rdst.rt_sin_len = sizeof (rdst); rdst.rt_sin_family = AF_INET; rdst.rt_sin_addr.s_addr = ire->ire_addr; (void) memset(&rmask, 0, sizeof (rmask)); rmask.rt_sin_len = sizeof (rmask); rmask.rt_sin_family = AF_INET; rmask.rt_sin_addr.s_addr = ire->ire_mask; /* * add the route. based on BSD's rtrequest1(RTM_ADD) */ R_Malloc(rt, rt_entry_cache, sizeof (*rt)); /* kmem_alloc failed */ if (rt == NULL) return (NULL); (void) memset(rt, 0, sizeof (*rt)); rt->rt_nodes->rn_key = (char *)&rt->rt_dst; rt->rt_dst = rdst; irb = &rt->rt_irb; irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ irb->irb_ipst = ipst; rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, ipst->ips_ip_ftable, (struct radix_node *)rt); if (rn == NULL) { RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); Free(rt, rt_entry_cache); rt = NULL; irb = NULL; RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, ipst->ips_ip_ftable); if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { /* found a non-root match */ rt = (struct rt_entry *)rn; } } if (rt != NULL) { irb = &rt->rt_irb; irb_refhold(irb); } RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); return (irb); } /* * This function is used when the caller wants to know the outbound * interface for a packet given only the address. * If this is a offlink IP address and there are multiple * routes to this destination, this routine will utilise the * first route it finds to IP address * Return values: * 0 - FAILURE * nonzero - ifindex */ uint_t ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) { uint_t ifindex = 0; ire_t *ire; ill_t *ill; netstack_t *ns; ip_stack_t *ipst; if (zoneid == ALL_ZONES) ns = netstack_find_by_zoneid(GLOBAL_ZONEID); else ns = netstack_find_by_zoneid(zoneid); ASSERT(ns != NULL); /* * For exclusive stacks we set the zoneid to zero * since IP uses the global zoneid in the exclusive stacks. */ if (ns->netstack_stackid != GLOBAL_NETSTACKID) zoneid = GLOBAL_ZONEID; ipst = ns->netstack_ip; ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { ill = ire_nexthop_ill(ire); if (ill != NULL) { ifindex = ill->ill_phyint->phyint_ifindex; ill_refrele(ill); } ire_refrele(ire); } netstack_rele(ns); return (ifindex); } /* * Routine to find the route to a destination. If a ifindex is supplied * it tries to match the route to the corresponding ipif for the ifindex */ static ire_t * route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) { ire_t *ire = NULL; int match_flags; match_flags = MATCH_IRE_DSTONLY; /* XXX pass NULL tsl for now */ if (dst_addr->sa_family == AF_INET) { ire = ire_route_recursive_v4( ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, NULL); } else { ire = ire_route_recursive_v6( &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, NULL); } ASSERT(ire != NULL); if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } return (ire); } /* * This routine is called by IP Filter to send a packet out on the wire * to a specified dstination (which may be onlink or offlink). The ifindex may * or may not be 0. A non-null ifindex indicates IP Filter has stipulated * an outgoing interface and requires the nexthop to be on that interface. * IP WILL NOT DO the following to the data packet before sending it out: * a. manipulate ttl * b. ipsec work * c. fragmentation * * If the packet has been prepared for hardware checksum then it will be * passed off to ip_send_align_cksum() to check that the flags set on the * packet are in alignment with the capabilities of the new outgoing NIC. * * Return values: * 0: IP was able to send of the data pkt * ECOMM: Could not send packet * ENONET No route to dst. It is up to the caller * to send icmp unreachable error message, * EINPROGRESS The macaddr of the onlink dst or that * of the offlink dst's nexthop needs to get * resolved before packet can be sent to dst. * Thus transmission is not guaranteed. * Note: No longer have visibility to the ARP queue * hence no EINPROGRESS. */ int ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, zoneid_t zoneid) { ipaddr_t nexthop; netstack_t *ns; ip_stack_t *ipst; ip_xmit_attr_t ixas; int error; ASSERT(mp != NULL); if (zoneid == ALL_ZONES) ns = netstack_find_by_zoneid(GLOBAL_ZONEID); else ns = netstack_find_by_zoneid(zoneid); ASSERT(ns != NULL); /* * For exclusive stacks we set the zoneid to zero * since IP uses the global zoneid in the exclusive stacks. */ if (ns->netstack_stackid != GLOBAL_NETSTACKID) zoneid = GLOBAL_ZONEID; ipst = ns->netstack_ip; ASSERT(dst_addr->sa_family == AF_INET || dst_addr->sa_family == AF_INET6); bzero(&ixas, sizeof (ixas)); /* * No IPsec, no fragmentation, and don't let any hooks see * the packet. */ ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; ixas.ixa_cred = kcred; ixas.ixa_cpid = NOPID; ixas.ixa_tsl = NULL; ixas.ixa_ipst = ipst; ixas.ixa_ifindex = ifindex; if (dst_addr->sa_family == AF_INET) { ipha_t *ipha = (ipha_t *)mp->b_rptr; ixas.ixa_flags |= IXAF_IS_IPV4; nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; if (nexthop != ipha->ipha_dst) { ixas.ixa_flags |= IXAF_NEXTHOP_SET; ixas.ixa_nexthop_v4 = nexthop; } ixas.ixa_multicast_ttl = ipha->ipha_ttl; } else { ip6_t *ip6h = (ip6_t *)mp->b_rptr; in6_addr_t *nexthop6; nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { ixas.ixa_flags |= IXAF_NEXTHOP_SET; ixas.ixa_nexthop_v6 = *nexthop6; } ixas.ixa_multicast_ttl = ip6h->ip6_hops; } error = ip_output_simple(mp, &ixas); ixa_cleanup(&ixas); netstack_rele(ns); switch (error) { case 0: break; case EHOSTUNREACH: case ENETUNREACH: error = ENONET; break; default: error = ECOMM; break; } return (error); } /* * callback function provided by ire_ftable_lookup when calling * rn_match_args(). Invoke ire_match_args on each matching leaf node in * the radix tree. */ boolean_t ire_find_best_route(struct radix_node *rn, void *arg) { struct rt_entry *rt = (struct rt_entry *)rn; irb_t *irb_ptr; ire_t *ire; ire_ftable_args_t *margs = arg; ipaddr_t match_mask; ASSERT(rt != NULL); irb_ptr = &rt->rt_irb; if (irb_ptr->irb_ire_cnt == 0) return (B_FALSE); rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (IRE_IS_CONDEMNED(ire)) continue; if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) match_mask = margs->ift_mask; else match_mask = ire->ire_mask; if (ire_match_args(ire, margs->ift_addr, match_mask, margs->ift_gateway, margs->ift_type, margs->ift_ill, margs->ift_zoneid, margs->ift_tsl, margs->ift_flags)) { ire_refhold(ire); rw_exit(&irb_ptr->irb_lock); margs->ift_best_ire = ire; return (B_TRUE); } } rw_exit(&irb_ptr->irb_lock); return (B_FALSE); } /* * ftable irb_t structures are dynamically allocated, and we need to * check if the irb_t (and associated ftable tree attachment) needs to * be cleaned up when the irb_refcnt goes to 0. The conditions that need * be verified are: * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, * - no other threads holding references to ire's in the bucket, * i.e., irb_nire == 0 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 * - need to hold the global tree lock and irb_lock in write mode. */ void irb_refrele_ftable(irb_t *irb) { for (;;) { rw_enter(&irb->irb_lock, RW_WRITER); ASSERT(irb->irb_refcnt != 0); if (irb->irb_refcnt != 1) { /* * Someone has a reference to this radix node * or there is some bucket walker. */ irb->irb_refcnt--; rw_exit(&irb->irb_lock); return; } else { /* * There is no other walker, nor is there any * other thread that holds a direct ref to this * radix node. Do the clean up if needed. Call * to ire_unlink will clear the IRB_MARK_CONDEMNED flag */ if (irb->irb_marks & IRB_MARK_CONDEMNED) { ire_t *ire_list; ire_list = ire_unlink(irb); rw_exit(&irb->irb_lock); if (ire_list != NULL) ire_cleanup(ire_list); /* * more CONDEMNED entries could have * been added while we dropped the lock, * so we have to re-check. */ continue; } /* * Now check if there are still any ires * associated with this radix node. */ if (irb->irb_nire != 0) { /* * someone is still holding on * to ires in this bucket */ irb->irb_refcnt--; rw_exit(&irb->irb_lock); return; } else { /* * Everything is clear. Zero walkers, * Zero threads with a ref to this * radix node, Zero ires associated with * this radix node. Due to lock order, * check the above conditions again * after grabbing all locks in the right order */ rw_exit(&irb->irb_lock); if (irb_inactive(irb)) return; /* * irb_inactive could not free the irb. * See if there are any walkers, if not * try to clean up again. */ } } } } /* * IRE iterator used by ire_ftable_lookup to process multiple equal * routes. Given a starting point in the hash list (hash), walk the IREs * in the bucket skipping deleted entries. We treat the bucket as a circular * list for the purposes of walking it. * Returns the IRE (held) that corresponds to the hash value. If that IRE is * not applicable (ire_match_args failed) then it returns a subsequent one. * If we fail to find an IRE we return NULL. * * Assumes that the caller holds a reference on the IRE bucket and a read lock * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). * * Applies to IPv4 and IPv6. * * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same * address and bucket, we compare against ire_type for the orig_ire. We also * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being * first in the bucket. Thus we compare that ire_flags match the orig_ire. * * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is * reachable from the zone i.e., that the ire_gateway_addr is in a subnet * in which the zone has an IP address. We check this for the global zone * even if no shared-IP zones are configured. */ ire_t * ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, ire_t *orig_ire, ip_stack_t *ipst) { ire_t *ire, *maybe_ire = NULL; uint_t maybe_badcnt; uint_t maxwalk; /* Fold in more bits from the hint/hash */ hash = hash ^ (hash >> 8) ^ (hash >> 16); rw_enter(&irb_ptr->irb_lock, RW_WRITER); maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ hash %= maxwalk; irb_refhold_locked(irb_ptr); rw_exit(&irb_ptr->irb_lock); /* * Round-robin the routers list looking for a route that * matches the passed in parameters. * First we skip "hash" number of non-condemned IREs. * Then we match the IRE. * If we find an ire which has a non-zero ire_badcnt then we remember * it and keep on looking for a lower ire_badcnt. * If we come to the end of the list we continue (treat the * bucket list as a circular list) but we match less than "max" * entries. */ ire = irb_ptr->irb_ire; while (maxwalk > 0) { if (IRE_IS_CONDEMNED(ire)) goto next_ire_skip; /* Skip the first "hash" entries to do ECMP */ if (hash != 0) { hash--; goto next_ire_skip; } /* See CGTP comment above */ if (ire->ire_type != orig_ire->ire_type || ire->ire_flags != orig_ire->ire_flags) goto next_ire; /* * Note: Since IPv6 has hash buckets instead of radix * buckers we need to explicitly compare the addresses. * That makes this less efficient since we will be called * even if there is no alternatives just because the * bucket has multiple IREs for different addresses. */ if (ire->ire_ipversion == IPV6_VERSION) { if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, &ire->ire_addr_v6)) goto next_ire; } /* * For some reason find_best_route uses ire_mask. We do * the same. */ if (ire->ire_ipversion == IPV4_VERSION ? !ire_match_args(ire, margs->ift_addr, ire->ire_mask, margs->ift_gateway, margs->ift_type, margs->ift_ill, margs->ift_zoneid, margs->ift_tsl, margs->ift_flags) : !ire_match_args_v6(ire, &margs->ift_addr_v6, &ire->ire_mask_v6, &margs->ift_gateway_v6, margs->ift_type, margs->ift_ill, margs->ift_zoneid, margs->ift_tsl, margs->ift_flags)) goto next_ire; if (margs->ift_zoneid != ALL_ZONES && (ire->ire_type & IRE_OFFLINK)) { /* * When we're in a zone, we're only * interested in routers that are * reachable through ipifs within our zone. */ if (ire->ire_ipversion == IPV4_VERSION) { if (!ire_gateway_ok_zone_v4( ire->ire_gateway_addr, margs->ift_zoneid, ire->ire_ill, margs->ift_tsl, ipst, B_TRUE)) goto next_ire; } else { if (!ire_gateway_ok_zone_v6( &ire->ire_gateway_addr_v6, margs->ift_zoneid, ire->ire_ill, margs->ift_tsl, ipst, B_TRUE)) goto next_ire; } } mutex_enter(&ire->ire_lock); /* Look for stale ire_badcnt and clear */ if (ire->ire_badcnt != 0 && (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > ipst->ips_ip_ire_badcnt_lifetime)) ire->ire_badcnt = 0; mutex_exit(&ire->ire_lock); if (ire->ire_badcnt == 0) { /* We found one with a zero badcnt; done */ ire_refhold(ire); /* * Care needed since irb_refrele grabs WLOCK to free * the irb_t. */ if (ire->ire_ipversion == IPV4_VERSION) { RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); irb_refrele(irb_ptr); RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); } else { rw_exit(&ipst->ips_ip6_ire_head_lock); irb_refrele(irb_ptr); rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); } return (ire); } /* * keep looking to see if there is a better (lower * badcnt) matching IRE, but save this one as a last resort. * If we find a lower badcnt pick that one as the last* resort. */ if (maybe_ire == NULL) { maybe_ire = ire; maybe_badcnt = ire->ire_badcnt; } else if (ire->ire_badcnt < maybe_badcnt) { maybe_ire = ire; maybe_badcnt = ire->ire_badcnt; } next_ire: maxwalk--; next_ire_skip: ire = ire->ire_next; if (ire == NULL) ire = irb_ptr->irb_ire; } if (maybe_ire != NULL) ire_refhold(maybe_ire); /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ if (ire->ire_ipversion == IPV4_VERSION) { RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); irb_refrele(irb_ptr); RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); } else { rw_exit(&ipst->ips_ip6_ire_head_lock); irb_refrele(irb_ptr); rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); } return (maybe_ire); } void irb_refhold_rn(struct radix_node *rn) { if ((rn->rn_flags & RNF_ROOT) == 0) irb_refhold(&((rt_t *)(rn))->rt_irb); } void irb_refrele_rn(struct radix_node *rn) { if ((rn->rn_flags & RNF_ROOT) == 0) irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); } /* * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject * routes this routine sets up a ire_nce_cache as well. The caller needs to * lookup an nce for the multicast case. */ ire_t * ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) { uint_t match_args; uint_t ire_type; ill_t *ill; ire_t *ire; ip_stack_t *ipst = ixa->ixa_ipst; ipaddr_t v4dst; in6_addr_t v6nexthop; iaflags_t ixaflags = ixa->ixa_flags; nce_t *nce; match_args = MATCH_IRE_SECATTR; IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); if (setsrcp != NULL) ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); if (errorp != NULL) ASSERT(*errorp == 0); /* * The content of the ixa will be different if IP_NEXTHOP, * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set */ if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : IN6_IS_ADDR_MULTICAST(v6dst)) { /* Pick up the IRE_MULTICAST for the ill */ if (ixa->ixa_multicast_ifindex != 0) { ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, !(ixaflags & IXAF_IS_IPV4), ipst); } else if (ixaflags & IXAF_SCOPEID_SET) { /* sin6_scope_id takes precedence over ixa_ifindex */ ASSERT(ixa->ixa_scopeid != 0); ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, !(ixaflags & IXAF_IS_IPV4), ipst); } else if (ixa->ixa_ifindex != 0) { /* * In the ipmp case, the ixa_ifindex is set to * point at an under_ill and we would return the * ire_multicast() corresponding to that under_ill. */ ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, !(ixaflags & IXAF_IS_IPV4), ipst); } else if (ixaflags & IXAF_IS_IPV4) { ipaddr_t v4setsrc = INADDR_ANY; ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, multirtp, &v4setsrc); if (setsrcp != NULL) IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); } else { ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, multirtp, setsrcp); } if (ill != NULL && IS_VNI(ill)) { ill_refrele(ill); ill = NULL; } if (ill == NULL) { if (errorp != NULL) *errorp = ENXIO; /* Get a hold on the IRE_NOROUTE */ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); return (ire); } if (!(ill->ill_flags & ILLF_MULTICAST)) { ill_refrele(ill); if (errorp != NULL) *errorp = EHOSTUNREACH; /* Get a hold on the IRE_NOROUTE */ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); return (ire); } /* Get a refcnt on the single IRE_MULTICAST per ill */ ire = ire_multicast(ill); ill_refrele(ill); if (generationp != NULL) *generationp = ire->ire_generation; if (errorp != NULL && (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { *errorp = EHOSTUNREACH; } return (ire); } if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { if (ixaflags & IXAF_SCOPEID_SET) { /* sin6_scope_id takes precedence over ixa_ifindex */ ASSERT(ixa->ixa_scopeid != 0); ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, !(ixaflags & IXAF_IS_IPV4), ipst); } else { ASSERT(ixa->ixa_ifindex != 0); ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, !(ixaflags & IXAF_IS_IPV4), ipst); } if (ill != NULL && IS_VNI(ill)) { ill_refrele(ill); ill = NULL; } if (ill == NULL) { if (errorp != NULL) *errorp = ENXIO; /* Get a hold on the IRE_NOROUTE */ ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); return (ire); } /* * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF * so for both of them we need to be able look for an under * interface. */ if (IS_UNDER_IPMP(ill)) match_args |= MATCH_IRE_TESTHIDDEN; } else { ill = NULL; } if (ixaflags & IXAF_NEXTHOP_SET) { /* IP_NEXTHOP was set */ v6nexthop = ixa->ixa_nexthop_v6; } else { v6nexthop = *v6dst; } ire_type = 0; /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ /* * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then * we only look for an onlink IRE. */ if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { match_args |= MATCH_IRE_TYPE; ire_type = IRE_ONLINK; } if (ixaflags & IXAF_IS_IPV4) { ipaddr_t v4nexthop; ipaddr_t v4setsrc = INADDR_ANY; IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); if (setsrcp != NULL) IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); } else { ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); } #ifdef DEBUG if (match_args & MATCH_IRE_TESTHIDDEN) { ip3dbg(("looking for hidden; dst %x ire %p\n", v4dst, (void *)ire)); } #endif if (ill != NULL) ill_refrele(ill); if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || (ire->ire_type & IRE_MULTICAST)) { /* No ire_nce_cache */ return (ire); } /* Setup ire_nce_cache if it doesn't exist or is condemned. */ mutex_enter(&ire->ire_lock); nce = ire->ire_nce_cache; if (nce == NULL || nce->nce_is_condemned) { mutex_exit(&ire->ire_lock); (void) ire_revalidate_nce(ire); } else { mutex_exit(&ire->ire_lock); } return (ire); } /* * Find a route given some xmit attributes and a packet. * Generic for IPv4 and IPv6 * * This never returns NULL. But when it returns the IRE_NOROUTE * it might set errorp. */ ire_t * ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, int *errorp, boolean_t *multirtp) { if (ixa->ixa_flags & IXAF_IS_IPV4) { ipha_t *ipha = (ipha_t *)mp->b_rptr; in6_addr_t v6dst; IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); return (ip_select_route(&v6dst, ixa, generationp, NULL, errorp, multirtp)); } else { ip6_t *ip6h = (ip6_t *)mp->b_rptr; return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, NULL, errorp, multirtp)); } } ire_t * ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) { in6_addr_t v6dst; ire_t *ire; in6_addr_t setsrc; ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); setsrc = ipv6_all_zeros; ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, multirtp); if (v4setsrcp != NULL) IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); return (ire); } /* * Recursively look for a route to the destination. Can also match on * the zoneid, ill, and label. Used for the data paths. See also * ire_route_recursive. * * If ill is set this means we will match it by adding MATCH_IRE_ILL. * * Note that this function never returns NULL. It returns an IRE_NOROUTE * instead. * * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it * is an error. * Allow at most one RTF_INDIRECT. */ ire_t * ire_route_recursive_impl_v4(ire_t *ire, ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { int i, j; ire_t *ires[MAX_IRE_RECURSION]; uint_t generation; uint_t generations[MAX_IRE_RECURSION]; boolean_t need_refrele = B_FALSE; boolean_t invalidate = B_FALSE; int prefs[MAX_IRE_RECURSION]; ill_t *ill = NULL; if (setsrcp != NULL) ASSERT(*setsrcp == INADDR_ANY); if (gwattrp != NULL) ASSERT(*gwattrp == NULL); if (ill_arg != NULL) match_args |= MATCH_IRE_ILL; /* * We iterate up to three times to resolve a route, even though * we have four slots in the array. The extra slot is for an * IRE_IF_CLONE we might need to create. */ i = 0; while (i < MAX_IRE_RECURSION - 1) { /* ire_ftable_lookup handles round-robin/ECMP */ if (ire == NULL) { ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, match_args, xmit_hint, ipst, &generation); } else { /* Caller passed it; extra hold since we will rele */ ire_refhold(ire); if (generationp != NULL) generation = *generationp; else generation = IRE_GENERATION_VERIFY; } if (ire == NULL) ire = ire_reject(ipst, B_FALSE); /* Need to return the ire with RTF_REJECT|BLACKHOLE */ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) goto error; ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ prefs[i] = ire_pref(ire); if (i != 0) { /* * Don't allow anything unusual past the first * iteration. */ if ((ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || prefs[i] <= prefs[i-1]) { ire_refrele(ire); ire = ire_reject(ipst, B_FALSE); goto error; } } /* We have a usable IRE */ ires[i] = ire; generations[i] = generation; i++; /* The first RTF_SETSRC address is passed back if setsrcp */ if ((ire->ire_flags & RTF_SETSRC) && setsrcp != NULL && *setsrcp == INADDR_ANY) { ASSERT(ire->ire_setsrc_addr != INADDR_ANY); *setsrcp = ire->ire_setsrc_addr; } /* The first ire_gw_secattr is passed back if gwattrp */ if (ire->ire_gw_secattr != NULL && gwattrp != NULL && *gwattrp == NULL) *gwattrp = ire->ire_gw_secattr; /* * Check if we have a short-cut pointer to an IRE for this * destination, and that the cached dependency isn't stale. * In that case we've rejoined an existing tree towards a * parent, thus we don't need to continue the loop to * discover the rest of the tree. */ mutex_enter(&ire->ire_lock); if (ire->ire_dep_parent != NULL && ire->ire_dep_parent->ire_generation == ire->ire_dep_parent_generation) { mutex_exit(&ire->ire_lock); ire = NULL; goto done; } mutex_exit(&ire->ire_lock); /* * If this type should have an ire_nce_cache (even if it * doesn't yet have one) then we are done. Includes * IRE_INTERFACE with a full 32 bit mask. */ if (ire->ire_nce_capable) { ire = NULL; goto done; } ASSERT(!(ire->ire_type & IRE_IF_CLONE)); /* * For an IRE_INTERFACE we create an IRE_IF_CLONE for this * particular destination */ if (ire->ire_type & IRE_INTERFACE) { in6_addr_t v6nexthop; ire_t *clone; ASSERT(ire->ire_masklen != IPV4_ABITS); /* * In the case of ip_input and ILLF_FORWARDING not * being set, and in the case of RTM_GET, * there is no point in allocating * an IRE_IF_CLONE. We return the IRE_INTERFACE. * Note that !allocate can result in a ire_dep_parent * which is IRE_IF_* without an IRE_IF_CLONE. * We recover from that when we need to send packets * by ensuring that the generations become * IRE_GENERATION_VERIFY in this case. */ if (!allocate) { invalidate = B_TRUE; ire = NULL; goto done; } IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); clone = ire_create_if_clone(ire, &v6nexthop, &generation); if (clone == NULL) { /* * Temporary failure - no memory. * Don't want caller to cache IRE_NOROUTE. */ invalidate = B_TRUE; ire = ire_blackhole(ipst, B_FALSE); goto error; } /* * Make clone next to last entry and the * IRE_INTERFACE the last in the dependency * chain since the clone depends on the * IRE_INTERFACE. */ ASSERT(i >= 1); ASSERT(i < MAX_IRE_RECURSION); ires[i] = ires[i-1]; generations[i] = generations[i-1]; ires[i-1] = clone; generations[i-1] = generation; i++; ire = NULL; goto done; } /* * We only match on the type and optionally ILL when * recursing. The type match is used by some callers * to exclude certain types (such as IRE_IF_CLONE or * IRE_LOCAL|IRE_LOOPBACK). */ match_args &= MATCH_IRE_TYPE; nexthop = ire->ire_gateway_addr; if (ill == NULL && ire->ire_ill != NULL) { ill = ire->ire_ill; need_refrele = B_TRUE; ill_refhold(ill); match_args |= MATCH_IRE_ILL; } ire = NULL; } ASSERT(ire == NULL); ire = ire_reject(ipst, B_FALSE); error: ASSERT(ire != NULL); if (need_refrele) ill_refrele(ill); /* * In the case of MULTIRT we want to try a different IRE the next * time. We let the next packet retry in that case. */ if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) (void) ire_no_good(ires[0]); cleanup: /* cleanup ires[i] */ ire_dep_unbuild(ires, i); for (j = 0; j < i; j++) ire_refrele(ires[j]); ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); /* * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the * ip_select_route since the reject or lack of memory might be gone. */ if (generationp != NULL) *generationp = IRE_GENERATION_VERIFY; return (ire); done: ASSERT(ire == NULL); if (need_refrele) { ill_refrele(ill); ill = NULL; } /* Build dependencies */ if (!ire_dep_build(ires, generations, i)) { /* Something in chain was condemned; tear it apart */ ire = ire_reject(ipst, B_FALSE); goto cleanup; } /* * Release all refholds except the one for ires[0] that we * will return to the caller. */ for (j = 1; j < i; j++) ire_refrele(ires[j]); if (invalidate) { /* * Since we needed to allocate but couldn't we need to make * sure that the dependency chain is rebuilt the next time. */ ire_dep_invalidate_generations(ires[0]); generation = IRE_GENERATION_VERIFY; } else { /* * IREs can have been added or deleted while we did the * recursive lookup and we can't catch those until we've built * the dependencies. We verify the stored * ire_dep_parent_generation to catch any such changes and * return IRE_GENERATION_VERIFY (which will cause * ip_select_route to be called again so we can redo the * recursive lookup next time we send a packet. */ generation = ire_dep_validate_generations(ires[0]); if (generations[0] != ires[0]->ire_generation) { /* Something changed at the top */ generation = IRE_GENERATION_VERIFY; } } if (generationp != NULL) *generationp = generation; return (ires[0]); } ire_t * ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, gwattrp, generationp)); } /* * Recursively look for a route to the destination. * We only handle a destination match here, yet we have the same arguments * as the full match to allow function pointers to select between the two. * * Note that this function never returns NULL. It returns an IRE_NOROUTE * instead. * * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it * is an error. * Allow at most one RTF_INDIRECT. */ ire_t * ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst) { ire_t *ire; ire_t *ire1; uint_t generation; /* ire_ftable_lookup handles round-robin/ECMP */ ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, &generation); ASSERT(ire != NULL); /* * If this type should have an ire_nce_cache (even if it * doesn't yet have one) then we are done. Includes * IRE_INTERFACE with a full 32 bit mask. */ if (ire->ire_nce_capable) return (ire); /* * If the IRE has a current cached parent we know that the whole * parent chain is current, hence we don't need to discover and * build any dependencies by doing a recursive lookup. */ mutex_enter(&ire->ire_lock); if (ire->ire_dep_parent != NULL && ire->ire_dep_parent->ire_generation == ire->ire_dep_parent_generation) { mutex_exit(&ire->ire_lock); return (ire); } mutex_exit(&ire->ire_lock); /* * Fallback to loop in the normal code starting with the ire * we found. Normally this would return the same ire. */ ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, &generation); ire_refrele(ire); return (ire1); }