/* * CDDL HEADER START * * The contents of this file are subject to the terms of the * Common Development and Distribution License (the "License"). * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. * See the License for the specific language governing permissions * and limitations under the License. * * When distributing Covered Code, include this CDDL HEADER in each * file and include the License file at usr/src/OPENSOLARIS.LICENSE. * If applicable, add the following below this CDDL HEADER, with the * fields enclosed by brackets "[]" replaced with your own identifying * information: Portions Copyright [yyyy] [name of copyright owner] * * CDDL HEADER END */ /* * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * Copyright (c) 1990 Mentat Inc. */ /* * This file contains routines that manipulate Internet Routing Entries (IREs). */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define IS_DEFAULT_ROUTE_V6(ire) \ (((ire)->ire_type & IRE_DEFAULT) || \ (((ire)->ire_type & IRE_INTERFACE) && \ (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) static ire_t ire_null; static ire_t * ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst); /* * Initialize the ire that is specific to IPv6 part and call * ire_init_common to finish it. * Returns zero or errno. */ int ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { int error; /* * Reject IRE security attmakeribute creation/initialization * if system is not running in Trusted mode. */ if (gc != NULL && !is_system_labeled()) return (EINVAL); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); if (v6addr != NULL) ire->ire_addr_v6 = *v6addr; if (v6gateway != NULL) ire->ire_gateway_addr_v6 = *v6gateway; /* Make sure we don't have stray values in some fields */ switch (type) { case IRE_LOOPBACK: ire->ire_gateway_addr_v6 = ire->ire_addr_v6; /* FALLTHRU */ case IRE_HOST: case IRE_LOCAL: case IRE_IF_CLONE: ire->ire_mask_v6 = ipv6_all_ones; ire->ire_masklen = IPV6_ABITS; break; case IRE_PREFIX: case IRE_DEFAULT: case IRE_IF_RESOLVER: case IRE_IF_NORESOLVER: if (v6mask != NULL) { ire->ire_mask_v6 = *v6mask; ire->ire_masklen = ip_mask_to_plen_v6(&ire->ire_mask_v6); } break; case IRE_MULTICAST: case IRE_NOROUTE: ASSERT(v6mask == NULL); break; default: ASSERT(0); return (EINVAL); } error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, gc, ipst); if (error != NULL) return (error); /* Determine which function pointers to use */ ire->ire_postfragfn = ip_xmit; /* Common case */ switch (ire->ire_type) { case IRE_LOCAL: ire->ire_sendfn = ire_send_local_v6; ire->ire_recvfn = ire_recv_local_v6; #ifdef SO_VRRP ASSERT(ire->ire_ill != NULL); if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) { ire->ire_noaccept = B_TRUE; ire->ire_recvfn = ire_recv_noaccept_v6; } #endif break; case IRE_LOOPBACK: ire->ire_sendfn = ire_send_local_v6; ire->ire_recvfn = ire_recv_loopback_v6; break; case IRE_MULTICAST: ire->ire_postfragfn = ip_postfrag_loopcheck; ire->ire_sendfn = ire_send_multicast_v6; ire->ire_recvfn = ire_recv_multicast_v6; break; default: /* * For IRE_IF_ALL and IRE_OFFLINK we forward received * packets by default. */ ire->ire_sendfn = ire_send_wire_v6; ire->ire_recvfn = ire_recv_forward_v6; break; } if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire->ire_sendfn = ire_send_noroute_v6; ire->ire_recvfn = ire_recv_noroute_v6; } else if (ire->ire_flags & RTF_MULTIRT) { ire->ire_postfragfn = ip_postfrag_multirt_v6; ire->ire_sendfn = ire_send_multirt_v6; ire->ire_recvfn = ire_recv_multirt_v6; } ire->ire_nce_capable = ire_determine_nce_capable(ire); return (0); } /* * ire_create_v6 is called to allocate and initialize a new IRE. * * NOTE : This is called as writer sometimes though not required * by this function. */ /* ARGSUSED */ ire_t * ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) { ire_t *ire; int error; ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); if (ire == NULL) { DTRACE_PROBE(kmem__cache__alloc); return (NULL); } *ire = ire_null; error = ire_init_v6(ire, v6addr, v6mask, v6gateway, type, ill, zoneid, flags, gc, ipst); if (error != 0) { DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); kmem_cache_free(ire_cache, ire); return (NULL); } return (ire); } /* * Find the ill matching a multicast group. * Allows different routes for multicast addresses * in the unicast routing table (akin to FF::0/8 but could be more specific) * which point at different interfaces. This is used when IPV6_MULTICAST_IF * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't * specify the interface to join on. * * Supports link-local addresses by using ire_route_recursive which follows * the ill when recursing. * * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group * and the MULTIRT property can be different for different groups, we * extract RTF_MULTIRT from the special unicast route added for a group * with CGTP and pass that back in the multirtp argument. * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. * We have a setsrcp argument for the same reason. */ ill_t * ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) { ire_t *ire; ill_t *ill; ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); ASSERT(ire != NULL); if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { ire_refrele(ire); return (NULL); } if (multirtp != NULL) *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; ill = ire_nexthop_ill(ire); ire_refrele(ire); return (ill); } /* * This function takes a mask and returns number of bits set in the * mask (the represented prefix length). Assumes a contiguous mask. */ int ip_mask_to_plen_v6(const in6_addr_t *v6mask) { int bits; int plen = IPV6_ABITS; int i; for (i = 3; i >= 0; i--) { if (v6mask->s6_addr32[i] == 0) { plen -= 32; continue; } bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; if (bits == 0) break; plen -= bits; } return (plen); } /* * Convert a prefix length to the mask for that prefix. * Returns the argument bitmask. */ in6_addr_t * ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) { uint32_t *ptr; if (plen < 0 || plen > IPV6_ABITS) return (NULL); *bitmask = ipv6_all_zeros; if (plen == 0) return (bitmask); ptr = (uint32_t *)bitmask; while (plen > 32) { *ptr++ = 0xffffffffU; plen -= 32; } *ptr = htonl(0xffffffffU << (32 - plen)); return (bitmask); } /* * Add a fully initialized IPv6 IRE to the forwarding table. * This returns NULL on failure, or a held IRE on success. * Normally the returned IRE is the same as the argument. But a different * IRE will be returned if the added IRE is deemed identical to an existing * one. In that case ire_identical_ref will be increased. * The caller always needs to do an ire_refrele() on the returned IRE. */ ire_t * ire_add_v6(ire_t *ire) { ire_t *ire1; int mask_table_index; irb_t *irb_ptr; ire_t **irep; int match_flags; int error; ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV6_VERSION); /* Make sure the address is properly masked. */ V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { irb_t *ptr; int i; ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * sizeof (irb_t))); if (ptr == NULL) { ire_delete(ire); return (NULL); } for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); } mutex_enter(&ipst->ips_ire_ft_init_lock); if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == NULL) { ipst->ips_ip_forwarding_table_v6[mask_table_index] = ptr; mutex_exit(&ipst->ips_ire_ft_init_lock); } else { /* * Some other thread won the race in * initializing the forwarding table at the * same index. */ mutex_exit(&ipst->ips_ire_ft_init_lock); for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { rw_destroy(&ptr[i].irb_lock); } mi_free(ptr); } } irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, ipst->ips_ip6_ftable_hash_size)]); match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); if (ire->ire_ill != NULL) match_flags |= MATCH_IRE_ILL; /* * Start the atomic add of the ire. Grab the bucket lock and the * ill lock. Check for condemned. */ error = ire_atomic_start(irb_ptr, ire); if (error != 0) { ire_delete(ire); return (NULL); } /* * If we are creating a hidden IRE, make sure we search for * hidden IREs when searching for duplicates below. * Otherwise, we might find an IRE on some other interface * that's not marked hidden. */ if (ire->ire_testhidden) match_flags |= MATCH_IRE_TESTHIDDEN; /* * Atomically check for duplicate and insert in the table. */ for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { if (IRE_IS_CONDEMNED(ire1)) continue; /* * Here we need an exact match on zoneid, i.e., * ire_match_args doesn't fit. */ if (ire1->ire_zoneid != ire->ire_zoneid) continue; if (ire1->ire_type != ire->ire_type) continue; /* * Note: We do not allow multiple routes that differ only * in the gateway security attributes; such routes are * considered duplicates. * To change that we explicitly have to treat them as * different here. */ if (ire_match_args_v6(ire1, &ire->ire_addr_v6, &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, match_flags)) { /* * Return the old ire after doing a REFHOLD. * As most of the callers continue to use the IRE * after adding, we return a held ire. This will * avoid a lookup in the caller again. If the callers * don't want to use it, they need to do a REFRELE. */ ip1dbg(("found dup ire existing %p new %p", (void *)ire1, (void *)ire)); ire_refhold(ire1); atomic_add_32(&ire1->ire_identical_ref, 1); ire_atomic_end(irb_ptr, ire); ire_delete(ire); return (ire1); } } /* * Normally we do head insertion since most things do not care about * the order of the IREs in the bucket. * However, due to shared-IP zones (and restrict_interzone_loopback) * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same * address. For that reason we do tail insertion for IRE_IF_CLONE. */ irep = (ire_t **)irb_ptr; if (ire->ire_type & IRE_IF_CLONE) { while ((ire1 = *irep) != NULL) irep = &ire1->ire_next; } /* Insert at *irep */ ire1 = *irep; if (ire1 != NULL) ire1->ire_ptpn = &ire->ire_next; ire->ire_next = ire1; /* Link the new one in. */ ire->ire_ptpn = irep; /* * ire_walk routines de-reference ire_next without holding * a lock. Before we point to the new ire, we want to make * sure the store that sets the ire_next of the new ire * reaches global visibility, so that ire_walk routines * don't see a truncated list of ires i.e if the ire_next * of the new ire gets set after we do "*irep = ire" due * to re-ordering, the ire_walk thread will see a NULL * once it accesses the ire_next of the new ire. * membar_producer() makes sure that the following store * happens *after* all of the above stores. */ membar_producer(); *irep = ire; ire->ire_bucket = irb_ptr; /* * We return a bumped up IRE above. Keep it symmetrical * so that the callers will always have to release. This * helps the callers of this function because they continue * to use the IRE after adding and hence they don't have to * lookup again after we return the IRE. * * NOTE : We don't have to use atomics as this is appearing * in the list for the first time and no one else can bump * up the reference count on this yet. */ ire_refhold_locked(ire); BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); irb_ptr->irb_ire_cnt++; if (ire->ire_ill != NULL) { DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, (char *), "ire", (void *), ire); ire->ire_ill->ill_ire_cnt++; ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ } ire_atomic_end(irb_ptr, ire); /* Make any caching of the IREs be notified or updated */ ire_flush_cache_v6(ire, IRE_FLUSH_ADD); return (ire); } /* * Search for all HOST REDIRECT routes that are * pointing at the specified gateway and * delete them. This routine is called only * when a default gateway is going away. */ static void ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) { irb_t *irb_ptr; irb_t *irb; ire_t *ire; in6_addr_t gw_addr_v6; int i; /* get the hash table for HOST routes */ irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; if (irb_ptr == NULL) return; for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { irb = &irb_ptr[i]; irb_refhold(irb); for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { if (!(ire->ire_flags & RTF_DYNAMIC)) continue; mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) ire_delete(ire); } irb_refrele(irb); } } /* * Delete the specified IRE. * All calls should use ire_delete(). * Sometimes called as writer though not required by this function. * * NOTE : This function is called only if the ire was added * in the list. */ void ire_delete_v6(ire_t *ire) { in6_addr_t gw_addr_v6; ip_stack_t *ipst = ire->ire_ipst; /* * Make sure ire_generation increases from ire_flush_cache happen * after any lookup/reader has read ire_generation. * Since the rw_enter makes us wait until any lookup/reader has * completed we can exit the lock immediately. */ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); rw_exit(&ipst->ips_ip6_ire_head_lock); ASSERT(ire->ire_refcnt >= 1); ASSERT(ire->ire_ipversion == IPV6_VERSION); ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); if (ire->ire_type == IRE_DEFAULT) { /* * when a default gateway is going away * delete all the host redirects pointing at that * gateway. */ mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); ire_delete_host_redirects_v6(&gw_addr_v6, ipst); } /* * If we are deleting an IRE_INTERFACE then we make sure we also * delete any IRE_IF_CLONE that has been created from it. * Those are always in ire_dep_children. */ if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) ire_dep_delete_if_clone(ire); /* Remove from parent dependencies and child */ rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); if (ire->ire_dep_parent != NULL) { ire_dep_remove(ire); } while (ire->ire_dep_children != NULL) ire_dep_remove(ire->ire_dep_children); rw_exit(&ipst->ips_ire_dep_lock); } /* * When an IRE is added or deleted this routine is called to make sure * any caching of IRE information is notified or updated. * * The flag argument indicates if the flush request is due to addition * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). */ void ire_flush_cache_v6(ire_t *ire, int flag) { ip_stack_t *ipst = ire->ire_ipst; /* * IRE_IF_CLONE ire's don't provide any new information * than the parent from which they are cloned, so don't * perturb the generation numbers. */ if (ire->ire_type & IRE_IF_CLONE) return; /* * Ensure that an ire_add during a lookup serializes the updates of * the generation numbers under ire_head_lock so that the lookup gets * either the old ire and old generation number, or a new ire and new * generation number. */ rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); /* * If a route was just added, we need to notify everybody that * has cached an IRE_NOROUTE since there might now be a better * route for them. */ if (flag == IRE_FLUSH_ADD) { ire_increment_generation(ipst->ips_ire_reject_v6); ire_increment_generation(ipst->ips_ire_blackhole_v6); } /* Adding a default can't otherwise provide a better route */ if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { rw_exit(&ipst->ips_ip6_ire_head_lock); return; } switch (flag) { case IRE_FLUSH_DELETE: case IRE_FLUSH_GWCHANGE: /* * Update ire_generation for all ire_dep_children chains * starting with this IRE */ ire_dep_incr_generation(ire); break; case IRE_FLUSH_ADD: { in6_addr_t addr; in6_addr_t mask; ip_stack_t *ipst = ire->ire_ipst; uint_t masklen; /* * Find an IRE which is a shorter match than the ire to be added * For any such IRE (which we repeat) we update the * ire_generation the same way as in the delete case. */ addr = ire->ire_addr_v6; mask = ire->ire_mask_v6; masklen = ip_mask_to_plen_v6(&mask); ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); while (ire != NULL) { /* We need to handle all in the same bucket */ irb_increment_generation(ire->ire_bucket); mask = ire->ire_mask_v6; ASSERT(masklen > ip_mask_to_plen_v6(&mask)); masklen = ip_mask_to_plen_v6(&mask); ire_refrele(ire); ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); } } break; } rw_exit(&ipst->ips_ip6_ire_head_lock); } /* * Matches the arguments passed with the values in the ire. * * Note: for match types that match using "ill" passed in, ill * must be checked for non-NULL before calling this routine. */ boolean_t ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int match_flags) { in6_addr_t masked_addr; in6_addr_t gw_addr_v6; ill_t *ire_ill = NULL, *dst_ill; ip_stack_t *ipst = ire->ire_ipst; ASSERT(ire->ire_ipversion == IPV6_VERSION); ASSERT(addr != NULL); ASSERT(mask != NULL); ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); ASSERT((!(match_flags & MATCH_IRE_ILL)) || (ill != NULL && ill->ill_isv6)); /* * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it * is in fact hidden, to ensure the caller gets the right one. */ if (ire->ire_testhidden) { if (!(match_flags & MATCH_IRE_TESTHIDDEN)) return (B_FALSE); } if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && ire->ire_zoneid != ALL_ZONES) { /* * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid * does not match that of ire_zoneid, a failure to * match is reported at this point. Otherwise, since some IREs * that are available in the global zone can be used in local * zones, additional checks need to be performed: * * IRE_LOOPBACK * entries should never be matched in this situation. * Each zone has its own IRE_LOOPBACK. * * IRE_LOCAL * We allow them for any zoneid. ire_route_recursive * does additional checks when * ip_restrict_interzone_loopback is set. * * If ill_usesrc_ifindex is set * Then we check if the zone has a valid source address * on the usesrc ill. * * If ire_ill is set, then check that the zone has an ipif * on that ill. * * Outside of this function (in ire_round_robin) we check * that any IRE_OFFLINK has a gateway that reachable from the * zone when we have multiple choices (ECMP). */ if (match_flags & MATCH_IRE_ZONEONLY) return (B_FALSE); if (ire->ire_type & IRE_LOOPBACK) return (B_FALSE); if (ire->ire_type & IRE_LOCAL) goto matchit; /* * The normal case of IRE_ONLINK has a matching zoneid. * Here we handle the case when shared-IP zones have been * configured with IP addresses on vniN. In that case it * is ok for traffic from a zone to use IRE_ONLINK routes * if the ill has a usesrc pointing at vniN * Applies to IRE_INTERFACE. */ dst_ill = ire->ire_ill; if (ire->ire_type & IRE_ONLINK) { uint_t ifindex; /* * Note there is no IRE_INTERFACE on vniN thus * can't do an IRE lookup for a matching route. */ ifindex = dst_ill->ill_usesrc_ifindex; if (ifindex == 0) return (B_FALSE); /* * If there is a usable source address in the * zone, then it's ok to return this IRE_INTERFACE */ if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, zoneid, ipst)) { ip3dbg(("ire_match_args: no usrsrc for zone" " dst_ill %p\n", (void *)dst_ill)); return (B_FALSE); } } /* * For exampe, with * route add 11.0.0.0 gw1 -ifp bge0 * route add 11.0.0.0 gw2 -ifp bge1 * this code would differentiate based on * where the sending zone has addresses. * Only if the zone has an address on bge0 can it use the first * route. It isn't clear if this behavior is documented * anywhere. */ if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { ipif_t *tipif; mutex_enter(&dst_ill->ill_lock); for (tipif = dst_ill->ill_ipif; tipif != NULL; tipif = tipif->ipif_next) { if (!IPIF_IS_CONDEMNED(tipif) && (tipif->ipif_flags & IPIF_UP) && (tipif->ipif_zoneid == zoneid || tipif->ipif_zoneid == ALL_ZONES)) break; } mutex_exit(&dst_ill->ill_lock); if (tipif == NULL) return (B_FALSE); } } matchit: if (match_flags & MATCH_IRE_GW) { mutex_enter(&ire->ire_lock); gw_addr_v6 = ire->ire_gateway_addr_v6; mutex_exit(&ire->ire_lock); } if (match_flags & MATCH_IRE_ILL) { ire_ill = ire->ire_ill; /* * If asked to match an ill, we *must* match * on the ire_ill for ipmp test addresses, or * any of the ill in the group for data addresses. * If we don't, we may as well fail. * However, we need an exception for IRE_LOCALs to ensure * we loopback packets even sent to test addresses on different * interfaces in the group. */ if ((match_flags & MATCH_IRE_TESTHIDDEN) && !(ire->ire_type & IRE_LOCAL)) { if (ire->ire_ill != ill) return (B_FALSE); } else { match_flags &= ~MATCH_IRE_TESTHIDDEN; /* * We know that ill is not NULL, but ire_ill could be * NULL */ if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) return (B_FALSE); } } /* No ire_addr_v6 bits set past the mask */ ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6)); V6_MASK_COPY(*addr, *mask, masked_addr); if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && ((!(match_flags & MATCH_IRE_GW)) || IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && ((!(match_flags & MATCH_IRE_MASK)) || (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && ((!(match_flags & MATCH_IRE_SECATTR)) || (!is_system_labeled()) || (tsol_ire_match_gwattr(ire, tsl) == 0))) { /* We found the matched IRE */ return (B_TRUE); } return (B_FALSE); } /* * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified * gateway address. If ill is non-NULL we also match on it. * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. */ boolean_t ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) { ire_t *ire; uint_t match_flags; if (lock_held) ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); else rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; if (ill != NULL) match_flags |= MATCH_IRE_ILL; ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, ipst); if (!lock_held) rw_exit(&ipst->ips_ip6_ire_head_lock); if (ire != NULL) { ire_refrele(ire); return (B_TRUE); } else { return (B_FALSE); } } /* * Lookup a route in forwarding table. * specific lookup is indicated by passing the * required parameters and indicating the * match required in flag field. * * Supports link-local addresses by following the ipif/ill when recursing. */ ire_t * ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { ire_t *ire = NULL; ASSERT(addr != NULL); ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); ASSERT(ill == NULL || ill->ill_isv6); ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); /* * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL * is set. */ if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) return (NULL); rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, tsl, flags, ipst); if (ire == NULL) { rw_exit(&ipst->ips_ip6_ire_head_lock); return (NULL); } /* * round-robin only if we have more than one route in the bucket. * ips_ip_ecmp_behavior controls when we do ECMP * 2: always * 1: for IRE_DEFAULT and /0 IRE_INTERFACE * 0: never * * Note: if we found an IRE_IF_CLONE we won't look at the bucket with * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match * and the IRE_INTERFACESs are likely to be shorter matches. */ if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { if (ipst->ips_ip_ecmp_behavior == 2 || (ipst->ips_ip_ecmp_behavior == 1 && IS_DEFAULT_ROUTE_V6(ire))) { ire_t *next_ire; ire_ftable_args_t margs; (void) memset(&margs, 0, sizeof (margs)); margs.ift_addr_v6 = *addr; if (mask != NULL) margs.ift_mask_v6 = *mask; if (gateway != NULL) margs.ift_gateway_v6 = *gateway; margs.ift_type = type; margs.ift_ill = ill; margs.ift_zoneid = zoneid; margs.ift_tsl = tsl; margs.ift_flags = flags; next_ire = ire_round_robin(ire->ire_bucket, &margs, xmit_hint, ire, ipst); if (next_ire == NULL) { /* keep ire if next_ire is null */ goto done; } ire_refrele(ire); ire = next_ire; } } done: /* Return generation before dropping lock */ if (generationp != NULL) *generationp = ire->ire_generation; rw_exit(&ipst->ips_ip6_ire_head_lock); /* * For shared-IP zones we need additional checks to what was * done in ire_match_args to make sure IRE_LOCALs are handled. * * When ip_restrict_interzone_loopback is set, then * we ensure that IRE_LOCAL are only used for loopback * between zones when the logical "Ethernet" would * have looped them back. That is, if in the absense of * the IRE_LOCAL we would have sent to packet out the * same ill. */ if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && ipst->ips_ip_restrict_interzone_loopback) { ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); ASSERT(ire != NULL); } return (ire); } /* * Look up a single ire. The caller holds either the read or write lock. */ ire_t * ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, int flags, ip_stack_t *ipst) { irb_t *irb_ptr; ire_t *ire = NULL; int i; ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); /* * If the mask is known, the lookup * is simple, if the mask is not known * we need to search. */ if (flags & MATCH_IRE_MASK) { uint_t masklen; masklen = ip_mask_to_plen_v6(mask); if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { return (NULL); } irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ IRE_ADDR_MASK_HASH_V6(*addr, *mask, ipst->ips_ip6_ftable_hash_size)]); rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, mask, gateway, type, ill, zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } else { uint_t masklen; /* * In this case we don't know the mask, we need to * search the table assuming different mask sizes. */ if (flags & MATCH_IRE_SHORTERMASK) { masklen = ip_mask_to_plen_v6(mask); if (masklen == 0) { /* Nothing shorter than zero */ return (NULL); } masklen--; } else { masklen = IP6_MASK_TABLE_SIZE - 1; } for (i = masklen; i >= 0; i--) { in6_addr_t tmpmask; if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) continue; (void) ip_plen_to_mask_v6(i, &tmpmask); irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, ipst->ips_ip6_ftable_hash_size)]; rw_enter(&irb_ptr->irb_lock, RW_READER); for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { if (IRE_IS_CONDEMNED(ire)) continue; if (ire_match_args_v6(ire, addr, &ire->ire_mask_v6, gateway, type, ill, zoneid, tsl, flags)) goto found_ire; } rw_exit(&irb_ptr->irb_lock); } } ASSERT(ire == NULL); ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); return (NULL); found_ire: ire_refhold(ire); rw_exit(&irb_ptr->irb_lock); return (ire); } /* * This function is called by * ip_input/ire_route_recursive when doing a route lookup on only the * destination address. * * The optimizations of this function over ire_ftable_lookup are: * o removing unnecessary flag matching * o doing longest prefix match instead of overloading it further * with the unnecessary "best_prefix_match" * * If no route is found we return IRE_NOROUTE. */ ire_t * ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) { ire_t *ire; ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); if (ire == NULL) { ire = ire_reject(ipst, B_TRUE); if (generationp != NULL) *generationp = IRE_GENERATION_VERIFY; } /* ftable_lookup did round robin */ return (ire); } ire_t * ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) { ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, multirtp)); } /* * Recursively look for a route to the destination. Can also match on * the zoneid, ill, and label. Used for the data paths. See also * ire_route_recursive_dstonly. * * If ill is set this means we will match it by adding MATCH_IRE_ILL. * * If allocate is not set then we will only inspect the existing IREs; never * create an IRE_IF_CLONE. This is used on the receive side when we are not * forwarding. * * Note that this function never returns NULL. It returns an IRE_NOROUTE * instead. * * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it * is an error. * Allow at most one RTF_INDIRECT. */ ire_t * ire_route_recursive_impl_v6(ire_t *ire, const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { int i, j; in6_addr_t v6nexthop = *nexthop; ire_t *ires[MAX_IRE_RECURSION]; uint_t generation; uint_t generations[MAX_IRE_RECURSION]; boolean_t need_refrele = B_FALSE; boolean_t invalidate = B_FALSE; int prefs[MAX_IRE_RECURSION]; ill_t *ill = NULL; if (setsrcp != NULL) ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); if (gwattrp != NULL) ASSERT(*gwattrp == NULL); if (ill_arg != NULL) match_args |= MATCH_IRE_ILL; /* * We iterate up to three times to resolve a route, even though * we have four slots in the array. The extra slot is for an * IRE_IF_CLONE we might need to create. */ i = 0; while (i < MAX_IRE_RECURSION - 1) { /* ire_ftable_lookup handles round-robin/ECMP */ if (ire == NULL) { ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, match_args, xmit_hint, ipst, &generation); } else { /* Caller passed it; extra hold since we will rele */ ire_refhold(ire); if (generationp != NULL) generation = *generationp; else generation = IRE_GENERATION_VERIFY; } if (ire == NULL) ire = ire_reject(ipst, B_TRUE); /* Need to return the ire with RTF_REJECT|BLACKHOLE */ if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) goto error; ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ prefs[i] = ire_pref(ire); if (i != 0) { /* * Don't allow anything unusual past the first * iteration. */ if ((ire->ire_type & (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || prefs[i] <= prefs[i-1]) { ire_refrele(ire); ire = ire_reject(ipst, B_TRUE); goto error; } } /* We have a usable IRE */ ires[i] = ire; generations[i] = generation; i++; /* The first RTF_SETSRC address is passed back if setsrcp */ if ((ire->ire_flags & RTF_SETSRC) && setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { ASSERT(!IN6_IS_ADDR_UNSPECIFIED( &ire->ire_setsrc_addr_v6)); *setsrcp = ire->ire_setsrc_addr_v6; } /* The first ire_gw_secattr is passed back if gwattrp */ if (ire->ire_gw_secattr != NULL && gwattrp != NULL && *gwattrp == NULL) *gwattrp = ire->ire_gw_secattr; /* * Check if we have a short-cut pointer to an IRE for this * destination, and that the cached dependency isn't stale. * In that case we've rejoined an existing tree towards a * parent, thus we don't need to continue the loop to * discover the rest of the tree. */ mutex_enter(&ire->ire_lock); if (ire->ire_dep_parent != NULL && ire->ire_dep_parent->ire_generation == ire->ire_dep_parent_generation) { mutex_exit(&ire->ire_lock); ire = NULL; goto done; } mutex_exit(&ire->ire_lock); /* * If this type should have an ire_nce_cache (even if it * doesn't yet have one) then we are done. Includes * IRE_INTERFACE with a full 128 bit mask. */ if (ire->ire_nce_capable) { ire = NULL; goto done; } ASSERT(!(ire->ire_type & IRE_IF_CLONE)); /* * For an IRE_INTERFACE we create an IRE_IF_CLONE for this * particular destination */ if (ire->ire_type & IRE_INTERFACE) { ire_t *clone; ASSERT(ire->ire_masklen != IPV6_ABITS); /* * In the case of ip_input and ILLF_FORWARDING not * being set, and in the case of RTM_GET, * there is no point in allocating * an IRE_IF_CLONE. We return the IRE_INTERFACE. * Note that !allocate can result in a ire_dep_parent * which is IRE_IF_* without an IRE_IF_CLONE. * We recover from that when we need to send packets * by ensuring that the generations become * IRE_GENERATION_VERIFY in this case. */ if (!allocate) { invalidate = B_TRUE; ire = NULL; goto done; } clone = ire_create_if_clone(ire, &v6nexthop, &generation); if (clone == NULL) { /* * Temporary failure - no memory. * Don't want caller to cache IRE_NOROUTE. */ invalidate = B_TRUE; ire = ire_blackhole(ipst, B_TRUE); goto error; } /* * Make clone next to last entry and the * IRE_INTERFACE the last in the dependency * chain since the clone depends on the * IRE_INTERFACE. */ ASSERT(i >= 1); ASSERT(i < MAX_IRE_RECURSION); ires[i] = ires[i-1]; generations[i] = generations[i-1]; ires[i-1] = clone; generations[i-1] = generation; i++; ire = NULL; goto done; } /* * We only match on the type and optionally ILL when * recursing. The type match is used by some callers * to exclude certain types (such as IRE_IF_CLONE or * IRE_LOCAL|IRE_LOOPBACK). */ match_args &= MATCH_IRE_TYPE; v6nexthop = ire->ire_gateway_addr_v6; if (ill == NULL && ire->ire_ill != NULL) { ill = ire->ire_ill; need_refrele = B_TRUE; ill_refhold(ill); match_args |= MATCH_IRE_ILL; } ire = NULL; } ASSERT(ire == NULL); ire = ire_reject(ipst, B_TRUE); error: ASSERT(ire != NULL); if (need_refrele) ill_refrele(ill); /* * In the case of MULTIRT we want to try a different IRE the next * time. We let the next packet retry in that case. */ if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) (void) ire_no_good(ires[0]); cleanup: /* cleanup ires[i] */ ire_dep_unbuild(ires, i); for (j = 0; j < i; j++) ire_refrele(ires[j]); ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); /* * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the * ip_select_route since the reject or lack of memory might be gone. */ if (generationp != NULL) *generationp = IRE_GENERATION_VERIFY; return (ire); done: ASSERT(ire == NULL); if (need_refrele) ill_refrele(ill); /* Build dependencies */ if (!ire_dep_build(ires, generations, i)) { /* Something in chain was condemned; tear it apart */ ire = ire_blackhole(ipst, B_TRUE); goto cleanup; } /* * Release all refholds except the one for ires[0] that we * will return to the caller. */ for (j = 1; j < i; j++) ire_refrele(ires[j]); if (invalidate) { /* * Since we needed to allocate but couldn't we need to make * sure that the dependency chain is rebuilt the next time. */ ire_dep_invalidate_generations(ires[0]); generation = IRE_GENERATION_VERIFY; } else { /* * IREs can have been added or deleted while we did the * recursive lookup and we can't catch those until we've built * the dependencies. We verify the stored * ire_dep_parent_generation to catch any such changes and * return IRE_GENERATION_VERIFY (which will cause * ip_select_route to be called again so we can redo the * recursive lookup next time we send a packet. */ generation = ire_dep_validate_generations(ires[0]); if (generations[0] != ires[0]->ire_generation) { /* Something changed at the top */ generation = IRE_GENERATION_VERIFY; } } if (generationp != NULL) *generationp = generation; return (ires[0]); } ire_t * ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) { return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, gwattrp, generationp)); } /* * Recursively look for a route to the destination. * We only handle a destination match here, yet we have the same arguments * as the full match to allow function pointers to select between the two. * * Note that this function never returns NULL. It returns an IRE_NOROUTE * instead. * * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it * is an error. * Allow at most one RTF_INDIRECT. */ ire_t * ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst) { ire_t *ire; ire_t *ire1; uint_t generation; /* ire_ftable_lookup handles round-robin/ECMP */ ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, &generation); ASSERT(ire != NULL); /* * If this type should have an ire_nce_cache (even if it * doesn't yet have one) then we are done. Includes * IRE_INTERFACE with a full 128 bit mask. */ if (ire->ire_nce_capable) return (ire); /* * If the IRE has a current cached parent we know that the whole * parent chain is current, hence we don't need to discover and * build any dependencies by doing a recursive lookup. */ mutex_enter(&ire->ire_lock); if (ire->ire_dep_parent != NULL && ire->ire_dep_parent->ire_generation == ire->ire_dep_parent_generation) { mutex_exit(&ire->ire_lock); return (ire); } mutex_exit(&ire->ire_lock); /* * Fallback to loop in the normal code starting with the ire * we found. Normally this would return the same ire. */ ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, &generation); ire_refrele(ire); return (ire1); }