1c793af95Ssangeeta /* 2c793af95Ssangeeta * CDDL HEADER START 3c793af95Ssangeeta * 4c793af95Ssangeeta * The contents of this file are subject to the terms of the 5c793af95Ssangeeta * Common Development and Distribution License (the "License"). 6c793af95Ssangeeta * You may not use this file except in compliance with the License. 7c793af95Ssangeeta * 8c793af95Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c793af95Ssangeeta * or http://www.opensolaris.org/os/licensing. 10c793af95Ssangeeta * See the License for the specific language governing permissions 11c793af95Ssangeeta * and limitations under the License. 12c793af95Ssangeeta * 13c793af95Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 14c793af95Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c793af95Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 16c793af95Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 17c793af95Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 18c793af95Ssangeeta * 19c793af95Ssangeeta * CDDL HEADER END 20c793af95Ssangeeta */ 21c793af95Ssangeeta /* 22e11c3f44Smeem * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23c793af95Ssangeeta * Use is subject to license terms. 24c793af95Ssangeeta */ 25c793af95Ssangeeta 26c793af95Ssangeeta /* 27c793af95Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 28c793af95Ssangeeta */ 29c793af95Ssangeeta 30c793af95Ssangeeta #include <sys/types.h> 31c793af95Ssangeeta #include <sys/stream.h> 32c793af95Ssangeeta #include <sys/stropts.h> 33c793af95Ssangeeta #include <sys/strlog.h> 34c793af95Ssangeeta #include <sys/dlpi.h> 35c793af95Ssangeeta #include <sys/ddi.h> 36c793af95Ssangeeta #include <sys/cmn_err.h> 37c793af95Ssangeeta #include <sys/policy.h> 38c793af95Ssangeeta 39c793af95Ssangeeta #include <sys/systm.h> 40c793af95Ssangeeta #include <sys/strsun.h> 41c793af95Ssangeeta #include <sys/kmem.h> 42c793af95Ssangeeta #include <sys/param.h> 43c793af95Ssangeeta #include <sys/socket.h> 44edd26dc5Sdr146992 #include <sys/strsubr.h> 45c793af95Ssangeeta #include <net/if.h> 46c793af95Ssangeeta #include <net/route.h> 47c793af95Ssangeeta #include <netinet/in.h> 48c793af95Ssangeeta #include <net/if_dl.h> 49c793af95Ssangeeta #include <netinet/ip6.h> 50c793af95Ssangeeta #include <netinet/icmp6.h> 51c793af95Ssangeeta 52*bd670b35SErik Nordmark #include <inet/ipsec_impl.h> 53c793af95Ssangeeta #include <inet/common.h> 54c793af95Ssangeeta #include <inet/mi.h> 55c793af95Ssangeeta #include <inet/mib2.h> 56c793af95Ssangeeta #include <inet/ip.h> 57edd26dc5Sdr146992 #include <inet/ip_impl.h> 58c793af95Ssangeeta #include <inet/ip6.h> 59c793af95Ssangeeta #include <inet/ip_ndp.h> 60c793af95Ssangeeta #include <inet/arp.h> 61c793af95Ssangeeta #include <inet/ip_if.h> 62c793af95Ssangeeta #include <inet/ip_ire.h> 63c793af95Ssangeeta #include <inet/ip_ftable.h> 64c793af95Ssangeeta #include <inet/ip_rts.h> 65c793af95Ssangeeta #include <inet/nd.h> 66c793af95Ssangeeta 67c793af95Ssangeeta #include <net/pfkeyv2.h> 68c793af95Ssangeeta #include <inet/sadb.h> 69c793af95Ssangeeta #include <inet/tcp.h> 70c793af95Ssangeeta #include <inet/ipclassifier.h> 71c793af95Ssangeeta #include <sys/zone.h> 72c793af95Ssangeeta #include <net/radix.h> 73c793af95Ssangeeta #include <sys/tsol/label.h> 74c793af95Ssangeeta #include <sys/tsol/tnet.h> 75c793af95Ssangeeta 76c793af95Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 77c793af95Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 78c793af95Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79c793af95Ssangeeta 80f4b3ec61Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81c793af95Ssangeeta static void ire_del_host_redir(ire_t *, char *); 82c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *); 83c793af95Ssangeeta 84c793af95Ssangeeta /* 85c793af95Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 86c793af95Ssangeeta * passing the required parameters and indicating the match required in the 87c793af95Ssangeeta * flag field. 88c793af95Ssangeeta * 89c793af95Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90c793af95Ssangeeta */ 91c793af95Ssangeeta ire_t * 92*bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93*bd670b35SErik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94*bd670b35SErik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95c793af95Ssangeeta { 96*bd670b35SErik Nordmark ire_t *ire; 97c793af95Ssangeeta struct rt_sockaddr rdst, rmask; 98c793af95Ssangeeta struct rt_entry *rt; 99c793af95Ssangeeta ire_ftable_args_t margs; 100c793af95Ssangeeta 101*bd670b35SErik Nordmark ASSERT(ill == NULL || !ill->ill_isv6); 102c793af95Ssangeeta 103c793af95Ssangeeta /* 104*bd670b35SErik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL 105*bd670b35SErik Nordmark * is set. 106c793af95Ssangeeta */ 107*bd670b35SErik Nordmark if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108c793af95Ssangeeta return (NULL); 109c793af95Ssangeeta 110c793af95Ssangeeta (void) memset(&rdst, 0, sizeof (rdst)); 111c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 112c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 113c793af95Ssangeeta rdst.rt_sin_addr.s_addr = addr; 114c793af95Ssangeeta 115c793af95Ssangeeta (void) memset(&rmask, 0, sizeof (rmask)); 116c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 117c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 118c793af95Ssangeeta rmask.rt_sin_addr.s_addr = mask; 119c793af95Ssangeeta 120c793af95Ssangeeta (void) memset(&margs, 0, sizeof (margs)); 121c793af95Ssangeeta margs.ift_addr = addr; 122c793af95Ssangeeta margs.ift_mask = mask; 123c793af95Ssangeeta margs.ift_gateway = gateway; 124c793af95Ssangeeta margs.ift_type = type; 125*bd670b35SErik Nordmark margs.ift_ill = ill; 126c793af95Ssangeeta margs.ift_zoneid = zoneid; 127c793af95Ssangeeta margs.ift_tsl = tsl; 128c793af95Ssangeeta margs.ift_flags = flags; 129c793af95Ssangeeta 130c793af95Ssangeeta /* 131c793af95Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 132c793af95Ssangeeta * search to return, not the longest matching prefix, but the 133c793af95Ssangeeta * "best matching prefix", i.e., the longest prefix that also 134c793af95Ssangeeta * satisfies constraints imposed via the permutation of flags 135c793af95Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 136c793af95Ssangeeta * each matching leaf in the radix tree. ire_match_args is 137c793af95Ssangeeta * invoked by the callback function ire_find_best_route() 138c793af95Ssangeeta * We hold the global tree lock in read mode when calling 139c793af95Ssangeeta * rn_match_args. Before dropping the global tree lock, ensure 140c793af95Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 141c793af95Ssangeeta */ 142f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143f4b3ec61Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144f4b3ec61Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145c793af95Ssangeeta ire = margs.ift_best_ire; 146c793af95Ssangeeta if (rt == NULL) { 147*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148c793af95Ssangeeta return (NULL); 149c793af95Ssangeeta } 150*bd670b35SErik Nordmark ASSERT(ire != NULL); 151c793af95Ssangeeta 152c793af95Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153c793af95Ssangeeta 154c793af95Ssangeeta /* 155c793af95Ssangeeta * round-robin only if we have more than one route in the bucket. 156*bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 157*bd670b35SErik Nordmark * 2: always 158*bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159*bd670b35SErik Nordmark * 0: never 160c793af95Ssangeeta */ 161*bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162*bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 163*bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 164*bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 165c793af95Ssangeeta ire_t *next_ire; 166c793af95Ssangeeta 167*bd670b35SErik Nordmark margs.ift_best_ire = NULL; 168*bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 169*bd670b35SErik Nordmark xmit_hint, ire, ipst); 170*bd670b35SErik Nordmark if (next_ire == NULL) { 171*bd670b35SErik Nordmark /* keep ire if next_ire is null */ 172*bd670b35SErik Nordmark goto done; 173*bd670b35SErik Nordmark } 174*bd670b35SErik Nordmark ire_refrele(ire); 175c793af95Ssangeeta ire = next_ire; 176c793af95Ssangeeta } 177c793af95Ssangeeta } 178c793af95Ssangeeta 179*bd670b35SErik Nordmark done: 180*bd670b35SErik Nordmark /* Return generation before dropping lock */ 181*bd670b35SErik Nordmark if (generationp != NULL) 182*bd670b35SErik Nordmark *generationp = ire->ire_generation; 183c793af95Ssangeeta 184*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185e11c3f44Smeem 186c793af95Ssangeeta /* 187*bd670b35SErik Nordmark * For shared-IP zones we need additional checks to what was 188*bd670b35SErik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled. 189*bd670b35SErik Nordmark * 190*bd670b35SErik Nordmark * When ip_restrict_interzone_loopback is set, then 191*bd670b35SErik Nordmark * we ensure that IRE_LOCAL are only used for loopback 192*bd670b35SErik Nordmark * between zones when the logical "Ethernet" would 193*bd670b35SErik Nordmark * have looped them back. That is, if in the absense of 194*bd670b35SErik Nordmark * the IRE_LOCAL we would have sent to packet out the 195*bd670b35SErik Nordmark * same ill. 196c793af95Ssangeeta */ 197*bd670b35SErik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198*bd670b35SErik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199*bd670b35SErik Nordmark ipst->ips_ip_restrict_interzone_loopback) { 200*bd670b35SErik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201*bd670b35SErik Nordmark ASSERT(ire != NULL); 202c793af95Ssangeeta } 203c793af95Ssangeeta return (ire); 204c793af95Ssangeeta } 205c793af95Ssangeeta 206da14cebeSEric Cheng /* 207da14cebeSEric Cheng * This function is called by 208*bd670b35SErik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the 209*bd670b35SErik Nordmark * destination address. 210*bd670b35SErik Nordmark * 211da14cebeSEric Cheng * The optimizations of this function over ire_ftable_lookup are: 212da14cebeSEric Cheng * o removing unnecessary flag matching 213da14cebeSEric Cheng * o doing longest prefix match instead of overloading it further 214da14cebeSEric Cheng * with the unnecessary "best_prefix_match" 215*bd670b35SErik Nordmark * 216*bd670b35SErik Nordmark * If no route is found we return IRE_NOROUTE. 217da14cebeSEric Cheng */ 218*bd670b35SErik Nordmark ire_t * 219*bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220*bd670b35SErik Nordmark uint_t *generationp) 221da14cebeSEric Cheng { 222*bd670b35SErik Nordmark ire_t *ire; 223da14cebeSEric Cheng struct rt_sockaddr rdst; 224da14cebeSEric Cheng struct rt_entry *rt; 225*bd670b35SErik Nordmark irb_t *irb; 226da14cebeSEric Cheng 227da14cebeSEric Cheng rdst.rt_sin_len = sizeof (rdst); 228da14cebeSEric Cheng rdst.rt_sin_family = AF_INET; 229da14cebeSEric Cheng rdst.rt_sin_addr.s_addr = addr; 230da14cebeSEric Cheng 231da14cebeSEric Cheng /* 232da14cebeSEric Cheng * This is basically inlining a simpler version of ire_match_args 233da14cebeSEric Cheng */ 234da14cebeSEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235da14cebeSEric Cheng 236da14cebeSEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237da14cebeSEric Cheng ipst->ips_ip_ftable, NULL, NULL); 238da14cebeSEric Cheng 239*bd670b35SErik Nordmark if (rt == NULL) 240*bd670b35SErik Nordmark goto bad; 241*bd670b35SErik Nordmark 242*bd670b35SErik Nordmark irb = &rt->rt_irb; 243*bd670b35SErik Nordmark if (irb->irb_ire_cnt == 0) 244*bd670b35SErik Nordmark goto bad; 245*bd670b35SErik Nordmark 246*bd670b35SErik Nordmark rw_enter(&irb->irb_lock, RW_READER); 247*bd670b35SErik Nordmark ire = irb->irb_ire; 248*bd670b35SErik Nordmark if (ire == NULL) { 249*bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 250*bd670b35SErik Nordmark goto bad; 251da14cebeSEric Cheng } 252*bd670b35SErik Nordmark while (IRE_IS_CONDEMNED(ire)) { 253*bd670b35SErik Nordmark ire = ire->ire_next; 254*bd670b35SErik Nordmark if (ire == NULL) { 255*bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 256*bd670b35SErik Nordmark goto bad; 257*bd670b35SErik Nordmark } 258da14cebeSEric Cheng } 259da14cebeSEric Cheng 260da14cebeSEric Cheng /* we have a ire that matches */ 261*bd670b35SErik Nordmark ire_refhold(ire); 262*bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 263*bd670b35SErik Nordmark 264*bd670b35SErik Nordmark /* 265*bd670b35SErik Nordmark * round-robin only if we have more than one route in the bucket. 266*bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 267*bd670b35SErik Nordmark * 2: always 268*bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269*bd670b35SErik Nordmark * 0: never 270*bd670b35SErik Nordmark * 271*bd670b35SErik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272*bd670b35SErik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273*bd670b35SErik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches. 274*bd670b35SErik Nordmark */ 275*bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1) { 276*bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 277*bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 278*bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 279*bd670b35SErik Nordmark ire_t *next_ire; 280*bd670b35SErik Nordmark ire_ftable_args_t margs; 281*bd670b35SErik Nordmark 282*bd670b35SErik Nordmark (void) memset(&margs, 0, sizeof (margs)); 283*bd670b35SErik Nordmark margs.ift_addr = addr; 284*bd670b35SErik Nordmark margs.ift_zoneid = ALL_ZONES; 285*bd670b35SErik Nordmark 286*bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 287*bd670b35SErik Nordmark xmit_hint, ire, ipst); 288*bd670b35SErik Nordmark if (next_ire == NULL) { 289*bd670b35SErik Nordmark /* keep ire if next_ire is null */ 290*bd670b35SErik Nordmark if (generationp != NULL) 291*bd670b35SErik Nordmark *generationp = ire->ire_generation; 292*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293*bd670b35SErik Nordmark return (ire); 294*bd670b35SErik Nordmark } 295*bd670b35SErik Nordmark ire_refrele(ire); 296*bd670b35SErik Nordmark ire = next_ire; 297*bd670b35SErik Nordmark } 298*bd670b35SErik Nordmark } 299*bd670b35SErik Nordmark /* Return generation before dropping lock */ 300*bd670b35SErik Nordmark if (generationp != NULL) 301*bd670b35SErik Nordmark *generationp = ire->ire_generation; 302*bd670b35SErik Nordmark 303da14cebeSEric Cheng RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304da14cebeSEric Cheng 305*bd670b35SErik Nordmark /* 306*bd670b35SErik Nordmark * Since we only did ALL_ZONES matches there is no special handling 307*bd670b35SErik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308*bd670b35SErik Nordmark */ 309da14cebeSEric Cheng return (ire); 310da14cebeSEric Cheng 311*bd670b35SErik Nordmark bad: 312*bd670b35SErik Nordmark if (generationp != NULL) 313*bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 314da14cebeSEric Cheng 315*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316*bd670b35SErik Nordmark return (ire_reject(ipst, B_FALSE)); 317da14cebeSEric Cheng } 318c793af95Ssangeeta 319c793af95Ssangeeta /* 320*bd670b35SErik Nordmark * Find the ill matching a multicast group. 321c793af95Ssangeeta * Allows different routes for multicast addresses 322c793af95Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323c793af95Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 324c793af95Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325c793af95Ssangeeta * specify the interface to join on. 326c793af95Ssangeeta * 327*bd670b35SErik Nordmark * Supports link-local addresses by using ire_route_recursive which follows 328*bd670b35SErik Nordmark * the ill when recursing. 329*bd670b35SErik Nordmark * 330*bd670b35SErik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331*bd670b35SErik Nordmark * and the MULTIRT property can be different for different groups, we 332*bd670b35SErik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group 333*bd670b35SErik Nordmark * with CGTP and pass that back in the multirtp argument. 334*bd670b35SErik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335*bd670b35SErik Nordmark * We have a setsrcp argument for the same reason. 336c793af95Ssangeeta */ 337*bd670b35SErik Nordmark ill_t * 338*bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339*bd670b35SErik Nordmark boolean_t *multirtp, ipaddr_t *setsrcp) 340c793af95Ssangeeta { 341c793af95Ssangeeta ire_t *ire; 342*bd670b35SErik Nordmark ill_t *ill; 343c793af95Ssangeeta 344*bd670b35SErik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345*bd670b35SErik Nordmark MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 346*bd670b35SErik Nordmark ASSERT(ire != NULL); 347*bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348c793af95Ssangeeta ire_refrele(ire); 349c793af95Ssangeeta return (NULL); 350c793af95Ssangeeta } 351*bd670b35SErik Nordmark 352*bd670b35SErik Nordmark if (multirtp != NULL) 353*bd670b35SErik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354*bd670b35SErik Nordmark 355*bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 356*bd670b35SErik Nordmark ire_refrele(ire); 357*bd670b35SErik Nordmark return (ill); 358c793af95Ssangeeta } 359c793af95Ssangeeta 360c793af95Ssangeeta /* 361c793af95Ssangeeta * Delete the passed in ire if the gateway addr matches 362c793af95Ssangeeta */ 363c793af95Ssangeeta void 364c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 365c793af95Ssangeeta { 3666bdb8e66Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 367c793af95Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368c793af95Ssangeeta ire_delete(ire); 369c793af95Ssangeeta } 370c793af95Ssangeeta 371c793af95Ssangeeta /* 372*bd670b35SErik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373c793af95Ssangeeta * pointing at the specified gateway and 374c793af95Ssangeeta * delete them. This routine is called only 375c793af95Ssangeeta * when a default gateway is going away. 376c793af95Ssangeeta */ 377c793af95Ssangeeta void 378f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379c793af95Ssangeeta { 380c793af95Ssangeeta struct rtfuncarg rtfarg; 381c793af95Ssangeeta 382c793af95Ssangeeta (void) memset(&rtfarg, 0, sizeof (rtfarg)); 383c793af95Ssangeeta rtfarg.rt_func = ire_del_host_redir; 384c793af95Ssangeeta rtfarg.rt_arg = (void *)&gateway; 385f4b3ec61Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 386f4b3ec61Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 387c793af95Ssangeeta } 388c793af95Ssangeeta 389c793af95Ssangeeta /* 390f4b3ec61Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 391f4b3ec61Sdh155122 * the ips_ip_ftable. 392c793af95Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 393c793af95Ssangeeta * route already exists, return the bucket for the existing route. 394c793af95Ssangeeta * 395c793af95Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 396c793af95Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 397c793af95Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 398c793af95Ssangeeta * while holding the irb_lock, but not the radix tree lock. 399c793af95Ssangeeta */ 400c793af95Ssangeeta irb_t * 401c793af95Ssangeeta ire_get_bucket(ire_t *ire) 402c793af95Ssangeeta { 403c793af95Ssangeeta struct radix_node *rn; 404c793af95Ssangeeta struct rt_entry *rt; 405c793af95Ssangeeta struct rt_sockaddr rmask, rdst; 406c793af95Ssangeeta irb_t *irb = NULL; 407f4b3ec61Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 408c793af95Ssangeeta 409f4b3ec61Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 410c793af95Ssangeeta 411c793af95Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 412c793af95Ssangeeta (void) memset(&rdst, 0, sizeof (rdst)); 413c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 414c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 415c793af95Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 416c793af95Ssangeeta 417c793af95Ssangeeta (void) memset(&rmask, 0, sizeof (rmask)); 418c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 419c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 420c793af95Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 421c793af95Ssangeeta 422c793af95Ssangeeta /* 423c793af95Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 424c793af95Ssangeeta */ 425c793af95Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 42629bc4795Ssangeeta /* kmem_alloc failed */ 42729bc4795Ssangeeta if (rt == NULL) 42829bc4795Ssangeeta return (NULL); 42929bc4795Ssangeeta 430c793af95Ssangeeta (void) memset(rt, 0, sizeof (*rt)); 431c793af95Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 432c793af95Ssangeeta rt->rt_dst = rdst; 433c793af95Ssangeeta irb = &rt->rt_irb; 434*bd670b35SErik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 435f4b3ec61Sdh155122 irb->irb_ipst = ipst; 436c793af95Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 437f4b3ec61Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 438f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 439f4b3ec61Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 440c793af95Ssangeeta if (rn == NULL) { 441f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 442c793af95Ssangeeta Free(rt, rt_entry_cache); 443c793af95Ssangeeta rt = NULL; 444c793af95Ssangeeta irb = NULL; 445f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 446f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 447f4b3ec61Sdh155122 ipst->ips_ip_ftable); 448f4b3ec61Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 449c793af95Ssangeeta /* found a non-root match */ 450c793af95Ssangeeta rt = (struct rt_entry *)rn; 451c793af95Ssangeeta } 452c793af95Ssangeeta } 453c793af95Ssangeeta if (rt != NULL) { 454c793af95Ssangeeta irb = &rt->rt_irb; 455*bd670b35SErik Nordmark irb_refhold(irb); 456c793af95Ssangeeta } 457f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 458c793af95Ssangeeta return (irb); 459c793af95Ssangeeta } 460c793af95Ssangeeta 461c793af95Ssangeeta /* 462c793af95Ssangeeta * This function is used when the caller wants to know the outbound 463c793af95Ssangeeta * interface for a packet given only the address. 464c793af95Ssangeeta * If this is a offlink IP address and there are multiple 465c793af95Ssangeeta * routes to this destination, this routine will utilise the 466c793af95Ssangeeta * first route it finds to IP address 467c793af95Ssangeeta * Return values: 468c793af95Ssangeeta * 0 - FAILURE 469c793af95Ssangeeta * nonzero - ifindex 470c793af95Ssangeeta */ 471c793af95Ssangeeta uint_t 472c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 473c793af95Ssangeeta { 474c793af95Ssangeeta uint_t ifindex = 0; 475c793af95Ssangeeta ire_t *ire; 476c793af95Ssangeeta ill_t *ill; 477f4b3ec61Sdh155122 netstack_t *ns; 478f4b3ec61Sdh155122 ip_stack_t *ipst; 479c793af95Ssangeeta 480f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 481f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 482f4b3ec61Sdh155122 else 483f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 484f4b3ec61Sdh155122 ASSERT(ns != NULL); 485f4b3ec61Sdh155122 486f4b3ec61Sdh155122 /* 487f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 488f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 489f4b3ec61Sdh155122 */ 490f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 491f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 492f4b3ec61Sdh155122 ipst = ns->netstack_ip; 493c793af95Ssangeeta 494c793af95Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 495c793af95Ssangeeta 496f4b3ec61Sdh155122 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 497*bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 498*bd670b35SErik Nordmark if (ill != NULL) { 499c793af95Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 500*bd670b35SErik Nordmark ill_refrele(ill); 501*bd670b35SErik Nordmark } 502c793af95Ssangeeta ire_refrele(ire); 503c793af95Ssangeeta } 504f4b3ec61Sdh155122 netstack_rele(ns); 505c793af95Ssangeeta return (ifindex); 506c793af95Ssangeeta } 507c793af95Ssangeeta 508c793af95Ssangeeta /* 509c793af95Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 510*bd670b35SErik Nordmark * it tries to match the route to the corresponding ipif for the ifindex 511c793af95Ssangeeta */ 512c793af95Ssangeeta static ire_t * 513f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 514c793af95Ssangeeta { 515c793af95Ssangeeta ire_t *ire = NULL; 516c793af95Ssangeeta int match_flags; 517c793af95Ssangeeta 518*bd670b35SErik Nordmark match_flags = MATCH_IRE_DSTONLY; 519c793af95Ssangeeta 520c793af95Ssangeeta /* XXX pass NULL tsl for now */ 521c793af95Ssangeeta 522c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 523*bd670b35SErik Nordmark ire = ire_route_recursive_v4( 524*bd670b35SErik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 525*bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 526*bd670b35SErik Nordmark NULL); 527c793af95Ssangeeta } else { 528*bd670b35SErik Nordmark ire = ire_route_recursive_v6( 529*bd670b35SErik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 530*bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 531*bd670b35SErik Nordmark NULL); 532*bd670b35SErik Nordmark } 533*bd670b35SErik Nordmark ASSERT(ire != NULL); 534*bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 535*bd670b35SErik Nordmark ire_refrele(ire); 536*bd670b35SErik Nordmark return (NULL); 537c793af95Ssangeeta } 538c793af95Ssangeeta return (ire); 539c793af95Ssangeeta } 540c793af95Ssangeeta 541c793af95Ssangeeta /* 542c793af95Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 543*bd670b35SErik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may 544*bd670b35SErik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 545c793af95Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 546c793af95Ssangeeta * IP WILL NOT DO the following to the data packet before sending it out: 547c793af95Ssangeeta * a. manipulate ttl 548edd26dc5Sdr146992 * b. ipsec work 549edd26dc5Sdr146992 * c. fragmentation 550edd26dc5Sdr146992 * 551edd26dc5Sdr146992 * If the packet has been prepared for hardware checksum then it will be 552edd26dc5Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 553edd26dc5Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 554c793af95Ssangeeta * 555c793af95Ssangeeta * Return values: 556c793af95Ssangeeta * 0: IP was able to send of the data pkt 557c793af95Ssangeeta * ECOMM: Could not send packet 558c793af95Ssangeeta * ENONET No route to dst. It is up to the caller 559c793af95Ssangeeta * to send icmp unreachable error message, 560c793af95Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 561c793af95Ssangeeta * of the offlink dst's nexthop needs to get 562c793af95Ssangeeta * resolved before packet can be sent to dst. 563c793af95Ssangeeta * Thus transmission is not guaranteed. 564*bd670b35SErik Nordmark * Note: No longer have visibility to the ARP queue 565*bd670b35SErik Nordmark * hence no EINPROGRESS. 566c793af95Ssangeeta */ 567c793af95Ssangeeta int 568c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 569c793af95Ssangeeta zoneid_t zoneid) 570c793af95Ssangeeta { 571*bd670b35SErik Nordmark ipaddr_t nexthop; 572f4b3ec61Sdh155122 netstack_t *ns; 573f4b3ec61Sdh155122 ip_stack_t *ipst; 574*bd670b35SErik Nordmark ip_xmit_attr_t ixas; 575*bd670b35SErik Nordmark int error; 576c793af95Ssangeeta 577c793af95Ssangeeta ASSERT(mp != NULL); 578c793af95Ssangeeta 579f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 580f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 581f4b3ec61Sdh155122 else 582f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 583f4b3ec61Sdh155122 ASSERT(ns != NULL); 584f4b3ec61Sdh155122 585f4b3ec61Sdh155122 /* 586f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 587f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 588f4b3ec61Sdh155122 */ 589f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 590f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 591f4b3ec61Sdh155122 ipst = ns->netstack_ip; 592f4b3ec61Sdh155122 593c793af95Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 594c793af95Ssangeeta dst_addr->sa_family == AF_INET6); 595c793af95Ssangeeta 596*bd670b35SErik Nordmark bzero(&ixas, sizeof (ixas)); 597*bd670b35SErik Nordmark /* 598*bd670b35SErik Nordmark * No IPsec, no fragmentation, and don't let any hooks see 599*bd670b35SErik Nordmark * the packet. 600*bd670b35SErik Nordmark */ 601*bd670b35SErik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 602*bd670b35SErik Nordmark ixas.ixa_cred = kcred; 603*bd670b35SErik Nordmark ixas.ixa_cpid = NOPID; 604*bd670b35SErik Nordmark ixas.ixa_tsl = NULL; 605*bd670b35SErik Nordmark ixas.ixa_ipst = ipst; 606*bd670b35SErik Nordmark ixas.ixa_ifindex = ifindex; 607*bd670b35SErik Nordmark 608c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 609*bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 610*bd670b35SErik Nordmark 611*bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_IS_IPV4; 612*bd670b35SErik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 613*bd670b35SErik Nordmark if (nexthop != ipha->ipha_dst) { 614*bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 615*bd670b35SErik Nordmark ixas.ixa_nexthop_v4 = nexthop; 616*bd670b35SErik Nordmark } 617*bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ipha->ipha_ttl; 618c793af95Ssangeeta } else { 619*bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 620*bd670b35SErik Nordmark in6_addr_t *nexthop6; 621*bd670b35SErik Nordmark 622*bd670b35SErik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 623*bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 624*bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 625*bd670b35SErik Nordmark ixas.ixa_nexthop_v6 = *nexthop6; 626c793af95Ssangeeta } 627*bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ip6h->ip6_hops; 628c793af95Ssangeeta } 629*bd670b35SErik Nordmark error = ip_output_simple(mp, &ixas); 630*bd670b35SErik Nordmark ixa_cleanup(&ixas); 631c793af95Ssangeeta 632f4b3ec61Sdh155122 netstack_rele(ns); 633*bd670b35SErik Nordmark switch (error) { 634*bd670b35SErik Nordmark case 0: 635*bd670b35SErik Nordmark break; 636*bd670b35SErik Nordmark 637*bd670b35SErik Nordmark case EHOSTUNREACH: 638*bd670b35SErik Nordmark case ENETUNREACH: 639*bd670b35SErik Nordmark error = ENONET; 640*bd670b35SErik Nordmark break; 641*bd670b35SErik Nordmark 642*bd670b35SErik Nordmark default: 643*bd670b35SErik Nordmark error = ECOMM; 644*bd670b35SErik Nordmark break; 645c793af95Ssangeeta } 646*bd670b35SErik Nordmark return (error); 647edd26dc5Sdr146992 } 648edd26dc5Sdr146992 649c793af95Ssangeeta /* 650c793af95Ssangeeta * callback function provided by ire_ftable_lookup when calling 651c793af95Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 652c793af95Ssangeeta * the radix tree. 653c793af95Ssangeeta */ 654c793af95Ssangeeta boolean_t 655c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 656c793af95Ssangeeta { 657c793af95Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 658c793af95Ssangeeta irb_t *irb_ptr; 659c793af95Ssangeeta ire_t *ire; 660c793af95Ssangeeta ire_ftable_args_t *margs = arg; 661c793af95Ssangeeta ipaddr_t match_mask; 662c793af95Ssangeeta 663c793af95Ssangeeta ASSERT(rt != NULL); 664c793af95Ssangeeta 665c793af95Ssangeeta irb_ptr = &rt->rt_irb; 666c793af95Ssangeeta 667c793af95Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 668c793af95Ssangeeta return (B_FALSE); 669c793af95Ssangeeta 670c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 671c793af95Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 672*bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 673c793af95Ssangeeta continue; 674*bd670b35SErik Nordmark if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 675c793af95Ssangeeta match_mask = margs->ift_mask; 676c793af95Ssangeeta else 677c793af95Ssangeeta match_mask = ire->ire_mask; 678c793af95Ssangeeta 679c793af95Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 680*bd670b35SErik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill, 681*bd670b35SErik Nordmark margs->ift_zoneid, margs->ift_tsl, 682*bd670b35SErik Nordmark margs->ift_flags)) { 683*bd670b35SErik Nordmark ire_refhold(ire); 684c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 685c793af95Ssangeeta margs->ift_best_ire = ire; 686c793af95Ssangeeta return (B_TRUE); 687c793af95Ssangeeta } 688c793af95Ssangeeta } 689c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 690c793af95Ssangeeta return (B_FALSE); 691c793af95Ssangeeta } 692c793af95Ssangeeta 693c793af95Ssangeeta /* 694c793af95Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 695c793af95Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 696c793af95Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 697c793af95Ssangeeta * be verified are: 698c793af95Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 699c793af95Ssangeeta * - no other threads holding references to ire's in the bucket, 700c793af95Ssangeeta * i.e., irb_nire == 0 701c793af95Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 702c793af95Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 703c793af95Ssangeeta */ 704c793af95Ssangeeta void 705c793af95Ssangeeta irb_refrele_ftable(irb_t *irb) 706c793af95Ssangeeta { 707c793af95Ssangeeta for (;;) { 708c793af95Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 709c793af95Ssangeeta ASSERT(irb->irb_refcnt != 0); 710c793af95Ssangeeta if (irb->irb_refcnt != 1) { 711c793af95Ssangeeta /* 712c793af95Ssangeeta * Someone has a reference to this radix node 713c793af95Ssangeeta * or there is some bucket walker. 714c793af95Ssangeeta */ 715c793af95Ssangeeta irb->irb_refcnt--; 716c793af95Ssangeeta rw_exit(&irb->irb_lock); 717c793af95Ssangeeta return; 718c793af95Ssangeeta } else { 719c793af95Ssangeeta /* 720c793af95Ssangeeta * There is no other walker, nor is there any 721c793af95Ssangeeta * other thread that holds a direct ref to this 722c793af95Ssangeeta * radix node. Do the clean up if needed. Call 723c793af95Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 724c793af95Ssangeeta */ 725c793af95Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 726c793af95Ssangeeta ire_t *ire_list; 727c793af95Ssangeeta 728c793af95Ssangeeta ire_list = ire_unlink(irb); 729c793af95Ssangeeta rw_exit(&irb->irb_lock); 730c793af95Ssangeeta 731c793af95Ssangeeta if (ire_list != NULL) 732c793af95Ssangeeta ire_cleanup(ire_list); 733c793af95Ssangeeta /* 734c793af95Ssangeeta * more CONDEMNED entries could have 735c793af95Ssangeeta * been added while we dropped the lock, 736c793af95Ssangeeta * so we have to re-check. 737c793af95Ssangeeta */ 738c793af95Ssangeeta continue; 739c793af95Ssangeeta } 740c793af95Ssangeeta 741c793af95Ssangeeta /* 742c793af95Ssangeeta * Now check if there are still any ires 743c793af95Ssangeeta * associated with this radix node. 744c793af95Ssangeeta */ 745c793af95Ssangeeta if (irb->irb_nire != 0) { 746c793af95Ssangeeta /* 747c793af95Ssangeeta * someone is still holding on 748c793af95Ssangeeta * to ires in this bucket 749c793af95Ssangeeta */ 750c793af95Ssangeeta irb->irb_refcnt--; 751c793af95Ssangeeta rw_exit(&irb->irb_lock); 752c793af95Ssangeeta return; 753c793af95Ssangeeta } else { 754c793af95Ssangeeta /* 755c793af95Ssangeeta * Everything is clear. Zero walkers, 756c793af95Ssangeeta * Zero threads with a ref to this 757c793af95Ssangeeta * radix node, Zero ires associated with 758c793af95Ssangeeta * this radix node. Due to lock order, 759c793af95Ssangeeta * check the above conditions again 760c793af95Ssangeeta * after grabbing all locks in the right order 761c793af95Ssangeeta */ 762c793af95Ssangeeta rw_exit(&irb->irb_lock); 763c793af95Ssangeeta if (irb_inactive(irb)) 764c793af95Ssangeeta return; 765c793af95Ssangeeta /* 766c793af95Ssangeeta * irb_inactive could not free the irb. 767c793af95Ssangeeta * See if there are any walkers, if not 768c793af95Ssangeeta * try to clean up again. 769c793af95Ssangeeta */ 770c793af95Ssangeeta } 771c793af95Ssangeeta } 772c793af95Ssangeeta } 773c793af95Ssangeeta } 774c793af95Ssangeeta 775c793af95Ssangeeta /* 776*bd670b35SErik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal 777*bd670b35SErik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs 778*bd670b35SErik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular 779*bd670b35SErik Nordmark * list for the purposes of walking it. 780*bd670b35SErik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is 781*bd670b35SErik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one. 782*bd670b35SErik Nordmark * If we fail to find an IRE we return NULL. 783c793af95Ssangeeta * 784*bd670b35SErik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock 785*bd670b35SErik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 786*bd670b35SErik Nordmark * 787*bd670b35SErik Nordmark * Applies to IPv4 and IPv6. 788*bd670b35SErik Nordmark * 789*bd670b35SErik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 790*bd670b35SErik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also 791*bd670b35SErik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 792*bd670b35SErik Nordmark * first in the bucket. Thus we compare that ire_flags match the orig_ire. 793*bd670b35SErik Nordmark * 794*bd670b35SErik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 795*bd670b35SErik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 796*bd670b35SErik Nordmark * in which the zone has an IP address. We check this for the global zone 797*bd670b35SErik Nordmark * even if no shared-IP zones are configured. 798c793af95Ssangeeta */ 799c793af95Ssangeeta ire_t * 800*bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 801*bd670b35SErik Nordmark ire_t *orig_ire, ip_stack_t *ipst) 802c793af95Ssangeeta { 803c793af95Ssangeeta ire_t *ire, *maybe_ire = NULL; 804*bd670b35SErik Nordmark uint_t maybe_badcnt; 805*bd670b35SErik Nordmark uint_t maxwalk; 806*bd670b35SErik Nordmark 807*bd670b35SErik Nordmark /* Fold in more bits from the hint/hash */ 808*bd670b35SErik Nordmark hash = hash ^ (hash >> 8) ^ (hash >> 16); 809c793af95Ssangeeta 810c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 811*bd670b35SErik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 812*bd670b35SErik Nordmark hash %= maxwalk; 813*bd670b35SErik Nordmark irb_refhold_locked(irb_ptr); 814c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 815c793af95Ssangeeta 816c793af95Ssangeeta /* 817c793af95Ssangeeta * Round-robin the routers list looking for a route that 818c793af95Ssangeeta * matches the passed in parameters. 819*bd670b35SErik Nordmark * First we skip "hash" number of non-condemned IREs. 820*bd670b35SErik Nordmark * Then we match the IRE. 821*bd670b35SErik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember 822*bd670b35SErik Nordmark * it and keep on looking for a lower ire_badcnt. 823*bd670b35SErik Nordmark * If we come to the end of the list we continue (treat the 824*bd670b35SErik Nordmark * bucket list as a circular list) but we match less than "max" 825*bd670b35SErik Nordmark * entries. 826c793af95Ssangeeta */ 827*bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 828*bd670b35SErik Nordmark while (maxwalk > 0) { 829*bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 830*bd670b35SErik Nordmark goto next_ire_skip; 831c793af95Ssangeeta 832*bd670b35SErik Nordmark /* Skip the first "hash" entries to do ECMP */ 833*bd670b35SErik Nordmark if (hash != 0) { 834*bd670b35SErik Nordmark hash--; 835*bd670b35SErik Nordmark goto next_ire_skip; 836*bd670b35SErik Nordmark } 837*bd670b35SErik Nordmark 838*bd670b35SErik Nordmark /* See CGTP comment above */ 839*bd670b35SErik Nordmark if (ire->ire_type != orig_ire->ire_type || 840*bd670b35SErik Nordmark ire->ire_flags != orig_ire->ire_flags) 841c793af95Ssangeeta goto next_ire; 842c793af95Ssangeeta 843c793af95Ssangeeta /* 844*bd670b35SErik Nordmark * Note: Since IPv6 has hash buckets instead of radix 845*bd670b35SErik Nordmark * buckers we need to explicitly compare the addresses. 846*bd670b35SErik Nordmark * That makes this less efficient since we will be called 847*bd670b35SErik Nordmark * even if there is no alternatives just because the 848*bd670b35SErik Nordmark * bucket has multiple IREs for different addresses. 849c793af95Ssangeeta */ 850*bd670b35SErik Nordmark if (ire->ire_ipversion == IPV6_VERSION) { 851*bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 852*bd670b35SErik Nordmark &ire->ire_addr_v6)) 853c793af95Ssangeeta goto next_ire; 854c793af95Ssangeeta } 855c793af95Ssangeeta 856c793af95Ssangeeta /* 857*bd670b35SErik Nordmark * For some reason find_best_route uses ire_mask. We do 858*bd670b35SErik Nordmark * the same. 859*bd670b35SErik Nordmark */ 860*bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION ? 861*bd670b35SErik Nordmark !ire_match_args(ire, margs->ift_addr, 862*bd670b35SErik Nordmark ire->ire_mask, margs->ift_gateway, 863*bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 864*bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags) : 865*bd670b35SErik Nordmark !ire_match_args_v6(ire, &margs->ift_addr_v6, 866*bd670b35SErik Nordmark &ire->ire_mask_v6, &margs->ift_gateway_v6, 867*bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 868*bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags)) 869*bd670b35SErik Nordmark goto next_ire; 870*bd670b35SErik Nordmark 871*bd670b35SErik Nordmark if (margs->ift_zoneid != ALL_ZONES && 872*bd670b35SErik Nordmark (ire->ire_type & IRE_OFFLINK)) { 873*bd670b35SErik Nordmark /* 874*bd670b35SErik Nordmark * When we're in a zone, we're only 875c793af95Ssangeeta * interested in routers that are 876c793af95Ssangeeta * reachable through ipifs within our zone. 877c793af95Ssangeeta */ 878*bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 879*bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v4( 880*bd670b35SErik Nordmark ire->ire_gateway_addr, margs->ift_zoneid, 881*bd670b35SErik Nordmark ire->ire_ill, margs->ift_tsl, ipst, 882*bd670b35SErik Nordmark B_TRUE)) 883*bd670b35SErik Nordmark goto next_ire; 884*bd670b35SErik Nordmark } else { 885*bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v6( 886*bd670b35SErik Nordmark &ire->ire_gateway_addr_v6, 887*bd670b35SErik Nordmark margs->ift_zoneid, ire->ire_ill, 888*bd670b35SErik Nordmark margs->ift_tsl, ipst, B_TRUE)) 889*bd670b35SErik Nordmark goto next_ire; 890*bd670b35SErik Nordmark } 891*bd670b35SErik Nordmark } 892*bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 893*bd670b35SErik Nordmark /* Look for stale ire_badcnt and clear */ 894*bd670b35SErik Nordmark if (ire->ire_badcnt != 0 && 895*bd670b35SErik Nordmark (TICK_TO_SEC(lbolt64) - ire->ire_last_badcnt > 896*bd670b35SErik Nordmark ipst->ips_ip_ire_badcnt_lifetime)) 897*bd670b35SErik Nordmark ire->ire_badcnt = 0; 898*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 899e11c3f44Smeem 900*bd670b35SErik Nordmark if (ire->ire_badcnt == 0) { 901*bd670b35SErik Nordmark /* We found one with a zero badcnt; done */ 902*bd670b35SErik Nordmark ire_refhold(ire); 903*bd670b35SErik Nordmark /* 904*bd670b35SErik Nordmark * Care needed since irb_refrele grabs WLOCK to free 905*bd670b35SErik Nordmark * the irb_t. 906*bd670b35SErik Nordmark */ 907*bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 908*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 909*bd670b35SErik Nordmark irb_refrele(irb_ptr); 910*bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 911*bd670b35SErik Nordmark } else { 912*bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 913*bd670b35SErik Nordmark irb_refrele(irb_ptr); 914*bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, 915*bd670b35SErik Nordmark RW_READER); 916*bd670b35SErik Nordmark } 917c793af95Ssangeeta return (ire); 918c793af95Ssangeeta } 919*bd670b35SErik Nordmark /* 920*bd670b35SErik Nordmark * keep looking to see if there is a better (lower 921*bd670b35SErik Nordmark * badcnt) matching IRE, but save this one as a last resort. 922*bd670b35SErik Nordmark * If we find a lower badcnt pick that one as the last* resort. 923*bd670b35SErik Nordmark */ 924*bd670b35SErik Nordmark if (maybe_ire == NULL) { 925*bd670b35SErik Nordmark maybe_ire = ire; 926*bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 927*bd670b35SErik Nordmark } else if (ire->ire_badcnt < maybe_badcnt) { 928*bd670b35SErik Nordmark maybe_ire = ire; 929*bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 930*bd670b35SErik Nordmark } 931*bd670b35SErik Nordmark 932c793af95Ssangeeta next_ire: 933*bd670b35SErik Nordmark maxwalk--; 934*bd670b35SErik Nordmark next_ire_skip: 935*bd670b35SErik Nordmark ire = ire->ire_next; 936*bd670b35SErik Nordmark if (ire == NULL) 937*bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 938c793af95Ssangeeta } 939c793af95Ssangeeta if (maybe_ire != NULL) 940*bd670b35SErik Nordmark ire_refhold(maybe_ire); 941*bd670b35SErik Nordmark 942*bd670b35SErik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 943*bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 944*bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 945*bd670b35SErik Nordmark irb_refrele(irb_ptr); 946*bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 947*bd670b35SErik Nordmark } else { 948*bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 949*bd670b35SErik Nordmark irb_refrele(irb_ptr); 950*bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 951*bd670b35SErik Nordmark } 952c793af95Ssangeeta return (maybe_ire); 953c793af95Ssangeeta } 9542679e103Ssowmini 9552679e103Ssowmini void 9562679e103Ssowmini irb_refhold_rn(struct radix_node *rn) 9572679e103Ssowmini { 9582679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 959*bd670b35SErik Nordmark irb_refhold(&((rt_t *)(rn))->rt_irb); 9602679e103Ssowmini } 9612679e103Ssowmini 9622679e103Ssowmini void 9632679e103Ssowmini irb_refrele_rn(struct radix_node *rn) 9642679e103Ssowmini { 9652679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9662679e103Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9672679e103Ssowmini } 968*bd670b35SErik Nordmark 969*bd670b35SErik Nordmark /* 970*bd670b35SErik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 971*bd670b35SErik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to 972*bd670b35SErik Nordmark * lookup an nce for the multicast case. 973*bd670b35SErik Nordmark */ 974*bd670b35SErik Nordmark ire_t * 975*bd670b35SErik Nordmark ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 976*bd670b35SErik Nordmark uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 977*bd670b35SErik Nordmark { 978*bd670b35SErik Nordmark uint_t match_args; 979*bd670b35SErik Nordmark uint_t ire_type; 980*bd670b35SErik Nordmark ill_t *ill; 981*bd670b35SErik Nordmark ire_t *ire; 982*bd670b35SErik Nordmark ip_stack_t *ipst = ixa->ixa_ipst; 983*bd670b35SErik Nordmark ipaddr_t v4dst; 984*bd670b35SErik Nordmark in6_addr_t v6nexthop; 985*bd670b35SErik Nordmark iaflags_t ixaflags = ixa->ixa_flags; 986*bd670b35SErik Nordmark nce_t *nce; 987*bd670b35SErik Nordmark 988*bd670b35SErik Nordmark match_args = MATCH_IRE_SECATTR; 989*bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 990*bd670b35SErik Nordmark if (setsrcp != NULL) 991*bd670b35SErik Nordmark ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 992*bd670b35SErik Nordmark if (errorp != NULL) 993*bd670b35SErik Nordmark ASSERT(*errorp == 0); 994*bd670b35SErik Nordmark 995*bd670b35SErik Nordmark /* 996*bd670b35SErik Nordmark * The content of the ixa will be different if IP_NEXTHOP, 997*bd670b35SErik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 998*bd670b35SErik Nordmark */ 999*bd670b35SErik Nordmark 1000*bd670b35SErik Nordmark if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1001*bd670b35SErik Nordmark IN6_IS_ADDR_MULTICAST(v6dst)) { 1002*bd670b35SErik Nordmark /* Pick up the IRE_MULTICAST for the ill */ 1003*bd670b35SErik Nordmark if (ixa->ixa_multicast_ifindex != 0) { 1004*bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1005*bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1006*bd670b35SErik Nordmark } else if (ixaflags & IXAF_SCOPEID_SET) { 1007*bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1008*bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1009*bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1010*bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1011*bd670b35SErik Nordmark } else if (ixa->ixa_ifindex != 0) { 1012*bd670b35SErik Nordmark /* 1013*bd670b35SErik Nordmark * In the ipmp case, the ixa_ifindex is set to 1014*bd670b35SErik Nordmark * point at an under_ill and we would return the 1015*bd670b35SErik Nordmark * ire_multicast() corresponding to that under_ill. 1016*bd670b35SErik Nordmark */ 1017*bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1018*bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1019*bd670b35SErik Nordmark } else if (ixaflags & IXAF_IS_IPV4) { 1020*bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1021*bd670b35SErik Nordmark 1022*bd670b35SErik Nordmark ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1023*bd670b35SErik Nordmark multirtp, &v4setsrc); 1024*bd670b35SErik Nordmark if (setsrcp != NULL) 1025*bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1026*bd670b35SErik Nordmark } else { 1027*bd670b35SErik Nordmark ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1028*bd670b35SErik Nordmark multirtp, setsrcp); 1029*bd670b35SErik Nordmark } 1030*bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1031*bd670b35SErik Nordmark ill_refrele(ill); 1032*bd670b35SErik Nordmark ill = NULL; 1033*bd670b35SErik Nordmark } 1034*bd670b35SErik Nordmark if (ill == NULL) { 1035*bd670b35SErik Nordmark if (errorp != NULL) 1036*bd670b35SErik Nordmark *errorp = ENXIO; 1037*bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1038*bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1039*bd670b35SErik Nordmark return (ire); 1040*bd670b35SErik Nordmark } 1041*bd670b35SErik Nordmark if (!(ill->ill_flags & ILLF_MULTICAST)) { 1042*bd670b35SErik Nordmark ill_refrele(ill); 1043*bd670b35SErik Nordmark if (errorp != NULL) 1044*bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1045*bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1046*bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1047*bd670b35SErik Nordmark return (ire); 1048*bd670b35SErik Nordmark } 1049*bd670b35SErik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */ 1050*bd670b35SErik Nordmark ire = ire_multicast(ill); 1051*bd670b35SErik Nordmark ill_refrele(ill); 1052*bd670b35SErik Nordmark if (generationp != NULL) 1053*bd670b35SErik Nordmark *generationp = ire->ire_generation; 1054*bd670b35SErik Nordmark if (errorp != NULL && 1055*bd670b35SErik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1056*bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1057*bd670b35SErik Nordmark } 1058*bd670b35SErik Nordmark return (ire); 1059*bd670b35SErik Nordmark } 1060*bd670b35SErik Nordmark 1061*bd670b35SErik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1062*bd670b35SErik Nordmark if (ixaflags & IXAF_SCOPEID_SET) { 1063*bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1064*bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1065*bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1066*bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1067*bd670b35SErik Nordmark } else { 1068*bd670b35SErik Nordmark ASSERT(ixa->ixa_ifindex != 0); 1069*bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1070*bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1071*bd670b35SErik Nordmark } 1072*bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1073*bd670b35SErik Nordmark ill_refrele(ill); 1074*bd670b35SErik Nordmark ill = NULL; 1075*bd670b35SErik Nordmark } 1076*bd670b35SErik Nordmark if (ill == NULL) { 1077*bd670b35SErik Nordmark if (errorp != NULL) 1078*bd670b35SErik Nordmark *errorp = ENXIO; 1079*bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1080*bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1081*bd670b35SErik Nordmark return (ire); 1082*bd670b35SErik Nordmark } 1083*bd670b35SErik Nordmark /* 1084*bd670b35SErik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1085*bd670b35SErik Nordmark * so for both of them we need to be able look for an under 1086*bd670b35SErik Nordmark * interface. 1087*bd670b35SErik Nordmark */ 1088*bd670b35SErik Nordmark if (IS_UNDER_IPMP(ill)) 1089*bd670b35SErik Nordmark match_args |= MATCH_IRE_TESTHIDDEN; 1090*bd670b35SErik Nordmark } else { 1091*bd670b35SErik Nordmark ill = NULL; 1092*bd670b35SErik Nordmark } 1093*bd670b35SErik Nordmark 1094*bd670b35SErik Nordmark if (ixaflags & IXAF_NEXTHOP_SET) { 1095*bd670b35SErik Nordmark /* IP_NEXTHOP was set */ 1096*bd670b35SErik Nordmark v6nexthop = ixa->ixa_nexthop_v6; 1097*bd670b35SErik Nordmark } else { 1098*bd670b35SErik Nordmark v6nexthop = *v6dst; 1099*bd670b35SErik Nordmark } 1100*bd670b35SErik Nordmark 1101*bd670b35SErik Nordmark ire_type = 0; 1102*bd670b35SErik Nordmark /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1103*bd670b35SErik Nordmark 1104*bd670b35SErik Nordmark /* 1105*bd670b35SErik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1106*bd670b35SErik Nordmark * we only look for an onlink IRE. 1107*bd670b35SErik Nordmark */ 1108*bd670b35SErik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1109*bd670b35SErik Nordmark match_args |= MATCH_IRE_TYPE; 1110*bd670b35SErik Nordmark ire_type = IRE_ONLINK; 1111*bd670b35SErik Nordmark } 1112*bd670b35SErik Nordmark 1113*bd670b35SErik Nordmark if (ixaflags & IXAF_IS_IPV4) { 1114*bd670b35SErik Nordmark ipaddr_t v4nexthop; 1115*bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1116*bd670b35SErik Nordmark 1117*bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1118*bd670b35SErik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1119*bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1120*bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1121*bd670b35SErik Nordmark if (setsrcp != NULL) 1122*bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1123*bd670b35SErik Nordmark } else { 1124*bd670b35SErik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1125*bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1126*bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1127*bd670b35SErik Nordmark } 1128*bd670b35SErik Nordmark 1129*bd670b35SErik Nordmark #ifdef DEBUG 1130*bd670b35SErik Nordmark if (match_args & MATCH_IRE_TESTHIDDEN) { 1131*bd670b35SErik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n", 1132*bd670b35SErik Nordmark v4dst, (void *)ire)); 1133*bd670b35SErik Nordmark } 1134*bd670b35SErik Nordmark #endif 1135*bd670b35SErik Nordmark 1136*bd670b35SErik Nordmark if (ill != NULL) 1137*bd670b35SErik Nordmark ill_refrele(ill); 1138*bd670b35SErik Nordmark 1139*bd670b35SErik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1140*bd670b35SErik Nordmark (ire->ire_type & IRE_MULTICAST)) { 1141*bd670b35SErik Nordmark /* No ire_nce_cache */ 1142*bd670b35SErik Nordmark return (ire); 1143*bd670b35SErik Nordmark } 1144*bd670b35SErik Nordmark 1145*bd670b35SErik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1146*bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1147*bd670b35SErik Nordmark nce = ire->ire_nce_cache; 1148*bd670b35SErik Nordmark if (nce == NULL || nce->nce_is_condemned) { 1149*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1150*bd670b35SErik Nordmark (void) ire_revalidate_nce(ire); 1151*bd670b35SErik Nordmark } else { 1152*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1153*bd670b35SErik Nordmark } 1154*bd670b35SErik Nordmark return (ire); 1155*bd670b35SErik Nordmark } 1156*bd670b35SErik Nordmark 1157*bd670b35SErik Nordmark /* 1158*bd670b35SErik Nordmark * Find a route given some xmit attributes and a packet. 1159*bd670b35SErik Nordmark * Generic for IPv4 and IPv6 1160*bd670b35SErik Nordmark * 1161*bd670b35SErik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE 1162*bd670b35SErik Nordmark * it might set errorp. 1163*bd670b35SErik Nordmark */ 1164*bd670b35SErik Nordmark ire_t * 1165*bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1166*bd670b35SErik Nordmark int *errorp, boolean_t *multirtp) 1167*bd670b35SErik Nordmark { 1168*bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_IS_IPV4) { 1169*bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 1170*bd670b35SErik Nordmark in6_addr_t v6dst; 1171*bd670b35SErik Nordmark 1172*bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1173*bd670b35SErik Nordmark 1174*bd670b35SErik Nordmark return (ip_select_route(&v6dst, ixa, generationp, 1175*bd670b35SErik Nordmark NULL, errorp, multirtp)); 1176*bd670b35SErik Nordmark } else { 1177*bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1178*bd670b35SErik Nordmark 1179*bd670b35SErik Nordmark return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1180*bd670b35SErik Nordmark NULL, errorp, multirtp)); 1181*bd670b35SErik Nordmark } 1182*bd670b35SErik Nordmark } 1183*bd670b35SErik Nordmark 1184*bd670b35SErik Nordmark ire_t * 1185*bd670b35SErik Nordmark ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1186*bd670b35SErik Nordmark ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1187*bd670b35SErik Nordmark { 1188*bd670b35SErik Nordmark in6_addr_t v6dst; 1189*bd670b35SErik Nordmark ire_t *ire; 1190*bd670b35SErik Nordmark in6_addr_t setsrc; 1191*bd670b35SErik Nordmark 1192*bd670b35SErik Nordmark ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1193*bd670b35SErik Nordmark 1194*bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1195*bd670b35SErik Nordmark 1196*bd670b35SErik Nordmark setsrc = ipv6_all_zeros; 1197*bd670b35SErik Nordmark ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1198*bd670b35SErik Nordmark multirtp); 1199*bd670b35SErik Nordmark if (v4setsrcp != NULL) 1200*bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1201*bd670b35SErik Nordmark return (ire); 1202*bd670b35SErik Nordmark } 1203*bd670b35SErik Nordmark 1204*bd670b35SErik Nordmark /* 1205*bd670b35SErik Nordmark * Recursively look for a route to the destination. Can also match on 1206*bd670b35SErik Nordmark * the zoneid, ill, and label. Used for the data paths. See also 1207*bd670b35SErik Nordmark * ire_route_recursive. 1208*bd670b35SErik Nordmark * 1209*bd670b35SErik Nordmark * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1210*bd670b35SErik Nordmark * 1211*bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1212*bd670b35SErik Nordmark * instead. 1213*bd670b35SErik Nordmark * 1214*bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1215*bd670b35SErik Nordmark * is an error. 1216*bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1217*bd670b35SErik Nordmark */ 1218*bd670b35SErik Nordmark ire_t * 1219*bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire, 1220*bd670b35SErik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1221*bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1222*bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1223*bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1224*bd670b35SErik Nordmark { 1225*bd670b35SErik Nordmark int i, j; 1226*bd670b35SErik Nordmark ire_t *ires[MAX_IRE_RECURSION]; 1227*bd670b35SErik Nordmark uint_t generation; 1228*bd670b35SErik Nordmark uint_t generations[MAX_IRE_RECURSION]; 1229*bd670b35SErik Nordmark boolean_t need_refrele = B_FALSE; 1230*bd670b35SErik Nordmark boolean_t invalidate = B_FALSE; 1231*bd670b35SErik Nordmark int prefs[MAX_IRE_RECURSION]; 1232*bd670b35SErik Nordmark ill_t *ill = NULL; 1233*bd670b35SErik Nordmark 1234*bd670b35SErik Nordmark if (setsrcp != NULL) 1235*bd670b35SErik Nordmark ASSERT(*setsrcp == INADDR_ANY); 1236*bd670b35SErik Nordmark if (gwattrp != NULL) 1237*bd670b35SErik Nordmark ASSERT(*gwattrp == NULL); 1238*bd670b35SErik Nordmark 1239*bd670b35SErik Nordmark if (ill_arg != NULL) 1240*bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1241*bd670b35SErik Nordmark 1242*bd670b35SErik Nordmark /* 1243*bd670b35SErik Nordmark * We iterate up to three times to resolve a route, even though 1244*bd670b35SErik Nordmark * we have four slots in the array. The extra slot is for an 1245*bd670b35SErik Nordmark * IRE_IF_CLONE we might need to create. 1246*bd670b35SErik Nordmark */ 1247*bd670b35SErik Nordmark i = 0; 1248*bd670b35SErik Nordmark while (i < MAX_IRE_RECURSION - 1) { 1249*bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1250*bd670b35SErik Nordmark if (ire == NULL) { 1251*bd670b35SErik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1252*bd670b35SErik Nordmark (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1253*bd670b35SErik Nordmark match_args, xmit_hint, ipst, &generation); 1254*bd670b35SErik Nordmark } else { 1255*bd670b35SErik Nordmark /* Caller passed it; extra hold since we will rele */ 1256*bd670b35SErik Nordmark ire_refhold(ire); 1257*bd670b35SErik Nordmark if (generationp != NULL) 1258*bd670b35SErik Nordmark generation = *generationp; 1259*bd670b35SErik Nordmark else 1260*bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1261*bd670b35SErik Nordmark } 1262*bd670b35SErik Nordmark if (ire == NULL) 1263*bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1264*bd670b35SErik Nordmark 1265*bd670b35SErik Nordmark /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1266*bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1267*bd670b35SErik Nordmark goto error; 1268*bd670b35SErik Nordmark 1269*bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1270*bd670b35SErik Nordmark 1271*bd670b35SErik Nordmark prefs[i] = ire_pref(ire); 1272*bd670b35SErik Nordmark if (i != 0) { 1273*bd670b35SErik Nordmark /* 1274*bd670b35SErik Nordmark * Don't allow anything unusual past the first 1275*bd670b35SErik Nordmark * iteration. 1276*bd670b35SErik Nordmark */ 1277*bd670b35SErik Nordmark if ((ire->ire_type & 1278*bd670b35SErik Nordmark (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1279*bd670b35SErik Nordmark prefs[i] <= prefs[i-1]) { 1280*bd670b35SErik Nordmark ire_refrele(ire); 1281*bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1282*bd670b35SErik Nordmark goto error; 1283*bd670b35SErik Nordmark } 1284*bd670b35SErik Nordmark } 1285*bd670b35SErik Nordmark /* We have a usable IRE */ 1286*bd670b35SErik Nordmark ires[i] = ire; 1287*bd670b35SErik Nordmark generations[i] = generation; 1288*bd670b35SErik Nordmark i++; 1289*bd670b35SErik Nordmark 1290*bd670b35SErik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */ 1291*bd670b35SErik Nordmark if ((ire->ire_flags & RTF_SETSRC) && 1292*bd670b35SErik Nordmark setsrcp != NULL && *setsrcp == INADDR_ANY) { 1293*bd670b35SErik Nordmark ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1294*bd670b35SErik Nordmark *setsrcp = ire->ire_setsrc_addr; 1295*bd670b35SErik Nordmark } 1296*bd670b35SErik Nordmark 1297*bd670b35SErik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */ 1298*bd670b35SErik Nordmark if (ire->ire_gw_secattr != NULL && 1299*bd670b35SErik Nordmark gwattrp != NULL && *gwattrp == NULL) 1300*bd670b35SErik Nordmark *gwattrp = ire->ire_gw_secattr; 1301*bd670b35SErik Nordmark 1302*bd670b35SErik Nordmark /* 1303*bd670b35SErik Nordmark * Check if we have a short-cut pointer to an IRE for this 1304*bd670b35SErik Nordmark * destination, and that the cached dependency isn't stale. 1305*bd670b35SErik Nordmark * In that case we've rejoined an existing tree towards a 1306*bd670b35SErik Nordmark * parent, thus we don't need to continue the loop to 1307*bd670b35SErik Nordmark * discover the rest of the tree. 1308*bd670b35SErik Nordmark */ 1309*bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1310*bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1311*bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1312*bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1313*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1314*bd670b35SErik Nordmark ire = NULL; 1315*bd670b35SErik Nordmark goto done; 1316*bd670b35SErik Nordmark } 1317*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1318*bd670b35SErik Nordmark 1319*bd670b35SErik Nordmark /* 1320*bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1321*bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1322*bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1323*bd670b35SErik Nordmark */ 1324*bd670b35SErik Nordmark if (ire->ire_nce_capable) { 1325*bd670b35SErik Nordmark ire = NULL; 1326*bd670b35SErik Nordmark goto done; 1327*bd670b35SErik Nordmark } 1328*bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1329*bd670b35SErik Nordmark /* 1330*bd670b35SErik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1331*bd670b35SErik Nordmark * particular destination 1332*bd670b35SErik Nordmark */ 1333*bd670b35SErik Nordmark if (ire->ire_type & IRE_INTERFACE) { 1334*bd670b35SErik Nordmark in6_addr_t v6nexthop; 1335*bd670b35SErik Nordmark ire_t *clone; 1336*bd670b35SErik Nordmark 1337*bd670b35SErik Nordmark ASSERT(ire->ire_masklen != IPV4_ABITS); 1338*bd670b35SErik Nordmark 1339*bd670b35SErik Nordmark /* 1340*bd670b35SErik Nordmark * In the case of ip_input and ILLF_FORWARDING not 1341*bd670b35SErik Nordmark * being set, and in the case of RTM_GET, 1342*bd670b35SErik Nordmark * there is no point in allocating 1343*bd670b35SErik Nordmark * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1344*bd670b35SErik Nordmark * Note that !allocate can result in a ire_dep_parent 1345*bd670b35SErik Nordmark * which is IRE_IF_* without an IRE_IF_CLONE. 1346*bd670b35SErik Nordmark * We recover from that when we need to send packets 1347*bd670b35SErik Nordmark * by ensuring that the generations become 1348*bd670b35SErik Nordmark * IRE_GENERATION_VERIFY in this case. 1349*bd670b35SErik Nordmark */ 1350*bd670b35SErik Nordmark if (!allocate) { 1351*bd670b35SErik Nordmark invalidate = B_TRUE; 1352*bd670b35SErik Nordmark ire = NULL; 1353*bd670b35SErik Nordmark goto done; 1354*bd670b35SErik Nordmark } 1355*bd670b35SErik Nordmark 1356*bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1357*bd670b35SErik Nordmark 1358*bd670b35SErik Nordmark clone = ire_create_if_clone(ire, &v6nexthop, 1359*bd670b35SErik Nordmark &generation); 1360*bd670b35SErik Nordmark if (clone == NULL) { 1361*bd670b35SErik Nordmark /* 1362*bd670b35SErik Nordmark * Temporary failure - no memory. 1363*bd670b35SErik Nordmark * Don't want caller to cache IRE_NOROUTE. 1364*bd670b35SErik Nordmark */ 1365*bd670b35SErik Nordmark invalidate = B_TRUE; 1366*bd670b35SErik Nordmark ire = ire_blackhole(ipst, B_FALSE); 1367*bd670b35SErik Nordmark goto error; 1368*bd670b35SErik Nordmark } 1369*bd670b35SErik Nordmark /* 1370*bd670b35SErik Nordmark * Make clone next to last entry and the 1371*bd670b35SErik Nordmark * IRE_INTERFACE the last in the dependency 1372*bd670b35SErik Nordmark * chain since the clone depends on the 1373*bd670b35SErik Nordmark * IRE_INTERFACE. 1374*bd670b35SErik Nordmark */ 1375*bd670b35SErik Nordmark ASSERT(i >= 1); 1376*bd670b35SErik Nordmark ASSERT(i < MAX_IRE_RECURSION); 1377*bd670b35SErik Nordmark 1378*bd670b35SErik Nordmark ires[i] = ires[i-1]; 1379*bd670b35SErik Nordmark generations[i] = generations[i-1]; 1380*bd670b35SErik Nordmark ires[i-1] = clone; 1381*bd670b35SErik Nordmark generations[i-1] = generation; 1382*bd670b35SErik Nordmark i++; 1383*bd670b35SErik Nordmark 1384*bd670b35SErik Nordmark ire = NULL; 1385*bd670b35SErik Nordmark goto done; 1386*bd670b35SErik Nordmark } 1387*bd670b35SErik Nordmark 1388*bd670b35SErik Nordmark /* 1389*bd670b35SErik Nordmark * We only match on the type and optionally ILL when 1390*bd670b35SErik Nordmark * recursing. The type match is used by some callers 1391*bd670b35SErik Nordmark * to exclude certain types (such as IRE_IF_CLONE or 1392*bd670b35SErik Nordmark * IRE_LOCAL|IRE_LOOPBACK). 1393*bd670b35SErik Nordmark */ 1394*bd670b35SErik Nordmark match_args &= MATCH_IRE_TYPE; 1395*bd670b35SErik Nordmark nexthop = ire->ire_gateway_addr; 1396*bd670b35SErik Nordmark if (ill == NULL && ire->ire_ill != NULL) { 1397*bd670b35SErik Nordmark ill = ire->ire_ill; 1398*bd670b35SErik Nordmark need_refrele = B_TRUE; 1399*bd670b35SErik Nordmark ill_refhold(ill); 1400*bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1401*bd670b35SErik Nordmark } 1402*bd670b35SErik Nordmark ire = NULL; 1403*bd670b35SErik Nordmark } 1404*bd670b35SErik Nordmark ASSERT(ire == NULL); 1405*bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1406*bd670b35SErik Nordmark 1407*bd670b35SErik Nordmark error: 1408*bd670b35SErik Nordmark ASSERT(ire != NULL); 1409*bd670b35SErik Nordmark if (need_refrele) 1410*bd670b35SErik Nordmark ill_refrele(ill); 1411*bd670b35SErik Nordmark 1412*bd670b35SErik Nordmark /* 1413*bd670b35SErik Nordmark * In the case of MULTIRT we want to try a different IRE the next 1414*bd670b35SErik Nordmark * time. We let the next packet retry in that case. 1415*bd670b35SErik Nordmark */ 1416*bd670b35SErik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1417*bd670b35SErik Nordmark (void) ire_no_good(ires[0]); 1418*bd670b35SErik Nordmark 1419*bd670b35SErik Nordmark cleanup: 1420*bd670b35SErik Nordmark /* cleanup ires[i] */ 1421*bd670b35SErik Nordmark ire_dep_unbuild(ires, i); 1422*bd670b35SErik Nordmark for (j = 0; j < i; j++) 1423*bd670b35SErik Nordmark ire_refrele(ires[j]); 1424*bd670b35SErik Nordmark 1425*bd670b35SErik Nordmark ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1426*bd670b35SErik Nordmark /* 1427*bd670b35SErik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1428*bd670b35SErik Nordmark * ip_select_route since the reject or lack of memory might be gone. 1429*bd670b35SErik Nordmark */ 1430*bd670b35SErik Nordmark if (generationp != NULL) 1431*bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 1432*bd670b35SErik Nordmark return (ire); 1433*bd670b35SErik Nordmark 1434*bd670b35SErik Nordmark done: 1435*bd670b35SErik Nordmark ASSERT(ire == NULL); 1436*bd670b35SErik Nordmark if (need_refrele) { 1437*bd670b35SErik Nordmark ill_refrele(ill); 1438*bd670b35SErik Nordmark ill = NULL; 1439*bd670b35SErik Nordmark } 1440*bd670b35SErik Nordmark 1441*bd670b35SErik Nordmark /* Build dependencies */ 1442*bd670b35SErik Nordmark if (!ire_dep_build(ires, generations, i)) { 1443*bd670b35SErik Nordmark /* Something in chain was condemned; tear it apart */ 1444*bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1445*bd670b35SErik Nordmark goto cleanup; 1446*bd670b35SErik Nordmark } 1447*bd670b35SErik Nordmark 1448*bd670b35SErik Nordmark /* 1449*bd670b35SErik Nordmark * Release all refholds except the one for ires[0] that we 1450*bd670b35SErik Nordmark * will return to the caller. 1451*bd670b35SErik Nordmark */ 1452*bd670b35SErik Nordmark for (j = 1; j < i; j++) 1453*bd670b35SErik Nordmark ire_refrele(ires[j]); 1454*bd670b35SErik Nordmark 1455*bd670b35SErik Nordmark if (invalidate) { 1456*bd670b35SErik Nordmark /* 1457*bd670b35SErik Nordmark * Since we needed to allocate but couldn't we need to make 1458*bd670b35SErik Nordmark * sure that the dependency chain is rebuilt the next time. 1459*bd670b35SErik Nordmark */ 1460*bd670b35SErik Nordmark ire_dep_invalidate_generations(ires[0]); 1461*bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1462*bd670b35SErik Nordmark } else { 1463*bd670b35SErik Nordmark /* 1464*bd670b35SErik Nordmark * IREs can have been added or deleted while we did the 1465*bd670b35SErik Nordmark * recursive lookup and we can't catch those until we've built 1466*bd670b35SErik Nordmark * the dependencies. We verify the stored 1467*bd670b35SErik Nordmark * ire_dep_parent_generation to catch any such changes and 1468*bd670b35SErik Nordmark * return IRE_GENERATION_VERIFY (which will cause 1469*bd670b35SErik Nordmark * ip_select_route to be called again so we can redo the 1470*bd670b35SErik Nordmark * recursive lookup next time we send a packet. 1471*bd670b35SErik Nordmark */ 1472*bd670b35SErik Nordmark generation = ire_dep_validate_generations(ires[0]); 1473*bd670b35SErik Nordmark if (generations[0] != ires[0]->ire_generation) { 1474*bd670b35SErik Nordmark /* Something changed at the top */ 1475*bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1476*bd670b35SErik Nordmark } 1477*bd670b35SErik Nordmark } 1478*bd670b35SErik Nordmark if (generationp != NULL) 1479*bd670b35SErik Nordmark *generationp = generation; 1480*bd670b35SErik Nordmark 1481*bd670b35SErik Nordmark return (ires[0]); 1482*bd670b35SErik Nordmark } 1483*bd670b35SErik Nordmark 1484*bd670b35SErik Nordmark ire_t * 1485*bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1486*bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1487*bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1488*bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1489*bd670b35SErik Nordmark { 1490*bd670b35SErik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1491*bd670b35SErik Nordmark zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1492*bd670b35SErik Nordmark gwattrp, generationp)); 1493*bd670b35SErik Nordmark } 1494*bd670b35SErik Nordmark 1495*bd670b35SErik Nordmark /* 1496*bd670b35SErik Nordmark * Recursively look for a route to the destination. 1497*bd670b35SErik Nordmark * We only handle a destination match here, yet we have the same arguments 1498*bd670b35SErik Nordmark * as the full match to allow function pointers to select between the two. 1499*bd670b35SErik Nordmark * 1500*bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1501*bd670b35SErik Nordmark * instead. 1502*bd670b35SErik Nordmark * 1503*bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1504*bd670b35SErik Nordmark * is an error. 1505*bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1506*bd670b35SErik Nordmark */ 1507*bd670b35SErik Nordmark ire_t * 1508*bd670b35SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, 1509*bd670b35SErik Nordmark uint32_t xmit_hint, ip_stack_t *ipst) 1510*bd670b35SErik Nordmark { 1511*bd670b35SErik Nordmark ire_t *ire; 1512*bd670b35SErik Nordmark ire_t *ire1; 1513*bd670b35SErik Nordmark uint_t generation; 1514*bd670b35SErik Nordmark 1515*bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1516*bd670b35SErik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1517*bd670b35SErik Nordmark &generation); 1518*bd670b35SErik Nordmark ASSERT(ire != NULL); 1519*bd670b35SErik Nordmark 1520*bd670b35SErik Nordmark /* 1521*bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1522*bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1523*bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1524*bd670b35SErik Nordmark */ 1525*bd670b35SErik Nordmark if (ire->ire_nce_capable) 1526*bd670b35SErik Nordmark return (ire); 1527*bd670b35SErik Nordmark 1528*bd670b35SErik Nordmark /* 1529*bd670b35SErik Nordmark * If the IRE has a current cached parent we know that the whole 1530*bd670b35SErik Nordmark * parent chain is current, hence we don't need to discover and 1531*bd670b35SErik Nordmark * build any dependencies by doing a recursive lookup. 1532*bd670b35SErik Nordmark */ 1533*bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1534*bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1535*bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1536*bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1537*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1538*bd670b35SErik Nordmark return (ire); 1539*bd670b35SErik Nordmark } 1540*bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1541*bd670b35SErik Nordmark 1542*bd670b35SErik Nordmark /* 1543*bd670b35SErik Nordmark * Fallback to loop in the normal code starting with the ire 1544*bd670b35SErik Nordmark * we found. Normally this would return the same ire. 1545*bd670b35SErik Nordmark */ 1546*bd670b35SErik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1547*bd670b35SErik Nordmark NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1548*bd670b35SErik Nordmark &generation); 1549*bd670b35SErik Nordmark ire_refrele(ire); 1550*bd670b35SErik Nordmark return (ire1); 1551*bd670b35SErik Nordmark } 1552