1c793af95Ssangeeta /* 2c793af95Ssangeeta * CDDL HEADER START 3c793af95Ssangeeta * 4c793af95Ssangeeta * The contents of this file are subject to the terms of the 5c793af95Ssangeeta * Common Development and Distribution License (the "License"). 6c793af95Ssangeeta * You may not use this file except in compliance with the License. 7c793af95Ssangeeta * 8c793af95Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c793af95Ssangeeta * or http://www.opensolaris.org/os/licensing. 10c793af95Ssangeeta * See the License for the specific language governing permissions 11c793af95Ssangeeta * and limitations under the License. 12c793af95Ssangeeta * 13c793af95Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 14c793af95Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c793af95Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 16c793af95Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 17c793af95Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 18c793af95Ssangeeta * 19c793af95Ssangeeta * CDDL HEADER END 20c793af95Ssangeeta */ 21c793af95Ssangeeta /* 22e11c3f44Smeem * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23c793af95Ssangeeta * Use is subject to license terms. 24c793af95Ssangeeta */ 25c793af95Ssangeeta 26c793af95Ssangeeta /* 27c793af95Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 28c793af95Ssangeeta */ 29c793af95Ssangeeta 30c793af95Ssangeeta #include <sys/types.h> 31c793af95Ssangeeta #include <sys/stream.h> 32c793af95Ssangeeta #include <sys/stropts.h> 33c793af95Ssangeeta #include <sys/strlog.h> 34c793af95Ssangeeta #include <sys/dlpi.h> 35c793af95Ssangeeta #include <sys/ddi.h> 36c793af95Ssangeeta #include <sys/cmn_err.h> 37c793af95Ssangeeta #include <sys/policy.h> 38c793af95Ssangeeta 39c793af95Ssangeeta #include <sys/systm.h> 40c793af95Ssangeeta #include <sys/strsun.h> 41c793af95Ssangeeta #include <sys/kmem.h> 42c793af95Ssangeeta #include <sys/param.h> 43c793af95Ssangeeta #include <sys/socket.h> 44edd26dc5Sdr146992 #include <sys/strsubr.h> 45c793af95Ssangeeta #include <net/if.h> 46c793af95Ssangeeta #include <net/route.h> 47c793af95Ssangeeta #include <netinet/in.h> 48c793af95Ssangeeta #include <net/if_dl.h> 49c793af95Ssangeeta #include <netinet/ip6.h> 50c793af95Ssangeeta #include <netinet/icmp6.h> 51c793af95Ssangeeta 52bd670b35SErik Nordmark #include <inet/ipsec_impl.h> 53c793af95Ssangeeta #include <inet/common.h> 54c793af95Ssangeeta #include <inet/mi.h> 55c793af95Ssangeeta #include <inet/mib2.h> 56c793af95Ssangeeta #include <inet/ip.h> 57edd26dc5Sdr146992 #include <inet/ip_impl.h> 58c793af95Ssangeeta #include <inet/ip6.h> 59c793af95Ssangeeta #include <inet/ip_ndp.h> 60c793af95Ssangeeta #include <inet/arp.h> 61c793af95Ssangeeta #include <inet/ip_if.h> 62c793af95Ssangeeta #include <inet/ip_ire.h> 63c793af95Ssangeeta #include <inet/ip_ftable.h> 64c793af95Ssangeeta #include <inet/ip_rts.h> 65c793af95Ssangeeta #include <inet/nd.h> 66c793af95Ssangeeta 67c793af95Ssangeeta #include <net/pfkeyv2.h> 68c793af95Ssangeeta #include <inet/sadb.h> 69c793af95Ssangeeta #include <inet/tcp.h> 70c793af95Ssangeeta #include <inet/ipclassifier.h> 71c793af95Ssangeeta #include <sys/zone.h> 72c793af95Ssangeeta #include <net/radix.h> 73c793af95Ssangeeta #include <sys/tsol/label.h> 74c793af95Ssangeeta #include <sys/tsol/tnet.h> 75c793af95Ssangeeta 76c793af95Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 77c793af95Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 78c793af95Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79c793af95Ssangeeta 80f4b3ec61Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81c793af95Ssangeeta static void ire_del_host_redir(ire_t *, char *); 82c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *); 83c793af95Ssangeeta 84c793af95Ssangeeta /* 85c793af95Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 86c793af95Ssangeeta * passing the required parameters and indicating the match required in the 87c793af95Ssangeeta * flag field. 88c793af95Ssangeeta * 89c793af95Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90c793af95Ssangeeta */ 91c793af95Ssangeeta ire_t * 92bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93bd670b35SErik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94bd670b35SErik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95c793af95Ssangeeta { 96bd670b35SErik Nordmark ire_t *ire; 97c793af95Ssangeeta struct rt_sockaddr rdst, rmask; 98c793af95Ssangeeta struct rt_entry *rt; 99c793af95Ssangeeta ire_ftable_args_t margs; 100c793af95Ssangeeta 101bd670b35SErik Nordmark ASSERT(ill == NULL || !ill->ill_isv6); 102c793af95Ssangeeta 103c793af95Ssangeeta /* 104bd670b35SErik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL 105bd670b35SErik Nordmark * is set. 106c793af95Ssangeeta */ 107bd670b35SErik Nordmark if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108c793af95Ssangeeta return (NULL); 109c793af95Ssangeeta 110c793af95Ssangeeta (void) memset(&rdst, 0, sizeof (rdst)); 111c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 112c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 113c793af95Ssangeeta rdst.rt_sin_addr.s_addr = addr; 114c793af95Ssangeeta 115c793af95Ssangeeta (void) memset(&rmask, 0, sizeof (rmask)); 116c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 117c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 118c793af95Ssangeeta rmask.rt_sin_addr.s_addr = mask; 119c793af95Ssangeeta 120c793af95Ssangeeta (void) memset(&margs, 0, sizeof (margs)); 121c793af95Ssangeeta margs.ift_addr = addr; 122c793af95Ssangeeta margs.ift_mask = mask; 123c793af95Ssangeeta margs.ift_gateway = gateway; 124c793af95Ssangeeta margs.ift_type = type; 125bd670b35SErik Nordmark margs.ift_ill = ill; 126c793af95Ssangeeta margs.ift_zoneid = zoneid; 127c793af95Ssangeeta margs.ift_tsl = tsl; 128c793af95Ssangeeta margs.ift_flags = flags; 129c793af95Ssangeeta 130c793af95Ssangeeta /* 131c793af95Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 132c793af95Ssangeeta * search to return, not the longest matching prefix, but the 133c793af95Ssangeeta * "best matching prefix", i.e., the longest prefix that also 134c793af95Ssangeeta * satisfies constraints imposed via the permutation of flags 135c793af95Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 136c793af95Ssangeeta * each matching leaf in the radix tree. ire_match_args is 137c793af95Ssangeeta * invoked by the callback function ire_find_best_route() 138c793af95Ssangeeta * We hold the global tree lock in read mode when calling 139c793af95Ssangeeta * rn_match_args. Before dropping the global tree lock, ensure 140c793af95Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 141c793af95Ssangeeta */ 142f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143f4b3ec61Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144f4b3ec61Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145c793af95Ssangeeta ire = margs.ift_best_ire; 146c793af95Ssangeeta if (rt == NULL) { 147bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148c793af95Ssangeeta return (NULL); 149c793af95Ssangeeta } 150bd670b35SErik Nordmark ASSERT(ire != NULL); 151c793af95Ssangeeta 152c793af95Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153c793af95Ssangeeta 154c793af95Ssangeeta /* 155c793af95Ssangeeta * round-robin only if we have more than one route in the bucket. 156bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 157bd670b35SErik Nordmark * 2: always 158bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159bd670b35SErik Nordmark * 0: never 160c793af95Ssangeeta */ 161bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 163bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 164bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 165c793af95Ssangeeta ire_t *next_ire; 166c793af95Ssangeeta 167bd670b35SErik Nordmark margs.ift_best_ire = NULL; 168bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 169bd670b35SErik Nordmark xmit_hint, ire, ipst); 170bd670b35SErik Nordmark if (next_ire == NULL) { 171bd670b35SErik Nordmark /* keep ire if next_ire is null */ 172bd670b35SErik Nordmark goto done; 173bd670b35SErik Nordmark } 174bd670b35SErik Nordmark ire_refrele(ire); 175c793af95Ssangeeta ire = next_ire; 176c793af95Ssangeeta } 177c793af95Ssangeeta } 178c793af95Ssangeeta 179bd670b35SErik Nordmark done: 180bd670b35SErik Nordmark /* Return generation before dropping lock */ 181bd670b35SErik Nordmark if (generationp != NULL) 182bd670b35SErik Nordmark *generationp = ire->ire_generation; 183c793af95Ssangeeta 184bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185e11c3f44Smeem 186c793af95Ssangeeta /* 187bd670b35SErik Nordmark * For shared-IP zones we need additional checks to what was 188bd670b35SErik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled. 189bd670b35SErik Nordmark * 190bd670b35SErik Nordmark * When ip_restrict_interzone_loopback is set, then 191bd670b35SErik Nordmark * we ensure that IRE_LOCAL are only used for loopback 192bd670b35SErik Nordmark * between zones when the logical "Ethernet" would 193bd670b35SErik Nordmark * have looped them back. That is, if in the absense of 194bd670b35SErik Nordmark * the IRE_LOCAL we would have sent to packet out the 195bd670b35SErik Nordmark * same ill. 196c793af95Ssangeeta */ 197bd670b35SErik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198bd670b35SErik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199bd670b35SErik Nordmark ipst->ips_ip_restrict_interzone_loopback) { 200bd670b35SErik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201bd670b35SErik Nordmark ASSERT(ire != NULL); 202c793af95Ssangeeta } 203c793af95Ssangeeta return (ire); 204c793af95Ssangeeta } 205c793af95Ssangeeta 206da14cebeSEric Cheng /* 207da14cebeSEric Cheng * This function is called by 208bd670b35SErik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the 209bd670b35SErik Nordmark * destination address. 210bd670b35SErik Nordmark * 211da14cebeSEric Cheng * The optimizations of this function over ire_ftable_lookup are: 212da14cebeSEric Cheng * o removing unnecessary flag matching 213da14cebeSEric Cheng * o doing longest prefix match instead of overloading it further 214da14cebeSEric Cheng * with the unnecessary "best_prefix_match" 215bd670b35SErik Nordmark * 216bd670b35SErik Nordmark * If no route is found we return IRE_NOROUTE. 217da14cebeSEric Cheng */ 218bd670b35SErik Nordmark ire_t * 219bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220bd670b35SErik Nordmark uint_t *generationp) 221da14cebeSEric Cheng { 222bd670b35SErik Nordmark ire_t *ire; 223da14cebeSEric Cheng struct rt_sockaddr rdst; 224da14cebeSEric Cheng struct rt_entry *rt; 225bd670b35SErik Nordmark irb_t *irb; 226da14cebeSEric Cheng 227da14cebeSEric Cheng rdst.rt_sin_len = sizeof (rdst); 228da14cebeSEric Cheng rdst.rt_sin_family = AF_INET; 229da14cebeSEric Cheng rdst.rt_sin_addr.s_addr = addr; 230da14cebeSEric Cheng 231da14cebeSEric Cheng /* 232da14cebeSEric Cheng * This is basically inlining a simpler version of ire_match_args 233da14cebeSEric Cheng */ 234da14cebeSEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235da14cebeSEric Cheng 236da14cebeSEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237da14cebeSEric Cheng ipst->ips_ip_ftable, NULL, NULL); 238da14cebeSEric Cheng 239bd670b35SErik Nordmark if (rt == NULL) 240bd670b35SErik Nordmark goto bad; 241bd670b35SErik Nordmark 242bd670b35SErik Nordmark irb = &rt->rt_irb; 243bd670b35SErik Nordmark if (irb->irb_ire_cnt == 0) 244bd670b35SErik Nordmark goto bad; 245bd670b35SErik Nordmark 246bd670b35SErik Nordmark rw_enter(&irb->irb_lock, RW_READER); 247bd670b35SErik Nordmark ire = irb->irb_ire; 248bd670b35SErik Nordmark if (ire == NULL) { 249bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 250bd670b35SErik Nordmark goto bad; 251da14cebeSEric Cheng } 252bd670b35SErik Nordmark while (IRE_IS_CONDEMNED(ire)) { 253bd670b35SErik Nordmark ire = ire->ire_next; 254bd670b35SErik Nordmark if (ire == NULL) { 255bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 256bd670b35SErik Nordmark goto bad; 257bd670b35SErik Nordmark } 258da14cebeSEric Cheng } 259da14cebeSEric Cheng 260da14cebeSEric Cheng /* we have a ire that matches */ 261bd670b35SErik Nordmark ire_refhold(ire); 262bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 263bd670b35SErik Nordmark 264bd670b35SErik Nordmark /* 265bd670b35SErik Nordmark * round-robin only if we have more than one route in the bucket. 266bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 267bd670b35SErik Nordmark * 2: always 268bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269bd670b35SErik Nordmark * 0: never 270bd670b35SErik Nordmark * 271bd670b35SErik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272bd670b35SErik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273bd670b35SErik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches. 274bd670b35SErik Nordmark */ 275bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1) { 276bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 277bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 278bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 279bd670b35SErik Nordmark ire_t *next_ire; 280bd670b35SErik Nordmark ire_ftable_args_t margs; 281bd670b35SErik Nordmark 282bd670b35SErik Nordmark (void) memset(&margs, 0, sizeof (margs)); 283bd670b35SErik Nordmark margs.ift_addr = addr; 284bd670b35SErik Nordmark margs.ift_zoneid = ALL_ZONES; 285bd670b35SErik Nordmark 286bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 287bd670b35SErik Nordmark xmit_hint, ire, ipst); 288bd670b35SErik Nordmark if (next_ire == NULL) { 289bd670b35SErik Nordmark /* keep ire if next_ire is null */ 290bd670b35SErik Nordmark if (generationp != NULL) 291bd670b35SErik Nordmark *generationp = ire->ire_generation; 292bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293bd670b35SErik Nordmark return (ire); 294bd670b35SErik Nordmark } 295bd670b35SErik Nordmark ire_refrele(ire); 296bd670b35SErik Nordmark ire = next_ire; 297bd670b35SErik Nordmark } 298bd670b35SErik Nordmark } 299bd670b35SErik Nordmark /* Return generation before dropping lock */ 300bd670b35SErik Nordmark if (generationp != NULL) 301bd670b35SErik Nordmark *generationp = ire->ire_generation; 302bd670b35SErik Nordmark 303da14cebeSEric Cheng RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304da14cebeSEric Cheng 305bd670b35SErik Nordmark /* 306bd670b35SErik Nordmark * Since we only did ALL_ZONES matches there is no special handling 307bd670b35SErik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308bd670b35SErik Nordmark */ 309da14cebeSEric Cheng return (ire); 310da14cebeSEric Cheng 311bd670b35SErik Nordmark bad: 312bd670b35SErik Nordmark if (generationp != NULL) 313bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 314da14cebeSEric Cheng 315bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316bd670b35SErik Nordmark return (ire_reject(ipst, B_FALSE)); 317da14cebeSEric Cheng } 318c793af95Ssangeeta 319c793af95Ssangeeta /* 320bd670b35SErik Nordmark * Find the ill matching a multicast group. 321c793af95Ssangeeta * Allows different routes for multicast addresses 322c793af95Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323c793af95Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 324c793af95Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325c793af95Ssangeeta * specify the interface to join on. 326c793af95Ssangeeta * 327bd670b35SErik Nordmark * Supports link-local addresses by using ire_route_recursive which follows 328bd670b35SErik Nordmark * the ill when recursing. 329bd670b35SErik Nordmark * 330bd670b35SErik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331bd670b35SErik Nordmark * and the MULTIRT property can be different for different groups, we 332bd670b35SErik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group 333bd670b35SErik Nordmark * with CGTP and pass that back in the multirtp argument. 334bd670b35SErik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335bd670b35SErik Nordmark * We have a setsrcp argument for the same reason. 336c793af95Ssangeeta */ 337bd670b35SErik Nordmark ill_t * 338bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339bd670b35SErik Nordmark boolean_t *multirtp, ipaddr_t *setsrcp) 340c793af95Ssangeeta { 341c793af95Ssangeeta ire_t *ire; 342bd670b35SErik Nordmark ill_t *ill; 343c793af95Ssangeeta 344bd670b35SErik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345bd670b35SErik Nordmark MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 346bd670b35SErik Nordmark ASSERT(ire != NULL); 347bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348c793af95Ssangeeta ire_refrele(ire); 349c793af95Ssangeeta return (NULL); 350c793af95Ssangeeta } 351bd670b35SErik Nordmark 352bd670b35SErik Nordmark if (multirtp != NULL) 353bd670b35SErik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354bd670b35SErik Nordmark 355bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 356bd670b35SErik Nordmark ire_refrele(ire); 357bd670b35SErik Nordmark return (ill); 358c793af95Ssangeeta } 359c793af95Ssangeeta 360c793af95Ssangeeta /* 361c793af95Ssangeeta * Delete the passed in ire if the gateway addr matches 362c793af95Ssangeeta */ 363c793af95Ssangeeta void 364c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 365c793af95Ssangeeta { 3666bdb8e66Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 367c793af95Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368c793af95Ssangeeta ire_delete(ire); 369c793af95Ssangeeta } 370c793af95Ssangeeta 371c793af95Ssangeeta /* 372bd670b35SErik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373c793af95Ssangeeta * pointing at the specified gateway and 374c793af95Ssangeeta * delete them. This routine is called only 375c793af95Ssangeeta * when a default gateway is going away. 376c793af95Ssangeeta */ 377c793af95Ssangeeta void 378f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379c793af95Ssangeeta { 380c793af95Ssangeeta struct rtfuncarg rtfarg; 381c793af95Ssangeeta 382c793af95Ssangeeta (void) memset(&rtfarg, 0, sizeof (rtfarg)); 383c793af95Ssangeeta rtfarg.rt_func = ire_del_host_redir; 384c793af95Ssangeeta rtfarg.rt_arg = (void *)&gateway; 385f4b3ec61Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 386f4b3ec61Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 387c793af95Ssangeeta } 388c793af95Ssangeeta 389c793af95Ssangeeta /* 390f4b3ec61Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 391f4b3ec61Sdh155122 * the ips_ip_ftable. 392c793af95Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 393c793af95Ssangeeta * route already exists, return the bucket for the existing route. 394c793af95Ssangeeta * 395c793af95Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 396c793af95Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 397c793af95Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 398c793af95Ssangeeta * while holding the irb_lock, but not the radix tree lock. 399c793af95Ssangeeta */ 400c793af95Ssangeeta irb_t * 401c793af95Ssangeeta ire_get_bucket(ire_t *ire) 402c793af95Ssangeeta { 403c793af95Ssangeeta struct radix_node *rn; 404c793af95Ssangeeta struct rt_entry *rt; 405c793af95Ssangeeta struct rt_sockaddr rmask, rdst; 406c793af95Ssangeeta irb_t *irb = NULL; 407f4b3ec61Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 408c793af95Ssangeeta 409f4b3ec61Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 410c793af95Ssangeeta 411c793af95Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 412c793af95Ssangeeta (void) memset(&rdst, 0, sizeof (rdst)); 413c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 414c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 415c793af95Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 416c793af95Ssangeeta 417c793af95Ssangeeta (void) memset(&rmask, 0, sizeof (rmask)); 418c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 419c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 420c793af95Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 421c793af95Ssangeeta 422c793af95Ssangeeta /* 423c793af95Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 424c793af95Ssangeeta */ 425c793af95Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 42629bc4795Ssangeeta /* kmem_alloc failed */ 42729bc4795Ssangeeta if (rt == NULL) 42829bc4795Ssangeeta return (NULL); 42929bc4795Ssangeeta 430c793af95Ssangeeta (void) memset(rt, 0, sizeof (*rt)); 431c793af95Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 432c793af95Ssangeeta rt->rt_dst = rdst; 433c793af95Ssangeeta irb = &rt->rt_irb; 434bd670b35SErik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 435f4b3ec61Sdh155122 irb->irb_ipst = ipst; 436c793af95Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 437f4b3ec61Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 438f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 439f4b3ec61Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 440c793af95Ssangeeta if (rn == NULL) { 441f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 442c793af95Ssangeeta Free(rt, rt_entry_cache); 443c793af95Ssangeeta rt = NULL; 444c793af95Ssangeeta irb = NULL; 445f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 446f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 447f4b3ec61Sdh155122 ipst->ips_ip_ftable); 448f4b3ec61Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 449c793af95Ssangeeta /* found a non-root match */ 450c793af95Ssangeeta rt = (struct rt_entry *)rn; 451c793af95Ssangeeta } 452c793af95Ssangeeta } 453c793af95Ssangeeta if (rt != NULL) { 454c793af95Ssangeeta irb = &rt->rt_irb; 455bd670b35SErik Nordmark irb_refhold(irb); 456c793af95Ssangeeta } 457f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 458c793af95Ssangeeta return (irb); 459c793af95Ssangeeta } 460c793af95Ssangeeta 461c793af95Ssangeeta /* 462c793af95Ssangeeta * This function is used when the caller wants to know the outbound 463c793af95Ssangeeta * interface for a packet given only the address. 464c793af95Ssangeeta * If this is a offlink IP address and there are multiple 465c793af95Ssangeeta * routes to this destination, this routine will utilise the 466c793af95Ssangeeta * first route it finds to IP address 467c793af95Ssangeeta * Return values: 468c793af95Ssangeeta * 0 - FAILURE 469c793af95Ssangeeta * nonzero - ifindex 470c793af95Ssangeeta */ 471c793af95Ssangeeta uint_t 472c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 473c793af95Ssangeeta { 474c793af95Ssangeeta uint_t ifindex = 0; 475c793af95Ssangeeta ire_t *ire; 476c793af95Ssangeeta ill_t *ill; 477f4b3ec61Sdh155122 netstack_t *ns; 478f4b3ec61Sdh155122 ip_stack_t *ipst; 479c793af95Ssangeeta 480f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 481f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 482f4b3ec61Sdh155122 else 483f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 484f4b3ec61Sdh155122 ASSERT(ns != NULL); 485f4b3ec61Sdh155122 486f4b3ec61Sdh155122 /* 487f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 488f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 489f4b3ec61Sdh155122 */ 490f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 491f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 492f4b3ec61Sdh155122 ipst = ns->netstack_ip; 493c793af95Ssangeeta 494c793af95Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 495c793af95Ssangeeta 496f4b3ec61Sdh155122 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 497bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 498bd670b35SErik Nordmark if (ill != NULL) { 499c793af95Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 500bd670b35SErik Nordmark ill_refrele(ill); 501bd670b35SErik Nordmark } 502c793af95Ssangeeta ire_refrele(ire); 503c793af95Ssangeeta } 504f4b3ec61Sdh155122 netstack_rele(ns); 505c793af95Ssangeeta return (ifindex); 506c793af95Ssangeeta } 507c793af95Ssangeeta 508c793af95Ssangeeta /* 509c793af95Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 510bd670b35SErik Nordmark * it tries to match the route to the corresponding ipif for the ifindex 511c793af95Ssangeeta */ 512c793af95Ssangeeta static ire_t * 513f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 514c793af95Ssangeeta { 515c793af95Ssangeeta ire_t *ire = NULL; 516c793af95Ssangeeta int match_flags; 517c793af95Ssangeeta 518bd670b35SErik Nordmark match_flags = MATCH_IRE_DSTONLY; 519c793af95Ssangeeta 520c793af95Ssangeeta /* XXX pass NULL tsl for now */ 521c793af95Ssangeeta 522c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 523bd670b35SErik Nordmark ire = ire_route_recursive_v4( 524bd670b35SErik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 525bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 526bd670b35SErik Nordmark NULL); 527c793af95Ssangeeta } else { 528bd670b35SErik Nordmark ire = ire_route_recursive_v6( 529bd670b35SErik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 530bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 531bd670b35SErik Nordmark NULL); 532bd670b35SErik Nordmark } 533bd670b35SErik Nordmark ASSERT(ire != NULL); 534bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 535bd670b35SErik Nordmark ire_refrele(ire); 536bd670b35SErik Nordmark return (NULL); 537c793af95Ssangeeta } 538c793af95Ssangeeta return (ire); 539c793af95Ssangeeta } 540c793af95Ssangeeta 541c793af95Ssangeeta /* 542c793af95Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 543bd670b35SErik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may 544bd670b35SErik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 545c793af95Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 546c793af95Ssangeeta * IP WILL NOT DO the following to the data packet before sending it out: 547c793af95Ssangeeta * a. manipulate ttl 548edd26dc5Sdr146992 * b. ipsec work 549edd26dc5Sdr146992 * c. fragmentation 550edd26dc5Sdr146992 * 551edd26dc5Sdr146992 * If the packet has been prepared for hardware checksum then it will be 552edd26dc5Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 553edd26dc5Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 554c793af95Ssangeeta * 555c793af95Ssangeeta * Return values: 556c793af95Ssangeeta * 0: IP was able to send of the data pkt 557c793af95Ssangeeta * ECOMM: Could not send packet 558c793af95Ssangeeta * ENONET No route to dst. It is up to the caller 559c793af95Ssangeeta * to send icmp unreachable error message, 560c793af95Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 561c793af95Ssangeeta * of the offlink dst's nexthop needs to get 562c793af95Ssangeeta * resolved before packet can be sent to dst. 563c793af95Ssangeeta * Thus transmission is not guaranteed. 564bd670b35SErik Nordmark * Note: No longer have visibility to the ARP queue 565bd670b35SErik Nordmark * hence no EINPROGRESS. 566c793af95Ssangeeta */ 567c793af95Ssangeeta int 568c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 569c793af95Ssangeeta zoneid_t zoneid) 570c793af95Ssangeeta { 571bd670b35SErik Nordmark ipaddr_t nexthop; 572f4b3ec61Sdh155122 netstack_t *ns; 573f4b3ec61Sdh155122 ip_stack_t *ipst; 574bd670b35SErik Nordmark ip_xmit_attr_t ixas; 575bd670b35SErik Nordmark int error; 576c793af95Ssangeeta 577c793af95Ssangeeta ASSERT(mp != NULL); 578c793af95Ssangeeta 579f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 580f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 581f4b3ec61Sdh155122 else 582f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 583f4b3ec61Sdh155122 ASSERT(ns != NULL); 584f4b3ec61Sdh155122 585f4b3ec61Sdh155122 /* 586f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 587f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 588f4b3ec61Sdh155122 */ 589f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 590f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 591f4b3ec61Sdh155122 ipst = ns->netstack_ip; 592f4b3ec61Sdh155122 593c793af95Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 594c793af95Ssangeeta dst_addr->sa_family == AF_INET6); 595c793af95Ssangeeta 596bd670b35SErik Nordmark bzero(&ixas, sizeof (ixas)); 597bd670b35SErik Nordmark /* 598bd670b35SErik Nordmark * No IPsec, no fragmentation, and don't let any hooks see 599bd670b35SErik Nordmark * the packet. 600bd670b35SErik Nordmark */ 601bd670b35SErik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 602bd670b35SErik Nordmark ixas.ixa_cred = kcred; 603bd670b35SErik Nordmark ixas.ixa_cpid = NOPID; 604bd670b35SErik Nordmark ixas.ixa_tsl = NULL; 605bd670b35SErik Nordmark ixas.ixa_ipst = ipst; 606bd670b35SErik Nordmark ixas.ixa_ifindex = ifindex; 607bd670b35SErik Nordmark 608c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 609bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 610bd670b35SErik Nordmark 611bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_IS_IPV4; 612bd670b35SErik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 613bd670b35SErik Nordmark if (nexthop != ipha->ipha_dst) { 614bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 615bd670b35SErik Nordmark ixas.ixa_nexthop_v4 = nexthop; 616bd670b35SErik Nordmark } 617bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ipha->ipha_ttl; 618c793af95Ssangeeta } else { 619bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 620bd670b35SErik Nordmark in6_addr_t *nexthop6; 621bd670b35SErik Nordmark 622bd670b35SErik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 623bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 624bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 625bd670b35SErik Nordmark ixas.ixa_nexthop_v6 = *nexthop6; 626c793af95Ssangeeta } 627bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ip6h->ip6_hops; 628c793af95Ssangeeta } 629bd670b35SErik Nordmark error = ip_output_simple(mp, &ixas); 630bd670b35SErik Nordmark ixa_cleanup(&ixas); 631c793af95Ssangeeta 632f4b3ec61Sdh155122 netstack_rele(ns); 633bd670b35SErik Nordmark switch (error) { 634bd670b35SErik Nordmark case 0: 635bd670b35SErik Nordmark break; 636bd670b35SErik Nordmark 637bd670b35SErik Nordmark case EHOSTUNREACH: 638bd670b35SErik Nordmark case ENETUNREACH: 639bd670b35SErik Nordmark error = ENONET; 640bd670b35SErik Nordmark break; 641bd670b35SErik Nordmark 642bd670b35SErik Nordmark default: 643bd670b35SErik Nordmark error = ECOMM; 644bd670b35SErik Nordmark break; 645c793af95Ssangeeta } 646bd670b35SErik Nordmark return (error); 647edd26dc5Sdr146992 } 648edd26dc5Sdr146992 649c793af95Ssangeeta /* 650c793af95Ssangeeta * callback function provided by ire_ftable_lookup when calling 651c793af95Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 652c793af95Ssangeeta * the radix tree. 653c793af95Ssangeeta */ 654c793af95Ssangeeta boolean_t 655c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 656c793af95Ssangeeta { 657c793af95Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 658c793af95Ssangeeta irb_t *irb_ptr; 659c793af95Ssangeeta ire_t *ire; 660c793af95Ssangeeta ire_ftable_args_t *margs = arg; 661c793af95Ssangeeta ipaddr_t match_mask; 662c793af95Ssangeeta 663c793af95Ssangeeta ASSERT(rt != NULL); 664c793af95Ssangeeta 665c793af95Ssangeeta irb_ptr = &rt->rt_irb; 666c793af95Ssangeeta 667c793af95Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 668c793af95Ssangeeta return (B_FALSE); 669c793af95Ssangeeta 670c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 671c793af95Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 672bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 673c793af95Ssangeeta continue; 674bd670b35SErik Nordmark if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 675c793af95Ssangeeta match_mask = margs->ift_mask; 676c793af95Ssangeeta else 677c793af95Ssangeeta match_mask = ire->ire_mask; 678c793af95Ssangeeta 679c793af95Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 680bd670b35SErik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill, 681bd670b35SErik Nordmark margs->ift_zoneid, margs->ift_tsl, 682bd670b35SErik Nordmark margs->ift_flags)) { 683bd670b35SErik Nordmark ire_refhold(ire); 684c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 685c793af95Ssangeeta margs->ift_best_ire = ire; 686c793af95Ssangeeta return (B_TRUE); 687c793af95Ssangeeta } 688c793af95Ssangeeta } 689c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 690c793af95Ssangeeta return (B_FALSE); 691c793af95Ssangeeta } 692c793af95Ssangeeta 693c793af95Ssangeeta /* 694c793af95Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 695c793af95Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 696c793af95Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 697c793af95Ssangeeta * be verified are: 698c793af95Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 699c793af95Ssangeeta * - no other threads holding references to ire's in the bucket, 700c793af95Ssangeeta * i.e., irb_nire == 0 701c793af95Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 702c793af95Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 703c793af95Ssangeeta */ 704c793af95Ssangeeta void 705c793af95Ssangeeta irb_refrele_ftable(irb_t *irb) 706c793af95Ssangeeta { 707c793af95Ssangeeta for (;;) { 708c793af95Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 709c793af95Ssangeeta ASSERT(irb->irb_refcnt != 0); 710c793af95Ssangeeta if (irb->irb_refcnt != 1) { 711c793af95Ssangeeta /* 712c793af95Ssangeeta * Someone has a reference to this radix node 713c793af95Ssangeeta * or there is some bucket walker. 714c793af95Ssangeeta */ 715c793af95Ssangeeta irb->irb_refcnt--; 716c793af95Ssangeeta rw_exit(&irb->irb_lock); 717c793af95Ssangeeta return; 718c793af95Ssangeeta } else { 719c793af95Ssangeeta /* 720c793af95Ssangeeta * There is no other walker, nor is there any 721c793af95Ssangeeta * other thread that holds a direct ref to this 722c793af95Ssangeeta * radix node. Do the clean up if needed. Call 723c793af95Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 724c793af95Ssangeeta */ 725c793af95Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 726c793af95Ssangeeta ire_t *ire_list; 727c793af95Ssangeeta 728c793af95Ssangeeta ire_list = ire_unlink(irb); 729c793af95Ssangeeta rw_exit(&irb->irb_lock); 730c793af95Ssangeeta 731c793af95Ssangeeta if (ire_list != NULL) 732c793af95Ssangeeta ire_cleanup(ire_list); 733c793af95Ssangeeta /* 734c793af95Ssangeeta * more CONDEMNED entries could have 735c793af95Ssangeeta * been added while we dropped the lock, 736c793af95Ssangeeta * so we have to re-check. 737c793af95Ssangeeta */ 738c793af95Ssangeeta continue; 739c793af95Ssangeeta } 740c793af95Ssangeeta 741c793af95Ssangeeta /* 742c793af95Ssangeeta * Now check if there are still any ires 743c793af95Ssangeeta * associated with this radix node. 744c793af95Ssangeeta */ 745c793af95Ssangeeta if (irb->irb_nire != 0) { 746c793af95Ssangeeta /* 747c793af95Ssangeeta * someone is still holding on 748c793af95Ssangeeta * to ires in this bucket 749c793af95Ssangeeta */ 750c793af95Ssangeeta irb->irb_refcnt--; 751c793af95Ssangeeta rw_exit(&irb->irb_lock); 752c793af95Ssangeeta return; 753c793af95Ssangeeta } else { 754c793af95Ssangeeta /* 755c793af95Ssangeeta * Everything is clear. Zero walkers, 756c793af95Ssangeeta * Zero threads with a ref to this 757c793af95Ssangeeta * radix node, Zero ires associated with 758c793af95Ssangeeta * this radix node. Due to lock order, 759c793af95Ssangeeta * check the above conditions again 760c793af95Ssangeeta * after grabbing all locks in the right order 761c793af95Ssangeeta */ 762c793af95Ssangeeta rw_exit(&irb->irb_lock); 763c793af95Ssangeeta if (irb_inactive(irb)) 764c793af95Ssangeeta return; 765c793af95Ssangeeta /* 766c793af95Ssangeeta * irb_inactive could not free the irb. 767c793af95Ssangeeta * See if there are any walkers, if not 768c793af95Ssangeeta * try to clean up again. 769c793af95Ssangeeta */ 770c793af95Ssangeeta } 771c793af95Ssangeeta } 772c793af95Ssangeeta } 773c793af95Ssangeeta } 774c793af95Ssangeeta 775c793af95Ssangeeta /* 776bd670b35SErik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal 777bd670b35SErik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs 778bd670b35SErik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular 779bd670b35SErik Nordmark * list for the purposes of walking it. 780bd670b35SErik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is 781bd670b35SErik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one. 782bd670b35SErik Nordmark * If we fail to find an IRE we return NULL. 783c793af95Ssangeeta * 784bd670b35SErik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock 785bd670b35SErik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 786bd670b35SErik Nordmark * 787bd670b35SErik Nordmark * Applies to IPv4 and IPv6. 788bd670b35SErik Nordmark * 789bd670b35SErik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 790bd670b35SErik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also 791bd670b35SErik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 792bd670b35SErik Nordmark * first in the bucket. Thus we compare that ire_flags match the orig_ire. 793bd670b35SErik Nordmark * 794bd670b35SErik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 795bd670b35SErik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 796bd670b35SErik Nordmark * in which the zone has an IP address. We check this for the global zone 797bd670b35SErik Nordmark * even if no shared-IP zones are configured. 798c793af95Ssangeeta */ 799c793af95Ssangeeta ire_t * 800bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 801bd670b35SErik Nordmark ire_t *orig_ire, ip_stack_t *ipst) 802c793af95Ssangeeta { 803c793af95Ssangeeta ire_t *ire, *maybe_ire = NULL; 804bd670b35SErik Nordmark uint_t maybe_badcnt; 805bd670b35SErik Nordmark uint_t maxwalk; 806bd670b35SErik Nordmark 807bd670b35SErik Nordmark /* Fold in more bits from the hint/hash */ 808bd670b35SErik Nordmark hash = hash ^ (hash >> 8) ^ (hash >> 16); 809c793af95Ssangeeta 810c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 811bd670b35SErik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 812bd670b35SErik Nordmark hash %= maxwalk; 813bd670b35SErik Nordmark irb_refhold_locked(irb_ptr); 814c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 815c793af95Ssangeeta 816c793af95Ssangeeta /* 817c793af95Ssangeeta * Round-robin the routers list looking for a route that 818c793af95Ssangeeta * matches the passed in parameters. 819bd670b35SErik Nordmark * First we skip "hash" number of non-condemned IREs. 820bd670b35SErik Nordmark * Then we match the IRE. 821bd670b35SErik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember 822bd670b35SErik Nordmark * it and keep on looking for a lower ire_badcnt. 823bd670b35SErik Nordmark * If we come to the end of the list we continue (treat the 824bd670b35SErik Nordmark * bucket list as a circular list) but we match less than "max" 825bd670b35SErik Nordmark * entries. 826c793af95Ssangeeta */ 827bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 828bd670b35SErik Nordmark while (maxwalk > 0) { 829bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 830bd670b35SErik Nordmark goto next_ire_skip; 831c793af95Ssangeeta 832bd670b35SErik Nordmark /* Skip the first "hash" entries to do ECMP */ 833bd670b35SErik Nordmark if (hash != 0) { 834bd670b35SErik Nordmark hash--; 835bd670b35SErik Nordmark goto next_ire_skip; 836bd670b35SErik Nordmark } 837bd670b35SErik Nordmark 838bd670b35SErik Nordmark /* See CGTP comment above */ 839bd670b35SErik Nordmark if (ire->ire_type != orig_ire->ire_type || 840bd670b35SErik Nordmark ire->ire_flags != orig_ire->ire_flags) 841c793af95Ssangeeta goto next_ire; 842c793af95Ssangeeta 843c793af95Ssangeeta /* 844bd670b35SErik Nordmark * Note: Since IPv6 has hash buckets instead of radix 845bd670b35SErik Nordmark * buckers we need to explicitly compare the addresses. 846bd670b35SErik Nordmark * That makes this less efficient since we will be called 847bd670b35SErik Nordmark * even if there is no alternatives just because the 848bd670b35SErik Nordmark * bucket has multiple IREs for different addresses. 849c793af95Ssangeeta */ 850bd670b35SErik Nordmark if (ire->ire_ipversion == IPV6_VERSION) { 851bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 852bd670b35SErik Nordmark &ire->ire_addr_v6)) 853c793af95Ssangeeta goto next_ire; 854c793af95Ssangeeta } 855c793af95Ssangeeta 856c793af95Ssangeeta /* 857bd670b35SErik Nordmark * For some reason find_best_route uses ire_mask. We do 858bd670b35SErik Nordmark * the same. 859bd670b35SErik Nordmark */ 860bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION ? 861bd670b35SErik Nordmark !ire_match_args(ire, margs->ift_addr, 862bd670b35SErik Nordmark ire->ire_mask, margs->ift_gateway, 863bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 864bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags) : 865bd670b35SErik Nordmark !ire_match_args_v6(ire, &margs->ift_addr_v6, 866bd670b35SErik Nordmark &ire->ire_mask_v6, &margs->ift_gateway_v6, 867bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 868bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags)) 869bd670b35SErik Nordmark goto next_ire; 870bd670b35SErik Nordmark 871bd670b35SErik Nordmark if (margs->ift_zoneid != ALL_ZONES && 872bd670b35SErik Nordmark (ire->ire_type & IRE_OFFLINK)) { 873bd670b35SErik Nordmark /* 874bd670b35SErik Nordmark * When we're in a zone, we're only 875c793af95Ssangeeta * interested in routers that are 876c793af95Ssangeeta * reachable through ipifs within our zone. 877c793af95Ssangeeta */ 878bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 879bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v4( 880bd670b35SErik Nordmark ire->ire_gateway_addr, margs->ift_zoneid, 881bd670b35SErik Nordmark ire->ire_ill, margs->ift_tsl, ipst, 882bd670b35SErik Nordmark B_TRUE)) 883bd670b35SErik Nordmark goto next_ire; 884bd670b35SErik Nordmark } else { 885bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v6( 886bd670b35SErik Nordmark &ire->ire_gateway_addr_v6, 887bd670b35SErik Nordmark margs->ift_zoneid, ire->ire_ill, 888bd670b35SErik Nordmark margs->ift_tsl, ipst, B_TRUE)) 889bd670b35SErik Nordmark goto next_ire; 890bd670b35SErik Nordmark } 891bd670b35SErik Nordmark } 892bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 893bd670b35SErik Nordmark /* Look for stale ire_badcnt and clear */ 894bd670b35SErik Nordmark if (ire->ire_badcnt != 0 && 895*d3d50737SRafael Vanoni (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 896bd670b35SErik Nordmark ipst->ips_ip_ire_badcnt_lifetime)) 897bd670b35SErik Nordmark ire->ire_badcnt = 0; 898bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 899e11c3f44Smeem 900bd670b35SErik Nordmark if (ire->ire_badcnt == 0) { 901bd670b35SErik Nordmark /* We found one with a zero badcnt; done */ 902bd670b35SErik Nordmark ire_refhold(ire); 903bd670b35SErik Nordmark /* 904bd670b35SErik Nordmark * Care needed since irb_refrele grabs WLOCK to free 905bd670b35SErik Nordmark * the irb_t. 906bd670b35SErik Nordmark */ 907bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 908bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 909bd670b35SErik Nordmark irb_refrele(irb_ptr); 910bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 911bd670b35SErik Nordmark } else { 912bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 913bd670b35SErik Nordmark irb_refrele(irb_ptr); 914bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, 915bd670b35SErik Nordmark RW_READER); 916bd670b35SErik Nordmark } 917c793af95Ssangeeta return (ire); 918c793af95Ssangeeta } 919bd670b35SErik Nordmark /* 920bd670b35SErik Nordmark * keep looking to see if there is a better (lower 921bd670b35SErik Nordmark * badcnt) matching IRE, but save this one as a last resort. 922bd670b35SErik Nordmark * If we find a lower badcnt pick that one as the last* resort. 923bd670b35SErik Nordmark */ 924bd670b35SErik Nordmark if (maybe_ire == NULL) { 925bd670b35SErik Nordmark maybe_ire = ire; 926bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 927bd670b35SErik Nordmark } else if (ire->ire_badcnt < maybe_badcnt) { 928bd670b35SErik Nordmark maybe_ire = ire; 929bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 930bd670b35SErik Nordmark } 931bd670b35SErik Nordmark 932c793af95Ssangeeta next_ire: 933bd670b35SErik Nordmark maxwalk--; 934bd670b35SErik Nordmark next_ire_skip: 935bd670b35SErik Nordmark ire = ire->ire_next; 936bd670b35SErik Nordmark if (ire == NULL) 937bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 938c793af95Ssangeeta } 939c793af95Ssangeeta if (maybe_ire != NULL) 940bd670b35SErik Nordmark ire_refhold(maybe_ire); 941bd670b35SErik Nordmark 942bd670b35SErik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 943bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 944bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 945bd670b35SErik Nordmark irb_refrele(irb_ptr); 946bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 947bd670b35SErik Nordmark } else { 948bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 949bd670b35SErik Nordmark irb_refrele(irb_ptr); 950bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 951bd670b35SErik Nordmark } 952c793af95Ssangeeta return (maybe_ire); 953c793af95Ssangeeta } 9542679e103Ssowmini 9552679e103Ssowmini void 9562679e103Ssowmini irb_refhold_rn(struct radix_node *rn) 9572679e103Ssowmini { 9582679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 959bd670b35SErik Nordmark irb_refhold(&((rt_t *)(rn))->rt_irb); 9602679e103Ssowmini } 9612679e103Ssowmini 9622679e103Ssowmini void 9632679e103Ssowmini irb_refrele_rn(struct radix_node *rn) 9642679e103Ssowmini { 9652679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9662679e103Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9672679e103Ssowmini } 968bd670b35SErik Nordmark 969bd670b35SErik Nordmark /* 970bd670b35SErik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 971bd670b35SErik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to 972bd670b35SErik Nordmark * lookup an nce for the multicast case. 973bd670b35SErik Nordmark */ 974bd670b35SErik Nordmark ire_t * 975bd670b35SErik Nordmark ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 976bd670b35SErik Nordmark uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 977bd670b35SErik Nordmark { 978bd670b35SErik Nordmark uint_t match_args; 979bd670b35SErik Nordmark uint_t ire_type; 980bd670b35SErik Nordmark ill_t *ill; 981bd670b35SErik Nordmark ire_t *ire; 982bd670b35SErik Nordmark ip_stack_t *ipst = ixa->ixa_ipst; 983bd670b35SErik Nordmark ipaddr_t v4dst; 984bd670b35SErik Nordmark in6_addr_t v6nexthop; 985bd670b35SErik Nordmark iaflags_t ixaflags = ixa->ixa_flags; 986bd670b35SErik Nordmark nce_t *nce; 987bd670b35SErik Nordmark 988bd670b35SErik Nordmark match_args = MATCH_IRE_SECATTR; 989bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 990bd670b35SErik Nordmark if (setsrcp != NULL) 991bd670b35SErik Nordmark ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 992bd670b35SErik Nordmark if (errorp != NULL) 993bd670b35SErik Nordmark ASSERT(*errorp == 0); 994bd670b35SErik Nordmark 995bd670b35SErik Nordmark /* 996bd670b35SErik Nordmark * The content of the ixa will be different if IP_NEXTHOP, 997bd670b35SErik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 998bd670b35SErik Nordmark */ 999bd670b35SErik Nordmark 1000bd670b35SErik Nordmark if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1001bd670b35SErik Nordmark IN6_IS_ADDR_MULTICAST(v6dst)) { 1002bd670b35SErik Nordmark /* Pick up the IRE_MULTICAST for the ill */ 1003bd670b35SErik Nordmark if (ixa->ixa_multicast_ifindex != 0) { 1004bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1005bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1006bd670b35SErik Nordmark } else if (ixaflags & IXAF_SCOPEID_SET) { 1007bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1008bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1009bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1010bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1011bd670b35SErik Nordmark } else if (ixa->ixa_ifindex != 0) { 1012bd670b35SErik Nordmark /* 1013bd670b35SErik Nordmark * In the ipmp case, the ixa_ifindex is set to 1014bd670b35SErik Nordmark * point at an under_ill and we would return the 1015bd670b35SErik Nordmark * ire_multicast() corresponding to that under_ill. 1016bd670b35SErik Nordmark */ 1017bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1018bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1019bd670b35SErik Nordmark } else if (ixaflags & IXAF_IS_IPV4) { 1020bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1021bd670b35SErik Nordmark 1022bd670b35SErik Nordmark ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1023bd670b35SErik Nordmark multirtp, &v4setsrc); 1024bd670b35SErik Nordmark if (setsrcp != NULL) 1025bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1026bd670b35SErik Nordmark } else { 1027bd670b35SErik Nordmark ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1028bd670b35SErik Nordmark multirtp, setsrcp); 1029bd670b35SErik Nordmark } 1030bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1031bd670b35SErik Nordmark ill_refrele(ill); 1032bd670b35SErik Nordmark ill = NULL; 1033bd670b35SErik Nordmark } 1034bd670b35SErik Nordmark if (ill == NULL) { 1035bd670b35SErik Nordmark if (errorp != NULL) 1036bd670b35SErik Nordmark *errorp = ENXIO; 1037bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1038bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1039bd670b35SErik Nordmark return (ire); 1040bd670b35SErik Nordmark } 1041bd670b35SErik Nordmark if (!(ill->ill_flags & ILLF_MULTICAST)) { 1042bd670b35SErik Nordmark ill_refrele(ill); 1043bd670b35SErik Nordmark if (errorp != NULL) 1044bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1045bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1046bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1047bd670b35SErik Nordmark return (ire); 1048bd670b35SErik Nordmark } 1049bd670b35SErik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */ 1050bd670b35SErik Nordmark ire = ire_multicast(ill); 1051bd670b35SErik Nordmark ill_refrele(ill); 1052bd670b35SErik Nordmark if (generationp != NULL) 1053bd670b35SErik Nordmark *generationp = ire->ire_generation; 1054bd670b35SErik Nordmark if (errorp != NULL && 1055bd670b35SErik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1056bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1057bd670b35SErik Nordmark } 1058bd670b35SErik Nordmark return (ire); 1059bd670b35SErik Nordmark } 1060bd670b35SErik Nordmark 1061bd670b35SErik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1062bd670b35SErik Nordmark if (ixaflags & IXAF_SCOPEID_SET) { 1063bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1064bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1065bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1066bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1067bd670b35SErik Nordmark } else { 1068bd670b35SErik Nordmark ASSERT(ixa->ixa_ifindex != 0); 1069bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1070bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1071bd670b35SErik Nordmark } 1072bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1073bd670b35SErik Nordmark ill_refrele(ill); 1074bd670b35SErik Nordmark ill = NULL; 1075bd670b35SErik Nordmark } 1076bd670b35SErik Nordmark if (ill == NULL) { 1077bd670b35SErik Nordmark if (errorp != NULL) 1078bd670b35SErik Nordmark *errorp = ENXIO; 1079bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1080bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1081bd670b35SErik Nordmark return (ire); 1082bd670b35SErik Nordmark } 1083bd670b35SErik Nordmark /* 1084bd670b35SErik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1085bd670b35SErik Nordmark * so for both of them we need to be able look for an under 1086bd670b35SErik Nordmark * interface. 1087bd670b35SErik Nordmark */ 1088bd670b35SErik Nordmark if (IS_UNDER_IPMP(ill)) 1089bd670b35SErik Nordmark match_args |= MATCH_IRE_TESTHIDDEN; 1090bd670b35SErik Nordmark } else { 1091bd670b35SErik Nordmark ill = NULL; 1092bd670b35SErik Nordmark } 1093bd670b35SErik Nordmark 1094bd670b35SErik Nordmark if (ixaflags & IXAF_NEXTHOP_SET) { 1095bd670b35SErik Nordmark /* IP_NEXTHOP was set */ 1096bd670b35SErik Nordmark v6nexthop = ixa->ixa_nexthop_v6; 1097bd670b35SErik Nordmark } else { 1098bd670b35SErik Nordmark v6nexthop = *v6dst; 1099bd670b35SErik Nordmark } 1100bd670b35SErik Nordmark 1101bd670b35SErik Nordmark ire_type = 0; 1102bd670b35SErik Nordmark /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1103bd670b35SErik Nordmark 1104bd670b35SErik Nordmark /* 1105bd670b35SErik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1106bd670b35SErik Nordmark * we only look for an onlink IRE. 1107bd670b35SErik Nordmark */ 1108bd670b35SErik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1109bd670b35SErik Nordmark match_args |= MATCH_IRE_TYPE; 1110bd670b35SErik Nordmark ire_type = IRE_ONLINK; 1111bd670b35SErik Nordmark } 1112bd670b35SErik Nordmark 1113bd670b35SErik Nordmark if (ixaflags & IXAF_IS_IPV4) { 1114bd670b35SErik Nordmark ipaddr_t v4nexthop; 1115bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1116bd670b35SErik Nordmark 1117bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1118bd670b35SErik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1119bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1120bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1121bd670b35SErik Nordmark if (setsrcp != NULL) 1122bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1123bd670b35SErik Nordmark } else { 1124bd670b35SErik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1125bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1126bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1127bd670b35SErik Nordmark } 1128bd670b35SErik Nordmark 1129bd670b35SErik Nordmark #ifdef DEBUG 1130bd670b35SErik Nordmark if (match_args & MATCH_IRE_TESTHIDDEN) { 1131bd670b35SErik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n", 1132bd670b35SErik Nordmark v4dst, (void *)ire)); 1133bd670b35SErik Nordmark } 1134bd670b35SErik Nordmark #endif 1135bd670b35SErik Nordmark 1136bd670b35SErik Nordmark if (ill != NULL) 1137bd670b35SErik Nordmark ill_refrele(ill); 1138bd670b35SErik Nordmark 1139bd670b35SErik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1140bd670b35SErik Nordmark (ire->ire_type & IRE_MULTICAST)) { 1141bd670b35SErik Nordmark /* No ire_nce_cache */ 1142bd670b35SErik Nordmark return (ire); 1143bd670b35SErik Nordmark } 1144bd670b35SErik Nordmark 1145bd670b35SErik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1146bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1147bd670b35SErik Nordmark nce = ire->ire_nce_cache; 1148bd670b35SErik Nordmark if (nce == NULL || nce->nce_is_condemned) { 1149bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1150bd670b35SErik Nordmark (void) ire_revalidate_nce(ire); 1151bd670b35SErik Nordmark } else { 1152bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1153bd670b35SErik Nordmark } 1154bd670b35SErik Nordmark return (ire); 1155bd670b35SErik Nordmark } 1156bd670b35SErik Nordmark 1157bd670b35SErik Nordmark /* 1158bd670b35SErik Nordmark * Find a route given some xmit attributes and a packet. 1159bd670b35SErik Nordmark * Generic for IPv4 and IPv6 1160bd670b35SErik Nordmark * 1161bd670b35SErik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE 1162bd670b35SErik Nordmark * it might set errorp. 1163bd670b35SErik Nordmark */ 1164bd670b35SErik Nordmark ire_t * 1165bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1166bd670b35SErik Nordmark int *errorp, boolean_t *multirtp) 1167bd670b35SErik Nordmark { 1168bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_IS_IPV4) { 1169bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 1170bd670b35SErik Nordmark in6_addr_t v6dst; 1171bd670b35SErik Nordmark 1172bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1173bd670b35SErik Nordmark 1174bd670b35SErik Nordmark return (ip_select_route(&v6dst, ixa, generationp, 1175bd670b35SErik Nordmark NULL, errorp, multirtp)); 1176bd670b35SErik Nordmark } else { 1177bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1178bd670b35SErik Nordmark 1179bd670b35SErik Nordmark return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1180bd670b35SErik Nordmark NULL, errorp, multirtp)); 1181bd670b35SErik Nordmark } 1182bd670b35SErik Nordmark } 1183bd670b35SErik Nordmark 1184bd670b35SErik Nordmark ire_t * 1185bd670b35SErik Nordmark ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1186bd670b35SErik Nordmark ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1187bd670b35SErik Nordmark { 1188bd670b35SErik Nordmark in6_addr_t v6dst; 1189bd670b35SErik Nordmark ire_t *ire; 1190bd670b35SErik Nordmark in6_addr_t setsrc; 1191bd670b35SErik Nordmark 1192bd670b35SErik Nordmark ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1193bd670b35SErik Nordmark 1194bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1195bd670b35SErik Nordmark 1196bd670b35SErik Nordmark setsrc = ipv6_all_zeros; 1197bd670b35SErik Nordmark ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1198bd670b35SErik Nordmark multirtp); 1199bd670b35SErik Nordmark if (v4setsrcp != NULL) 1200bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1201bd670b35SErik Nordmark return (ire); 1202bd670b35SErik Nordmark } 1203bd670b35SErik Nordmark 1204bd670b35SErik Nordmark /* 1205bd670b35SErik Nordmark * Recursively look for a route to the destination. Can also match on 1206bd670b35SErik Nordmark * the zoneid, ill, and label. Used for the data paths. See also 1207bd670b35SErik Nordmark * ire_route_recursive. 1208bd670b35SErik Nordmark * 1209bd670b35SErik Nordmark * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1210bd670b35SErik Nordmark * 1211bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1212bd670b35SErik Nordmark * instead. 1213bd670b35SErik Nordmark * 1214bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1215bd670b35SErik Nordmark * is an error. 1216bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1217bd670b35SErik Nordmark */ 1218bd670b35SErik Nordmark ire_t * 1219bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire, 1220bd670b35SErik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1221bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1222bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1223bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1224bd670b35SErik Nordmark { 1225bd670b35SErik Nordmark int i, j; 1226bd670b35SErik Nordmark ire_t *ires[MAX_IRE_RECURSION]; 1227bd670b35SErik Nordmark uint_t generation; 1228bd670b35SErik Nordmark uint_t generations[MAX_IRE_RECURSION]; 1229bd670b35SErik Nordmark boolean_t need_refrele = B_FALSE; 1230bd670b35SErik Nordmark boolean_t invalidate = B_FALSE; 1231bd670b35SErik Nordmark int prefs[MAX_IRE_RECURSION]; 1232bd670b35SErik Nordmark ill_t *ill = NULL; 1233bd670b35SErik Nordmark 1234bd670b35SErik Nordmark if (setsrcp != NULL) 1235bd670b35SErik Nordmark ASSERT(*setsrcp == INADDR_ANY); 1236bd670b35SErik Nordmark if (gwattrp != NULL) 1237bd670b35SErik Nordmark ASSERT(*gwattrp == NULL); 1238bd670b35SErik Nordmark 1239bd670b35SErik Nordmark if (ill_arg != NULL) 1240bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1241bd670b35SErik Nordmark 1242bd670b35SErik Nordmark /* 1243bd670b35SErik Nordmark * We iterate up to three times to resolve a route, even though 1244bd670b35SErik Nordmark * we have four slots in the array. The extra slot is for an 1245bd670b35SErik Nordmark * IRE_IF_CLONE we might need to create. 1246bd670b35SErik Nordmark */ 1247bd670b35SErik Nordmark i = 0; 1248bd670b35SErik Nordmark while (i < MAX_IRE_RECURSION - 1) { 1249bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1250bd670b35SErik Nordmark if (ire == NULL) { 1251bd670b35SErik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1252bd670b35SErik Nordmark (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1253bd670b35SErik Nordmark match_args, xmit_hint, ipst, &generation); 1254bd670b35SErik Nordmark } else { 1255bd670b35SErik Nordmark /* Caller passed it; extra hold since we will rele */ 1256bd670b35SErik Nordmark ire_refhold(ire); 1257bd670b35SErik Nordmark if (generationp != NULL) 1258bd670b35SErik Nordmark generation = *generationp; 1259bd670b35SErik Nordmark else 1260bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1261bd670b35SErik Nordmark } 1262bd670b35SErik Nordmark if (ire == NULL) 1263bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1264bd670b35SErik Nordmark 1265bd670b35SErik Nordmark /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1266bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1267bd670b35SErik Nordmark goto error; 1268bd670b35SErik Nordmark 1269bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1270bd670b35SErik Nordmark 1271bd670b35SErik Nordmark prefs[i] = ire_pref(ire); 1272bd670b35SErik Nordmark if (i != 0) { 1273bd670b35SErik Nordmark /* 1274bd670b35SErik Nordmark * Don't allow anything unusual past the first 1275bd670b35SErik Nordmark * iteration. 1276bd670b35SErik Nordmark */ 1277bd670b35SErik Nordmark if ((ire->ire_type & 1278bd670b35SErik Nordmark (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1279bd670b35SErik Nordmark prefs[i] <= prefs[i-1]) { 1280bd670b35SErik Nordmark ire_refrele(ire); 1281bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1282bd670b35SErik Nordmark goto error; 1283bd670b35SErik Nordmark } 1284bd670b35SErik Nordmark } 1285bd670b35SErik Nordmark /* We have a usable IRE */ 1286bd670b35SErik Nordmark ires[i] = ire; 1287bd670b35SErik Nordmark generations[i] = generation; 1288bd670b35SErik Nordmark i++; 1289bd670b35SErik Nordmark 1290bd670b35SErik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */ 1291bd670b35SErik Nordmark if ((ire->ire_flags & RTF_SETSRC) && 1292bd670b35SErik Nordmark setsrcp != NULL && *setsrcp == INADDR_ANY) { 1293bd670b35SErik Nordmark ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1294bd670b35SErik Nordmark *setsrcp = ire->ire_setsrc_addr; 1295bd670b35SErik Nordmark } 1296bd670b35SErik Nordmark 1297bd670b35SErik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */ 1298bd670b35SErik Nordmark if (ire->ire_gw_secattr != NULL && 1299bd670b35SErik Nordmark gwattrp != NULL && *gwattrp == NULL) 1300bd670b35SErik Nordmark *gwattrp = ire->ire_gw_secattr; 1301bd670b35SErik Nordmark 1302bd670b35SErik Nordmark /* 1303bd670b35SErik Nordmark * Check if we have a short-cut pointer to an IRE for this 1304bd670b35SErik Nordmark * destination, and that the cached dependency isn't stale. 1305bd670b35SErik Nordmark * In that case we've rejoined an existing tree towards a 1306bd670b35SErik Nordmark * parent, thus we don't need to continue the loop to 1307bd670b35SErik Nordmark * discover the rest of the tree. 1308bd670b35SErik Nordmark */ 1309bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1310bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1311bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1312bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1313bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1314bd670b35SErik Nordmark ire = NULL; 1315bd670b35SErik Nordmark goto done; 1316bd670b35SErik Nordmark } 1317bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1318bd670b35SErik Nordmark 1319bd670b35SErik Nordmark /* 1320bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1321bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1322bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1323bd670b35SErik Nordmark */ 1324bd670b35SErik Nordmark if (ire->ire_nce_capable) { 1325bd670b35SErik Nordmark ire = NULL; 1326bd670b35SErik Nordmark goto done; 1327bd670b35SErik Nordmark } 1328bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1329bd670b35SErik Nordmark /* 1330bd670b35SErik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1331bd670b35SErik Nordmark * particular destination 1332bd670b35SErik Nordmark */ 1333bd670b35SErik Nordmark if (ire->ire_type & IRE_INTERFACE) { 1334bd670b35SErik Nordmark in6_addr_t v6nexthop; 1335bd670b35SErik Nordmark ire_t *clone; 1336bd670b35SErik Nordmark 1337bd670b35SErik Nordmark ASSERT(ire->ire_masklen != IPV4_ABITS); 1338bd670b35SErik Nordmark 1339bd670b35SErik Nordmark /* 1340bd670b35SErik Nordmark * In the case of ip_input and ILLF_FORWARDING not 1341bd670b35SErik Nordmark * being set, and in the case of RTM_GET, 1342bd670b35SErik Nordmark * there is no point in allocating 1343bd670b35SErik Nordmark * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1344bd670b35SErik Nordmark * Note that !allocate can result in a ire_dep_parent 1345bd670b35SErik Nordmark * which is IRE_IF_* without an IRE_IF_CLONE. 1346bd670b35SErik Nordmark * We recover from that when we need to send packets 1347bd670b35SErik Nordmark * by ensuring that the generations become 1348bd670b35SErik Nordmark * IRE_GENERATION_VERIFY in this case. 1349bd670b35SErik Nordmark */ 1350bd670b35SErik Nordmark if (!allocate) { 1351bd670b35SErik Nordmark invalidate = B_TRUE; 1352bd670b35SErik Nordmark ire = NULL; 1353bd670b35SErik Nordmark goto done; 1354bd670b35SErik Nordmark } 1355bd670b35SErik Nordmark 1356bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1357bd670b35SErik Nordmark 1358bd670b35SErik Nordmark clone = ire_create_if_clone(ire, &v6nexthop, 1359bd670b35SErik Nordmark &generation); 1360bd670b35SErik Nordmark if (clone == NULL) { 1361bd670b35SErik Nordmark /* 1362bd670b35SErik Nordmark * Temporary failure - no memory. 1363bd670b35SErik Nordmark * Don't want caller to cache IRE_NOROUTE. 1364bd670b35SErik Nordmark */ 1365bd670b35SErik Nordmark invalidate = B_TRUE; 1366bd670b35SErik Nordmark ire = ire_blackhole(ipst, B_FALSE); 1367bd670b35SErik Nordmark goto error; 1368bd670b35SErik Nordmark } 1369bd670b35SErik Nordmark /* 1370bd670b35SErik Nordmark * Make clone next to last entry and the 1371bd670b35SErik Nordmark * IRE_INTERFACE the last in the dependency 1372bd670b35SErik Nordmark * chain since the clone depends on the 1373bd670b35SErik Nordmark * IRE_INTERFACE. 1374bd670b35SErik Nordmark */ 1375bd670b35SErik Nordmark ASSERT(i >= 1); 1376bd670b35SErik Nordmark ASSERT(i < MAX_IRE_RECURSION); 1377bd670b35SErik Nordmark 1378bd670b35SErik Nordmark ires[i] = ires[i-1]; 1379bd670b35SErik Nordmark generations[i] = generations[i-1]; 1380bd670b35SErik Nordmark ires[i-1] = clone; 1381bd670b35SErik Nordmark generations[i-1] = generation; 1382bd670b35SErik Nordmark i++; 1383bd670b35SErik Nordmark 1384bd670b35SErik Nordmark ire = NULL; 1385bd670b35SErik Nordmark goto done; 1386bd670b35SErik Nordmark } 1387bd670b35SErik Nordmark 1388bd670b35SErik Nordmark /* 1389bd670b35SErik Nordmark * We only match on the type and optionally ILL when 1390bd670b35SErik Nordmark * recursing. The type match is used by some callers 1391bd670b35SErik Nordmark * to exclude certain types (such as IRE_IF_CLONE or 1392bd670b35SErik Nordmark * IRE_LOCAL|IRE_LOOPBACK). 1393bd670b35SErik Nordmark */ 1394bd670b35SErik Nordmark match_args &= MATCH_IRE_TYPE; 1395bd670b35SErik Nordmark nexthop = ire->ire_gateway_addr; 1396bd670b35SErik Nordmark if (ill == NULL && ire->ire_ill != NULL) { 1397bd670b35SErik Nordmark ill = ire->ire_ill; 1398bd670b35SErik Nordmark need_refrele = B_TRUE; 1399bd670b35SErik Nordmark ill_refhold(ill); 1400bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1401bd670b35SErik Nordmark } 1402bd670b35SErik Nordmark ire = NULL; 1403bd670b35SErik Nordmark } 1404bd670b35SErik Nordmark ASSERT(ire == NULL); 1405bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1406bd670b35SErik Nordmark 1407bd670b35SErik Nordmark error: 1408bd670b35SErik Nordmark ASSERT(ire != NULL); 1409bd670b35SErik Nordmark if (need_refrele) 1410bd670b35SErik Nordmark ill_refrele(ill); 1411bd670b35SErik Nordmark 1412bd670b35SErik Nordmark /* 1413bd670b35SErik Nordmark * In the case of MULTIRT we want to try a different IRE the next 1414bd670b35SErik Nordmark * time. We let the next packet retry in that case. 1415bd670b35SErik Nordmark */ 1416bd670b35SErik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1417bd670b35SErik Nordmark (void) ire_no_good(ires[0]); 1418bd670b35SErik Nordmark 1419bd670b35SErik Nordmark cleanup: 1420bd670b35SErik Nordmark /* cleanup ires[i] */ 1421bd670b35SErik Nordmark ire_dep_unbuild(ires, i); 1422bd670b35SErik Nordmark for (j = 0; j < i; j++) 1423bd670b35SErik Nordmark ire_refrele(ires[j]); 1424bd670b35SErik Nordmark 1425bd670b35SErik Nordmark ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1426bd670b35SErik Nordmark /* 1427bd670b35SErik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1428bd670b35SErik Nordmark * ip_select_route since the reject or lack of memory might be gone. 1429bd670b35SErik Nordmark */ 1430bd670b35SErik Nordmark if (generationp != NULL) 1431bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 1432bd670b35SErik Nordmark return (ire); 1433bd670b35SErik Nordmark 1434bd670b35SErik Nordmark done: 1435bd670b35SErik Nordmark ASSERT(ire == NULL); 1436bd670b35SErik Nordmark if (need_refrele) { 1437bd670b35SErik Nordmark ill_refrele(ill); 1438bd670b35SErik Nordmark ill = NULL; 1439bd670b35SErik Nordmark } 1440bd670b35SErik Nordmark 1441bd670b35SErik Nordmark /* Build dependencies */ 1442bd670b35SErik Nordmark if (!ire_dep_build(ires, generations, i)) { 1443bd670b35SErik Nordmark /* Something in chain was condemned; tear it apart */ 1444bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1445bd670b35SErik Nordmark goto cleanup; 1446bd670b35SErik Nordmark } 1447bd670b35SErik Nordmark 1448bd670b35SErik Nordmark /* 1449bd670b35SErik Nordmark * Release all refholds except the one for ires[0] that we 1450bd670b35SErik Nordmark * will return to the caller. 1451bd670b35SErik Nordmark */ 1452bd670b35SErik Nordmark for (j = 1; j < i; j++) 1453bd670b35SErik Nordmark ire_refrele(ires[j]); 1454bd670b35SErik Nordmark 1455bd670b35SErik Nordmark if (invalidate) { 1456bd670b35SErik Nordmark /* 1457bd670b35SErik Nordmark * Since we needed to allocate but couldn't we need to make 1458bd670b35SErik Nordmark * sure that the dependency chain is rebuilt the next time. 1459bd670b35SErik Nordmark */ 1460bd670b35SErik Nordmark ire_dep_invalidate_generations(ires[0]); 1461bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1462bd670b35SErik Nordmark } else { 1463bd670b35SErik Nordmark /* 1464bd670b35SErik Nordmark * IREs can have been added or deleted while we did the 1465bd670b35SErik Nordmark * recursive lookup and we can't catch those until we've built 1466bd670b35SErik Nordmark * the dependencies. We verify the stored 1467bd670b35SErik Nordmark * ire_dep_parent_generation to catch any such changes and 1468bd670b35SErik Nordmark * return IRE_GENERATION_VERIFY (which will cause 1469bd670b35SErik Nordmark * ip_select_route to be called again so we can redo the 1470bd670b35SErik Nordmark * recursive lookup next time we send a packet. 1471bd670b35SErik Nordmark */ 1472bd670b35SErik Nordmark generation = ire_dep_validate_generations(ires[0]); 1473bd670b35SErik Nordmark if (generations[0] != ires[0]->ire_generation) { 1474bd670b35SErik Nordmark /* Something changed at the top */ 1475bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1476bd670b35SErik Nordmark } 1477bd670b35SErik Nordmark } 1478bd670b35SErik Nordmark if (generationp != NULL) 1479bd670b35SErik Nordmark *generationp = generation; 1480bd670b35SErik Nordmark 1481bd670b35SErik Nordmark return (ires[0]); 1482bd670b35SErik Nordmark } 1483bd670b35SErik Nordmark 1484bd670b35SErik Nordmark ire_t * 1485bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1486bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1487bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1488bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1489bd670b35SErik Nordmark { 1490bd670b35SErik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1491bd670b35SErik Nordmark zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1492bd670b35SErik Nordmark gwattrp, generationp)); 1493bd670b35SErik Nordmark } 1494bd670b35SErik Nordmark 1495bd670b35SErik Nordmark /* 1496bd670b35SErik Nordmark * Recursively look for a route to the destination. 1497bd670b35SErik Nordmark * We only handle a destination match here, yet we have the same arguments 1498bd670b35SErik Nordmark * as the full match to allow function pointers to select between the two. 1499bd670b35SErik Nordmark * 1500bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1501bd670b35SErik Nordmark * instead. 1502bd670b35SErik Nordmark * 1503bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1504bd670b35SErik Nordmark * is an error. 1505bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1506bd670b35SErik Nordmark */ 1507bd670b35SErik Nordmark ire_t * 1508bd670b35SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, 1509bd670b35SErik Nordmark uint32_t xmit_hint, ip_stack_t *ipst) 1510bd670b35SErik Nordmark { 1511bd670b35SErik Nordmark ire_t *ire; 1512bd670b35SErik Nordmark ire_t *ire1; 1513bd670b35SErik Nordmark uint_t generation; 1514bd670b35SErik Nordmark 1515bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1516bd670b35SErik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1517bd670b35SErik Nordmark &generation); 1518bd670b35SErik Nordmark ASSERT(ire != NULL); 1519bd670b35SErik Nordmark 1520bd670b35SErik Nordmark /* 1521bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1522bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1523bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1524bd670b35SErik Nordmark */ 1525bd670b35SErik Nordmark if (ire->ire_nce_capable) 1526bd670b35SErik Nordmark return (ire); 1527bd670b35SErik Nordmark 1528bd670b35SErik Nordmark /* 1529bd670b35SErik Nordmark * If the IRE has a current cached parent we know that the whole 1530bd670b35SErik Nordmark * parent chain is current, hence we don't need to discover and 1531bd670b35SErik Nordmark * build any dependencies by doing a recursive lookup. 1532bd670b35SErik Nordmark */ 1533bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1534bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1535bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1536bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1537bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1538bd670b35SErik Nordmark return (ire); 1539bd670b35SErik Nordmark } 1540bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1541bd670b35SErik Nordmark 1542bd670b35SErik Nordmark /* 1543bd670b35SErik Nordmark * Fallback to loop in the normal code starting with the ire 1544bd670b35SErik Nordmark * we found. Normally this would return the same ire. 1545bd670b35SErik Nordmark */ 1546bd670b35SErik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1547bd670b35SErik Nordmark NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1548bd670b35SErik Nordmark &generation); 1549bd670b35SErik Nordmark ire_refrele(ire); 1550bd670b35SErik Nordmark return (ire1); 1551bd670b35SErik Nordmark } 1552