1c793af95Ssangeeta /* 2c793af95Ssangeeta * CDDL HEADER START 3c793af95Ssangeeta * 4c793af95Ssangeeta * The contents of this file are subject to the terms of the 5c793af95Ssangeeta * Common Development and Distribution License (the "License"). 6c793af95Ssangeeta * You may not use this file except in compliance with the License. 7c793af95Ssangeeta * 8c793af95Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c793af95Ssangeeta * or http://www.opensolaris.org/os/licensing. 10c793af95Ssangeeta * See the License for the specific language governing permissions 11c793af95Ssangeeta * and limitations under the License. 12c793af95Ssangeeta * 13c793af95Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 14c793af95Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c793af95Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 16c793af95Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 17c793af95Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 18c793af95Ssangeeta * 19c793af95Ssangeeta * CDDL HEADER END 20c793af95Ssangeeta */ 21c793af95Ssangeeta /* 22fff7ec1dSSowmini Varadhan * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23c793af95Ssangeeta */ 24c793af95Ssangeeta 25c793af95Ssangeeta /* 26c793af95Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 27c793af95Ssangeeta */ 28c793af95Ssangeeta 29c793af95Ssangeeta #include <sys/types.h> 30c793af95Ssangeeta #include <sys/stream.h> 31c793af95Ssangeeta #include <sys/stropts.h> 32c793af95Ssangeeta #include <sys/strlog.h> 33c793af95Ssangeeta #include <sys/dlpi.h> 34c793af95Ssangeeta #include <sys/ddi.h> 35c793af95Ssangeeta #include <sys/cmn_err.h> 36c793af95Ssangeeta #include <sys/policy.h> 37c793af95Ssangeeta 38c793af95Ssangeeta #include <sys/systm.h> 39c793af95Ssangeeta #include <sys/strsun.h> 40c793af95Ssangeeta #include <sys/kmem.h> 41c793af95Ssangeeta #include <sys/param.h> 42c793af95Ssangeeta #include <sys/socket.h> 43edd26dc5Sdr146992 #include <sys/strsubr.h> 44c793af95Ssangeeta #include <net/if.h> 45c793af95Ssangeeta #include <net/route.h> 46c793af95Ssangeeta #include <netinet/in.h> 47c793af95Ssangeeta #include <net/if_dl.h> 48c793af95Ssangeeta #include <netinet/ip6.h> 49c793af95Ssangeeta #include <netinet/icmp6.h> 50c793af95Ssangeeta 51bd670b35SErik Nordmark #include <inet/ipsec_impl.h> 52c793af95Ssangeeta #include <inet/common.h> 53c793af95Ssangeeta #include <inet/mi.h> 54c793af95Ssangeeta #include <inet/mib2.h> 55c793af95Ssangeeta #include <inet/ip.h> 56edd26dc5Sdr146992 #include <inet/ip_impl.h> 57c793af95Ssangeeta #include <inet/ip6.h> 58c793af95Ssangeeta #include <inet/ip_ndp.h> 59c793af95Ssangeeta #include <inet/arp.h> 60c793af95Ssangeeta #include <inet/ip_if.h> 61c793af95Ssangeeta #include <inet/ip_ire.h> 62c793af95Ssangeeta #include <inet/ip_ftable.h> 63c793af95Ssangeeta #include <inet/ip_rts.h> 64c793af95Ssangeeta #include <inet/nd.h> 65c793af95Ssangeeta 66c793af95Ssangeeta #include <net/pfkeyv2.h> 67c793af95Ssangeeta #include <inet/sadb.h> 68c793af95Ssangeeta #include <inet/tcp.h> 69c793af95Ssangeeta #include <inet/ipclassifier.h> 70c793af95Ssangeeta #include <sys/zone.h> 71c793af95Ssangeeta #include <net/radix.h> 72c793af95Ssangeeta #include <sys/tsol/label.h> 73c793af95Ssangeeta #include <sys/tsol/tnet.h> 74c793af95Ssangeeta 75c793af95Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 76c793af95Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 77c793af95Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 78c793af95Ssangeeta 7944b099c4SSowmini Varadhan #define IP_SRC_MULTIHOMING(isv6, ipst) \ 8044b099c4SSowmini Varadhan (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 8144b099c4SSowmini Varadhan ipst->ips_ip_strict_src_multihoming) 8244b099c4SSowmini Varadhan 83f4b3ec61Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 84c793af95Ssangeeta static void ire_del_host_redir(ire_t *, char *); 85c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *); 86c793af95Ssangeeta 87c793af95Ssangeeta /* 88c793af95Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 89c793af95Ssangeeta * passing the required parameters and indicating the match required in the 90c793af95Ssangeeta * flag field. 91c793af95Ssangeeta * 92c793af95Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 93c793af95Ssangeeta */ 94c793af95Ssangeeta ire_t * 95bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 96bd670b35SErik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 97bd670b35SErik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 98c793af95Ssangeeta { 99bd670b35SErik Nordmark ire_t *ire; 100c793af95Ssangeeta struct rt_sockaddr rdst, rmask; 101c793af95Ssangeeta struct rt_entry *rt; 102c793af95Ssangeeta ire_ftable_args_t margs; 103c793af95Ssangeeta 104bd670b35SErik Nordmark ASSERT(ill == NULL || !ill->ill_isv6); 105c793af95Ssangeeta 106c793af95Ssangeeta /* 107bd670b35SErik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL 108bd670b35SErik Nordmark * is set. 109c793af95Ssangeeta */ 11044b099c4SSowmini Varadhan if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 111c793af95Ssangeeta return (NULL); 112c793af95Ssangeeta 113188e1664SErik Nordmark bzero(&rdst, sizeof (rdst)); 114c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 115c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 116c793af95Ssangeeta rdst.rt_sin_addr.s_addr = addr; 117c793af95Ssangeeta 118188e1664SErik Nordmark bzero(&rmask, sizeof (rmask)); 119c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 120c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 121c793af95Ssangeeta rmask.rt_sin_addr.s_addr = mask; 122c793af95Ssangeeta 123188e1664SErik Nordmark bzero(&margs, sizeof (margs)); 124c793af95Ssangeeta margs.ift_addr = addr; 125c793af95Ssangeeta margs.ift_mask = mask; 126c793af95Ssangeeta margs.ift_gateway = gateway; 127c793af95Ssangeeta margs.ift_type = type; 128bd670b35SErik Nordmark margs.ift_ill = ill; 129c793af95Ssangeeta margs.ift_zoneid = zoneid; 130c793af95Ssangeeta margs.ift_tsl = tsl; 131c793af95Ssangeeta margs.ift_flags = flags; 132c793af95Ssangeeta 133c793af95Ssangeeta /* 134c793af95Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 135c793af95Ssangeeta * search to return, not the longest matching prefix, but the 136c793af95Ssangeeta * "best matching prefix", i.e., the longest prefix that also 137c793af95Ssangeeta * satisfies constraints imposed via the permutation of flags 138c793af95Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 139c793af95Ssangeeta * each matching leaf in the radix tree. ire_match_args is 140c793af95Ssangeeta * invoked by the callback function ire_find_best_route() 141c793af95Ssangeeta * We hold the global tree lock in read mode when calling 142c793af95Ssangeeta * rn_match_args. Before dropping the global tree lock, ensure 143c793af95Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 144c793af95Ssangeeta */ 145f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 146f4b3ec61Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 147f4b3ec61Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 148c793af95Ssangeeta ire = margs.ift_best_ire; 149c793af95Ssangeeta if (rt == NULL) { 150bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 151c793af95Ssangeeta return (NULL); 152c793af95Ssangeeta } 153bd670b35SErik Nordmark ASSERT(ire != NULL); 154c793af95Ssangeeta 155c793af95Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 156c793af95Ssangeeta 157c793af95Ssangeeta /* 158c793af95Ssangeeta * round-robin only if we have more than one route in the bucket. 159bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 160bd670b35SErik Nordmark * 2: always 161bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 162bd670b35SErik Nordmark * 0: never 163c793af95Ssangeeta */ 164bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 165bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 166bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 167bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 168c793af95Ssangeeta ire_t *next_ire; 169c793af95Ssangeeta 170bd670b35SErik Nordmark margs.ift_best_ire = NULL; 171bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 172bd670b35SErik Nordmark xmit_hint, ire, ipst); 173bd670b35SErik Nordmark if (next_ire == NULL) { 174bd670b35SErik Nordmark /* keep ire if next_ire is null */ 175bd670b35SErik Nordmark goto done; 176bd670b35SErik Nordmark } 177bd670b35SErik Nordmark ire_refrele(ire); 178c793af95Ssangeeta ire = next_ire; 179c793af95Ssangeeta } 180c793af95Ssangeeta } 181c793af95Ssangeeta 182bd670b35SErik Nordmark done: 183bd670b35SErik Nordmark /* Return generation before dropping lock */ 184bd670b35SErik Nordmark if (generationp != NULL) 185bd670b35SErik Nordmark *generationp = ire->ire_generation; 186c793af95Ssangeeta 187bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 188e11c3f44Smeem 189c793af95Ssangeeta /* 190bd670b35SErik Nordmark * For shared-IP zones we need additional checks to what was 191bd670b35SErik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled. 192bd670b35SErik Nordmark * 193bd670b35SErik Nordmark * When ip_restrict_interzone_loopback is set, then 194bd670b35SErik Nordmark * we ensure that IRE_LOCAL are only used for loopback 195bd670b35SErik Nordmark * between zones when the logical "Ethernet" would 196bd670b35SErik Nordmark * have looped them back. That is, if in the absense of 197bd670b35SErik Nordmark * the IRE_LOCAL we would have sent to packet out the 198bd670b35SErik Nordmark * same ill. 199c793af95Ssangeeta */ 200bd670b35SErik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 201bd670b35SErik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 202bd670b35SErik Nordmark ipst->ips_ip_restrict_interzone_loopback) { 203bd670b35SErik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 204bd670b35SErik Nordmark ASSERT(ire != NULL); 205c793af95Ssangeeta } 206c793af95Ssangeeta return (ire); 207c793af95Ssangeeta } 208c793af95Ssangeeta 209da14cebeSEric Cheng /* 210da14cebeSEric Cheng * This function is called by 211bd670b35SErik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the 212bd670b35SErik Nordmark * destination address. 213bd670b35SErik Nordmark * 214da14cebeSEric Cheng * The optimizations of this function over ire_ftable_lookup are: 215da14cebeSEric Cheng * o removing unnecessary flag matching 216da14cebeSEric Cheng * o doing longest prefix match instead of overloading it further 217da14cebeSEric Cheng * with the unnecessary "best_prefix_match" 218bd670b35SErik Nordmark * 219bd670b35SErik Nordmark * If no route is found we return IRE_NOROUTE. 220da14cebeSEric Cheng */ 221bd670b35SErik Nordmark ire_t * 222bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 223bd670b35SErik Nordmark uint_t *generationp) 224da14cebeSEric Cheng { 225bd670b35SErik Nordmark ire_t *ire; 226da14cebeSEric Cheng struct rt_sockaddr rdst; 227da14cebeSEric Cheng struct rt_entry *rt; 228bd670b35SErik Nordmark irb_t *irb; 229da14cebeSEric Cheng 230da14cebeSEric Cheng rdst.rt_sin_len = sizeof (rdst); 231da14cebeSEric Cheng rdst.rt_sin_family = AF_INET; 232da14cebeSEric Cheng rdst.rt_sin_addr.s_addr = addr; 233da14cebeSEric Cheng 234da14cebeSEric Cheng /* 235da14cebeSEric Cheng * This is basically inlining a simpler version of ire_match_args 236da14cebeSEric Cheng */ 237da14cebeSEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 238da14cebeSEric Cheng 239da14cebeSEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 240da14cebeSEric Cheng ipst->ips_ip_ftable, NULL, NULL); 241da14cebeSEric Cheng 242bd670b35SErik Nordmark if (rt == NULL) 243bd670b35SErik Nordmark goto bad; 244bd670b35SErik Nordmark 245bd670b35SErik Nordmark irb = &rt->rt_irb; 246bd670b35SErik Nordmark if (irb->irb_ire_cnt == 0) 247bd670b35SErik Nordmark goto bad; 248bd670b35SErik Nordmark 249bd670b35SErik Nordmark rw_enter(&irb->irb_lock, RW_READER); 250bd670b35SErik Nordmark ire = irb->irb_ire; 251bd670b35SErik Nordmark if (ire == NULL) { 252bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 253bd670b35SErik Nordmark goto bad; 254da14cebeSEric Cheng } 255bd670b35SErik Nordmark while (IRE_IS_CONDEMNED(ire)) { 256bd670b35SErik Nordmark ire = ire->ire_next; 257bd670b35SErik Nordmark if (ire == NULL) { 258bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 259bd670b35SErik Nordmark goto bad; 260bd670b35SErik Nordmark } 261da14cebeSEric Cheng } 262da14cebeSEric Cheng 263da14cebeSEric Cheng /* we have a ire that matches */ 264bd670b35SErik Nordmark ire_refhold(ire); 265bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 266bd670b35SErik Nordmark 267bd670b35SErik Nordmark /* 268bd670b35SErik Nordmark * round-robin only if we have more than one route in the bucket. 269bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 270bd670b35SErik Nordmark * 2: always 271bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 272bd670b35SErik Nordmark * 0: never 273bd670b35SErik Nordmark * 274bd670b35SErik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 275bd670b35SErik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 276bd670b35SErik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches. 277bd670b35SErik Nordmark */ 278bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1) { 279bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 280bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 281bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 282bd670b35SErik Nordmark ire_t *next_ire; 283bd670b35SErik Nordmark ire_ftable_args_t margs; 284bd670b35SErik Nordmark 285188e1664SErik Nordmark bzero(&margs, sizeof (margs)); 286bd670b35SErik Nordmark margs.ift_addr = addr; 287bd670b35SErik Nordmark margs.ift_zoneid = ALL_ZONES; 288bd670b35SErik Nordmark 289bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 290bd670b35SErik Nordmark xmit_hint, ire, ipst); 291bd670b35SErik Nordmark if (next_ire == NULL) { 292bd670b35SErik Nordmark /* keep ire if next_ire is null */ 293bd670b35SErik Nordmark if (generationp != NULL) 294bd670b35SErik Nordmark *generationp = ire->ire_generation; 295bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 296bd670b35SErik Nordmark return (ire); 297bd670b35SErik Nordmark } 298bd670b35SErik Nordmark ire_refrele(ire); 299bd670b35SErik Nordmark ire = next_ire; 300bd670b35SErik Nordmark } 301bd670b35SErik Nordmark } 302bd670b35SErik Nordmark /* Return generation before dropping lock */ 303bd670b35SErik Nordmark if (generationp != NULL) 304bd670b35SErik Nordmark *generationp = ire->ire_generation; 305bd670b35SErik Nordmark 306da14cebeSEric Cheng RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 307da14cebeSEric Cheng 308bd670b35SErik Nordmark /* 309bd670b35SErik Nordmark * Since we only did ALL_ZONES matches there is no special handling 310bd670b35SErik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 311bd670b35SErik Nordmark */ 312da14cebeSEric Cheng return (ire); 313da14cebeSEric Cheng 314bd670b35SErik Nordmark bad: 315bd670b35SErik Nordmark if (generationp != NULL) 316bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 317da14cebeSEric Cheng 318bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 319bd670b35SErik Nordmark return (ire_reject(ipst, B_FALSE)); 320da14cebeSEric Cheng } 321c793af95Ssangeeta 322c793af95Ssangeeta /* 323bd670b35SErik Nordmark * Find the ill matching a multicast group. 324c793af95Ssangeeta * Allows different routes for multicast addresses 325c793af95Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 326c793af95Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 327c793af95Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 328c793af95Ssangeeta * specify the interface to join on. 329c793af95Ssangeeta * 330bd670b35SErik Nordmark * Supports link-local addresses by using ire_route_recursive which follows 331bd670b35SErik Nordmark * the ill when recursing. 332bd670b35SErik Nordmark * 333bd670b35SErik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 334bd670b35SErik Nordmark * and the MULTIRT property can be different for different groups, we 335bd670b35SErik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group 336bd670b35SErik Nordmark * with CGTP and pass that back in the multirtp argument. 337bd670b35SErik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 338bd670b35SErik Nordmark * We have a setsrcp argument for the same reason. 339c793af95Ssangeeta */ 340bd670b35SErik Nordmark ill_t * 341bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 342bd670b35SErik Nordmark boolean_t *multirtp, ipaddr_t *setsrcp) 343c793af95Ssangeeta { 344c793af95Ssangeeta ire_t *ire; 345bd670b35SErik Nordmark ill_t *ill; 346c793af95Ssangeeta 347bd670b35SErik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 3489e3469d3SErik Nordmark MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 349bd670b35SErik Nordmark ASSERT(ire != NULL); 350bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 351c793af95Ssangeeta ire_refrele(ire); 352c793af95Ssangeeta return (NULL); 353c793af95Ssangeeta } 354bd670b35SErik Nordmark 355bd670b35SErik Nordmark if (multirtp != NULL) 356bd670b35SErik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 357bd670b35SErik Nordmark 358bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 359bd670b35SErik Nordmark ire_refrele(ire); 360bd670b35SErik Nordmark return (ill); 361c793af95Ssangeeta } 362c793af95Ssangeeta 363c793af95Ssangeeta /* 364c793af95Ssangeeta * Delete the passed in ire if the gateway addr matches 365c793af95Ssangeeta */ 366c793af95Ssangeeta void 367c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 368c793af95Ssangeeta { 3696bdb8e66Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 370c793af95Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 371c793af95Ssangeeta ire_delete(ire); 372c793af95Ssangeeta } 373c793af95Ssangeeta 374c793af95Ssangeeta /* 375bd670b35SErik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 376c793af95Ssangeeta * pointing at the specified gateway and 377c793af95Ssangeeta * delete them. This routine is called only 378c793af95Ssangeeta * when a default gateway is going away. 379c793af95Ssangeeta */ 380c793af95Ssangeeta void 381f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 382c793af95Ssangeeta { 383c793af95Ssangeeta struct rtfuncarg rtfarg; 384c793af95Ssangeeta 385188e1664SErik Nordmark bzero(&rtfarg, sizeof (rtfarg)); 386c793af95Ssangeeta rtfarg.rt_func = ire_del_host_redir; 387c793af95Ssangeeta rtfarg.rt_arg = (void *)&gateway; 388188e1664SErik Nordmark rtfarg.rt_zoneid = ALL_ZONES; 389188e1664SErik Nordmark rtfarg.rt_ipst = ipst; 390f4b3ec61Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 391f4b3ec61Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 392c793af95Ssangeeta } 393c793af95Ssangeeta 394c793af95Ssangeeta /* 395f4b3ec61Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 396f4b3ec61Sdh155122 * the ips_ip_ftable. 397c793af95Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 398c793af95Ssangeeta * route already exists, return the bucket for the existing route. 399c793af95Ssangeeta * 400c793af95Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 401c793af95Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 402c793af95Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 403c793af95Ssangeeta * while holding the irb_lock, but not the radix tree lock. 404c793af95Ssangeeta */ 405c793af95Ssangeeta irb_t * 406c793af95Ssangeeta ire_get_bucket(ire_t *ire) 407c793af95Ssangeeta { 408c793af95Ssangeeta struct radix_node *rn; 409c793af95Ssangeeta struct rt_entry *rt; 410c793af95Ssangeeta struct rt_sockaddr rmask, rdst; 411c793af95Ssangeeta irb_t *irb = NULL; 412f4b3ec61Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 413c793af95Ssangeeta 414f4b3ec61Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 415c793af95Ssangeeta 416c793af95Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 417188e1664SErik Nordmark bzero(&rdst, sizeof (rdst)); 418c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 419c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 420c793af95Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 421c793af95Ssangeeta 422188e1664SErik Nordmark bzero(&rmask, sizeof (rmask)); 423c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 424c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 425c793af95Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 426c793af95Ssangeeta 427c793af95Ssangeeta /* 428c793af95Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 429c793af95Ssangeeta */ 430c793af95Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 43129bc4795Ssangeeta /* kmem_alloc failed */ 43229bc4795Ssangeeta if (rt == NULL) 43329bc4795Ssangeeta return (NULL); 43429bc4795Ssangeeta 435188e1664SErik Nordmark bzero(rt, sizeof (*rt)); 436c793af95Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 437c793af95Ssangeeta rt->rt_dst = rdst; 438c793af95Ssangeeta irb = &rt->rt_irb; 439bd670b35SErik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 440f4b3ec61Sdh155122 irb->irb_ipst = ipst; 441c793af95Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 442f4b3ec61Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 443f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 444f4b3ec61Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 445c793af95Ssangeeta if (rn == NULL) { 446f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 447c793af95Ssangeeta Free(rt, rt_entry_cache); 448c793af95Ssangeeta rt = NULL; 449c793af95Ssangeeta irb = NULL; 450f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 451f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 452f4b3ec61Sdh155122 ipst->ips_ip_ftable); 453f4b3ec61Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 454c793af95Ssangeeta /* found a non-root match */ 455c793af95Ssangeeta rt = (struct rt_entry *)rn; 456c793af95Ssangeeta } 457c793af95Ssangeeta } 458c793af95Ssangeeta if (rt != NULL) { 459c793af95Ssangeeta irb = &rt->rt_irb; 460bd670b35SErik Nordmark irb_refhold(irb); 461c793af95Ssangeeta } 462f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 463c793af95Ssangeeta return (irb); 464c793af95Ssangeeta } 465c793af95Ssangeeta 466c793af95Ssangeeta /* 467c793af95Ssangeeta * This function is used when the caller wants to know the outbound 468c793af95Ssangeeta * interface for a packet given only the address. 469c793af95Ssangeeta * If this is a offlink IP address and there are multiple 470c793af95Ssangeeta * routes to this destination, this routine will utilise the 471c793af95Ssangeeta * first route it finds to IP address 472c793af95Ssangeeta * Return values: 473c793af95Ssangeeta * 0 - FAILURE 474c793af95Ssangeeta * nonzero - ifindex 475c793af95Ssangeeta */ 476c793af95Ssangeeta uint_t 477c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 478c793af95Ssangeeta { 479c793af95Ssangeeta uint_t ifindex = 0; 480c793af95Ssangeeta ire_t *ire; 481c793af95Ssangeeta ill_t *ill; 482f4b3ec61Sdh155122 netstack_t *ns; 483f4b3ec61Sdh155122 ip_stack_t *ipst; 484c793af95Ssangeeta 485f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 486f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 487f4b3ec61Sdh155122 else 488f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 489f4b3ec61Sdh155122 ASSERT(ns != NULL); 490f4b3ec61Sdh155122 491f4b3ec61Sdh155122 /* 492f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 493f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 494f4b3ec61Sdh155122 */ 495f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 496f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 497f4b3ec61Sdh155122 ipst = ns->netstack_ip; 498c793af95Ssangeeta 499c793af95Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 500c793af95Ssangeeta 501f4b3ec61Sdh155122 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 502bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 503bd670b35SErik Nordmark if (ill != NULL) { 504c793af95Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 505bd670b35SErik Nordmark ill_refrele(ill); 506bd670b35SErik Nordmark } 507c793af95Ssangeeta ire_refrele(ire); 508c793af95Ssangeeta } 509f4b3ec61Sdh155122 netstack_rele(ns); 510c793af95Ssangeeta return (ifindex); 511c793af95Ssangeeta } 512c793af95Ssangeeta 513c793af95Ssangeeta /* 514c793af95Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 515bd670b35SErik Nordmark * it tries to match the route to the corresponding ipif for the ifindex 516c793af95Ssangeeta */ 517c793af95Ssangeeta static ire_t * 518f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 519c793af95Ssangeeta { 520c793af95Ssangeeta ire_t *ire = NULL; 521c793af95Ssangeeta int match_flags; 522c793af95Ssangeeta 523bd670b35SErik Nordmark match_flags = MATCH_IRE_DSTONLY; 524c793af95Ssangeeta 525c793af95Ssangeeta /* XXX pass NULL tsl for now */ 526c793af95Ssangeeta 527c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 528bd670b35SErik Nordmark ire = ire_route_recursive_v4( 529bd670b35SErik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 5309e3469d3SErik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 5319e3469d3SErik Nordmark NULL, NULL); 532c793af95Ssangeeta } else { 533bd670b35SErik Nordmark ire = ire_route_recursive_v6( 534bd670b35SErik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 5359e3469d3SErik Nordmark zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 5369e3469d3SErik Nordmark NULL, NULL); 537bd670b35SErik Nordmark } 538bd670b35SErik Nordmark ASSERT(ire != NULL); 539bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 540bd670b35SErik Nordmark ire_refrele(ire); 541bd670b35SErik Nordmark return (NULL); 542c793af95Ssangeeta } 543c793af95Ssangeeta return (ire); 544c793af95Ssangeeta } 545c793af95Ssangeeta 546c793af95Ssangeeta /* 547c793af95Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 548bd670b35SErik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may 549bd670b35SErik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 550c793af95Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 551c793af95Ssangeeta * IP WILL NOT DO the following to the data packet before sending it out: 552c793af95Ssangeeta * a. manipulate ttl 553edd26dc5Sdr146992 * b. ipsec work 554edd26dc5Sdr146992 * c. fragmentation 555edd26dc5Sdr146992 * 556edd26dc5Sdr146992 * If the packet has been prepared for hardware checksum then it will be 557edd26dc5Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 558edd26dc5Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 559c793af95Ssangeeta * 560c793af95Ssangeeta * Return values: 561c793af95Ssangeeta * 0: IP was able to send of the data pkt 562c793af95Ssangeeta * ECOMM: Could not send packet 563c793af95Ssangeeta * ENONET No route to dst. It is up to the caller 564c793af95Ssangeeta * to send icmp unreachable error message, 565c793af95Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 566c793af95Ssangeeta * of the offlink dst's nexthop needs to get 567c793af95Ssangeeta * resolved before packet can be sent to dst. 568c793af95Ssangeeta * Thus transmission is not guaranteed. 569bd670b35SErik Nordmark * Note: No longer have visibility to the ARP queue 570bd670b35SErik Nordmark * hence no EINPROGRESS. 571c793af95Ssangeeta */ 572c793af95Ssangeeta int 573c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 574c793af95Ssangeeta zoneid_t zoneid) 575c793af95Ssangeeta { 576bd670b35SErik Nordmark ipaddr_t nexthop; 577f4b3ec61Sdh155122 netstack_t *ns; 578f4b3ec61Sdh155122 ip_stack_t *ipst; 579bd670b35SErik Nordmark ip_xmit_attr_t ixas; 580bd670b35SErik Nordmark int error; 581c793af95Ssangeeta 582c793af95Ssangeeta ASSERT(mp != NULL); 583c793af95Ssangeeta 584f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 585f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 586f4b3ec61Sdh155122 else 587f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 588f4b3ec61Sdh155122 ASSERT(ns != NULL); 589f4b3ec61Sdh155122 590f4b3ec61Sdh155122 /* 591f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 592f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 593f4b3ec61Sdh155122 */ 594f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 595f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 596f4b3ec61Sdh155122 ipst = ns->netstack_ip; 597f4b3ec61Sdh155122 598c793af95Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 599c793af95Ssangeeta dst_addr->sa_family == AF_INET6); 600c793af95Ssangeeta 601bd670b35SErik Nordmark bzero(&ixas, sizeof (ixas)); 602bd670b35SErik Nordmark /* 603bd670b35SErik Nordmark * No IPsec, no fragmentation, and don't let any hooks see 604bd670b35SErik Nordmark * the packet. 605bd670b35SErik Nordmark */ 606bd670b35SErik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 607bd670b35SErik Nordmark ixas.ixa_cred = kcred; 608bd670b35SErik Nordmark ixas.ixa_cpid = NOPID; 609bd670b35SErik Nordmark ixas.ixa_tsl = NULL; 610bd670b35SErik Nordmark ixas.ixa_ipst = ipst; 611bd670b35SErik Nordmark ixas.ixa_ifindex = ifindex; 612bd670b35SErik Nordmark 613c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 614bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 615bd670b35SErik Nordmark 616bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_IS_IPV4; 617bd670b35SErik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 618bd670b35SErik Nordmark if (nexthop != ipha->ipha_dst) { 619bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 620bd670b35SErik Nordmark ixas.ixa_nexthop_v4 = nexthop; 621bd670b35SErik Nordmark } 622bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ipha->ipha_ttl; 623c793af95Ssangeeta } else { 624bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 625bd670b35SErik Nordmark in6_addr_t *nexthop6; 626bd670b35SErik Nordmark 627bd670b35SErik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 628bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 629bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 630bd670b35SErik Nordmark ixas.ixa_nexthop_v6 = *nexthop6; 631c793af95Ssangeeta } 632bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ip6h->ip6_hops; 633c793af95Ssangeeta } 634bd670b35SErik Nordmark error = ip_output_simple(mp, &ixas); 635bd670b35SErik Nordmark ixa_cleanup(&ixas); 636c793af95Ssangeeta 637f4b3ec61Sdh155122 netstack_rele(ns); 638bd670b35SErik Nordmark switch (error) { 639bd670b35SErik Nordmark case 0: 640bd670b35SErik Nordmark break; 641bd670b35SErik Nordmark 642bd670b35SErik Nordmark case EHOSTUNREACH: 643bd670b35SErik Nordmark case ENETUNREACH: 644bd670b35SErik Nordmark error = ENONET; 645bd670b35SErik Nordmark break; 646bd670b35SErik Nordmark 647bd670b35SErik Nordmark default: 648bd670b35SErik Nordmark error = ECOMM; 649bd670b35SErik Nordmark break; 650c793af95Ssangeeta } 651bd670b35SErik Nordmark return (error); 652edd26dc5Sdr146992 } 653edd26dc5Sdr146992 654c793af95Ssangeeta /* 655c793af95Ssangeeta * callback function provided by ire_ftable_lookup when calling 656c793af95Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 657c793af95Ssangeeta * the radix tree. 658c793af95Ssangeeta */ 659c793af95Ssangeeta boolean_t 660c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 661c793af95Ssangeeta { 662c793af95Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 663c793af95Ssangeeta irb_t *irb_ptr; 664c793af95Ssangeeta ire_t *ire; 665c793af95Ssangeeta ire_ftable_args_t *margs = arg; 666c793af95Ssangeeta ipaddr_t match_mask; 667c793af95Ssangeeta 668c793af95Ssangeeta ASSERT(rt != NULL); 669c793af95Ssangeeta 670c793af95Ssangeeta irb_ptr = &rt->rt_irb; 671c793af95Ssangeeta 672c793af95Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 673c793af95Ssangeeta return (B_FALSE); 674c793af95Ssangeeta 675c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 676c793af95Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 677bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 678c793af95Ssangeeta continue; 67944b099c4SSowmini Varadhan ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 68044b099c4SSowmini Varadhan if (margs->ift_flags & MATCH_IRE_MASK) 681c793af95Ssangeeta match_mask = margs->ift_mask; 682c793af95Ssangeeta else 683c793af95Ssangeeta match_mask = ire->ire_mask; 684c793af95Ssangeeta 685c793af95Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 686bd670b35SErik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill, 687bd670b35SErik Nordmark margs->ift_zoneid, margs->ift_tsl, 688bd670b35SErik Nordmark margs->ift_flags)) { 689bd670b35SErik Nordmark ire_refhold(ire); 690c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 691c793af95Ssangeeta margs->ift_best_ire = ire; 692c793af95Ssangeeta return (B_TRUE); 693c793af95Ssangeeta } 694c793af95Ssangeeta } 695c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 696c793af95Ssangeeta return (B_FALSE); 697c793af95Ssangeeta } 698c793af95Ssangeeta 699c793af95Ssangeeta /* 700c793af95Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 701c793af95Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 702c793af95Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 703c793af95Ssangeeta * be verified are: 704c793af95Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 705c793af95Ssangeeta * - no other threads holding references to ire's in the bucket, 706c793af95Ssangeeta * i.e., irb_nire == 0 707c793af95Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 708c793af95Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 709c793af95Ssangeeta */ 710c793af95Ssangeeta void 711c793af95Ssangeeta irb_refrele_ftable(irb_t *irb) 712c793af95Ssangeeta { 713c793af95Ssangeeta for (;;) { 714c793af95Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 715c793af95Ssangeeta ASSERT(irb->irb_refcnt != 0); 716c793af95Ssangeeta if (irb->irb_refcnt != 1) { 717c793af95Ssangeeta /* 718c793af95Ssangeeta * Someone has a reference to this radix node 719c793af95Ssangeeta * or there is some bucket walker. 720c793af95Ssangeeta */ 721c793af95Ssangeeta irb->irb_refcnt--; 722c793af95Ssangeeta rw_exit(&irb->irb_lock); 723c793af95Ssangeeta return; 724c793af95Ssangeeta } else { 725c793af95Ssangeeta /* 726c793af95Ssangeeta * There is no other walker, nor is there any 727c793af95Ssangeeta * other thread that holds a direct ref to this 728c793af95Ssangeeta * radix node. Do the clean up if needed. Call 729c793af95Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 730c793af95Ssangeeta */ 731c793af95Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 732c793af95Ssangeeta ire_t *ire_list; 733c793af95Ssangeeta 734c793af95Ssangeeta ire_list = ire_unlink(irb); 735c793af95Ssangeeta rw_exit(&irb->irb_lock); 736c793af95Ssangeeta 737c793af95Ssangeeta if (ire_list != NULL) 738c793af95Ssangeeta ire_cleanup(ire_list); 739c793af95Ssangeeta /* 740c793af95Ssangeeta * more CONDEMNED entries could have 741c793af95Ssangeeta * been added while we dropped the lock, 742c793af95Ssangeeta * so we have to re-check. 743c793af95Ssangeeta */ 744c793af95Ssangeeta continue; 745c793af95Ssangeeta } 746c793af95Ssangeeta 747c793af95Ssangeeta /* 748c793af95Ssangeeta * Now check if there are still any ires 749c793af95Ssangeeta * associated with this radix node. 750c793af95Ssangeeta */ 751c793af95Ssangeeta if (irb->irb_nire != 0) { 752c793af95Ssangeeta /* 753c793af95Ssangeeta * someone is still holding on 754c793af95Ssangeeta * to ires in this bucket 755c793af95Ssangeeta */ 756c793af95Ssangeeta irb->irb_refcnt--; 757c793af95Ssangeeta rw_exit(&irb->irb_lock); 758c793af95Ssangeeta return; 759c793af95Ssangeeta } else { 760c793af95Ssangeeta /* 761c793af95Ssangeeta * Everything is clear. Zero walkers, 762c793af95Ssangeeta * Zero threads with a ref to this 763c793af95Ssangeeta * radix node, Zero ires associated with 764c793af95Ssangeeta * this radix node. Due to lock order, 765c793af95Ssangeeta * check the above conditions again 766c793af95Ssangeeta * after grabbing all locks in the right order 767c793af95Ssangeeta */ 768c793af95Ssangeeta rw_exit(&irb->irb_lock); 769c793af95Ssangeeta if (irb_inactive(irb)) 770c793af95Ssangeeta return; 771c793af95Ssangeeta /* 772c793af95Ssangeeta * irb_inactive could not free the irb. 773c793af95Ssangeeta * See if there are any walkers, if not 774c793af95Ssangeeta * try to clean up again. 775c793af95Ssangeeta */ 776c793af95Ssangeeta } 777c793af95Ssangeeta } 778c793af95Ssangeeta } 779c793af95Ssangeeta } 780c793af95Ssangeeta 781c793af95Ssangeeta /* 782bd670b35SErik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal 783bd670b35SErik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs 784bd670b35SErik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular 785bd670b35SErik Nordmark * list for the purposes of walking it. 786bd670b35SErik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is 787bd670b35SErik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one. 788bd670b35SErik Nordmark * If we fail to find an IRE we return NULL. 789c793af95Ssangeeta * 790bd670b35SErik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock 791bd670b35SErik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 792bd670b35SErik Nordmark * 793bd670b35SErik Nordmark * Applies to IPv4 and IPv6. 794bd670b35SErik Nordmark * 795bd670b35SErik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 796bd670b35SErik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also 797bd670b35SErik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 798188e1664SErik Nordmark * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 799bd670b35SErik Nordmark * 800bd670b35SErik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 801bd670b35SErik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 802bd670b35SErik Nordmark * in which the zone has an IP address. We check this for the global zone 803bd670b35SErik Nordmark * even if no shared-IP zones are configured. 804c793af95Ssangeeta */ 805c793af95Ssangeeta ire_t * 806bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 807bd670b35SErik Nordmark ire_t *orig_ire, ip_stack_t *ipst) 808c793af95Ssangeeta { 809c793af95Ssangeeta ire_t *ire, *maybe_ire = NULL; 810bd670b35SErik Nordmark uint_t maybe_badcnt; 811bd670b35SErik Nordmark uint_t maxwalk; 812bd670b35SErik Nordmark 813bd670b35SErik Nordmark /* Fold in more bits from the hint/hash */ 814bd670b35SErik Nordmark hash = hash ^ (hash >> 8) ^ (hash >> 16); 815c793af95Ssangeeta 816c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 817bd670b35SErik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 818*82dec0a6SRobert Mustacchi if (maxwalk == 0) { 819*82dec0a6SRobert Mustacchi rw_exit(&irb_ptr->irb_lock); 820*82dec0a6SRobert Mustacchi return (NULL); 821*82dec0a6SRobert Mustacchi } 822*82dec0a6SRobert Mustacchi 823bd670b35SErik Nordmark hash %= maxwalk; 824bd670b35SErik Nordmark irb_refhold_locked(irb_ptr); 825c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 826c793af95Ssangeeta 827c793af95Ssangeeta /* 828c793af95Ssangeeta * Round-robin the routers list looking for a route that 829c793af95Ssangeeta * matches the passed in parameters. 830bd670b35SErik Nordmark * First we skip "hash" number of non-condemned IREs. 831bd670b35SErik Nordmark * Then we match the IRE. 832bd670b35SErik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember 833bd670b35SErik Nordmark * it and keep on looking for a lower ire_badcnt. 834bd670b35SErik Nordmark * If we come to the end of the list we continue (treat the 835bd670b35SErik Nordmark * bucket list as a circular list) but we match less than "max" 836bd670b35SErik Nordmark * entries. 837c793af95Ssangeeta */ 838bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 839bd670b35SErik Nordmark while (maxwalk > 0) { 840bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 841bd670b35SErik Nordmark goto next_ire_skip; 842c793af95Ssangeeta 843bd670b35SErik Nordmark /* Skip the first "hash" entries to do ECMP */ 844bd670b35SErik Nordmark if (hash != 0) { 845bd670b35SErik Nordmark hash--; 846bd670b35SErik Nordmark goto next_ire_skip; 847bd670b35SErik Nordmark } 848bd670b35SErik Nordmark 849bd670b35SErik Nordmark /* See CGTP comment above */ 850bd670b35SErik Nordmark if (ire->ire_type != orig_ire->ire_type || 851188e1664SErik Nordmark ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 852c793af95Ssangeeta goto next_ire; 853c793af95Ssangeeta 854c793af95Ssangeeta /* 855bd670b35SErik Nordmark * Note: Since IPv6 has hash buckets instead of radix 856bd670b35SErik Nordmark * buckers we need to explicitly compare the addresses. 857bd670b35SErik Nordmark * That makes this less efficient since we will be called 858bd670b35SErik Nordmark * even if there is no alternatives just because the 859bd670b35SErik Nordmark * bucket has multiple IREs for different addresses. 860c793af95Ssangeeta */ 861bd670b35SErik Nordmark if (ire->ire_ipversion == IPV6_VERSION) { 862bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 863bd670b35SErik Nordmark &ire->ire_addr_v6)) 864c793af95Ssangeeta goto next_ire; 865c793af95Ssangeeta } 866c793af95Ssangeeta 867c793af95Ssangeeta /* 868bd670b35SErik Nordmark * For some reason find_best_route uses ire_mask. We do 869bd670b35SErik Nordmark * the same. 870bd670b35SErik Nordmark */ 871bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION ? 872bd670b35SErik Nordmark !ire_match_args(ire, margs->ift_addr, 873bd670b35SErik Nordmark ire->ire_mask, margs->ift_gateway, 874bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 875bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags) : 876bd670b35SErik Nordmark !ire_match_args_v6(ire, &margs->ift_addr_v6, 877bd670b35SErik Nordmark &ire->ire_mask_v6, &margs->ift_gateway_v6, 878bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 879bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags)) 880bd670b35SErik Nordmark goto next_ire; 881bd670b35SErik Nordmark 882bd670b35SErik Nordmark if (margs->ift_zoneid != ALL_ZONES && 883bd670b35SErik Nordmark (ire->ire_type & IRE_OFFLINK)) { 884bd670b35SErik Nordmark /* 885bd670b35SErik Nordmark * When we're in a zone, we're only 886c793af95Ssangeeta * interested in routers that are 887c793af95Ssangeeta * reachable through ipifs within our zone. 888c793af95Ssangeeta */ 889bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 890bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v4( 891bd670b35SErik Nordmark ire->ire_gateway_addr, margs->ift_zoneid, 892bd670b35SErik Nordmark ire->ire_ill, margs->ift_tsl, ipst, 893bd670b35SErik Nordmark B_TRUE)) 894bd670b35SErik Nordmark goto next_ire; 895bd670b35SErik Nordmark } else { 896bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v6( 897bd670b35SErik Nordmark &ire->ire_gateway_addr_v6, 898bd670b35SErik Nordmark margs->ift_zoneid, ire->ire_ill, 899bd670b35SErik Nordmark margs->ift_tsl, ipst, B_TRUE)) 900bd670b35SErik Nordmark goto next_ire; 901bd670b35SErik Nordmark } 902bd670b35SErik Nordmark } 903bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 904bd670b35SErik Nordmark /* Look for stale ire_badcnt and clear */ 905bd670b35SErik Nordmark if (ire->ire_badcnt != 0 && 906d3d50737SRafael Vanoni (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 907bd670b35SErik Nordmark ipst->ips_ip_ire_badcnt_lifetime)) 908bd670b35SErik Nordmark ire->ire_badcnt = 0; 909bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 910e11c3f44Smeem 911bd670b35SErik Nordmark if (ire->ire_badcnt == 0) { 912bd670b35SErik Nordmark /* We found one with a zero badcnt; done */ 913bd670b35SErik Nordmark ire_refhold(ire); 914bd670b35SErik Nordmark /* 915bd670b35SErik Nordmark * Care needed since irb_refrele grabs WLOCK to free 916bd670b35SErik Nordmark * the irb_t. 917bd670b35SErik Nordmark */ 918bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 919bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 920bd670b35SErik Nordmark irb_refrele(irb_ptr); 921bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 922bd670b35SErik Nordmark } else { 923bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 924bd670b35SErik Nordmark irb_refrele(irb_ptr); 925bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, 926bd670b35SErik Nordmark RW_READER); 927bd670b35SErik Nordmark } 928c793af95Ssangeeta return (ire); 929c793af95Ssangeeta } 930bd670b35SErik Nordmark /* 931bd670b35SErik Nordmark * keep looking to see if there is a better (lower 932bd670b35SErik Nordmark * badcnt) matching IRE, but save this one as a last resort. 933bd670b35SErik Nordmark * If we find a lower badcnt pick that one as the last* resort. 934bd670b35SErik Nordmark */ 935bd670b35SErik Nordmark if (maybe_ire == NULL) { 936bd670b35SErik Nordmark maybe_ire = ire; 937bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 938bd670b35SErik Nordmark } else if (ire->ire_badcnt < maybe_badcnt) { 939bd670b35SErik Nordmark maybe_ire = ire; 940bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 941bd670b35SErik Nordmark } 942bd670b35SErik Nordmark 943c793af95Ssangeeta next_ire: 944bd670b35SErik Nordmark maxwalk--; 945bd670b35SErik Nordmark next_ire_skip: 946bd670b35SErik Nordmark ire = ire->ire_next; 947bd670b35SErik Nordmark if (ire == NULL) 948bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 949c793af95Ssangeeta } 950c793af95Ssangeeta if (maybe_ire != NULL) 951bd670b35SErik Nordmark ire_refhold(maybe_ire); 952bd670b35SErik Nordmark 953bd670b35SErik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 954bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 955bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 956bd670b35SErik Nordmark irb_refrele(irb_ptr); 957bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 958bd670b35SErik Nordmark } else { 959bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 960bd670b35SErik Nordmark irb_refrele(irb_ptr); 961bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 962bd670b35SErik Nordmark } 963c793af95Ssangeeta return (maybe_ire); 964c793af95Ssangeeta } 9652679e103Ssowmini 9662679e103Ssowmini void 9672679e103Ssowmini irb_refhold_rn(struct radix_node *rn) 9682679e103Ssowmini { 9692679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 970bd670b35SErik Nordmark irb_refhold(&((rt_t *)(rn))->rt_irb); 9712679e103Ssowmini } 9722679e103Ssowmini 9732679e103Ssowmini void 9742679e103Ssowmini irb_refrele_rn(struct radix_node *rn) 9752679e103Ssowmini { 9762679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9772679e103Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9782679e103Ssowmini } 979bd670b35SErik Nordmark 98044b099c4SSowmini Varadhan 98144b099c4SSowmini Varadhan /* 98244b099c4SSowmini Varadhan * ip_select_src_ill() is used by ip_select_route() to find the src_ill 98344b099c4SSowmini Varadhan * to be used for source-aware routing table lookup. This function will 98444b099c4SSowmini Varadhan * ignore IPIF_UNNUMBERED interface addresses, and will only return a 98544b099c4SSowmini Varadhan * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 98644b099c4SSowmini Varadhan * interfaces). 98744b099c4SSowmini Varadhan */ 98844b099c4SSowmini Varadhan static ill_t * 98944b099c4SSowmini Varadhan ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 99044b099c4SSowmini Varadhan { 99144b099c4SSowmini Varadhan ipif_t *ipif; 99244b099c4SSowmini Varadhan ill_t *ill; 99344b099c4SSowmini Varadhan boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 99444b099c4SSowmini Varadhan ipaddr_t v4src; 99544b099c4SSowmini Varadhan 99644b099c4SSowmini Varadhan if (isv6) { 99744b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 99844b099c4SSowmini Varadhan } else { 99944b099c4SSowmini Varadhan IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 100044b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 100144b099c4SSowmini Varadhan } 100244b099c4SSowmini Varadhan if (ipif == NULL) 100344b099c4SSowmini Varadhan return (NULL); 100444b099c4SSowmini Varadhan ill = ipif->ipif_ill; 100544b099c4SSowmini Varadhan ill_refhold(ill); 100644b099c4SSowmini Varadhan ipif_refrele(ipif); 100744b099c4SSowmini Varadhan return (ill); 100844b099c4SSowmini Varadhan } 100944b099c4SSowmini Varadhan 101044b099c4SSowmini Varadhan /* 101144b099c4SSowmini Varadhan * verify that v6src is configured on ill 101244b099c4SSowmini Varadhan */ 101344b099c4SSowmini Varadhan static boolean_t 101444b099c4SSowmini Varadhan ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 101544b099c4SSowmini Varadhan { 101644b099c4SSowmini Varadhan ipif_t *ipif; 101744b099c4SSowmini Varadhan ip_stack_t *ipst; 101844b099c4SSowmini Varadhan ipaddr_t v4src; 101944b099c4SSowmini Varadhan 102044b099c4SSowmini Varadhan if (ill == NULL) 102144b099c4SSowmini Varadhan return (B_FALSE); 102244b099c4SSowmini Varadhan ipst = ill->ill_ipst; 102344b099c4SSowmini Varadhan 102444b099c4SSowmini Varadhan if (ill->ill_isv6) { 102544b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 102644b099c4SSowmini Varadhan } else { 102744b099c4SSowmini Varadhan IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 102844b099c4SSowmini Varadhan ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 102944b099c4SSowmini Varadhan } 103044b099c4SSowmini Varadhan 103144b099c4SSowmini Varadhan if (ipif != NULL) { 103244b099c4SSowmini Varadhan ipif_refrele(ipif); 103344b099c4SSowmini Varadhan return (B_TRUE); 103444b099c4SSowmini Varadhan } else { 103544b099c4SSowmini Varadhan return (B_FALSE); 103644b099c4SSowmini Varadhan } 103744b099c4SSowmini Varadhan } 103844b099c4SSowmini Varadhan 1039bd670b35SErik Nordmark /* 1040bd670b35SErik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 1041bd670b35SErik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to 1042bd670b35SErik Nordmark * lookup an nce for the multicast case. 104344b099c4SSowmini Varadhan * 104444b099c4SSowmini Varadhan * When src_multihoming is set to 2 (strict src multihoming) we use the source 104544b099c4SSowmini Varadhan * address to select the interface and route. If IP_BOUND_IF etc are 104644b099c4SSowmini Varadhan * specified, we require that they specify an interface on which the 104744b099c4SSowmini Varadhan * source address is assigned. 104844b099c4SSowmini Varadhan * 104944b099c4SSowmini Varadhan * When src_multihoming is set to 1 (preferred src aware route 105044b099c4SSowmini Varadhan * selection) the unicast lookup prefers a matching source 105144b099c4SSowmini Varadhan * (i.e., that the route points out an ill on which the source is assigned), but 105244b099c4SSowmini Varadhan * if no such route is found we fallback to not considering the source in the 105344b099c4SSowmini Varadhan * route lookup. 105444b099c4SSowmini Varadhan * 105544b099c4SSowmini Varadhan * We skip the src_multihoming check when the source isn't (yet) set, and 105644b099c4SSowmini Varadhan * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 105744b099c4SSowmini Varadhan * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 105844b099c4SSowmini Varadhan * when secpolicy_net_rawaccess(). 1059bd670b35SErik Nordmark */ 1060bd670b35SErik Nordmark ire_t * 106144b099c4SSowmini Varadhan ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 106244b099c4SSowmini Varadhan ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 106344b099c4SSowmini Varadhan int *errorp, boolean_t *multirtp) 1064bd670b35SErik Nordmark { 1065bd670b35SErik Nordmark uint_t match_args; 1066bd670b35SErik Nordmark uint_t ire_type; 106744b099c4SSowmini Varadhan ill_t *ill = NULL; 1068bd670b35SErik Nordmark ire_t *ire; 1069bd670b35SErik Nordmark ip_stack_t *ipst = ixa->ixa_ipst; 1070bd670b35SErik Nordmark ipaddr_t v4dst; 1071bd670b35SErik Nordmark in6_addr_t v6nexthop; 1072bd670b35SErik Nordmark iaflags_t ixaflags = ixa->ixa_flags; 1073bd670b35SErik Nordmark nce_t *nce; 107444b099c4SSowmini Varadhan boolean_t preferred_src_aware = B_FALSE; 107544b099c4SSowmini Varadhan boolean_t verify_src; 107644b099c4SSowmini Varadhan boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 107744b099c4SSowmini Varadhan int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 107844b099c4SSowmini Varadhan 107944b099c4SSowmini Varadhan /* 108044b099c4SSowmini Varadhan * We only verify that the src has been configured on a selected 108144b099c4SSowmini Varadhan * interface if the src is not :: or INADDR_ANY, and if the 108244b099c4SSowmini Varadhan * IXAF_VERIFY_SOURCE flag is set. 108344b099c4SSowmini Varadhan */ 108444b099c4SSowmini Varadhan verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 108544b099c4SSowmini Varadhan (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 1086bd670b35SErik Nordmark 1087bd670b35SErik Nordmark match_args = MATCH_IRE_SECATTR; 1088bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 1089bd670b35SErik Nordmark if (setsrcp != NULL) 1090bd670b35SErik Nordmark ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1091bd670b35SErik Nordmark if (errorp != NULL) 1092bd670b35SErik Nordmark ASSERT(*errorp == 0); 1093bd670b35SErik Nordmark 1094bd670b35SErik Nordmark /* 1095bd670b35SErik Nordmark * The content of the ixa will be different if IP_NEXTHOP, 1096bd670b35SErik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1097bd670b35SErik Nordmark */ 1098bd670b35SErik Nordmark 109944b099c4SSowmini Varadhan if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 1100bd670b35SErik Nordmark /* Pick up the IRE_MULTICAST for the ill */ 1101bd670b35SErik Nordmark if (ixa->ixa_multicast_ifindex != 0) { 1102bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 110344b099c4SSowmini Varadhan isv6, ipst); 1104bd670b35SErik Nordmark } else if (ixaflags & IXAF_SCOPEID_SET) { 1105bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1106bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1107bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 110844b099c4SSowmini Varadhan isv6, ipst); 1109bd670b35SErik Nordmark } else if (ixa->ixa_ifindex != 0) { 1110bd670b35SErik Nordmark /* 1111bd670b35SErik Nordmark * In the ipmp case, the ixa_ifindex is set to 1112bd670b35SErik Nordmark * point at an under_ill and we would return the 1113bd670b35SErik Nordmark * ire_multicast() corresponding to that under_ill. 1114bd670b35SErik Nordmark */ 1115bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 111644b099c4SSowmini Varadhan isv6, ipst); 111744b099c4SSowmini Varadhan } else if (src_multihoming != 0 && verify_src) { 111844b099c4SSowmini Varadhan /* Look up the ill based on the source address */ 111944b099c4SSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 112044b099c4SSowmini Varadhan /* 112144b099c4SSowmini Varadhan * Since we looked up the ill from the source there 112244b099c4SSowmini Varadhan * is no need to verify that the source is on the ill 112344b099c4SSowmini Varadhan * below. 112444b099c4SSowmini Varadhan */ 112544b099c4SSowmini Varadhan verify_src = B_FALSE; 112644b099c4SSowmini Varadhan if (ill != NULL && IS_VNI(ill)) { 112744b099c4SSowmini Varadhan ill_t *usesrc = ill; 112844b099c4SSowmini Varadhan 112944b099c4SSowmini Varadhan ill = ill_lookup_usesrc(usesrc); 113044b099c4SSowmini Varadhan ill_refrele(usesrc); 113144b099c4SSowmini Varadhan } 113244b099c4SSowmini Varadhan } else if (!isv6) { 1133bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1134bd670b35SErik Nordmark 113544b099c4SSowmini Varadhan ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 113644b099c4SSowmini Varadhan ipst, multirtp, &v4setsrc); 1137bd670b35SErik Nordmark if (setsrcp != NULL) 1138bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1139bd670b35SErik Nordmark } else { 114044b099c4SSowmini Varadhan ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 114144b099c4SSowmini Varadhan ipst, multirtp, setsrcp); 1142bd670b35SErik Nordmark } 1143bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1144bd670b35SErik Nordmark ill_refrele(ill); 1145bd670b35SErik Nordmark ill = NULL; 1146bd670b35SErik Nordmark } 1147bd670b35SErik Nordmark if (ill == NULL) { 1148bd670b35SErik Nordmark if (errorp != NULL) 1149bd670b35SErik Nordmark *errorp = ENXIO; 1150bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 115144b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 1152bd670b35SErik Nordmark return (ire); 1153bd670b35SErik Nordmark } 1154bd670b35SErik Nordmark if (!(ill->ill_flags & ILLF_MULTICAST)) { 1155bd670b35SErik Nordmark ill_refrele(ill); 1156bd670b35SErik Nordmark if (errorp != NULL) 1157bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1158bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 115944b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 116044b099c4SSowmini Varadhan return (ire); 116144b099c4SSowmini Varadhan } 116244b099c4SSowmini Varadhan /* 116344b099c4SSowmini Varadhan * If we are doing the strictest src_multihoming, then 116444b099c4SSowmini Varadhan * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 116544b099c4SSowmini Varadhan * an interface that is consistent with the source address. 116644b099c4SSowmini Varadhan */ 116744b099c4SSowmini Varadhan if (verify_src && src_multihoming == 2 && 116844b099c4SSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 116944b099c4SSowmini Varadhan if (errorp != NULL) 117044b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL; 117144b099c4SSowmini Varadhan ill_refrele(ill); 117244b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */ 117344b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 1174bd670b35SErik Nordmark return (ire); 1175bd670b35SErik Nordmark } 1176bd670b35SErik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */ 1177bd670b35SErik Nordmark ire = ire_multicast(ill); 1178bd670b35SErik Nordmark ill_refrele(ill); 1179bd670b35SErik Nordmark if (generationp != NULL) 1180bd670b35SErik Nordmark *generationp = ire->ire_generation; 1181bd670b35SErik Nordmark if (errorp != NULL && 1182bd670b35SErik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1183bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1184bd670b35SErik Nordmark } 1185bd670b35SErik Nordmark return (ire); 1186bd670b35SErik Nordmark } 1187bd670b35SErik Nordmark 118844b099c4SSowmini Varadhan /* Now for unicast */ 1189bd670b35SErik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1190bd670b35SErik Nordmark if (ixaflags & IXAF_SCOPEID_SET) { 1191bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1192bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1193bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 119444b099c4SSowmini Varadhan isv6, ipst); 1195bd670b35SErik Nordmark } else { 1196bd670b35SErik Nordmark ASSERT(ixa->ixa_ifindex != 0); 1197bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 119844b099c4SSowmini Varadhan isv6, ipst); 1199bd670b35SErik Nordmark } 1200bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1201bd670b35SErik Nordmark ill_refrele(ill); 1202bd670b35SErik Nordmark ill = NULL; 1203bd670b35SErik Nordmark } 1204bd670b35SErik Nordmark if (ill == NULL) { 1205bd670b35SErik Nordmark if (errorp != NULL) 1206bd670b35SErik Nordmark *errorp = ENXIO; 1207bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 120844b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 1209bd670b35SErik Nordmark return (ire); 1210bd670b35SErik Nordmark } 121144b099c4SSowmini Varadhan 121244b099c4SSowmini Varadhan match_args |= MATCH_IRE_ILL; 121344b099c4SSowmini Varadhan 1214bd670b35SErik Nordmark /* 1215bd670b35SErik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1216bd670b35SErik Nordmark * so for both of them we need to be able look for an under 1217bd670b35SErik Nordmark * interface. 1218bd670b35SErik Nordmark */ 1219bd670b35SErik Nordmark if (IS_UNDER_IPMP(ill)) 1220bd670b35SErik Nordmark match_args |= MATCH_IRE_TESTHIDDEN; 122144b099c4SSowmini Varadhan 122244b099c4SSowmini Varadhan /* 122344b099c4SSowmini Varadhan * If we are doing the strictest src_multihoming, then 122444b099c4SSowmini Varadhan * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 122544b099c4SSowmini Varadhan * an interface that is consistent with the source address. 122644b099c4SSowmini Varadhan */ 122744b099c4SSowmini Varadhan if (src_multihoming == 2 && 122844b099c4SSowmini Varadhan !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 122944b099c4SSowmini Varadhan if (errorp != NULL) 123044b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL; 123144b099c4SSowmini Varadhan ill_refrele(ill); 123244b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */ 123344b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 123444b099c4SSowmini Varadhan return (ire); 123544b099c4SSowmini Varadhan } 123644b099c4SSowmini Varadhan } else if (src_multihoming != 0 && verify_src) { 123744b099c4SSowmini Varadhan /* Look up the ill based on the source address */ 123844b099c4SSowmini Varadhan ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 123944b099c4SSowmini Varadhan if (ill == NULL) { 124044b099c4SSowmini Varadhan char addrbuf[INET6_ADDRSTRLEN]; 124144b099c4SSowmini Varadhan 124244b099c4SSowmini Varadhan ip3dbg(("%s not a valid src for unicast", 124344b099c4SSowmini Varadhan inet_ntop(AF_INET6, &v6src, addrbuf, 124444b099c4SSowmini Varadhan sizeof (addrbuf)))); 124544b099c4SSowmini Varadhan if (errorp != NULL) 124644b099c4SSowmini Varadhan *errorp = EADDRNOTAVAIL; 124744b099c4SSowmini Varadhan /* Get a hold on the IRE_NOROUTE */ 124844b099c4SSowmini Varadhan ire = ire_reject(ipst, isv6); 124944b099c4SSowmini Varadhan return (ire); 125044b099c4SSowmini Varadhan } 125144b099c4SSowmini Varadhan match_args |= MATCH_IRE_SRC_ILL; 125244b099c4SSowmini Varadhan preferred_src_aware = (src_multihoming == 1); 1253bd670b35SErik Nordmark } 1254bd670b35SErik Nordmark 1255bd670b35SErik Nordmark if (ixaflags & IXAF_NEXTHOP_SET) { 1256bd670b35SErik Nordmark /* IP_NEXTHOP was set */ 1257bd670b35SErik Nordmark v6nexthop = ixa->ixa_nexthop_v6; 1258bd670b35SErik Nordmark } else { 1259bd670b35SErik Nordmark v6nexthop = *v6dst; 1260bd670b35SErik Nordmark } 1261bd670b35SErik Nordmark 1262bd670b35SErik Nordmark ire_type = 0; 1263bd670b35SErik Nordmark 1264bd670b35SErik Nordmark /* 1265bd670b35SErik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1266bd670b35SErik Nordmark * we only look for an onlink IRE. 1267bd670b35SErik Nordmark */ 1268bd670b35SErik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1269bd670b35SErik Nordmark match_args |= MATCH_IRE_TYPE; 1270bd670b35SErik Nordmark ire_type = IRE_ONLINK; 1271bd670b35SErik Nordmark } 1272bd670b35SErik Nordmark 127344b099c4SSowmini Varadhan retry: 127444b099c4SSowmini Varadhan if (!isv6) { 1275bd670b35SErik Nordmark ipaddr_t v4nexthop; 1276bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1277bd670b35SErik Nordmark 1278bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1279bd670b35SErik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 12809e3469d3SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1281bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1282bd670b35SErik Nordmark if (setsrcp != NULL) 1283bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1284bd670b35SErik Nordmark } else { 1285bd670b35SErik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 12869e3469d3SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1287bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1288bd670b35SErik Nordmark } 1289bd670b35SErik Nordmark 1290bd670b35SErik Nordmark #ifdef DEBUG 1291bd670b35SErik Nordmark if (match_args & MATCH_IRE_TESTHIDDEN) { 1292bd670b35SErik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n", 1293bd670b35SErik Nordmark v4dst, (void *)ire)); 1294bd670b35SErik Nordmark } 1295bd670b35SErik Nordmark #endif 129644b099c4SSowmini Varadhan if (ill != NULL) { 1297bd670b35SErik Nordmark ill_refrele(ill); 129844b099c4SSowmini Varadhan ill = NULL; 129944b099c4SSowmini Varadhan } 1300bd670b35SErik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1301bd670b35SErik Nordmark (ire->ire_type & IRE_MULTICAST)) { 130244b099c4SSowmini Varadhan if (preferred_src_aware) { 130344b099c4SSowmini Varadhan /* 130444b099c4SSowmini Varadhan * "Preferred Source Aware" send mode. If we cannot 130544b099c4SSowmini Varadhan * find an ire whose ire_ill had the desired source 130644b099c4SSowmini Varadhan * address retry after relaxing the ill matching 130744b099c4SSowmini Varadhan * constraint. 130844b099c4SSowmini Varadhan */ 130944b099c4SSowmini Varadhan ire_refrele(ire); 131044b099c4SSowmini Varadhan preferred_src_aware = B_FALSE; 131144b099c4SSowmini Varadhan match_args &= ~MATCH_IRE_SRC_ILL; 131244b099c4SSowmini Varadhan goto retry; 131344b099c4SSowmini Varadhan } 1314bd670b35SErik Nordmark /* No ire_nce_cache */ 1315bd670b35SErik Nordmark return (ire); 1316bd670b35SErik Nordmark } 1317bd670b35SErik Nordmark 1318bd670b35SErik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1319bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1320bd670b35SErik Nordmark nce = ire->ire_nce_cache; 1321bd670b35SErik Nordmark if (nce == NULL || nce->nce_is_condemned) { 1322bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1323bd670b35SErik Nordmark (void) ire_revalidate_nce(ire); 1324bd670b35SErik Nordmark } else { 1325bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1326bd670b35SErik Nordmark } 1327bd670b35SErik Nordmark return (ire); 1328bd670b35SErik Nordmark } 1329bd670b35SErik Nordmark 1330bd670b35SErik Nordmark /* 1331bd670b35SErik Nordmark * Find a route given some xmit attributes and a packet. 1332bd670b35SErik Nordmark * Generic for IPv4 and IPv6 1333bd670b35SErik Nordmark * 1334bd670b35SErik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE 1335bd670b35SErik Nordmark * it might set errorp. 1336bd670b35SErik Nordmark */ 1337bd670b35SErik Nordmark ire_t * 1338bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1339bd670b35SErik Nordmark int *errorp, boolean_t *multirtp) 1340bd670b35SErik Nordmark { 1341bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_IS_IPV4) { 1342bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 134344b099c4SSowmini Varadhan in6_addr_t v6dst, v6src; 1344bd670b35SErik Nordmark 1345bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 134644b099c4SSowmini Varadhan IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 1347bd670b35SErik Nordmark 134844b099c4SSowmini Varadhan return (ip_select_route(&v6dst, v6src, ixa, generationp, 1349bd670b35SErik Nordmark NULL, errorp, multirtp)); 1350bd670b35SErik Nordmark } else { 1351bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1352bd670b35SErik Nordmark 135344b099c4SSowmini Varadhan return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 135444b099c4SSowmini Varadhan ixa, generationp, NULL, errorp, multirtp)); 1355bd670b35SErik Nordmark } 1356bd670b35SErik Nordmark } 1357bd670b35SErik Nordmark 1358bd670b35SErik Nordmark ire_t * 135944b099c4SSowmini Varadhan ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 136044b099c4SSowmini Varadhan uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1361bd670b35SErik Nordmark { 136244b099c4SSowmini Varadhan in6_addr_t v6dst, v6src; 1363bd670b35SErik Nordmark ire_t *ire; 1364bd670b35SErik Nordmark in6_addr_t setsrc; 1365bd670b35SErik Nordmark 1366bd670b35SErik Nordmark ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1367bd670b35SErik Nordmark 1368bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 136944b099c4SSowmini Varadhan IN6_IPADDR_TO_V4MAPPED(src, &v6src); 1370bd670b35SErik Nordmark 1371bd670b35SErik Nordmark setsrc = ipv6_all_zeros; 137244b099c4SSowmini Varadhan ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 1373bd670b35SErik Nordmark multirtp); 1374bd670b35SErik Nordmark if (v4setsrcp != NULL) 1375bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1376bd670b35SErik Nordmark return (ire); 1377bd670b35SErik Nordmark } 1378bd670b35SErik Nordmark 1379bd670b35SErik Nordmark /* 1380bd670b35SErik Nordmark * Recursively look for a route to the destination. Can also match on 1381bd670b35SErik Nordmark * the zoneid, ill, and label. Used for the data paths. See also 1382bd670b35SErik Nordmark * ire_route_recursive. 1383bd670b35SErik Nordmark * 13849e3469d3SErik Nordmark * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 13859e3469d3SErik Nordmark * create an IRE_IF_CLONE. This is used on the receive side when we are not 13869e3469d3SErik Nordmark * forwarding. 13879e3469d3SErik Nordmark * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 13889e3469d3SErik Nordmark * resolve the gateway. 13899e3469d3SErik Nordmark * 1390bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1391bd670b35SErik Nordmark * instead. 1392bd670b35SErik Nordmark * 1393bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1394bd670b35SErik Nordmark * is an error. 1395bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1396bd670b35SErik Nordmark */ 1397bd670b35SErik Nordmark ire_t * 1398bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire, 1399bd670b35SErik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1400bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 14019e3469d3SErik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1402bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1403bd670b35SErik Nordmark { 1404bd670b35SErik Nordmark int i, j; 1405bd670b35SErik Nordmark ire_t *ires[MAX_IRE_RECURSION]; 1406bd670b35SErik Nordmark uint_t generation; 1407bd670b35SErik Nordmark uint_t generations[MAX_IRE_RECURSION]; 1408bd670b35SErik Nordmark boolean_t need_refrele = B_FALSE; 1409bd670b35SErik Nordmark boolean_t invalidate = B_FALSE; 1410bd670b35SErik Nordmark ill_t *ill = NULL; 141101685f97SSowmini Varadhan uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST); 1412bd670b35SErik Nordmark 1413bd670b35SErik Nordmark if (setsrcp != NULL) 1414bd670b35SErik Nordmark ASSERT(*setsrcp == INADDR_ANY); 1415bd670b35SErik Nordmark if (gwattrp != NULL) 1416bd670b35SErik Nordmark ASSERT(*gwattrp == NULL); 1417bd670b35SErik Nordmark 1418bd670b35SErik Nordmark /* 1419bd670b35SErik Nordmark * We iterate up to three times to resolve a route, even though 1420bd670b35SErik Nordmark * we have four slots in the array. The extra slot is for an 1421bd670b35SErik Nordmark * IRE_IF_CLONE we might need to create. 1422bd670b35SErik Nordmark */ 1423bd670b35SErik Nordmark i = 0; 1424bd670b35SErik Nordmark while (i < MAX_IRE_RECURSION - 1) { 1425bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1426bd670b35SErik Nordmark if (ire == NULL) { 1427bd670b35SErik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 142844b099c4SSowmini Varadhan (ill != NULL? ill : ill_arg), zoneid, tsl, 1429bd670b35SErik Nordmark match_args, xmit_hint, ipst, &generation); 1430bd670b35SErik Nordmark } else { 1431bd670b35SErik Nordmark /* Caller passed it; extra hold since we will rele */ 1432bd670b35SErik Nordmark ire_refhold(ire); 1433bd670b35SErik Nordmark if (generationp != NULL) 1434bd670b35SErik Nordmark generation = *generationp; 1435bd670b35SErik Nordmark else 1436bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1437bd670b35SErik Nordmark } 143801685f97SSowmini Varadhan if (ire == NULL) { 143901685f97SSowmini Varadhan if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 14409e3469d3SErik Nordmark ire = ires[0]; 14419e3469d3SErik Nordmark ire_refhold(ire); 14429e3469d3SErik Nordmark } else { 1443bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 14449e3469d3SErik Nordmark } 1445bd670b35SErik Nordmark goto error; 1446bd670b35SErik Nordmark } 144701685f97SSowmini Varadhan 144801685f97SSowmini Varadhan /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 144901685f97SSowmini Varadhan if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 145001685f97SSowmini Varadhan goto error; 145101685f97SSowmini Varadhan 145201685f97SSowmini Varadhan ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1453fff7ec1dSSowmini Varadhan /* 1454fff7ec1dSSowmini Varadhan * Verify that the IRE_IF_CLONE has a consistent generation 1455fff7ec1dSSowmini Varadhan * number. 1456fff7ec1dSSowmini Varadhan */ 1457fff7ec1dSSowmini Varadhan if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) { 1458fff7ec1dSSowmini Varadhan ire_refrele(ire); 1459fff7ec1dSSowmini Varadhan ire = NULL; 1460fff7ec1dSSowmini Varadhan continue; 1461fff7ec1dSSowmini Varadhan } 146201685f97SSowmini Varadhan 146301685f97SSowmini Varadhan /* 146401685f97SSowmini Varadhan * Don't allow anything unusual past the first iteration. 146501685f97SSowmini Varadhan * After the first lookup, we should no longer look for 146601685f97SSowmini Varadhan * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT 146701685f97SSowmini Varadhan * routes. 146801685f97SSowmini Varadhan * 146901685f97SSowmini Varadhan * In addition, after we have found a direct IRE_OFFLINK, 147001685f97SSowmini Varadhan * we should only look for interface or clone routes. 147101685f97SSowmini Varadhan */ 147201685f97SSowmini Varadhan match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 147301685f97SSowmini Varadhan 147401685f97SSowmini Varadhan if ((ire->ire_type & IRE_OFFLINK) && 147501685f97SSowmini Varadhan !(ire->ire_flags & RTF_INDIRECT)) { 147601685f97SSowmini Varadhan ire_type = IRE_IF_ALL; 147701685f97SSowmini Varadhan } else { 147801685f97SSowmini Varadhan /* 147901685f97SSowmini Varadhan * no more local, loopback, broadcast routes 148001685f97SSowmini Varadhan */ 148101685f97SSowmini Varadhan if (!(match_args & MATCH_IRE_TYPE)) 148201685f97SSowmini Varadhan ire_type = (IRE_OFFLINK|IRE_ONLINK); 148301685f97SSowmini Varadhan ire_type &= ~maskoff; 1484bd670b35SErik Nordmark } 148501685f97SSowmini Varadhan match_args |= MATCH_IRE_TYPE; 148601685f97SSowmini Varadhan 1487bd670b35SErik Nordmark /* We have a usable IRE */ 1488bd670b35SErik Nordmark ires[i] = ire; 1489bd670b35SErik Nordmark generations[i] = generation; 1490bd670b35SErik Nordmark i++; 1491bd670b35SErik Nordmark 1492bd670b35SErik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */ 1493bd670b35SErik Nordmark if ((ire->ire_flags & RTF_SETSRC) && 1494bd670b35SErik Nordmark setsrcp != NULL && *setsrcp == INADDR_ANY) { 1495bd670b35SErik Nordmark ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1496bd670b35SErik Nordmark *setsrcp = ire->ire_setsrc_addr; 1497bd670b35SErik Nordmark } 1498bd670b35SErik Nordmark 1499bd670b35SErik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */ 1500bd670b35SErik Nordmark if (ire->ire_gw_secattr != NULL && 1501bd670b35SErik Nordmark gwattrp != NULL && *gwattrp == NULL) 1502bd670b35SErik Nordmark *gwattrp = ire->ire_gw_secattr; 1503bd670b35SErik Nordmark 1504bd670b35SErik Nordmark /* 1505bd670b35SErik Nordmark * Check if we have a short-cut pointer to an IRE for this 1506bd670b35SErik Nordmark * destination, and that the cached dependency isn't stale. 1507bd670b35SErik Nordmark * In that case we've rejoined an existing tree towards a 1508bd670b35SErik Nordmark * parent, thus we don't need to continue the loop to 1509bd670b35SErik Nordmark * discover the rest of the tree. 1510bd670b35SErik Nordmark */ 1511bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1512bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1513bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1514bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1515bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1516bd670b35SErik Nordmark ire = NULL; 1517bd670b35SErik Nordmark goto done; 1518bd670b35SErik Nordmark } 1519bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1520bd670b35SErik Nordmark 1521bd670b35SErik Nordmark /* 1522bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1523bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1524bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1525bd670b35SErik Nordmark */ 1526bd670b35SErik Nordmark if (ire->ire_nce_capable) { 1527bd670b35SErik Nordmark ire = NULL; 1528bd670b35SErik Nordmark goto done; 1529bd670b35SErik Nordmark } 1530bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1531bd670b35SErik Nordmark /* 1532bd670b35SErik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1533bd670b35SErik Nordmark * particular destination 1534bd670b35SErik Nordmark */ 1535bd670b35SErik Nordmark if (ire->ire_type & IRE_INTERFACE) { 1536bd670b35SErik Nordmark in6_addr_t v6nexthop; 1537bd670b35SErik Nordmark ire_t *clone; 1538bd670b35SErik Nordmark 1539bd670b35SErik Nordmark ASSERT(ire->ire_masklen != IPV4_ABITS); 1540bd670b35SErik Nordmark 1541bd670b35SErik Nordmark /* 1542bd670b35SErik Nordmark * In the case of ip_input and ILLF_FORWARDING not 15439e3469d3SErik Nordmark * being set, and in the case of RTM_GET, there is 15449e3469d3SErik Nordmark * no point in allocating an IRE_IF_CLONE. We return 15459e3469d3SErik Nordmark * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 15469e3469d3SErik Nordmark * result in a ire_dep_parent which is IRE_IF_* 15479e3469d3SErik Nordmark * without an IRE_IF_CLONE. 1548bd670b35SErik Nordmark * We recover from that when we need to send packets 1549bd670b35SErik Nordmark * by ensuring that the generations become 1550bd670b35SErik Nordmark * IRE_GENERATION_VERIFY in this case. 1551bd670b35SErik Nordmark */ 15529e3469d3SErik Nordmark if (!(irr_flags & IRR_ALLOCATE)) { 1553bd670b35SErik Nordmark invalidate = B_TRUE; 1554bd670b35SErik Nordmark ire = NULL; 1555bd670b35SErik Nordmark goto done; 1556bd670b35SErik Nordmark } 1557bd670b35SErik Nordmark 1558bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1559bd670b35SErik Nordmark 1560bd670b35SErik Nordmark clone = ire_create_if_clone(ire, &v6nexthop, 1561bd670b35SErik Nordmark &generation); 1562bd670b35SErik Nordmark if (clone == NULL) { 1563bd670b35SErik Nordmark /* 1564bd670b35SErik Nordmark * Temporary failure - no memory. 1565bd670b35SErik Nordmark * Don't want caller to cache IRE_NOROUTE. 1566bd670b35SErik Nordmark */ 1567bd670b35SErik Nordmark invalidate = B_TRUE; 1568bd670b35SErik Nordmark ire = ire_blackhole(ipst, B_FALSE); 1569bd670b35SErik Nordmark goto error; 1570bd670b35SErik Nordmark } 1571bd670b35SErik Nordmark /* 1572bd670b35SErik Nordmark * Make clone next to last entry and the 1573bd670b35SErik Nordmark * IRE_INTERFACE the last in the dependency 1574bd670b35SErik Nordmark * chain since the clone depends on the 1575bd670b35SErik Nordmark * IRE_INTERFACE. 1576bd670b35SErik Nordmark */ 1577bd670b35SErik Nordmark ASSERT(i >= 1); 1578bd670b35SErik Nordmark ASSERT(i < MAX_IRE_RECURSION); 1579bd670b35SErik Nordmark 1580bd670b35SErik Nordmark ires[i] = ires[i-1]; 1581bd670b35SErik Nordmark generations[i] = generations[i-1]; 1582bd670b35SErik Nordmark ires[i-1] = clone; 1583bd670b35SErik Nordmark generations[i-1] = generation; 1584bd670b35SErik Nordmark i++; 1585bd670b35SErik Nordmark 1586bd670b35SErik Nordmark ire = NULL; 1587bd670b35SErik Nordmark goto done; 1588bd670b35SErik Nordmark } 1589bd670b35SErik Nordmark 1590bd670b35SErik Nordmark /* 1591bd670b35SErik Nordmark * We only match on the type and optionally ILL when 1592bd670b35SErik Nordmark * recursing. The type match is used by some callers 1593bd670b35SErik Nordmark * to exclude certain types (such as IRE_IF_CLONE or 1594bd670b35SErik Nordmark * IRE_LOCAL|IRE_LOOPBACK). 159544b099c4SSowmini Varadhan * 159644b099c4SSowmini Varadhan * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 159744b099c4SSowmini Varadhan * ire->ire_ill, and we want to find the IRE_INTERFACE for 159844b099c4SSowmini Varadhan * ire_ill, so we set ill to the ire_ill; 1599bd670b35SErik Nordmark */ 160001685f97SSowmini Varadhan match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1601bd670b35SErik Nordmark nexthop = ire->ire_gateway_addr; 1602bd670b35SErik Nordmark if (ill == NULL && ire->ire_ill != NULL) { 1603bd670b35SErik Nordmark ill = ire->ire_ill; 1604bd670b35SErik Nordmark need_refrele = B_TRUE; 1605bd670b35SErik Nordmark ill_refhold(ill); 1606bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1607bd670b35SErik Nordmark } 1608bd670b35SErik Nordmark ire = NULL; 1609bd670b35SErik Nordmark } 1610bd670b35SErik Nordmark ASSERT(ire == NULL); 1611bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1612bd670b35SErik Nordmark 1613bd670b35SErik Nordmark error: 1614bd670b35SErik Nordmark ASSERT(ire != NULL); 1615bd670b35SErik Nordmark if (need_refrele) 1616bd670b35SErik Nordmark ill_refrele(ill); 1617bd670b35SErik Nordmark 1618bd670b35SErik Nordmark /* 1619bd670b35SErik Nordmark * In the case of MULTIRT we want to try a different IRE the next 1620bd670b35SErik Nordmark * time. We let the next packet retry in that case. 1621bd670b35SErik Nordmark */ 1622bd670b35SErik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1623bd670b35SErik Nordmark (void) ire_no_good(ires[0]); 1624bd670b35SErik Nordmark 1625bd670b35SErik Nordmark cleanup: 1626bd670b35SErik Nordmark /* cleanup ires[i] */ 1627bd670b35SErik Nordmark ire_dep_unbuild(ires, i); 1628bd670b35SErik Nordmark for (j = 0; j < i; j++) 1629bd670b35SErik Nordmark ire_refrele(ires[j]); 1630bd670b35SErik Nordmark 16319e3469d3SErik Nordmark ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 16329e3469d3SErik Nordmark (irr_flags & IRR_INCOMPLETE)); 1633bd670b35SErik Nordmark /* 1634bd670b35SErik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1635bd670b35SErik Nordmark * ip_select_route since the reject or lack of memory might be gone. 1636bd670b35SErik Nordmark */ 1637bd670b35SErik Nordmark if (generationp != NULL) 1638bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 1639bd670b35SErik Nordmark return (ire); 1640bd670b35SErik Nordmark 1641bd670b35SErik Nordmark done: 1642bd670b35SErik Nordmark ASSERT(ire == NULL); 1643bd670b35SErik Nordmark if (need_refrele) { 1644bd670b35SErik Nordmark ill_refrele(ill); 1645bd670b35SErik Nordmark ill = NULL; 1646bd670b35SErik Nordmark } 1647bd670b35SErik Nordmark 1648bd670b35SErik Nordmark /* Build dependencies */ 1649188e1664SErik Nordmark if (i > 1 && !ire_dep_build(ires, generations, i)) { 1650bd670b35SErik Nordmark /* Something in chain was condemned; tear it apart */ 1651bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1652bd670b35SErik Nordmark goto cleanup; 1653bd670b35SErik Nordmark } 1654bd670b35SErik Nordmark 1655bd670b35SErik Nordmark /* 1656bd670b35SErik Nordmark * Release all refholds except the one for ires[0] that we 1657bd670b35SErik Nordmark * will return to the caller. 1658bd670b35SErik Nordmark */ 1659bd670b35SErik Nordmark for (j = 1; j < i; j++) 1660bd670b35SErik Nordmark ire_refrele(ires[j]); 1661bd670b35SErik Nordmark 1662bd670b35SErik Nordmark if (invalidate) { 1663bd670b35SErik Nordmark /* 1664bd670b35SErik Nordmark * Since we needed to allocate but couldn't we need to make 1665bd670b35SErik Nordmark * sure that the dependency chain is rebuilt the next time. 1666bd670b35SErik Nordmark */ 1667bd670b35SErik Nordmark ire_dep_invalidate_generations(ires[0]); 1668bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1669bd670b35SErik Nordmark } else { 1670bd670b35SErik Nordmark /* 1671bd670b35SErik Nordmark * IREs can have been added or deleted while we did the 1672bd670b35SErik Nordmark * recursive lookup and we can't catch those until we've built 1673bd670b35SErik Nordmark * the dependencies. We verify the stored 1674bd670b35SErik Nordmark * ire_dep_parent_generation to catch any such changes and 1675bd670b35SErik Nordmark * return IRE_GENERATION_VERIFY (which will cause 1676bd670b35SErik Nordmark * ip_select_route to be called again so we can redo the 1677bd670b35SErik Nordmark * recursive lookup next time we send a packet. 1678bd670b35SErik Nordmark */ 1679188e1664SErik Nordmark if (ires[0]->ire_dep_parent == NULL) 1680188e1664SErik Nordmark generation = ires[0]->ire_generation; 1681188e1664SErik Nordmark else 1682bd670b35SErik Nordmark generation = ire_dep_validate_generations(ires[0]); 1683bd670b35SErik Nordmark if (generations[0] != ires[0]->ire_generation) { 1684bd670b35SErik Nordmark /* Something changed at the top */ 1685bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1686bd670b35SErik Nordmark } 1687bd670b35SErik Nordmark } 1688bd670b35SErik Nordmark if (generationp != NULL) 1689bd670b35SErik Nordmark *generationp = generation; 1690bd670b35SErik Nordmark 1691bd670b35SErik Nordmark return (ires[0]); 1692bd670b35SErik Nordmark } 1693bd670b35SErik Nordmark 1694bd670b35SErik Nordmark ire_t * 1695bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1696bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 16979e3469d3SErik Nordmark uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1698bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1699bd670b35SErik Nordmark { 1700bd670b35SErik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 17019e3469d3SErik Nordmark zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1702bd670b35SErik Nordmark gwattrp, generationp)); 1703bd670b35SErik Nordmark } 1704bd670b35SErik Nordmark 1705bd670b35SErik Nordmark /* 1706bd670b35SErik Nordmark * Recursively look for a route to the destination. 1707bd670b35SErik Nordmark * We only handle a destination match here, yet we have the same arguments 1708bd670b35SErik Nordmark * as the full match to allow function pointers to select between the two. 1709bd670b35SErik Nordmark * 1710bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1711bd670b35SErik Nordmark * instead. 1712bd670b35SErik Nordmark * 1713bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1714bd670b35SErik Nordmark * is an error. 1715bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1716bd670b35SErik Nordmark */ 1717bd670b35SErik Nordmark ire_t * 17189e3469d3SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1719bd670b35SErik Nordmark uint32_t xmit_hint, ip_stack_t *ipst) 1720bd670b35SErik Nordmark { 1721bd670b35SErik Nordmark ire_t *ire; 1722bd670b35SErik Nordmark ire_t *ire1; 1723bd670b35SErik Nordmark uint_t generation; 1724bd670b35SErik Nordmark 1725bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1726bd670b35SErik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1727bd670b35SErik Nordmark &generation); 1728bd670b35SErik Nordmark ASSERT(ire != NULL); 1729fff7ec1dSSowmini Varadhan /* 1730fff7ec1dSSowmini Varadhan * If the IRE has a current cached parent we know that the whole 1731fff7ec1dSSowmini Varadhan * parent chain is current, hence we don't need to discover and 1732fff7ec1dSSowmini Varadhan * build any dependencies by doing a recursive lookup. 1733fff7ec1dSSowmini Varadhan */ 1734fff7ec1dSSowmini Varadhan mutex_enter(&ire->ire_lock); 1735fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent != NULL) { 1736fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent->ire_generation == 1737fff7ec1dSSowmini Varadhan ire->ire_dep_parent_generation) { 1738fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock); 1739fff7ec1dSSowmini Varadhan return (ire); 1740fff7ec1dSSowmini Varadhan } 1741fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock); 1742fff7ec1dSSowmini Varadhan } else { 1743fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock); 1744bd670b35SErik Nordmark /* 1745bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1746bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1747bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1748bd670b35SErik Nordmark */ 1749bd670b35SErik Nordmark if (ire->ire_nce_capable) 1750bd670b35SErik Nordmark return (ire); 1751bd670b35SErik Nordmark } 1752bd670b35SErik Nordmark 1753bd670b35SErik Nordmark /* 1754bd670b35SErik Nordmark * Fallback to loop in the normal code starting with the ire 1755bd670b35SErik Nordmark * we found. Normally this would return the same ire. 1756bd670b35SErik Nordmark */ 1757bd670b35SErik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 17589e3469d3SErik Nordmark NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1759bd670b35SErik Nordmark &generation); 1760bd670b35SErik Nordmark ire_refrele(ire); 1761bd670b35SErik Nordmark return (ire1); 1762bd670b35SErik Nordmark } 1763fff7ec1dSSowmini Varadhan 1764fff7ec1dSSowmini Varadhan /* 1765fff7ec1dSSowmini Varadhan * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE 1766fff7ec1dSSowmini Varadhan * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they 1767fff7ec1dSSowmini Varadhan * are not consistent, and TRUE otherwise. 1768fff7ec1dSSowmini Varadhan */ 1769fff7ec1dSSowmini Varadhan boolean_t 1770fff7ec1dSSowmini Varadhan ire_clone_verify(ire_t *ire) 1771fff7ec1dSSowmini Varadhan { 1772fff7ec1dSSowmini Varadhan ASSERT((ire->ire_type & IRE_IF_CLONE) != 0); 1773fff7ec1dSSowmini Varadhan mutex_enter(&ire->ire_lock); 1774fff7ec1dSSowmini Varadhan if (ire->ire_dep_parent != NULL && 1775fff7ec1dSSowmini Varadhan ire->ire_dep_parent->ire_generation != 1776fff7ec1dSSowmini Varadhan ire->ire_dep_parent_generation) { 1777fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock); 1778fff7ec1dSSowmini Varadhan ire_delete(ire); 1779fff7ec1dSSowmini Varadhan return (B_FALSE); 1780fff7ec1dSSowmini Varadhan } 1781fff7ec1dSSowmini Varadhan mutex_exit(&ire->ire_lock); 1782fff7ec1dSSowmini Varadhan return (B_TRUE); 1783fff7ec1dSSowmini Varadhan } 1784