1c793af95Ssangeeta /* 2c793af95Ssangeeta * CDDL HEADER START 3c793af95Ssangeeta * 4c793af95Ssangeeta * The contents of this file are subject to the terms of the 5c793af95Ssangeeta * Common Development and Distribution License (the "License"). 6c793af95Ssangeeta * You may not use this file except in compliance with the License. 7c793af95Ssangeeta * 8c793af95Ssangeeta * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9c793af95Ssangeeta * or http://www.opensolaris.org/os/licensing. 10c793af95Ssangeeta * See the License for the specific language governing permissions 11c793af95Ssangeeta * and limitations under the License. 12c793af95Ssangeeta * 13c793af95Ssangeeta * When distributing Covered Code, include this CDDL HEADER in each 14c793af95Ssangeeta * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15c793af95Ssangeeta * If applicable, add the following below this CDDL HEADER, with the 16c793af95Ssangeeta * fields enclosed by brackets "[]" replaced with your own identifying 17c793af95Ssangeeta * information: Portions Copyright [yyyy] [name of copyright owner] 18c793af95Ssangeeta * 19c793af95Ssangeeta * CDDL HEADER END 20c793af95Ssangeeta */ 21c793af95Ssangeeta /* 22e11c3f44Smeem * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23c793af95Ssangeeta * Use is subject to license terms. 24c793af95Ssangeeta */ 25c793af95Ssangeeta 26c793af95Ssangeeta /* 27c793af95Ssangeeta * This file contains consumer routines of the IPv4 forwarding engine 28c793af95Ssangeeta */ 29c793af95Ssangeeta 30c793af95Ssangeeta #include <sys/types.h> 31c793af95Ssangeeta #include <sys/stream.h> 32c793af95Ssangeeta #include <sys/stropts.h> 33c793af95Ssangeeta #include <sys/strlog.h> 34c793af95Ssangeeta #include <sys/dlpi.h> 35c793af95Ssangeeta #include <sys/ddi.h> 36c793af95Ssangeeta #include <sys/cmn_err.h> 37c793af95Ssangeeta #include <sys/policy.h> 38c793af95Ssangeeta 39c793af95Ssangeeta #include <sys/systm.h> 40c793af95Ssangeeta #include <sys/strsun.h> 41c793af95Ssangeeta #include <sys/kmem.h> 42c793af95Ssangeeta #include <sys/param.h> 43c793af95Ssangeeta #include <sys/socket.h> 44edd26dc5Sdr146992 #include <sys/strsubr.h> 45c793af95Ssangeeta #include <net/if.h> 46c793af95Ssangeeta #include <net/route.h> 47c793af95Ssangeeta #include <netinet/in.h> 48c793af95Ssangeeta #include <net/if_dl.h> 49c793af95Ssangeeta #include <netinet/ip6.h> 50c793af95Ssangeeta #include <netinet/icmp6.h> 51c793af95Ssangeeta 52bd670b35SErik Nordmark #include <inet/ipsec_impl.h> 53c793af95Ssangeeta #include <inet/common.h> 54c793af95Ssangeeta #include <inet/mi.h> 55c793af95Ssangeeta #include <inet/mib2.h> 56c793af95Ssangeeta #include <inet/ip.h> 57edd26dc5Sdr146992 #include <inet/ip_impl.h> 58c793af95Ssangeeta #include <inet/ip6.h> 59c793af95Ssangeeta #include <inet/ip_ndp.h> 60c793af95Ssangeeta #include <inet/arp.h> 61c793af95Ssangeeta #include <inet/ip_if.h> 62c793af95Ssangeeta #include <inet/ip_ire.h> 63c793af95Ssangeeta #include <inet/ip_ftable.h> 64c793af95Ssangeeta #include <inet/ip_rts.h> 65c793af95Ssangeeta #include <inet/nd.h> 66c793af95Ssangeeta 67c793af95Ssangeeta #include <net/pfkeyv2.h> 68c793af95Ssangeeta #include <inet/sadb.h> 69c793af95Ssangeeta #include <inet/tcp.h> 70c793af95Ssangeeta #include <inet/ipclassifier.h> 71c793af95Ssangeeta #include <sys/zone.h> 72c793af95Ssangeeta #include <net/radix.h> 73c793af95Ssangeeta #include <sys/tsol/label.h> 74c793af95Ssangeeta #include <sys/tsol/tnet.h> 75c793af95Ssangeeta 76c793af95Ssangeeta #define IS_DEFAULT_ROUTE(ire) \ 77c793af95Ssangeeta (((ire)->ire_type & IRE_DEFAULT) || \ 78c793af95Ssangeeta (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79c793af95Ssangeeta 80f4b3ec61Sdh155122 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81c793af95Ssangeeta static void ire_del_host_redir(ire_t *, char *); 82c793af95Ssangeeta static boolean_t ire_find_best_route(struct radix_node *, void *); 83c793af95Ssangeeta 84c793af95Ssangeeta /* 85c793af95Ssangeeta * Lookup a route in forwarding table. A specific lookup is indicated by 86c793af95Ssangeeta * passing the required parameters and indicating the match required in the 87c793af95Ssangeeta * flag field. 88c793af95Ssangeeta * 89c793af95Ssangeeta * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90c793af95Ssangeeta */ 91c793af95Ssangeeta ire_t * 92bd670b35SErik Nordmark ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93bd670b35SErik Nordmark int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94bd670b35SErik Nordmark int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95c793af95Ssangeeta { 96bd670b35SErik Nordmark ire_t *ire; 97c793af95Ssangeeta struct rt_sockaddr rdst, rmask; 98c793af95Ssangeeta struct rt_entry *rt; 99c793af95Ssangeeta ire_ftable_args_t margs; 100c793af95Ssangeeta 101bd670b35SErik Nordmark ASSERT(ill == NULL || !ill->ill_isv6); 102c793af95Ssangeeta 103c793af95Ssangeeta /* 104bd670b35SErik Nordmark * ire_match_args() will dereference ill if MATCH_IRE_ILL 105bd670b35SErik Nordmark * is set. 106c793af95Ssangeeta */ 107bd670b35SErik Nordmark if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108c793af95Ssangeeta return (NULL); 109c793af95Ssangeeta 110*188e1664SErik Nordmark bzero(&rdst, sizeof (rdst)); 111c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 112c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 113c793af95Ssangeeta rdst.rt_sin_addr.s_addr = addr; 114c793af95Ssangeeta 115*188e1664SErik Nordmark bzero(&rmask, sizeof (rmask)); 116c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 117c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 118c793af95Ssangeeta rmask.rt_sin_addr.s_addr = mask; 119c793af95Ssangeeta 120*188e1664SErik Nordmark bzero(&margs, sizeof (margs)); 121c793af95Ssangeeta margs.ift_addr = addr; 122c793af95Ssangeeta margs.ift_mask = mask; 123c793af95Ssangeeta margs.ift_gateway = gateway; 124c793af95Ssangeeta margs.ift_type = type; 125bd670b35SErik Nordmark margs.ift_ill = ill; 126c793af95Ssangeeta margs.ift_zoneid = zoneid; 127c793af95Ssangeeta margs.ift_tsl = tsl; 128c793af95Ssangeeta margs.ift_flags = flags; 129c793af95Ssangeeta 130c793af95Ssangeeta /* 131c793af95Ssangeeta * The flags argument passed to ire_ftable_lookup may cause the 132c793af95Ssangeeta * search to return, not the longest matching prefix, but the 133c793af95Ssangeeta * "best matching prefix", i.e., the longest prefix that also 134c793af95Ssangeeta * satisfies constraints imposed via the permutation of flags 135c793af95Ssangeeta * passed in. To achieve this, we invoke ire_match_args() on 136c793af95Ssangeeta * each matching leaf in the radix tree. ire_match_args is 137c793af95Ssangeeta * invoked by the callback function ire_find_best_route() 138c793af95Ssangeeta * We hold the global tree lock in read mode when calling 139c793af95Ssangeeta * rn_match_args. Before dropping the global tree lock, ensure 140c793af95Ssangeeta * that the radix node can't be deleted by incrementing ire_refcnt. 141c793af95Ssangeeta */ 142f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143f4b3ec61Sdh155122 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144f4b3ec61Sdh155122 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145c793af95Ssangeeta ire = margs.ift_best_ire; 146c793af95Ssangeeta if (rt == NULL) { 147bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148c793af95Ssangeeta return (NULL); 149c793af95Ssangeeta } 150bd670b35SErik Nordmark ASSERT(ire != NULL); 151c793af95Ssangeeta 152c793af95Ssangeeta DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153c793af95Ssangeeta 154c793af95Ssangeeta /* 155c793af95Ssangeeta * round-robin only if we have more than one route in the bucket. 156bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 157bd670b35SErik Nordmark * 2: always 158bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159bd670b35SErik Nordmark * 0: never 160c793af95Ssangeeta */ 161bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 163bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 164bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 165c793af95Ssangeeta ire_t *next_ire; 166c793af95Ssangeeta 167bd670b35SErik Nordmark margs.ift_best_ire = NULL; 168bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 169bd670b35SErik Nordmark xmit_hint, ire, ipst); 170bd670b35SErik Nordmark if (next_ire == NULL) { 171bd670b35SErik Nordmark /* keep ire if next_ire is null */ 172bd670b35SErik Nordmark goto done; 173bd670b35SErik Nordmark } 174bd670b35SErik Nordmark ire_refrele(ire); 175c793af95Ssangeeta ire = next_ire; 176c793af95Ssangeeta } 177c793af95Ssangeeta } 178c793af95Ssangeeta 179bd670b35SErik Nordmark done: 180bd670b35SErik Nordmark /* Return generation before dropping lock */ 181bd670b35SErik Nordmark if (generationp != NULL) 182bd670b35SErik Nordmark *generationp = ire->ire_generation; 183c793af95Ssangeeta 184bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185e11c3f44Smeem 186c793af95Ssangeeta /* 187bd670b35SErik Nordmark * For shared-IP zones we need additional checks to what was 188bd670b35SErik Nordmark * done in ire_match_args to make sure IRE_LOCALs are handled. 189bd670b35SErik Nordmark * 190bd670b35SErik Nordmark * When ip_restrict_interzone_loopback is set, then 191bd670b35SErik Nordmark * we ensure that IRE_LOCAL are only used for loopback 192bd670b35SErik Nordmark * between zones when the logical "Ethernet" would 193bd670b35SErik Nordmark * have looped them back. That is, if in the absense of 194bd670b35SErik Nordmark * the IRE_LOCAL we would have sent to packet out the 195bd670b35SErik Nordmark * same ill. 196c793af95Ssangeeta */ 197bd670b35SErik Nordmark if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198bd670b35SErik Nordmark ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199bd670b35SErik Nordmark ipst->ips_ip_restrict_interzone_loopback) { 200bd670b35SErik Nordmark ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201bd670b35SErik Nordmark ASSERT(ire != NULL); 202c793af95Ssangeeta } 203c793af95Ssangeeta return (ire); 204c793af95Ssangeeta } 205c793af95Ssangeeta 206da14cebeSEric Cheng /* 207da14cebeSEric Cheng * This function is called by 208bd670b35SErik Nordmark * ip_input/ire_route_recursive when doing a route lookup on only the 209bd670b35SErik Nordmark * destination address. 210bd670b35SErik Nordmark * 211da14cebeSEric Cheng * The optimizations of this function over ire_ftable_lookup are: 212da14cebeSEric Cheng * o removing unnecessary flag matching 213da14cebeSEric Cheng * o doing longest prefix match instead of overloading it further 214da14cebeSEric Cheng * with the unnecessary "best_prefix_match" 215bd670b35SErik Nordmark * 216bd670b35SErik Nordmark * If no route is found we return IRE_NOROUTE. 217da14cebeSEric Cheng */ 218bd670b35SErik Nordmark ire_t * 219bd670b35SErik Nordmark ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220bd670b35SErik Nordmark uint_t *generationp) 221da14cebeSEric Cheng { 222bd670b35SErik Nordmark ire_t *ire; 223da14cebeSEric Cheng struct rt_sockaddr rdst; 224da14cebeSEric Cheng struct rt_entry *rt; 225bd670b35SErik Nordmark irb_t *irb; 226da14cebeSEric Cheng 227da14cebeSEric Cheng rdst.rt_sin_len = sizeof (rdst); 228da14cebeSEric Cheng rdst.rt_sin_family = AF_INET; 229da14cebeSEric Cheng rdst.rt_sin_addr.s_addr = addr; 230da14cebeSEric Cheng 231da14cebeSEric Cheng /* 232da14cebeSEric Cheng * This is basically inlining a simpler version of ire_match_args 233da14cebeSEric Cheng */ 234da14cebeSEric Cheng RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235da14cebeSEric Cheng 236da14cebeSEric Cheng rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237da14cebeSEric Cheng ipst->ips_ip_ftable, NULL, NULL); 238da14cebeSEric Cheng 239bd670b35SErik Nordmark if (rt == NULL) 240bd670b35SErik Nordmark goto bad; 241bd670b35SErik Nordmark 242bd670b35SErik Nordmark irb = &rt->rt_irb; 243bd670b35SErik Nordmark if (irb->irb_ire_cnt == 0) 244bd670b35SErik Nordmark goto bad; 245bd670b35SErik Nordmark 246bd670b35SErik Nordmark rw_enter(&irb->irb_lock, RW_READER); 247bd670b35SErik Nordmark ire = irb->irb_ire; 248bd670b35SErik Nordmark if (ire == NULL) { 249bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 250bd670b35SErik Nordmark goto bad; 251da14cebeSEric Cheng } 252bd670b35SErik Nordmark while (IRE_IS_CONDEMNED(ire)) { 253bd670b35SErik Nordmark ire = ire->ire_next; 254bd670b35SErik Nordmark if (ire == NULL) { 255bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 256bd670b35SErik Nordmark goto bad; 257bd670b35SErik Nordmark } 258da14cebeSEric Cheng } 259da14cebeSEric Cheng 260da14cebeSEric Cheng /* we have a ire that matches */ 261bd670b35SErik Nordmark ire_refhold(ire); 262bd670b35SErik Nordmark rw_exit(&irb->irb_lock); 263bd670b35SErik Nordmark 264bd670b35SErik Nordmark /* 265bd670b35SErik Nordmark * round-robin only if we have more than one route in the bucket. 266bd670b35SErik Nordmark * ips_ip_ecmp_behavior controls when we do ECMP 267bd670b35SErik Nordmark * 2: always 268bd670b35SErik Nordmark * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269bd670b35SErik Nordmark * 0: never 270bd670b35SErik Nordmark * 271bd670b35SErik Nordmark * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272bd670b35SErik Nordmark * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273bd670b35SErik Nordmark * and the IRE_INTERFACESs are likely to be shorter matches. 274bd670b35SErik Nordmark */ 275bd670b35SErik Nordmark if (ire->ire_bucket->irb_ire_cnt > 1) { 276bd670b35SErik Nordmark if (ipst->ips_ip_ecmp_behavior == 2 || 277bd670b35SErik Nordmark (ipst->ips_ip_ecmp_behavior == 1 && 278bd670b35SErik Nordmark IS_DEFAULT_ROUTE(ire))) { 279bd670b35SErik Nordmark ire_t *next_ire; 280bd670b35SErik Nordmark ire_ftable_args_t margs; 281bd670b35SErik Nordmark 282*188e1664SErik Nordmark bzero(&margs, sizeof (margs)); 283bd670b35SErik Nordmark margs.ift_addr = addr; 284bd670b35SErik Nordmark margs.ift_zoneid = ALL_ZONES; 285bd670b35SErik Nordmark 286bd670b35SErik Nordmark next_ire = ire_round_robin(ire->ire_bucket, &margs, 287bd670b35SErik Nordmark xmit_hint, ire, ipst); 288bd670b35SErik Nordmark if (next_ire == NULL) { 289bd670b35SErik Nordmark /* keep ire if next_ire is null */ 290bd670b35SErik Nordmark if (generationp != NULL) 291bd670b35SErik Nordmark *generationp = ire->ire_generation; 292bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293bd670b35SErik Nordmark return (ire); 294bd670b35SErik Nordmark } 295bd670b35SErik Nordmark ire_refrele(ire); 296bd670b35SErik Nordmark ire = next_ire; 297bd670b35SErik Nordmark } 298bd670b35SErik Nordmark } 299bd670b35SErik Nordmark /* Return generation before dropping lock */ 300bd670b35SErik Nordmark if (generationp != NULL) 301bd670b35SErik Nordmark *generationp = ire->ire_generation; 302bd670b35SErik Nordmark 303da14cebeSEric Cheng RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304da14cebeSEric Cheng 305bd670b35SErik Nordmark /* 306bd670b35SErik Nordmark * Since we only did ALL_ZONES matches there is no special handling 307bd670b35SErik Nordmark * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308bd670b35SErik Nordmark */ 309da14cebeSEric Cheng return (ire); 310da14cebeSEric Cheng 311bd670b35SErik Nordmark bad: 312bd670b35SErik Nordmark if (generationp != NULL) 313bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 314da14cebeSEric Cheng 315bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316bd670b35SErik Nordmark return (ire_reject(ipst, B_FALSE)); 317da14cebeSEric Cheng } 318c793af95Ssangeeta 319c793af95Ssangeeta /* 320bd670b35SErik Nordmark * Find the ill matching a multicast group. 321c793af95Ssangeeta * Allows different routes for multicast addresses 322c793af95Ssangeeta * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323c793af95Ssangeeta * which point at different interfaces. This is used when IP_MULTICAST_IF 324c793af95Ssangeeta * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325c793af95Ssangeeta * specify the interface to join on. 326c793af95Ssangeeta * 327bd670b35SErik Nordmark * Supports link-local addresses by using ire_route_recursive which follows 328bd670b35SErik Nordmark * the ill when recursing. 329bd670b35SErik Nordmark * 330bd670b35SErik Nordmark * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331bd670b35SErik Nordmark * and the MULTIRT property can be different for different groups, we 332bd670b35SErik Nordmark * extract RTF_MULTIRT from the special unicast route added for a group 333bd670b35SErik Nordmark * with CGTP and pass that back in the multirtp argument. 334bd670b35SErik Nordmark * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335bd670b35SErik Nordmark * We have a setsrcp argument for the same reason. 336c793af95Ssangeeta */ 337bd670b35SErik Nordmark ill_t * 338bd670b35SErik Nordmark ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339bd670b35SErik Nordmark boolean_t *multirtp, ipaddr_t *setsrcp) 340c793af95Ssangeeta { 341c793af95Ssangeeta ire_t *ire; 342bd670b35SErik Nordmark ill_t *ill; 343c793af95Ssangeeta 344bd670b35SErik Nordmark ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345bd670b35SErik Nordmark MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 346bd670b35SErik Nordmark ASSERT(ire != NULL); 347bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348c793af95Ssangeeta ire_refrele(ire); 349c793af95Ssangeeta return (NULL); 350c793af95Ssangeeta } 351bd670b35SErik Nordmark 352bd670b35SErik Nordmark if (multirtp != NULL) 353bd670b35SErik Nordmark *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354bd670b35SErik Nordmark 355bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 356bd670b35SErik Nordmark ire_refrele(ire); 357bd670b35SErik Nordmark return (ill); 358c793af95Ssangeeta } 359c793af95Ssangeeta 360c793af95Ssangeeta /* 361c793af95Ssangeeta * Delete the passed in ire if the gateway addr matches 362c793af95Ssangeeta */ 363c793af95Ssangeeta void 364c793af95Ssangeeta ire_del_host_redir(ire_t *ire, char *gateway) 365c793af95Ssangeeta { 3666bdb8e66Sdd193516 if ((ire->ire_flags & RTF_DYNAMIC) && 367c793af95Ssangeeta (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368c793af95Ssangeeta ire_delete(ire); 369c793af95Ssangeeta } 370c793af95Ssangeeta 371c793af95Ssangeeta /* 372bd670b35SErik Nordmark * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373c793af95Ssangeeta * pointing at the specified gateway and 374c793af95Ssangeeta * delete them. This routine is called only 375c793af95Ssangeeta * when a default gateway is going away. 376c793af95Ssangeeta */ 377c793af95Ssangeeta void 378f4b3ec61Sdh155122 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379c793af95Ssangeeta { 380c793af95Ssangeeta struct rtfuncarg rtfarg; 381c793af95Ssangeeta 382*188e1664SErik Nordmark bzero(&rtfarg, sizeof (rtfarg)); 383c793af95Ssangeeta rtfarg.rt_func = ire_del_host_redir; 384c793af95Ssangeeta rtfarg.rt_arg = (void *)&gateway; 385*188e1664SErik Nordmark rtfarg.rt_zoneid = ALL_ZONES; 386*188e1664SErik Nordmark rtfarg.rt_ipst = ipst; 387f4b3ec61Sdh155122 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 388f4b3ec61Sdh155122 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 389c793af95Ssangeeta } 390c793af95Ssangeeta 391c793af95Ssangeeta /* 392f4b3ec61Sdh155122 * Obtain the rt_entry and rt_irb for the route to be added to 393f4b3ec61Sdh155122 * the ips_ip_ftable. 394c793af95Ssangeeta * First attempt to add a node to the radix tree via rn_addroute. If the 395c793af95Ssangeeta * route already exists, return the bucket for the existing route. 396c793af95Ssangeeta * 397c793af95Ssangeeta * Locking notes: Need to hold the global radix tree lock in write mode to 398c793af95Ssangeeta * add a radix node. To prevent the node from being deleted, ire_get_bucket() 399c793af95Ssangeeta * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 400c793af95Ssangeeta * while holding the irb_lock, but not the radix tree lock. 401c793af95Ssangeeta */ 402c793af95Ssangeeta irb_t * 403c793af95Ssangeeta ire_get_bucket(ire_t *ire) 404c793af95Ssangeeta { 405c793af95Ssangeeta struct radix_node *rn; 406c793af95Ssangeeta struct rt_entry *rt; 407c793af95Ssangeeta struct rt_sockaddr rmask, rdst; 408c793af95Ssangeeta irb_t *irb = NULL; 409f4b3ec61Sdh155122 ip_stack_t *ipst = ire->ire_ipst; 410c793af95Ssangeeta 411f4b3ec61Sdh155122 ASSERT(ipst->ips_ip_ftable != NULL); 412c793af95Ssangeeta 413c793af95Ssangeeta /* first try to see if route exists (based on rtalloc1) */ 414*188e1664SErik Nordmark bzero(&rdst, sizeof (rdst)); 415c793af95Ssangeeta rdst.rt_sin_len = sizeof (rdst); 416c793af95Ssangeeta rdst.rt_sin_family = AF_INET; 417c793af95Ssangeeta rdst.rt_sin_addr.s_addr = ire->ire_addr; 418c793af95Ssangeeta 419*188e1664SErik Nordmark bzero(&rmask, sizeof (rmask)); 420c793af95Ssangeeta rmask.rt_sin_len = sizeof (rmask); 421c793af95Ssangeeta rmask.rt_sin_family = AF_INET; 422c793af95Ssangeeta rmask.rt_sin_addr.s_addr = ire->ire_mask; 423c793af95Ssangeeta 424c793af95Ssangeeta /* 425c793af95Ssangeeta * add the route. based on BSD's rtrequest1(RTM_ADD) 426c793af95Ssangeeta */ 427c793af95Ssangeeta R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 42829bc4795Ssangeeta /* kmem_alloc failed */ 42929bc4795Ssangeeta if (rt == NULL) 43029bc4795Ssangeeta return (NULL); 43129bc4795Ssangeeta 432*188e1664SErik Nordmark bzero(rt, sizeof (*rt)); 433c793af95Ssangeeta rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 434c793af95Ssangeeta rt->rt_dst = rdst; 435c793af95Ssangeeta irb = &rt->rt_irb; 436bd670b35SErik Nordmark irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 437f4b3ec61Sdh155122 irb->irb_ipst = ipst; 438c793af95Ssangeeta rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 439f4b3ec61Sdh155122 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 440f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 441f4b3ec61Sdh155122 ipst->ips_ip_ftable, (struct radix_node *)rt); 442c793af95Ssangeeta if (rn == NULL) { 443f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 444c793af95Ssangeeta Free(rt, rt_entry_cache); 445c793af95Ssangeeta rt = NULL; 446c793af95Ssangeeta irb = NULL; 447f4b3ec61Sdh155122 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 448f4b3ec61Sdh155122 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 449f4b3ec61Sdh155122 ipst->ips_ip_ftable); 450f4b3ec61Sdh155122 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 451c793af95Ssangeeta /* found a non-root match */ 452c793af95Ssangeeta rt = (struct rt_entry *)rn; 453c793af95Ssangeeta } 454c793af95Ssangeeta } 455c793af95Ssangeeta if (rt != NULL) { 456c793af95Ssangeeta irb = &rt->rt_irb; 457bd670b35SErik Nordmark irb_refhold(irb); 458c793af95Ssangeeta } 459f4b3ec61Sdh155122 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 460c793af95Ssangeeta return (irb); 461c793af95Ssangeeta } 462c793af95Ssangeeta 463c793af95Ssangeeta /* 464c793af95Ssangeeta * This function is used when the caller wants to know the outbound 465c793af95Ssangeeta * interface for a packet given only the address. 466c793af95Ssangeeta * If this is a offlink IP address and there are multiple 467c793af95Ssangeeta * routes to this destination, this routine will utilise the 468c793af95Ssangeeta * first route it finds to IP address 469c793af95Ssangeeta * Return values: 470c793af95Ssangeeta * 0 - FAILURE 471c793af95Ssangeeta * nonzero - ifindex 472c793af95Ssangeeta */ 473c793af95Ssangeeta uint_t 474c793af95Ssangeeta ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 475c793af95Ssangeeta { 476c793af95Ssangeeta uint_t ifindex = 0; 477c793af95Ssangeeta ire_t *ire; 478c793af95Ssangeeta ill_t *ill; 479f4b3ec61Sdh155122 netstack_t *ns; 480f4b3ec61Sdh155122 ip_stack_t *ipst; 481c793af95Ssangeeta 482f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 483f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 484f4b3ec61Sdh155122 else 485f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 486f4b3ec61Sdh155122 ASSERT(ns != NULL); 487f4b3ec61Sdh155122 488f4b3ec61Sdh155122 /* 489f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 490f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 491f4b3ec61Sdh155122 */ 492f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 493f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 494f4b3ec61Sdh155122 ipst = ns->netstack_ip; 495c793af95Ssangeeta 496c793af95Ssangeeta ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 497c793af95Ssangeeta 498f4b3ec61Sdh155122 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 499bd670b35SErik Nordmark ill = ire_nexthop_ill(ire); 500bd670b35SErik Nordmark if (ill != NULL) { 501c793af95Ssangeeta ifindex = ill->ill_phyint->phyint_ifindex; 502bd670b35SErik Nordmark ill_refrele(ill); 503bd670b35SErik Nordmark } 504c793af95Ssangeeta ire_refrele(ire); 505c793af95Ssangeeta } 506f4b3ec61Sdh155122 netstack_rele(ns); 507c793af95Ssangeeta return (ifindex); 508c793af95Ssangeeta } 509c793af95Ssangeeta 510c793af95Ssangeeta /* 511c793af95Ssangeeta * Routine to find the route to a destination. If a ifindex is supplied 512bd670b35SErik Nordmark * it tries to match the route to the corresponding ipif for the ifindex 513c793af95Ssangeeta */ 514c793af95Ssangeeta static ire_t * 515f4b3ec61Sdh155122 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 516c793af95Ssangeeta { 517c793af95Ssangeeta ire_t *ire = NULL; 518c793af95Ssangeeta int match_flags; 519c793af95Ssangeeta 520bd670b35SErik Nordmark match_flags = MATCH_IRE_DSTONLY; 521c793af95Ssangeeta 522c793af95Ssangeeta /* XXX pass NULL tsl for now */ 523c793af95Ssangeeta 524c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 525bd670b35SErik Nordmark ire = ire_route_recursive_v4( 526bd670b35SErik Nordmark ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 527bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 528bd670b35SErik Nordmark NULL); 529c793af95Ssangeeta } else { 530bd670b35SErik Nordmark ire = ire_route_recursive_v6( 531bd670b35SErik Nordmark &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 532bd670b35SErik Nordmark zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 533bd670b35SErik Nordmark NULL); 534bd670b35SErik Nordmark } 535bd670b35SErik Nordmark ASSERT(ire != NULL); 536bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 537bd670b35SErik Nordmark ire_refrele(ire); 538bd670b35SErik Nordmark return (NULL); 539c793af95Ssangeeta } 540c793af95Ssangeeta return (ire); 541c793af95Ssangeeta } 542c793af95Ssangeeta 543c793af95Ssangeeta /* 544c793af95Ssangeeta * This routine is called by IP Filter to send a packet out on the wire 545bd670b35SErik Nordmark * to a specified dstination (which may be onlink or offlink). The ifindex may 546bd670b35SErik Nordmark * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 547c793af95Ssangeeta * an outgoing interface and requires the nexthop to be on that interface. 548c793af95Ssangeeta * IP WILL NOT DO the following to the data packet before sending it out: 549c793af95Ssangeeta * a. manipulate ttl 550edd26dc5Sdr146992 * b. ipsec work 551edd26dc5Sdr146992 * c. fragmentation 552edd26dc5Sdr146992 * 553edd26dc5Sdr146992 * If the packet has been prepared for hardware checksum then it will be 554edd26dc5Sdr146992 * passed off to ip_send_align_cksum() to check that the flags set on the 555edd26dc5Sdr146992 * packet are in alignment with the capabilities of the new outgoing NIC. 556c793af95Ssangeeta * 557c793af95Ssangeeta * Return values: 558c793af95Ssangeeta * 0: IP was able to send of the data pkt 559c793af95Ssangeeta * ECOMM: Could not send packet 560c793af95Ssangeeta * ENONET No route to dst. It is up to the caller 561c793af95Ssangeeta * to send icmp unreachable error message, 562c793af95Ssangeeta * EINPROGRESS The macaddr of the onlink dst or that 563c793af95Ssangeeta * of the offlink dst's nexthop needs to get 564c793af95Ssangeeta * resolved before packet can be sent to dst. 565c793af95Ssangeeta * Thus transmission is not guaranteed. 566bd670b35SErik Nordmark * Note: No longer have visibility to the ARP queue 567bd670b35SErik Nordmark * hence no EINPROGRESS. 568c793af95Ssangeeta */ 569c793af95Ssangeeta int 570c793af95Ssangeeta ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 571c793af95Ssangeeta zoneid_t zoneid) 572c793af95Ssangeeta { 573bd670b35SErik Nordmark ipaddr_t nexthop; 574f4b3ec61Sdh155122 netstack_t *ns; 575f4b3ec61Sdh155122 ip_stack_t *ipst; 576bd670b35SErik Nordmark ip_xmit_attr_t ixas; 577bd670b35SErik Nordmark int error; 578c793af95Ssangeeta 579c793af95Ssangeeta ASSERT(mp != NULL); 580c793af95Ssangeeta 581f4b3ec61Sdh155122 if (zoneid == ALL_ZONES) 582f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 583f4b3ec61Sdh155122 else 584f4b3ec61Sdh155122 ns = netstack_find_by_zoneid(zoneid); 585f4b3ec61Sdh155122 ASSERT(ns != NULL); 586f4b3ec61Sdh155122 587f4b3ec61Sdh155122 /* 588f4b3ec61Sdh155122 * For exclusive stacks we set the zoneid to zero 589f4b3ec61Sdh155122 * since IP uses the global zoneid in the exclusive stacks. 590f4b3ec61Sdh155122 */ 591f4b3ec61Sdh155122 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 592f4b3ec61Sdh155122 zoneid = GLOBAL_ZONEID; 593f4b3ec61Sdh155122 ipst = ns->netstack_ip; 594f4b3ec61Sdh155122 595c793af95Ssangeeta ASSERT(dst_addr->sa_family == AF_INET || 596c793af95Ssangeeta dst_addr->sa_family == AF_INET6); 597c793af95Ssangeeta 598bd670b35SErik Nordmark bzero(&ixas, sizeof (ixas)); 599bd670b35SErik Nordmark /* 600bd670b35SErik Nordmark * No IPsec, no fragmentation, and don't let any hooks see 601bd670b35SErik Nordmark * the packet. 602bd670b35SErik Nordmark */ 603bd670b35SErik Nordmark ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 604bd670b35SErik Nordmark ixas.ixa_cred = kcred; 605bd670b35SErik Nordmark ixas.ixa_cpid = NOPID; 606bd670b35SErik Nordmark ixas.ixa_tsl = NULL; 607bd670b35SErik Nordmark ixas.ixa_ipst = ipst; 608bd670b35SErik Nordmark ixas.ixa_ifindex = ifindex; 609bd670b35SErik Nordmark 610c793af95Ssangeeta if (dst_addr->sa_family == AF_INET) { 611bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 612bd670b35SErik Nordmark 613bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_IS_IPV4; 614bd670b35SErik Nordmark nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 615bd670b35SErik Nordmark if (nexthop != ipha->ipha_dst) { 616bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 617bd670b35SErik Nordmark ixas.ixa_nexthop_v4 = nexthop; 618bd670b35SErik Nordmark } 619bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ipha->ipha_ttl; 620c793af95Ssangeeta } else { 621bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 622bd670b35SErik Nordmark in6_addr_t *nexthop6; 623bd670b35SErik Nordmark 624bd670b35SErik Nordmark nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 625bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 626bd670b35SErik Nordmark ixas.ixa_flags |= IXAF_NEXTHOP_SET; 627bd670b35SErik Nordmark ixas.ixa_nexthop_v6 = *nexthop6; 628c793af95Ssangeeta } 629bd670b35SErik Nordmark ixas.ixa_multicast_ttl = ip6h->ip6_hops; 630c793af95Ssangeeta } 631bd670b35SErik Nordmark error = ip_output_simple(mp, &ixas); 632bd670b35SErik Nordmark ixa_cleanup(&ixas); 633c793af95Ssangeeta 634f4b3ec61Sdh155122 netstack_rele(ns); 635bd670b35SErik Nordmark switch (error) { 636bd670b35SErik Nordmark case 0: 637bd670b35SErik Nordmark break; 638bd670b35SErik Nordmark 639bd670b35SErik Nordmark case EHOSTUNREACH: 640bd670b35SErik Nordmark case ENETUNREACH: 641bd670b35SErik Nordmark error = ENONET; 642bd670b35SErik Nordmark break; 643bd670b35SErik Nordmark 644bd670b35SErik Nordmark default: 645bd670b35SErik Nordmark error = ECOMM; 646bd670b35SErik Nordmark break; 647c793af95Ssangeeta } 648bd670b35SErik Nordmark return (error); 649edd26dc5Sdr146992 } 650edd26dc5Sdr146992 651c793af95Ssangeeta /* 652c793af95Ssangeeta * callback function provided by ire_ftable_lookup when calling 653c793af95Ssangeeta * rn_match_args(). Invoke ire_match_args on each matching leaf node in 654c793af95Ssangeeta * the radix tree. 655c793af95Ssangeeta */ 656c793af95Ssangeeta boolean_t 657c793af95Ssangeeta ire_find_best_route(struct radix_node *rn, void *arg) 658c793af95Ssangeeta { 659c793af95Ssangeeta struct rt_entry *rt = (struct rt_entry *)rn; 660c793af95Ssangeeta irb_t *irb_ptr; 661c793af95Ssangeeta ire_t *ire; 662c793af95Ssangeeta ire_ftable_args_t *margs = arg; 663c793af95Ssangeeta ipaddr_t match_mask; 664c793af95Ssangeeta 665c793af95Ssangeeta ASSERT(rt != NULL); 666c793af95Ssangeeta 667c793af95Ssangeeta irb_ptr = &rt->rt_irb; 668c793af95Ssangeeta 669c793af95Ssangeeta if (irb_ptr->irb_ire_cnt == 0) 670c793af95Ssangeeta return (B_FALSE); 671c793af95Ssangeeta 672c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_READER); 673c793af95Ssangeeta for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 674bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 675c793af95Ssangeeta continue; 676bd670b35SErik Nordmark if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 677c793af95Ssangeeta match_mask = margs->ift_mask; 678c793af95Ssangeeta else 679c793af95Ssangeeta match_mask = ire->ire_mask; 680c793af95Ssangeeta 681c793af95Ssangeeta if (ire_match_args(ire, margs->ift_addr, match_mask, 682bd670b35SErik Nordmark margs->ift_gateway, margs->ift_type, margs->ift_ill, 683bd670b35SErik Nordmark margs->ift_zoneid, margs->ift_tsl, 684bd670b35SErik Nordmark margs->ift_flags)) { 685bd670b35SErik Nordmark ire_refhold(ire); 686c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 687c793af95Ssangeeta margs->ift_best_ire = ire; 688c793af95Ssangeeta return (B_TRUE); 689c793af95Ssangeeta } 690c793af95Ssangeeta } 691c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 692c793af95Ssangeeta return (B_FALSE); 693c793af95Ssangeeta } 694c793af95Ssangeeta 695c793af95Ssangeeta /* 696c793af95Ssangeeta * ftable irb_t structures are dynamically allocated, and we need to 697c793af95Ssangeeta * check if the irb_t (and associated ftable tree attachment) needs to 698c793af95Ssangeeta * be cleaned up when the irb_refcnt goes to 0. The conditions that need 699c793af95Ssangeeta * be verified are: 700c793af95Ssangeeta * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 701c793af95Ssangeeta * - no other threads holding references to ire's in the bucket, 702c793af95Ssangeeta * i.e., irb_nire == 0 703c793af95Ssangeeta * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 704c793af95Ssangeeta * - need to hold the global tree lock and irb_lock in write mode. 705c793af95Ssangeeta */ 706c793af95Ssangeeta void 707c793af95Ssangeeta irb_refrele_ftable(irb_t *irb) 708c793af95Ssangeeta { 709c793af95Ssangeeta for (;;) { 710c793af95Ssangeeta rw_enter(&irb->irb_lock, RW_WRITER); 711c793af95Ssangeeta ASSERT(irb->irb_refcnt != 0); 712c793af95Ssangeeta if (irb->irb_refcnt != 1) { 713c793af95Ssangeeta /* 714c793af95Ssangeeta * Someone has a reference to this radix node 715c793af95Ssangeeta * or there is some bucket walker. 716c793af95Ssangeeta */ 717c793af95Ssangeeta irb->irb_refcnt--; 718c793af95Ssangeeta rw_exit(&irb->irb_lock); 719c793af95Ssangeeta return; 720c793af95Ssangeeta } else { 721c793af95Ssangeeta /* 722c793af95Ssangeeta * There is no other walker, nor is there any 723c793af95Ssangeeta * other thread that holds a direct ref to this 724c793af95Ssangeeta * radix node. Do the clean up if needed. Call 725c793af95Ssangeeta * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 726c793af95Ssangeeta */ 727c793af95Ssangeeta if (irb->irb_marks & IRB_MARK_CONDEMNED) { 728c793af95Ssangeeta ire_t *ire_list; 729c793af95Ssangeeta 730c793af95Ssangeeta ire_list = ire_unlink(irb); 731c793af95Ssangeeta rw_exit(&irb->irb_lock); 732c793af95Ssangeeta 733c793af95Ssangeeta if (ire_list != NULL) 734c793af95Ssangeeta ire_cleanup(ire_list); 735c793af95Ssangeeta /* 736c793af95Ssangeeta * more CONDEMNED entries could have 737c793af95Ssangeeta * been added while we dropped the lock, 738c793af95Ssangeeta * so we have to re-check. 739c793af95Ssangeeta */ 740c793af95Ssangeeta continue; 741c793af95Ssangeeta } 742c793af95Ssangeeta 743c793af95Ssangeeta /* 744c793af95Ssangeeta * Now check if there are still any ires 745c793af95Ssangeeta * associated with this radix node. 746c793af95Ssangeeta */ 747c793af95Ssangeeta if (irb->irb_nire != 0) { 748c793af95Ssangeeta /* 749c793af95Ssangeeta * someone is still holding on 750c793af95Ssangeeta * to ires in this bucket 751c793af95Ssangeeta */ 752c793af95Ssangeeta irb->irb_refcnt--; 753c793af95Ssangeeta rw_exit(&irb->irb_lock); 754c793af95Ssangeeta return; 755c793af95Ssangeeta } else { 756c793af95Ssangeeta /* 757c793af95Ssangeeta * Everything is clear. Zero walkers, 758c793af95Ssangeeta * Zero threads with a ref to this 759c793af95Ssangeeta * radix node, Zero ires associated with 760c793af95Ssangeeta * this radix node. Due to lock order, 761c793af95Ssangeeta * check the above conditions again 762c793af95Ssangeeta * after grabbing all locks in the right order 763c793af95Ssangeeta */ 764c793af95Ssangeeta rw_exit(&irb->irb_lock); 765c793af95Ssangeeta if (irb_inactive(irb)) 766c793af95Ssangeeta return; 767c793af95Ssangeeta /* 768c793af95Ssangeeta * irb_inactive could not free the irb. 769c793af95Ssangeeta * See if there are any walkers, if not 770c793af95Ssangeeta * try to clean up again. 771c793af95Ssangeeta */ 772c793af95Ssangeeta } 773c793af95Ssangeeta } 774c793af95Ssangeeta } 775c793af95Ssangeeta } 776c793af95Ssangeeta 777c793af95Ssangeeta /* 778bd670b35SErik Nordmark * IRE iterator used by ire_ftable_lookup to process multiple equal 779bd670b35SErik Nordmark * routes. Given a starting point in the hash list (hash), walk the IREs 780bd670b35SErik Nordmark * in the bucket skipping deleted entries. We treat the bucket as a circular 781bd670b35SErik Nordmark * list for the purposes of walking it. 782bd670b35SErik Nordmark * Returns the IRE (held) that corresponds to the hash value. If that IRE is 783bd670b35SErik Nordmark * not applicable (ire_match_args failed) then it returns a subsequent one. 784bd670b35SErik Nordmark * If we fail to find an IRE we return NULL. 785c793af95Ssangeeta * 786bd670b35SErik Nordmark * Assumes that the caller holds a reference on the IRE bucket and a read lock 787bd670b35SErik Nordmark * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 788bd670b35SErik Nordmark * 789bd670b35SErik Nordmark * Applies to IPv4 and IPv6. 790bd670b35SErik Nordmark * 791bd670b35SErik Nordmark * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 792bd670b35SErik Nordmark * address and bucket, we compare against ire_type for the orig_ire. We also 793bd670b35SErik Nordmark * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 794*188e1664SErik Nordmark * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 795bd670b35SErik Nordmark * 796bd670b35SErik Nordmark * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 797bd670b35SErik Nordmark * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 798bd670b35SErik Nordmark * in which the zone has an IP address. We check this for the global zone 799bd670b35SErik Nordmark * even if no shared-IP zones are configured. 800c793af95Ssangeeta */ 801c793af95Ssangeeta ire_t * 802bd670b35SErik Nordmark ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 803bd670b35SErik Nordmark ire_t *orig_ire, ip_stack_t *ipst) 804c793af95Ssangeeta { 805c793af95Ssangeeta ire_t *ire, *maybe_ire = NULL; 806bd670b35SErik Nordmark uint_t maybe_badcnt; 807bd670b35SErik Nordmark uint_t maxwalk; 808bd670b35SErik Nordmark 809bd670b35SErik Nordmark /* Fold in more bits from the hint/hash */ 810bd670b35SErik Nordmark hash = hash ^ (hash >> 8) ^ (hash >> 16); 811c793af95Ssangeeta 812c793af95Ssangeeta rw_enter(&irb_ptr->irb_lock, RW_WRITER); 813bd670b35SErik Nordmark maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 814bd670b35SErik Nordmark hash %= maxwalk; 815bd670b35SErik Nordmark irb_refhold_locked(irb_ptr); 816c793af95Ssangeeta rw_exit(&irb_ptr->irb_lock); 817c793af95Ssangeeta 818c793af95Ssangeeta /* 819c793af95Ssangeeta * Round-robin the routers list looking for a route that 820c793af95Ssangeeta * matches the passed in parameters. 821bd670b35SErik Nordmark * First we skip "hash" number of non-condemned IREs. 822bd670b35SErik Nordmark * Then we match the IRE. 823bd670b35SErik Nordmark * If we find an ire which has a non-zero ire_badcnt then we remember 824bd670b35SErik Nordmark * it and keep on looking for a lower ire_badcnt. 825bd670b35SErik Nordmark * If we come to the end of the list we continue (treat the 826bd670b35SErik Nordmark * bucket list as a circular list) but we match less than "max" 827bd670b35SErik Nordmark * entries. 828c793af95Ssangeeta */ 829bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 830bd670b35SErik Nordmark while (maxwalk > 0) { 831bd670b35SErik Nordmark if (IRE_IS_CONDEMNED(ire)) 832bd670b35SErik Nordmark goto next_ire_skip; 833c793af95Ssangeeta 834bd670b35SErik Nordmark /* Skip the first "hash" entries to do ECMP */ 835bd670b35SErik Nordmark if (hash != 0) { 836bd670b35SErik Nordmark hash--; 837bd670b35SErik Nordmark goto next_ire_skip; 838bd670b35SErik Nordmark } 839bd670b35SErik Nordmark 840bd670b35SErik Nordmark /* See CGTP comment above */ 841bd670b35SErik Nordmark if (ire->ire_type != orig_ire->ire_type || 842*188e1664SErik Nordmark ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 843c793af95Ssangeeta goto next_ire; 844c793af95Ssangeeta 845c793af95Ssangeeta /* 846bd670b35SErik Nordmark * Note: Since IPv6 has hash buckets instead of radix 847bd670b35SErik Nordmark * buckers we need to explicitly compare the addresses. 848bd670b35SErik Nordmark * That makes this less efficient since we will be called 849bd670b35SErik Nordmark * even if there is no alternatives just because the 850bd670b35SErik Nordmark * bucket has multiple IREs for different addresses. 851c793af95Ssangeeta */ 852bd670b35SErik Nordmark if (ire->ire_ipversion == IPV6_VERSION) { 853bd670b35SErik Nordmark if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 854bd670b35SErik Nordmark &ire->ire_addr_v6)) 855c793af95Ssangeeta goto next_ire; 856c793af95Ssangeeta } 857c793af95Ssangeeta 858c793af95Ssangeeta /* 859bd670b35SErik Nordmark * For some reason find_best_route uses ire_mask. We do 860bd670b35SErik Nordmark * the same. 861bd670b35SErik Nordmark */ 862bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION ? 863bd670b35SErik Nordmark !ire_match_args(ire, margs->ift_addr, 864bd670b35SErik Nordmark ire->ire_mask, margs->ift_gateway, 865bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 866bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags) : 867bd670b35SErik Nordmark !ire_match_args_v6(ire, &margs->ift_addr_v6, 868bd670b35SErik Nordmark &ire->ire_mask_v6, &margs->ift_gateway_v6, 869bd670b35SErik Nordmark margs->ift_type, margs->ift_ill, margs->ift_zoneid, 870bd670b35SErik Nordmark margs->ift_tsl, margs->ift_flags)) 871bd670b35SErik Nordmark goto next_ire; 872bd670b35SErik Nordmark 873bd670b35SErik Nordmark if (margs->ift_zoneid != ALL_ZONES && 874bd670b35SErik Nordmark (ire->ire_type & IRE_OFFLINK)) { 875bd670b35SErik Nordmark /* 876bd670b35SErik Nordmark * When we're in a zone, we're only 877c793af95Ssangeeta * interested in routers that are 878c793af95Ssangeeta * reachable through ipifs within our zone. 879c793af95Ssangeeta */ 880bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 881bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v4( 882bd670b35SErik Nordmark ire->ire_gateway_addr, margs->ift_zoneid, 883bd670b35SErik Nordmark ire->ire_ill, margs->ift_tsl, ipst, 884bd670b35SErik Nordmark B_TRUE)) 885bd670b35SErik Nordmark goto next_ire; 886bd670b35SErik Nordmark } else { 887bd670b35SErik Nordmark if (!ire_gateway_ok_zone_v6( 888bd670b35SErik Nordmark &ire->ire_gateway_addr_v6, 889bd670b35SErik Nordmark margs->ift_zoneid, ire->ire_ill, 890bd670b35SErik Nordmark margs->ift_tsl, ipst, B_TRUE)) 891bd670b35SErik Nordmark goto next_ire; 892bd670b35SErik Nordmark } 893bd670b35SErik Nordmark } 894bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 895bd670b35SErik Nordmark /* Look for stale ire_badcnt and clear */ 896bd670b35SErik Nordmark if (ire->ire_badcnt != 0 && 897d3d50737SRafael Vanoni (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 898bd670b35SErik Nordmark ipst->ips_ip_ire_badcnt_lifetime)) 899bd670b35SErik Nordmark ire->ire_badcnt = 0; 900bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 901e11c3f44Smeem 902bd670b35SErik Nordmark if (ire->ire_badcnt == 0) { 903bd670b35SErik Nordmark /* We found one with a zero badcnt; done */ 904bd670b35SErik Nordmark ire_refhold(ire); 905bd670b35SErik Nordmark /* 906bd670b35SErik Nordmark * Care needed since irb_refrele grabs WLOCK to free 907bd670b35SErik Nordmark * the irb_t. 908bd670b35SErik Nordmark */ 909bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 910bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 911bd670b35SErik Nordmark irb_refrele(irb_ptr); 912bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 913bd670b35SErik Nordmark } else { 914bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 915bd670b35SErik Nordmark irb_refrele(irb_ptr); 916bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, 917bd670b35SErik Nordmark RW_READER); 918bd670b35SErik Nordmark } 919c793af95Ssangeeta return (ire); 920c793af95Ssangeeta } 921bd670b35SErik Nordmark /* 922bd670b35SErik Nordmark * keep looking to see if there is a better (lower 923bd670b35SErik Nordmark * badcnt) matching IRE, but save this one as a last resort. 924bd670b35SErik Nordmark * If we find a lower badcnt pick that one as the last* resort. 925bd670b35SErik Nordmark */ 926bd670b35SErik Nordmark if (maybe_ire == NULL) { 927bd670b35SErik Nordmark maybe_ire = ire; 928bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 929bd670b35SErik Nordmark } else if (ire->ire_badcnt < maybe_badcnt) { 930bd670b35SErik Nordmark maybe_ire = ire; 931bd670b35SErik Nordmark maybe_badcnt = ire->ire_badcnt; 932bd670b35SErik Nordmark } 933bd670b35SErik Nordmark 934c793af95Ssangeeta next_ire: 935bd670b35SErik Nordmark maxwalk--; 936bd670b35SErik Nordmark next_ire_skip: 937bd670b35SErik Nordmark ire = ire->ire_next; 938bd670b35SErik Nordmark if (ire == NULL) 939bd670b35SErik Nordmark ire = irb_ptr->irb_ire; 940c793af95Ssangeeta } 941c793af95Ssangeeta if (maybe_ire != NULL) 942bd670b35SErik Nordmark ire_refhold(maybe_ire); 943bd670b35SErik Nordmark 944bd670b35SErik Nordmark /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 945bd670b35SErik Nordmark if (ire->ire_ipversion == IPV4_VERSION) { 946bd670b35SErik Nordmark RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 947bd670b35SErik Nordmark irb_refrele(irb_ptr); 948bd670b35SErik Nordmark RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 949bd670b35SErik Nordmark } else { 950bd670b35SErik Nordmark rw_exit(&ipst->ips_ip6_ire_head_lock); 951bd670b35SErik Nordmark irb_refrele(irb_ptr); 952bd670b35SErik Nordmark rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 953bd670b35SErik Nordmark } 954c793af95Ssangeeta return (maybe_ire); 955c793af95Ssangeeta } 9562679e103Ssowmini 9572679e103Ssowmini void 9582679e103Ssowmini irb_refhold_rn(struct radix_node *rn) 9592679e103Ssowmini { 9602679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 961bd670b35SErik Nordmark irb_refhold(&((rt_t *)(rn))->rt_irb); 9622679e103Ssowmini } 9632679e103Ssowmini 9642679e103Ssowmini void 9652679e103Ssowmini irb_refrele_rn(struct radix_node *rn) 9662679e103Ssowmini { 9672679e103Ssowmini if ((rn->rn_flags & RNF_ROOT) == 0) 9682679e103Ssowmini irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 9692679e103Ssowmini } 970bd670b35SErik Nordmark 971bd670b35SErik Nordmark /* 972bd670b35SErik Nordmark * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 973bd670b35SErik Nordmark * routes this routine sets up a ire_nce_cache as well. The caller needs to 974bd670b35SErik Nordmark * lookup an nce for the multicast case. 975bd670b35SErik Nordmark */ 976bd670b35SErik Nordmark ire_t * 977bd670b35SErik Nordmark ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 978bd670b35SErik Nordmark uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 979bd670b35SErik Nordmark { 980bd670b35SErik Nordmark uint_t match_args; 981bd670b35SErik Nordmark uint_t ire_type; 982bd670b35SErik Nordmark ill_t *ill; 983bd670b35SErik Nordmark ire_t *ire; 984bd670b35SErik Nordmark ip_stack_t *ipst = ixa->ixa_ipst; 985bd670b35SErik Nordmark ipaddr_t v4dst; 986bd670b35SErik Nordmark in6_addr_t v6nexthop; 987bd670b35SErik Nordmark iaflags_t ixaflags = ixa->ixa_flags; 988bd670b35SErik Nordmark nce_t *nce; 989bd670b35SErik Nordmark 990bd670b35SErik Nordmark match_args = MATCH_IRE_SECATTR; 991bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 992bd670b35SErik Nordmark if (setsrcp != NULL) 993bd670b35SErik Nordmark ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 994bd670b35SErik Nordmark if (errorp != NULL) 995bd670b35SErik Nordmark ASSERT(*errorp == 0); 996bd670b35SErik Nordmark 997bd670b35SErik Nordmark /* 998bd670b35SErik Nordmark * The content of the ixa will be different if IP_NEXTHOP, 999bd670b35SErik Nordmark * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1000bd670b35SErik Nordmark */ 1001bd670b35SErik Nordmark 1002bd670b35SErik Nordmark if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1003bd670b35SErik Nordmark IN6_IS_ADDR_MULTICAST(v6dst)) { 1004bd670b35SErik Nordmark /* Pick up the IRE_MULTICAST for the ill */ 1005bd670b35SErik Nordmark if (ixa->ixa_multicast_ifindex != 0) { 1006bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1007bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1008bd670b35SErik Nordmark } else if (ixaflags & IXAF_SCOPEID_SET) { 1009bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1010bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1011bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1012bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1013bd670b35SErik Nordmark } else if (ixa->ixa_ifindex != 0) { 1014bd670b35SErik Nordmark /* 1015bd670b35SErik Nordmark * In the ipmp case, the ixa_ifindex is set to 1016bd670b35SErik Nordmark * point at an under_ill and we would return the 1017bd670b35SErik Nordmark * ire_multicast() corresponding to that under_ill. 1018bd670b35SErik Nordmark */ 1019bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1020bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1021bd670b35SErik Nordmark } else if (ixaflags & IXAF_IS_IPV4) { 1022bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1023bd670b35SErik Nordmark 1024bd670b35SErik Nordmark ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1025bd670b35SErik Nordmark multirtp, &v4setsrc); 1026bd670b35SErik Nordmark if (setsrcp != NULL) 1027bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1028bd670b35SErik Nordmark } else { 1029bd670b35SErik Nordmark ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1030bd670b35SErik Nordmark multirtp, setsrcp); 1031bd670b35SErik Nordmark } 1032bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1033bd670b35SErik Nordmark ill_refrele(ill); 1034bd670b35SErik Nordmark ill = NULL; 1035bd670b35SErik Nordmark } 1036bd670b35SErik Nordmark if (ill == NULL) { 1037bd670b35SErik Nordmark if (errorp != NULL) 1038bd670b35SErik Nordmark *errorp = ENXIO; 1039bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1040bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1041bd670b35SErik Nordmark return (ire); 1042bd670b35SErik Nordmark } 1043bd670b35SErik Nordmark if (!(ill->ill_flags & ILLF_MULTICAST)) { 1044bd670b35SErik Nordmark ill_refrele(ill); 1045bd670b35SErik Nordmark if (errorp != NULL) 1046bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1047bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1048bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1049bd670b35SErik Nordmark return (ire); 1050bd670b35SErik Nordmark } 1051bd670b35SErik Nordmark /* Get a refcnt on the single IRE_MULTICAST per ill */ 1052bd670b35SErik Nordmark ire = ire_multicast(ill); 1053bd670b35SErik Nordmark ill_refrele(ill); 1054bd670b35SErik Nordmark if (generationp != NULL) 1055bd670b35SErik Nordmark *generationp = ire->ire_generation; 1056bd670b35SErik Nordmark if (errorp != NULL && 1057bd670b35SErik Nordmark (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1058bd670b35SErik Nordmark *errorp = EHOSTUNREACH; 1059bd670b35SErik Nordmark } 1060bd670b35SErik Nordmark return (ire); 1061bd670b35SErik Nordmark } 1062bd670b35SErik Nordmark 1063bd670b35SErik Nordmark if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1064bd670b35SErik Nordmark if (ixaflags & IXAF_SCOPEID_SET) { 1065bd670b35SErik Nordmark /* sin6_scope_id takes precedence over ixa_ifindex */ 1066bd670b35SErik Nordmark ASSERT(ixa->ixa_scopeid != 0); 1067bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1068bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1069bd670b35SErik Nordmark } else { 1070bd670b35SErik Nordmark ASSERT(ixa->ixa_ifindex != 0); 1071bd670b35SErik Nordmark ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1072bd670b35SErik Nordmark !(ixaflags & IXAF_IS_IPV4), ipst); 1073bd670b35SErik Nordmark } 1074bd670b35SErik Nordmark if (ill != NULL && IS_VNI(ill)) { 1075bd670b35SErik Nordmark ill_refrele(ill); 1076bd670b35SErik Nordmark ill = NULL; 1077bd670b35SErik Nordmark } 1078bd670b35SErik Nordmark if (ill == NULL) { 1079bd670b35SErik Nordmark if (errorp != NULL) 1080bd670b35SErik Nordmark *errorp = ENXIO; 1081bd670b35SErik Nordmark /* Get a hold on the IRE_NOROUTE */ 1082bd670b35SErik Nordmark ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1083bd670b35SErik Nordmark return (ire); 1084bd670b35SErik Nordmark } 1085bd670b35SErik Nordmark /* 1086bd670b35SErik Nordmark * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1087bd670b35SErik Nordmark * so for both of them we need to be able look for an under 1088bd670b35SErik Nordmark * interface. 1089bd670b35SErik Nordmark */ 1090bd670b35SErik Nordmark if (IS_UNDER_IPMP(ill)) 1091bd670b35SErik Nordmark match_args |= MATCH_IRE_TESTHIDDEN; 1092bd670b35SErik Nordmark } else { 1093bd670b35SErik Nordmark ill = NULL; 1094bd670b35SErik Nordmark } 1095bd670b35SErik Nordmark 1096bd670b35SErik Nordmark if (ixaflags & IXAF_NEXTHOP_SET) { 1097bd670b35SErik Nordmark /* IP_NEXTHOP was set */ 1098bd670b35SErik Nordmark v6nexthop = ixa->ixa_nexthop_v6; 1099bd670b35SErik Nordmark } else { 1100bd670b35SErik Nordmark v6nexthop = *v6dst; 1101bd670b35SErik Nordmark } 1102bd670b35SErik Nordmark 1103bd670b35SErik Nordmark ire_type = 0; 1104bd670b35SErik Nordmark /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1105bd670b35SErik Nordmark 1106bd670b35SErik Nordmark /* 1107bd670b35SErik Nordmark * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1108bd670b35SErik Nordmark * we only look for an onlink IRE. 1109bd670b35SErik Nordmark */ 1110bd670b35SErik Nordmark if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1111bd670b35SErik Nordmark match_args |= MATCH_IRE_TYPE; 1112bd670b35SErik Nordmark ire_type = IRE_ONLINK; 1113bd670b35SErik Nordmark } 1114bd670b35SErik Nordmark 1115bd670b35SErik Nordmark if (ixaflags & IXAF_IS_IPV4) { 1116bd670b35SErik Nordmark ipaddr_t v4nexthop; 1117bd670b35SErik Nordmark ipaddr_t v4setsrc = INADDR_ANY; 1118bd670b35SErik Nordmark 1119bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1120bd670b35SErik Nordmark ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1121bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1122bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1123bd670b35SErik Nordmark if (setsrcp != NULL) 1124bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1125bd670b35SErik Nordmark } else { 1126bd670b35SErik Nordmark ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1127bd670b35SErik Nordmark ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1128bd670b35SErik Nordmark ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1129bd670b35SErik Nordmark } 1130bd670b35SErik Nordmark 1131bd670b35SErik Nordmark #ifdef DEBUG 1132bd670b35SErik Nordmark if (match_args & MATCH_IRE_TESTHIDDEN) { 1133bd670b35SErik Nordmark ip3dbg(("looking for hidden; dst %x ire %p\n", 1134bd670b35SErik Nordmark v4dst, (void *)ire)); 1135bd670b35SErik Nordmark } 1136bd670b35SErik Nordmark #endif 1137bd670b35SErik Nordmark 1138bd670b35SErik Nordmark if (ill != NULL) 1139bd670b35SErik Nordmark ill_refrele(ill); 1140bd670b35SErik Nordmark 1141bd670b35SErik Nordmark if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1142bd670b35SErik Nordmark (ire->ire_type & IRE_MULTICAST)) { 1143bd670b35SErik Nordmark /* No ire_nce_cache */ 1144bd670b35SErik Nordmark return (ire); 1145bd670b35SErik Nordmark } 1146bd670b35SErik Nordmark 1147bd670b35SErik Nordmark /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1148bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1149bd670b35SErik Nordmark nce = ire->ire_nce_cache; 1150bd670b35SErik Nordmark if (nce == NULL || nce->nce_is_condemned) { 1151bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1152bd670b35SErik Nordmark (void) ire_revalidate_nce(ire); 1153bd670b35SErik Nordmark } else { 1154bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1155bd670b35SErik Nordmark } 1156bd670b35SErik Nordmark return (ire); 1157bd670b35SErik Nordmark } 1158bd670b35SErik Nordmark 1159bd670b35SErik Nordmark /* 1160bd670b35SErik Nordmark * Find a route given some xmit attributes and a packet. 1161bd670b35SErik Nordmark * Generic for IPv4 and IPv6 1162bd670b35SErik Nordmark * 1163bd670b35SErik Nordmark * This never returns NULL. But when it returns the IRE_NOROUTE 1164bd670b35SErik Nordmark * it might set errorp. 1165bd670b35SErik Nordmark */ 1166bd670b35SErik Nordmark ire_t * 1167bd670b35SErik Nordmark ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1168bd670b35SErik Nordmark int *errorp, boolean_t *multirtp) 1169bd670b35SErik Nordmark { 1170bd670b35SErik Nordmark if (ixa->ixa_flags & IXAF_IS_IPV4) { 1171bd670b35SErik Nordmark ipha_t *ipha = (ipha_t *)mp->b_rptr; 1172bd670b35SErik Nordmark in6_addr_t v6dst; 1173bd670b35SErik Nordmark 1174bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1175bd670b35SErik Nordmark 1176bd670b35SErik Nordmark return (ip_select_route(&v6dst, ixa, generationp, 1177bd670b35SErik Nordmark NULL, errorp, multirtp)); 1178bd670b35SErik Nordmark } else { 1179bd670b35SErik Nordmark ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1180bd670b35SErik Nordmark 1181bd670b35SErik Nordmark return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1182bd670b35SErik Nordmark NULL, errorp, multirtp)); 1183bd670b35SErik Nordmark } 1184bd670b35SErik Nordmark } 1185bd670b35SErik Nordmark 1186bd670b35SErik Nordmark ire_t * 1187bd670b35SErik Nordmark ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1188bd670b35SErik Nordmark ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1189bd670b35SErik Nordmark { 1190bd670b35SErik Nordmark in6_addr_t v6dst; 1191bd670b35SErik Nordmark ire_t *ire; 1192bd670b35SErik Nordmark in6_addr_t setsrc; 1193bd670b35SErik Nordmark 1194bd670b35SErik Nordmark ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1195bd670b35SErik Nordmark 1196bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1197bd670b35SErik Nordmark 1198bd670b35SErik Nordmark setsrc = ipv6_all_zeros; 1199bd670b35SErik Nordmark ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1200bd670b35SErik Nordmark multirtp); 1201bd670b35SErik Nordmark if (v4setsrcp != NULL) 1202bd670b35SErik Nordmark IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1203bd670b35SErik Nordmark return (ire); 1204bd670b35SErik Nordmark } 1205bd670b35SErik Nordmark 1206bd670b35SErik Nordmark /* 1207bd670b35SErik Nordmark * Recursively look for a route to the destination. Can also match on 1208bd670b35SErik Nordmark * the zoneid, ill, and label. Used for the data paths. See also 1209bd670b35SErik Nordmark * ire_route_recursive. 1210bd670b35SErik Nordmark * 1211bd670b35SErik Nordmark * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1212bd670b35SErik Nordmark * 1213bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1214bd670b35SErik Nordmark * instead. 1215bd670b35SErik Nordmark * 1216bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1217bd670b35SErik Nordmark * is an error. 1218bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1219bd670b35SErik Nordmark */ 1220bd670b35SErik Nordmark ire_t * 1221bd670b35SErik Nordmark ire_route_recursive_impl_v4(ire_t *ire, 1222bd670b35SErik Nordmark ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1223bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1224bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1225bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1226bd670b35SErik Nordmark { 1227bd670b35SErik Nordmark int i, j; 1228bd670b35SErik Nordmark ire_t *ires[MAX_IRE_RECURSION]; 1229bd670b35SErik Nordmark uint_t generation; 1230bd670b35SErik Nordmark uint_t generations[MAX_IRE_RECURSION]; 1231bd670b35SErik Nordmark boolean_t need_refrele = B_FALSE; 1232bd670b35SErik Nordmark boolean_t invalidate = B_FALSE; 1233bd670b35SErik Nordmark int prefs[MAX_IRE_RECURSION]; 1234bd670b35SErik Nordmark ill_t *ill = NULL; 1235bd670b35SErik Nordmark 1236bd670b35SErik Nordmark if (setsrcp != NULL) 1237bd670b35SErik Nordmark ASSERT(*setsrcp == INADDR_ANY); 1238bd670b35SErik Nordmark if (gwattrp != NULL) 1239bd670b35SErik Nordmark ASSERT(*gwattrp == NULL); 1240bd670b35SErik Nordmark 1241bd670b35SErik Nordmark if (ill_arg != NULL) 1242bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1243bd670b35SErik Nordmark 1244bd670b35SErik Nordmark /* 1245bd670b35SErik Nordmark * We iterate up to three times to resolve a route, even though 1246bd670b35SErik Nordmark * we have four slots in the array. The extra slot is for an 1247bd670b35SErik Nordmark * IRE_IF_CLONE we might need to create. 1248bd670b35SErik Nordmark */ 1249bd670b35SErik Nordmark i = 0; 1250bd670b35SErik Nordmark while (i < MAX_IRE_RECURSION - 1) { 1251bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1252bd670b35SErik Nordmark if (ire == NULL) { 1253bd670b35SErik Nordmark ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1254bd670b35SErik Nordmark (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1255bd670b35SErik Nordmark match_args, xmit_hint, ipst, &generation); 1256bd670b35SErik Nordmark } else { 1257bd670b35SErik Nordmark /* Caller passed it; extra hold since we will rele */ 1258bd670b35SErik Nordmark ire_refhold(ire); 1259bd670b35SErik Nordmark if (generationp != NULL) 1260bd670b35SErik Nordmark generation = *generationp; 1261bd670b35SErik Nordmark else 1262bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1263bd670b35SErik Nordmark } 1264bd670b35SErik Nordmark if (ire == NULL) 1265bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1266bd670b35SErik Nordmark 1267bd670b35SErik Nordmark /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1268bd670b35SErik Nordmark if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1269bd670b35SErik Nordmark goto error; 1270bd670b35SErik Nordmark 1271bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1272bd670b35SErik Nordmark 1273bd670b35SErik Nordmark if (i != 0) { 1274*188e1664SErik Nordmark prefs[i] = ire_pref(ire); 1275bd670b35SErik Nordmark /* 1276bd670b35SErik Nordmark * Don't allow anything unusual past the first 1277bd670b35SErik Nordmark * iteration. 1278bd670b35SErik Nordmark */ 1279bd670b35SErik Nordmark if ((ire->ire_type & 1280bd670b35SErik Nordmark (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1281bd670b35SErik Nordmark prefs[i] <= prefs[i-1]) { 1282bd670b35SErik Nordmark ire_refrele(ire); 1283bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1284bd670b35SErik Nordmark goto error; 1285bd670b35SErik Nordmark } 1286bd670b35SErik Nordmark } 1287bd670b35SErik Nordmark /* We have a usable IRE */ 1288bd670b35SErik Nordmark ires[i] = ire; 1289bd670b35SErik Nordmark generations[i] = generation; 1290bd670b35SErik Nordmark i++; 1291bd670b35SErik Nordmark 1292bd670b35SErik Nordmark /* The first RTF_SETSRC address is passed back if setsrcp */ 1293bd670b35SErik Nordmark if ((ire->ire_flags & RTF_SETSRC) && 1294bd670b35SErik Nordmark setsrcp != NULL && *setsrcp == INADDR_ANY) { 1295bd670b35SErik Nordmark ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1296bd670b35SErik Nordmark *setsrcp = ire->ire_setsrc_addr; 1297bd670b35SErik Nordmark } 1298bd670b35SErik Nordmark 1299bd670b35SErik Nordmark /* The first ire_gw_secattr is passed back if gwattrp */ 1300bd670b35SErik Nordmark if (ire->ire_gw_secattr != NULL && 1301bd670b35SErik Nordmark gwattrp != NULL && *gwattrp == NULL) 1302bd670b35SErik Nordmark *gwattrp = ire->ire_gw_secattr; 1303bd670b35SErik Nordmark 1304bd670b35SErik Nordmark /* 1305bd670b35SErik Nordmark * Check if we have a short-cut pointer to an IRE for this 1306bd670b35SErik Nordmark * destination, and that the cached dependency isn't stale. 1307bd670b35SErik Nordmark * In that case we've rejoined an existing tree towards a 1308bd670b35SErik Nordmark * parent, thus we don't need to continue the loop to 1309bd670b35SErik Nordmark * discover the rest of the tree. 1310bd670b35SErik Nordmark */ 1311bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1312bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1313bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1314bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1315bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1316bd670b35SErik Nordmark ire = NULL; 1317bd670b35SErik Nordmark goto done; 1318bd670b35SErik Nordmark } 1319bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1320bd670b35SErik Nordmark 1321bd670b35SErik Nordmark /* 1322bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1323bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1324bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1325bd670b35SErik Nordmark */ 1326bd670b35SErik Nordmark if (ire->ire_nce_capable) { 1327bd670b35SErik Nordmark ire = NULL; 1328bd670b35SErik Nordmark goto done; 1329bd670b35SErik Nordmark } 1330bd670b35SErik Nordmark ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1331bd670b35SErik Nordmark /* 1332bd670b35SErik Nordmark * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1333bd670b35SErik Nordmark * particular destination 1334bd670b35SErik Nordmark */ 1335bd670b35SErik Nordmark if (ire->ire_type & IRE_INTERFACE) { 1336bd670b35SErik Nordmark in6_addr_t v6nexthop; 1337bd670b35SErik Nordmark ire_t *clone; 1338bd670b35SErik Nordmark 1339bd670b35SErik Nordmark ASSERT(ire->ire_masklen != IPV4_ABITS); 1340bd670b35SErik Nordmark 1341bd670b35SErik Nordmark /* 1342bd670b35SErik Nordmark * In the case of ip_input and ILLF_FORWARDING not 1343bd670b35SErik Nordmark * being set, and in the case of RTM_GET, 1344bd670b35SErik Nordmark * there is no point in allocating 1345bd670b35SErik Nordmark * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1346bd670b35SErik Nordmark * Note that !allocate can result in a ire_dep_parent 1347bd670b35SErik Nordmark * which is IRE_IF_* without an IRE_IF_CLONE. 1348bd670b35SErik Nordmark * We recover from that when we need to send packets 1349bd670b35SErik Nordmark * by ensuring that the generations become 1350bd670b35SErik Nordmark * IRE_GENERATION_VERIFY in this case. 1351bd670b35SErik Nordmark */ 1352bd670b35SErik Nordmark if (!allocate) { 1353bd670b35SErik Nordmark invalidate = B_TRUE; 1354bd670b35SErik Nordmark ire = NULL; 1355bd670b35SErik Nordmark goto done; 1356bd670b35SErik Nordmark } 1357bd670b35SErik Nordmark 1358bd670b35SErik Nordmark IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1359bd670b35SErik Nordmark 1360bd670b35SErik Nordmark clone = ire_create_if_clone(ire, &v6nexthop, 1361bd670b35SErik Nordmark &generation); 1362bd670b35SErik Nordmark if (clone == NULL) { 1363bd670b35SErik Nordmark /* 1364bd670b35SErik Nordmark * Temporary failure - no memory. 1365bd670b35SErik Nordmark * Don't want caller to cache IRE_NOROUTE. 1366bd670b35SErik Nordmark */ 1367bd670b35SErik Nordmark invalidate = B_TRUE; 1368bd670b35SErik Nordmark ire = ire_blackhole(ipst, B_FALSE); 1369bd670b35SErik Nordmark goto error; 1370bd670b35SErik Nordmark } 1371bd670b35SErik Nordmark /* 1372bd670b35SErik Nordmark * Make clone next to last entry and the 1373bd670b35SErik Nordmark * IRE_INTERFACE the last in the dependency 1374bd670b35SErik Nordmark * chain since the clone depends on the 1375bd670b35SErik Nordmark * IRE_INTERFACE. 1376bd670b35SErik Nordmark */ 1377bd670b35SErik Nordmark ASSERT(i >= 1); 1378bd670b35SErik Nordmark ASSERT(i < MAX_IRE_RECURSION); 1379bd670b35SErik Nordmark 1380bd670b35SErik Nordmark ires[i] = ires[i-1]; 1381bd670b35SErik Nordmark generations[i] = generations[i-1]; 1382bd670b35SErik Nordmark ires[i-1] = clone; 1383bd670b35SErik Nordmark generations[i-1] = generation; 1384bd670b35SErik Nordmark i++; 1385bd670b35SErik Nordmark 1386bd670b35SErik Nordmark ire = NULL; 1387bd670b35SErik Nordmark goto done; 1388bd670b35SErik Nordmark } 1389bd670b35SErik Nordmark 1390bd670b35SErik Nordmark /* 1391bd670b35SErik Nordmark * We only match on the type and optionally ILL when 1392bd670b35SErik Nordmark * recursing. The type match is used by some callers 1393bd670b35SErik Nordmark * to exclude certain types (such as IRE_IF_CLONE or 1394bd670b35SErik Nordmark * IRE_LOCAL|IRE_LOOPBACK). 1395bd670b35SErik Nordmark */ 1396bd670b35SErik Nordmark match_args &= MATCH_IRE_TYPE; 1397bd670b35SErik Nordmark nexthop = ire->ire_gateway_addr; 1398bd670b35SErik Nordmark if (ill == NULL && ire->ire_ill != NULL) { 1399bd670b35SErik Nordmark ill = ire->ire_ill; 1400bd670b35SErik Nordmark need_refrele = B_TRUE; 1401bd670b35SErik Nordmark ill_refhold(ill); 1402bd670b35SErik Nordmark match_args |= MATCH_IRE_ILL; 1403bd670b35SErik Nordmark } 1404*188e1664SErik Nordmark /* 1405*188e1664SErik Nordmark * We set the prefs[i] value above if i > 0. We've already 1406*188e1664SErik Nordmark * done i++ so i is one in the case of the first time around. 1407*188e1664SErik Nordmark */ 1408*188e1664SErik Nordmark if (i == 1) 1409*188e1664SErik Nordmark prefs[0] = ire_pref(ire); 1410bd670b35SErik Nordmark ire = NULL; 1411bd670b35SErik Nordmark } 1412bd670b35SErik Nordmark ASSERT(ire == NULL); 1413bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1414bd670b35SErik Nordmark 1415bd670b35SErik Nordmark error: 1416bd670b35SErik Nordmark ASSERT(ire != NULL); 1417bd670b35SErik Nordmark if (need_refrele) 1418bd670b35SErik Nordmark ill_refrele(ill); 1419bd670b35SErik Nordmark 1420bd670b35SErik Nordmark /* 1421bd670b35SErik Nordmark * In the case of MULTIRT we want to try a different IRE the next 1422bd670b35SErik Nordmark * time. We let the next packet retry in that case. 1423bd670b35SErik Nordmark */ 1424bd670b35SErik Nordmark if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1425bd670b35SErik Nordmark (void) ire_no_good(ires[0]); 1426bd670b35SErik Nordmark 1427bd670b35SErik Nordmark cleanup: 1428bd670b35SErik Nordmark /* cleanup ires[i] */ 1429bd670b35SErik Nordmark ire_dep_unbuild(ires, i); 1430bd670b35SErik Nordmark for (j = 0; j < i; j++) 1431bd670b35SErik Nordmark ire_refrele(ires[j]); 1432bd670b35SErik Nordmark 1433bd670b35SErik Nordmark ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1434bd670b35SErik Nordmark /* 1435bd670b35SErik Nordmark * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1436bd670b35SErik Nordmark * ip_select_route since the reject or lack of memory might be gone. 1437bd670b35SErik Nordmark */ 1438bd670b35SErik Nordmark if (generationp != NULL) 1439bd670b35SErik Nordmark *generationp = IRE_GENERATION_VERIFY; 1440bd670b35SErik Nordmark return (ire); 1441bd670b35SErik Nordmark 1442bd670b35SErik Nordmark done: 1443bd670b35SErik Nordmark ASSERT(ire == NULL); 1444bd670b35SErik Nordmark if (need_refrele) { 1445bd670b35SErik Nordmark ill_refrele(ill); 1446bd670b35SErik Nordmark ill = NULL; 1447bd670b35SErik Nordmark } 1448bd670b35SErik Nordmark 1449bd670b35SErik Nordmark /* Build dependencies */ 1450*188e1664SErik Nordmark if (i > 1 && !ire_dep_build(ires, generations, i)) { 1451bd670b35SErik Nordmark /* Something in chain was condemned; tear it apart */ 1452bd670b35SErik Nordmark ire = ire_reject(ipst, B_FALSE); 1453bd670b35SErik Nordmark goto cleanup; 1454bd670b35SErik Nordmark } 1455bd670b35SErik Nordmark 1456bd670b35SErik Nordmark /* 1457bd670b35SErik Nordmark * Release all refholds except the one for ires[0] that we 1458bd670b35SErik Nordmark * will return to the caller. 1459bd670b35SErik Nordmark */ 1460bd670b35SErik Nordmark for (j = 1; j < i; j++) 1461bd670b35SErik Nordmark ire_refrele(ires[j]); 1462bd670b35SErik Nordmark 1463bd670b35SErik Nordmark if (invalidate) { 1464bd670b35SErik Nordmark /* 1465bd670b35SErik Nordmark * Since we needed to allocate but couldn't we need to make 1466bd670b35SErik Nordmark * sure that the dependency chain is rebuilt the next time. 1467bd670b35SErik Nordmark */ 1468bd670b35SErik Nordmark ire_dep_invalidate_generations(ires[0]); 1469bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1470bd670b35SErik Nordmark } else { 1471bd670b35SErik Nordmark /* 1472bd670b35SErik Nordmark * IREs can have been added or deleted while we did the 1473bd670b35SErik Nordmark * recursive lookup and we can't catch those until we've built 1474bd670b35SErik Nordmark * the dependencies. We verify the stored 1475bd670b35SErik Nordmark * ire_dep_parent_generation to catch any such changes and 1476bd670b35SErik Nordmark * return IRE_GENERATION_VERIFY (which will cause 1477bd670b35SErik Nordmark * ip_select_route to be called again so we can redo the 1478bd670b35SErik Nordmark * recursive lookup next time we send a packet. 1479bd670b35SErik Nordmark */ 1480*188e1664SErik Nordmark if (ires[0]->ire_dep_parent == NULL) 1481*188e1664SErik Nordmark generation = ires[0]->ire_generation; 1482*188e1664SErik Nordmark else 1483bd670b35SErik Nordmark generation = ire_dep_validate_generations(ires[0]); 1484bd670b35SErik Nordmark if (generations[0] != ires[0]->ire_generation) { 1485bd670b35SErik Nordmark /* Something changed at the top */ 1486bd670b35SErik Nordmark generation = IRE_GENERATION_VERIFY; 1487bd670b35SErik Nordmark } 1488bd670b35SErik Nordmark } 1489bd670b35SErik Nordmark if (generationp != NULL) 1490bd670b35SErik Nordmark *generationp = generation; 1491bd670b35SErik Nordmark 1492bd670b35SErik Nordmark return (ires[0]); 1493bd670b35SErik Nordmark } 1494bd670b35SErik Nordmark 1495bd670b35SErik Nordmark ire_t * 1496bd670b35SErik Nordmark ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1497bd670b35SErik Nordmark zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1498bd670b35SErik Nordmark boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1499bd670b35SErik Nordmark tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1500bd670b35SErik Nordmark { 1501bd670b35SErik Nordmark return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1502bd670b35SErik Nordmark zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1503bd670b35SErik Nordmark gwattrp, generationp)); 1504bd670b35SErik Nordmark } 1505bd670b35SErik Nordmark 1506bd670b35SErik Nordmark /* 1507bd670b35SErik Nordmark * Recursively look for a route to the destination. 1508bd670b35SErik Nordmark * We only handle a destination match here, yet we have the same arguments 1509bd670b35SErik Nordmark * as the full match to allow function pointers to select between the two. 1510bd670b35SErik Nordmark * 1511bd670b35SErik Nordmark * Note that this function never returns NULL. It returns an IRE_NOROUTE 1512bd670b35SErik Nordmark * instead. 1513bd670b35SErik Nordmark * 1514bd670b35SErik Nordmark * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1515bd670b35SErik Nordmark * is an error. 1516bd670b35SErik Nordmark * Allow at most one RTF_INDIRECT. 1517bd670b35SErik Nordmark */ 1518bd670b35SErik Nordmark ire_t * 1519bd670b35SErik Nordmark ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, 1520bd670b35SErik Nordmark uint32_t xmit_hint, ip_stack_t *ipst) 1521bd670b35SErik Nordmark { 1522bd670b35SErik Nordmark ire_t *ire; 1523bd670b35SErik Nordmark ire_t *ire1; 1524bd670b35SErik Nordmark uint_t generation; 1525bd670b35SErik Nordmark 1526bd670b35SErik Nordmark /* ire_ftable_lookup handles round-robin/ECMP */ 1527bd670b35SErik Nordmark ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1528bd670b35SErik Nordmark &generation); 1529bd670b35SErik Nordmark ASSERT(ire != NULL); 1530bd670b35SErik Nordmark 1531bd670b35SErik Nordmark /* 1532bd670b35SErik Nordmark * If this type should have an ire_nce_cache (even if it 1533bd670b35SErik Nordmark * doesn't yet have one) then we are done. Includes 1534bd670b35SErik Nordmark * IRE_INTERFACE with a full 32 bit mask. 1535bd670b35SErik Nordmark */ 1536bd670b35SErik Nordmark if (ire->ire_nce_capable) 1537bd670b35SErik Nordmark return (ire); 1538bd670b35SErik Nordmark 1539bd670b35SErik Nordmark /* 1540bd670b35SErik Nordmark * If the IRE has a current cached parent we know that the whole 1541bd670b35SErik Nordmark * parent chain is current, hence we don't need to discover and 1542bd670b35SErik Nordmark * build any dependencies by doing a recursive lookup. 1543bd670b35SErik Nordmark */ 1544bd670b35SErik Nordmark mutex_enter(&ire->ire_lock); 1545bd670b35SErik Nordmark if (ire->ire_dep_parent != NULL && 1546bd670b35SErik Nordmark ire->ire_dep_parent->ire_generation == 1547bd670b35SErik Nordmark ire->ire_dep_parent_generation) { 1548bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1549bd670b35SErik Nordmark return (ire); 1550bd670b35SErik Nordmark } 1551bd670b35SErik Nordmark mutex_exit(&ire->ire_lock); 1552bd670b35SErik Nordmark 1553bd670b35SErik Nordmark /* 1554bd670b35SErik Nordmark * Fallback to loop in the normal code starting with the ire 1555bd670b35SErik Nordmark * we found. Normally this would return the same ire. 1556bd670b35SErik Nordmark */ 1557bd670b35SErik Nordmark ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1558bd670b35SErik Nordmark NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1559bd670b35SErik Nordmark &generation); 1560bd670b35SErik Nordmark ire_refrele(ire); 1561bd670b35SErik Nordmark return (ire1); 1562bd670b35SErik Nordmark } 1563