1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * This file contains consumer routines of the IPv4 forwarding engine 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/dlpi.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/policy.h> 38 39 #include <sys/systm.h> 40 #include <sys/strsun.h> 41 #include <sys/kmem.h> 42 #include <sys/param.h> 43 #include <sys/socket.h> 44 #include <sys/strsubr.h> 45 #include <net/if.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/ipsec_impl.h> 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ip6.h> 59 #include <inet/ip_ndp.h> 60 #include <inet/arp.h> 61 #include <inet/ip_if.h> 62 #include <inet/ip_ire.h> 63 #include <inet/ip_ftable.h> 64 #include <inet/ip_rts.h> 65 #include <inet/nd.h> 66 67 #include <net/pfkeyv2.h> 68 #include <inet/sadb.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <net/radix.h> 73 #include <sys/tsol/label.h> 74 #include <sys/tsol/tnet.h> 75 76 #define IS_DEFAULT_ROUTE(ire) \ 77 (((ire)->ire_type & IRE_DEFAULT) || \ 78 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79 80 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81 static void ire_del_host_redir(ire_t *, char *); 82 static boolean_t ire_find_best_route(struct radix_node *, void *); 83 84 /* 85 * Lookup a route in forwarding table. A specific lookup is indicated by 86 * passing the required parameters and indicating the match required in the 87 * flag field. 88 * 89 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90 */ 91 ire_t * 92 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95 { 96 ire_t *ire; 97 struct rt_sockaddr rdst, rmask; 98 struct rt_entry *rt; 99 ire_ftable_args_t margs; 100 101 ASSERT(ill == NULL || !ill->ill_isv6); 102 103 /* 104 * ire_match_args() will dereference ill if MATCH_IRE_ILL 105 * is set. 106 */ 107 if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108 return (NULL); 109 110 (void) memset(&rdst, 0, sizeof (rdst)); 111 rdst.rt_sin_len = sizeof (rdst); 112 rdst.rt_sin_family = AF_INET; 113 rdst.rt_sin_addr.s_addr = addr; 114 115 (void) memset(&rmask, 0, sizeof (rmask)); 116 rmask.rt_sin_len = sizeof (rmask); 117 rmask.rt_sin_family = AF_INET; 118 rmask.rt_sin_addr.s_addr = mask; 119 120 (void) memset(&margs, 0, sizeof (margs)); 121 margs.ift_addr = addr; 122 margs.ift_mask = mask; 123 margs.ift_gateway = gateway; 124 margs.ift_type = type; 125 margs.ift_ill = ill; 126 margs.ift_zoneid = zoneid; 127 margs.ift_tsl = tsl; 128 margs.ift_flags = flags; 129 130 /* 131 * The flags argument passed to ire_ftable_lookup may cause the 132 * search to return, not the longest matching prefix, but the 133 * "best matching prefix", i.e., the longest prefix that also 134 * satisfies constraints imposed via the permutation of flags 135 * passed in. To achieve this, we invoke ire_match_args() on 136 * each matching leaf in the radix tree. ire_match_args is 137 * invoked by the callback function ire_find_best_route() 138 * We hold the global tree lock in read mode when calling 139 * rn_match_args. Before dropping the global tree lock, ensure 140 * that the radix node can't be deleted by incrementing ire_refcnt. 141 */ 142 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145 ire = margs.ift_best_ire; 146 if (rt == NULL) { 147 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148 return (NULL); 149 } 150 ASSERT(ire != NULL); 151 152 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153 154 /* 155 * round-robin only if we have more than one route in the bucket. 156 * ips_ip_ecmp_behavior controls when we do ECMP 157 * 2: always 158 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159 * 0: never 160 */ 161 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162 if (ipst->ips_ip_ecmp_behavior == 2 || 163 (ipst->ips_ip_ecmp_behavior == 1 && 164 IS_DEFAULT_ROUTE(ire))) { 165 ire_t *next_ire; 166 167 margs.ift_best_ire = NULL; 168 next_ire = ire_round_robin(ire->ire_bucket, &margs, 169 xmit_hint, ire, ipst); 170 if (next_ire == NULL) { 171 /* keep ire if next_ire is null */ 172 goto done; 173 } 174 ire_refrele(ire); 175 ire = next_ire; 176 } 177 } 178 179 done: 180 /* Return generation before dropping lock */ 181 if (generationp != NULL) 182 *generationp = ire->ire_generation; 183 184 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185 186 /* 187 * For shared-IP zones we need additional checks to what was 188 * done in ire_match_args to make sure IRE_LOCALs are handled. 189 * 190 * When ip_restrict_interzone_loopback is set, then 191 * we ensure that IRE_LOCAL are only used for loopback 192 * between zones when the logical "Ethernet" would 193 * have looped them back. That is, if in the absense of 194 * the IRE_LOCAL we would have sent to packet out the 195 * same ill. 196 */ 197 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199 ipst->ips_ip_restrict_interzone_loopback) { 200 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201 ASSERT(ire != NULL); 202 } 203 return (ire); 204 } 205 206 /* 207 * This function is called by 208 * ip_input/ire_route_recursive when doing a route lookup on only the 209 * destination address. 210 * 211 * The optimizations of this function over ire_ftable_lookup are: 212 * o removing unnecessary flag matching 213 * o doing longest prefix match instead of overloading it further 214 * with the unnecessary "best_prefix_match" 215 * 216 * If no route is found we return IRE_NOROUTE. 217 */ 218 ire_t * 219 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220 uint_t *generationp) 221 { 222 ire_t *ire; 223 struct rt_sockaddr rdst; 224 struct rt_entry *rt; 225 irb_t *irb; 226 227 rdst.rt_sin_len = sizeof (rdst); 228 rdst.rt_sin_family = AF_INET; 229 rdst.rt_sin_addr.s_addr = addr; 230 231 /* 232 * This is basically inlining a simpler version of ire_match_args 233 */ 234 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235 236 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237 ipst->ips_ip_ftable, NULL, NULL); 238 239 if (rt == NULL) 240 goto bad; 241 242 irb = &rt->rt_irb; 243 if (irb->irb_ire_cnt == 0) 244 goto bad; 245 246 rw_enter(&irb->irb_lock, RW_READER); 247 ire = irb->irb_ire; 248 if (ire == NULL) { 249 rw_exit(&irb->irb_lock); 250 goto bad; 251 } 252 while (IRE_IS_CONDEMNED(ire)) { 253 ire = ire->ire_next; 254 if (ire == NULL) { 255 rw_exit(&irb->irb_lock); 256 goto bad; 257 } 258 } 259 260 /* we have a ire that matches */ 261 ire_refhold(ire); 262 rw_exit(&irb->irb_lock); 263 264 /* 265 * round-robin only if we have more than one route in the bucket. 266 * ips_ip_ecmp_behavior controls when we do ECMP 267 * 2: always 268 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269 * 0: never 270 * 271 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273 * and the IRE_INTERFACESs are likely to be shorter matches. 274 */ 275 if (ire->ire_bucket->irb_ire_cnt > 1) { 276 if (ipst->ips_ip_ecmp_behavior == 2 || 277 (ipst->ips_ip_ecmp_behavior == 1 && 278 IS_DEFAULT_ROUTE(ire))) { 279 ire_t *next_ire; 280 ire_ftable_args_t margs; 281 282 (void) memset(&margs, 0, sizeof (margs)); 283 margs.ift_addr = addr; 284 margs.ift_zoneid = ALL_ZONES; 285 286 next_ire = ire_round_robin(ire->ire_bucket, &margs, 287 xmit_hint, ire, ipst); 288 if (next_ire == NULL) { 289 /* keep ire if next_ire is null */ 290 if (generationp != NULL) 291 *generationp = ire->ire_generation; 292 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293 return (ire); 294 } 295 ire_refrele(ire); 296 ire = next_ire; 297 } 298 } 299 /* Return generation before dropping lock */ 300 if (generationp != NULL) 301 *generationp = ire->ire_generation; 302 303 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304 305 /* 306 * Since we only did ALL_ZONES matches there is no special handling 307 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308 */ 309 return (ire); 310 311 bad: 312 if (generationp != NULL) 313 *generationp = IRE_GENERATION_VERIFY; 314 315 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316 return (ire_reject(ipst, B_FALSE)); 317 } 318 319 /* 320 * Find the ill matching a multicast group. 321 * Allows different routes for multicast addresses 322 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323 * which point at different interfaces. This is used when IP_MULTICAST_IF 324 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325 * specify the interface to join on. 326 * 327 * Supports link-local addresses by using ire_route_recursive which follows 328 * the ill when recursing. 329 * 330 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331 * and the MULTIRT property can be different for different groups, we 332 * extract RTF_MULTIRT from the special unicast route added for a group 333 * with CGTP and pass that back in the multirtp argument. 334 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335 * We have a setsrcp argument for the same reason. 336 */ 337 ill_t * 338 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339 boolean_t *multirtp, ipaddr_t *setsrcp) 340 { 341 ire_t *ire; 342 ill_t *ill; 343 344 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345 MATCH_IRE_DSTONLY, B_FALSE, 0, ipst, setsrcp, NULL, NULL); 346 ASSERT(ire != NULL); 347 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348 ire_refrele(ire); 349 return (NULL); 350 } 351 352 if (multirtp != NULL) 353 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354 355 ill = ire_nexthop_ill(ire); 356 ire_refrele(ire); 357 return (ill); 358 } 359 360 /* 361 * Delete the passed in ire if the gateway addr matches 362 */ 363 void 364 ire_del_host_redir(ire_t *ire, char *gateway) 365 { 366 if ((ire->ire_flags & RTF_DYNAMIC) && 367 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368 ire_delete(ire); 369 } 370 371 /* 372 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373 * pointing at the specified gateway and 374 * delete them. This routine is called only 375 * when a default gateway is going away. 376 */ 377 void 378 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379 { 380 struct rtfuncarg rtfarg; 381 382 (void) memset(&rtfarg, 0, sizeof (rtfarg)); 383 rtfarg.rt_func = ire_del_host_redir; 384 rtfarg.rt_arg = (void *)&gateway; 385 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 386 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 387 } 388 389 /* 390 * Obtain the rt_entry and rt_irb for the route to be added to 391 * the ips_ip_ftable. 392 * First attempt to add a node to the radix tree via rn_addroute. If the 393 * route already exists, return the bucket for the existing route. 394 * 395 * Locking notes: Need to hold the global radix tree lock in write mode to 396 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 397 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 398 * while holding the irb_lock, but not the radix tree lock. 399 */ 400 irb_t * 401 ire_get_bucket(ire_t *ire) 402 { 403 struct radix_node *rn; 404 struct rt_entry *rt; 405 struct rt_sockaddr rmask, rdst; 406 irb_t *irb = NULL; 407 ip_stack_t *ipst = ire->ire_ipst; 408 409 ASSERT(ipst->ips_ip_ftable != NULL); 410 411 /* first try to see if route exists (based on rtalloc1) */ 412 (void) memset(&rdst, 0, sizeof (rdst)); 413 rdst.rt_sin_len = sizeof (rdst); 414 rdst.rt_sin_family = AF_INET; 415 rdst.rt_sin_addr.s_addr = ire->ire_addr; 416 417 (void) memset(&rmask, 0, sizeof (rmask)); 418 rmask.rt_sin_len = sizeof (rmask); 419 rmask.rt_sin_family = AF_INET; 420 rmask.rt_sin_addr.s_addr = ire->ire_mask; 421 422 /* 423 * add the route. based on BSD's rtrequest1(RTM_ADD) 424 */ 425 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 426 /* kmem_alloc failed */ 427 if (rt == NULL) 428 return (NULL); 429 430 (void) memset(rt, 0, sizeof (*rt)); 431 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 432 rt->rt_dst = rdst; 433 irb = &rt->rt_irb; 434 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 435 irb->irb_ipst = ipst; 436 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 437 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 438 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 439 ipst->ips_ip_ftable, (struct radix_node *)rt); 440 if (rn == NULL) { 441 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 442 Free(rt, rt_entry_cache); 443 rt = NULL; 444 irb = NULL; 445 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 446 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 447 ipst->ips_ip_ftable); 448 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 449 /* found a non-root match */ 450 rt = (struct rt_entry *)rn; 451 } 452 } 453 if (rt != NULL) { 454 irb = &rt->rt_irb; 455 irb_refhold(irb); 456 } 457 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 458 return (irb); 459 } 460 461 /* 462 * This function is used when the caller wants to know the outbound 463 * interface for a packet given only the address. 464 * If this is a offlink IP address and there are multiple 465 * routes to this destination, this routine will utilise the 466 * first route it finds to IP address 467 * Return values: 468 * 0 - FAILURE 469 * nonzero - ifindex 470 */ 471 uint_t 472 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 473 { 474 uint_t ifindex = 0; 475 ire_t *ire; 476 ill_t *ill; 477 netstack_t *ns; 478 ip_stack_t *ipst; 479 480 if (zoneid == ALL_ZONES) 481 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 482 else 483 ns = netstack_find_by_zoneid(zoneid); 484 ASSERT(ns != NULL); 485 486 /* 487 * For exclusive stacks we set the zoneid to zero 488 * since IP uses the global zoneid in the exclusive stacks. 489 */ 490 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 491 zoneid = GLOBAL_ZONEID; 492 ipst = ns->netstack_ip; 493 494 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 495 496 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 497 ill = ire_nexthop_ill(ire); 498 if (ill != NULL) { 499 ifindex = ill->ill_phyint->phyint_ifindex; 500 ill_refrele(ill); 501 } 502 ire_refrele(ire); 503 } 504 netstack_rele(ns); 505 return (ifindex); 506 } 507 508 /* 509 * Routine to find the route to a destination. If a ifindex is supplied 510 * it tries to match the route to the corresponding ipif for the ifindex 511 */ 512 static ire_t * 513 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 514 { 515 ire_t *ire = NULL; 516 int match_flags; 517 518 match_flags = MATCH_IRE_DSTONLY; 519 520 /* XXX pass NULL tsl for now */ 521 522 if (dst_addr->sa_family == AF_INET) { 523 ire = ire_route_recursive_v4( 524 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 525 zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 526 NULL); 527 } else { 528 ire = ire_route_recursive_v6( 529 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 530 zoneid, NULL, match_flags, B_TRUE, 0, ipst, NULL, NULL, 531 NULL); 532 } 533 ASSERT(ire != NULL); 534 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 535 ire_refrele(ire); 536 return (NULL); 537 } 538 return (ire); 539 } 540 541 /* 542 * This routine is called by IP Filter to send a packet out on the wire 543 * to a specified dstination (which may be onlink or offlink). The ifindex may 544 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 545 * an outgoing interface and requires the nexthop to be on that interface. 546 * IP WILL NOT DO the following to the data packet before sending it out: 547 * a. manipulate ttl 548 * b. ipsec work 549 * c. fragmentation 550 * 551 * If the packet has been prepared for hardware checksum then it will be 552 * passed off to ip_send_align_cksum() to check that the flags set on the 553 * packet are in alignment with the capabilities of the new outgoing NIC. 554 * 555 * Return values: 556 * 0: IP was able to send of the data pkt 557 * ECOMM: Could not send packet 558 * ENONET No route to dst. It is up to the caller 559 * to send icmp unreachable error message, 560 * EINPROGRESS The macaddr of the onlink dst or that 561 * of the offlink dst's nexthop needs to get 562 * resolved before packet can be sent to dst. 563 * Thus transmission is not guaranteed. 564 * Note: No longer have visibility to the ARP queue 565 * hence no EINPROGRESS. 566 */ 567 int 568 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 569 zoneid_t zoneid) 570 { 571 ipaddr_t nexthop; 572 netstack_t *ns; 573 ip_stack_t *ipst; 574 ip_xmit_attr_t ixas; 575 int error; 576 577 ASSERT(mp != NULL); 578 579 if (zoneid == ALL_ZONES) 580 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 581 else 582 ns = netstack_find_by_zoneid(zoneid); 583 ASSERT(ns != NULL); 584 585 /* 586 * For exclusive stacks we set the zoneid to zero 587 * since IP uses the global zoneid in the exclusive stacks. 588 */ 589 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 590 zoneid = GLOBAL_ZONEID; 591 ipst = ns->netstack_ip; 592 593 ASSERT(dst_addr->sa_family == AF_INET || 594 dst_addr->sa_family == AF_INET6); 595 596 bzero(&ixas, sizeof (ixas)); 597 /* 598 * No IPsec, no fragmentation, and don't let any hooks see 599 * the packet. 600 */ 601 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 602 ixas.ixa_cred = kcred; 603 ixas.ixa_cpid = NOPID; 604 ixas.ixa_tsl = NULL; 605 ixas.ixa_ipst = ipst; 606 ixas.ixa_ifindex = ifindex; 607 608 if (dst_addr->sa_family == AF_INET) { 609 ipha_t *ipha = (ipha_t *)mp->b_rptr; 610 611 ixas.ixa_flags |= IXAF_IS_IPV4; 612 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 613 if (nexthop != ipha->ipha_dst) { 614 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 615 ixas.ixa_nexthop_v4 = nexthop; 616 } 617 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 618 } else { 619 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 620 in6_addr_t *nexthop6; 621 622 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 623 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 624 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 625 ixas.ixa_nexthop_v6 = *nexthop6; 626 } 627 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 628 } 629 error = ip_output_simple(mp, &ixas); 630 ixa_cleanup(&ixas); 631 632 netstack_rele(ns); 633 switch (error) { 634 case 0: 635 break; 636 637 case EHOSTUNREACH: 638 case ENETUNREACH: 639 error = ENONET; 640 break; 641 642 default: 643 error = ECOMM; 644 break; 645 } 646 return (error); 647 } 648 649 /* 650 * callback function provided by ire_ftable_lookup when calling 651 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 652 * the radix tree. 653 */ 654 boolean_t 655 ire_find_best_route(struct radix_node *rn, void *arg) 656 { 657 struct rt_entry *rt = (struct rt_entry *)rn; 658 irb_t *irb_ptr; 659 ire_t *ire; 660 ire_ftable_args_t *margs = arg; 661 ipaddr_t match_mask; 662 663 ASSERT(rt != NULL); 664 665 irb_ptr = &rt->rt_irb; 666 667 if (irb_ptr->irb_ire_cnt == 0) 668 return (B_FALSE); 669 670 rw_enter(&irb_ptr->irb_lock, RW_READER); 671 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 672 if (IRE_IS_CONDEMNED(ire)) 673 continue; 674 if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 675 match_mask = margs->ift_mask; 676 else 677 match_mask = ire->ire_mask; 678 679 if (ire_match_args(ire, margs->ift_addr, match_mask, 680 margs->ift_gateway, margs->ift_type, margs->ift_ill, 681 margs->ift_zoneid, margs->ift_tsl, 682 margs->ift_flags)) { 683 ire_refhold(ire); 684 rw_exit(&irb_ptr->irb_lock); 685 margs->ift_best_ire = ire; 686 return (B_TRUE); 687 } 688 } 689 rw_exit(&irb_ptr->irb_lock); 690 return (B_FALSE); 691 } 692 693 /* 694 * ftable irb_t structures are dynamically allocated, and we need to 695 * check if the irb_t (and associated ftable tree attachment) needs to 696 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 697 * be verified are: 698 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 699 * - no other threads holding references to ire's in the bucket, 700 * i.e., irb_nire == 0 701 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 702 * - need to hold the global tree lock and irb_lock in write mode. 703 */ 704 void 705 irb_refrele_ftable(irb_t *irb) 706 { 707 for (;;) { 708 rw_enter(&irb->irb_lock, RW_WRITER); 709 ASSERT(irb->irb_refcnt != 0); 710 if (irb->irb_refcnt != 1) { 711 /* 712 * Someone has a reference to this radix node 713 * or there is some bucket walker. 714 */ 715 irb->irb_refcnt--; 716 rw_exit(&irb->irb_lock); 717 return; 718 } else { 719 /* 720 * There is no other walker, nor is there any 721 * other thread that holds a direct ref to this 722 * radix node. Do the clean up if needed. Call 723 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 724 */ 725 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 726 ire_t *ire_list; 727 728 ire_list = ire_unlink(irb); 729 rw_exit(&irb->irb_lock); 730 731 if (ire_list != NULL) 732 ire_cleanup(ire_list); 733 /* 734 * more CONDEMNED entries could have 735 * been added while we dropped the lock, 736 * so we have to re-check. 737 */ 738 continue; 739 } 740 741 /* 742 * Now check if there are still any ires 743 * associated with this radix node. 744 */ 745 if (irb->irb_nire != 0) { 746 /* 747 * someone is still holding on 748 * to ires in this bucket 749 */ 750 irb->irb_refcnt--; 751 rw_exit(&irb->irb_lock); 752 return; 753 } else { 754 /* 755 * Everything is clear. Zero walkers, 756 * Zero threads with a ref to this 757 * radix node, Zero ires associated with 758 * this radix node. Due to lock order, 759 * check the above conditions again 760 * after grabbing all locks in the right order 761 */ 762 rw_exit(&irb->irb_lock); 763 if (irb_inactive(irb)) 764 return; 765 /* 766 * irb_inactive could not free the irb. 767 * See if there are any walkers, if not 768 * try to clean up again. 769 */ 770 } 771 } 772 } 773 } 774 775 /* 776 * IRE iterator used by ire_ftable_lookup to process multiple equal 777 * routes. Given a starting point in the hash list (hash), walk the IREs 778 * in the bucket skipping deleted entries. We treat the bucket as a circular 779 * list for the purposes of walking it. 780 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 781 * not applicable (ire_match_args failed) then it returns a subsequent one. 782 * If we fail to find an IRE we return NULL. 783 * 784 * Assumes that the caller holds a reference on the IRE bucket and a read lock 785 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 786 * 787 * Applies to IPv4 and IPv6. 788 * 789 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 790 * address and bucket, we compare against ire_type for the orig_ire. We also 791 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 792 * first in the bucket. Thus we compare that ire_flags match the orig_ire. 793 * 794 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 795 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 796 * in which the zone has an IP address. We check this for the global zone 797 * even if no shared-IP zones are configured. 798 */ 799 ire_t * 800 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 801 ire_t *orig_ire, ip_stack_t *ipst) 802 { 803 ire_t *ire, *maybe_ire = NULL; 804 uint_t maybe_badcnt; 805 uint_t maxwalk; 806 807 /* Fold in more bits from the hint/hash */ 808 hash = hash ^ (hash >> 8) ^ (hash >> 16); 809 810 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 811 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 812 hash %= maxwalk; 813 irb_refhold_locked(irb_ptr); 814 rw_exit(&irb_ptr->irb_lock); 815 816 /* 817 * Round-robin the routers list looking for a route that 818 * matches the passed in parameters. 819 * First we skip "hash" number of non-condemned IREs. 820 * Then we match the IRE. 821 * If we find an ire which has a non-zero ire_badcnt then we remember 822 * it and keep on looking for a lower ire_badcnt. 823 * If we come to the end of the list we continue (treat the 824 * bucket list as a circular list) but we match less than "max" 825 * entries. 826 */ 827 ire = irb_ptr->irb_ire; 828 while (maxwalk > 0) { 829 if (IRE_IS_CONDEMNED(ire)) 830 goto next_ire_skip; 831 832 /* Skip the first "hash" entries to do ECMP */ 833 if (hash != 0) { 834 hash--; 835 goto next_ire_skip; 836 } 837 838 /* See CGTP comment above */ 839 if (ire->ire_type != orig_ire->ire_type || 840 ire->ire_flags != orig_ire->ire_flags) 841 goto next_ire; 842 843 /* 844 * Note: Since IPv6 has hash buckets instead of radix 845 * buckers we need to explicitly compare the addresses. 846 * That makes this less efficient since we will be called 847 * even if there is no alternatives just because the 848 * bucket has multiple IREs for different addresses. 849 */ 850 if (ire->ire_ipversion == IPV6_VERSION) { 851 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 852 &ire->ire_addr_v6)) 853 goto next_ire; 854 } 855 856 /* 857 * For some reason find_best_route uses ire_mask. We do 858 * the same. 859 */ 860 if (ire->ire_ipversion == IPV4_VERSION ? 861 !ire_match_args(ire, margs->ift_addr, 862 ire->ire_mask, margs->ift_gateway, 863 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 864 margs->ift_tsl, margs->ift_flags) : 865 !ire_match_args_v6(ire, &margs->ift_addr_v6, 866 &ire->ire_mask_v6, &margs->ift_gateway_v6, 867 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 868 margs->ift_tsl, margs->ift_flags)) 869 goto next_ire; 870 871 if (margs->ift_zoneid != ALL_ZONES && 872 (ire->ire_type & IRE_OFFLINK)) { 873 /* 874 * When we're in a zone, we're only 875 * interested in routers that are 876 * reachable through ipifs within our zone. 877 */ 878 if (ire->ire_ipversion == IPV4_VERSION) { 879 if (!ire_gateway_ok_zone_v4( 880 ire->ire_gateway_addr, margs->ift_zoneid, 881 ire->ire_ill, margs->ift_tsl, ipst, 882 B_TRUE)) 883 goto next_ire; 884 } else { 885 if (!ire_gateway_ok_zone_v6( 886 &ire->ire_gateway_addr_v6, 887 margs->ift_zoneid, ire->ire_ill, 888 margs->ift_tsl, ipst, B_TRUE)) 889 goto next_ire; 890 } 891 } 892 mutex_enter(&ire->ire_lock); 893 /* Look for stale ire_badcnt and clear */ 894 if (ire->ire_badcnt != 0 && 895 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 896 ipst->ips_ip_ire_badcnt_lifetime)) 897 ire->ire_badcnt = 0; 898 mutex_exit(&ire->ire_lock); 899 900 if (ire->ire_badcnt == 0) { 901 /* We found one with a zero badcnt; done */ 902 ire_refhold(ire); 903 /* 904 * Care needed since irb_refrele grabs WLOCK to free 905 * the irb_t. 906 */ 907 if (ire->ire_ipversion == IPV4_VERSION) { 908 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 909 irb_refrele(irb_ptr); 910 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 911 } else { 912 rw_exit(&ipst->ips_ip6_ire_head_lock); 913 irb_refrele(irb_ptr); 914 rw_enter(&ipst->ips_ip6_ire_head_lock, 915 RW_READER); 916 } 917 return (ire); 918 } 919 /* 920 * keep looking to see if there is a better (lower 921 * badcnt) matching IRE, but save this one as a last resort. 922 * If we find a lower badcnt pick that one as the last* resort. 923 */ 924 if (maybe_ire == NULL) { 925 maybe_ire = ire; 926 maybe_badcnt = ire->ire_badcnt; 927 } else if (ire->ire_badcnt < maybe_badcnt) { 928 maybe_ire = ire; 929 maybe_badcnt = ire->ire_badcnt; 930 } 931 932 next_ire: 933 maxwalk--; 934 next_ire_skip: 935 ire = ire->ire_next; 936 if (ire == NULL) 937 ire = irb_ptr->irb_ire; 938 } 939 if (maybe_ire != NULL) 940 ire_refhold(maybe_ire); 941 942 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 943 if (ire->ire_ipversion == IPV4_VERSION) { 944 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 945 irb_refrele(irb_ptr); 946 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 947 } else { 948 rw_exit(&ipst->ips_ip6_ire_head_lock); 949 irb_refrele(irb_ptr); 950 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 951 } 952 return (maybe_ire); 953 } 954 955 void 956 irb_refhold_rn(struct radix_node *rn) 957 { 958 if ((rn->rn_flags & RNF_ROOT) == 0) 959 irb_refhold(&((rt_t *)(rn))->rt_irb); 960 } 961 962 void 963 irb_refrele_rn(struct radix_node *rn) 964 { 965 if ((rn->rn_flags & RNF_ROOT) == 0) 966 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 967 } 968 969 /* 970 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 971 * routes this routine sets up a ire_nce_cache as well. The caller needs to 972 * lookup an nce for the multicast case. 973 */ 974 ire_t * 975 ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 976 uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 977 { 978 uint_t match_args; 979 uint_t ire_type; 980 ill_t *ill; 981 ire_t *ire; 982 ip_stack_t *ipst = ixa->ixa_ipst; 983 ipaddr_t v4dst; 984 in6_addr_t v6nexthop; 985 iaflags_t ixaflags = ixa->ixa_flags; 986 nce_t *nce; 987 988 match_args = MATCH_IRE_SECATTR; 989 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 990 if (setsrcp != NULL) 991 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 992 if (errorp != NULL) 993 ASSERT(*errorp == 0); 994 995 /* 996 * The content of the ixa will be different if IP_NEXTHOP, 997 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 998 */ 999 1000 if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1001 IN6_IS_ADDR_MULTICAST(v6dst)) { 1002 /* Pick up the IRE_MULTICAST for the ill */ 1003 if (ixa->ixa_multicast_ifindex != 0) { 1004 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1005 !(ixaflags & IXAF_IS_IPV4), ipst); 1006 } else if (ixaflags & IXAF_SCOPEID_SET) { 1007 /* sin6_scope_id takes precedence over ixa_ifindex */ 1008 ASSERT(ixa->ixa_scopeid != 0); 1009 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1010 !(ixaflags & IXAF_IS_IPV4), ipst); 1011 } else if (ixa->ixa_ifindex != 0) { 1012 /* 1013 * In the ipmp case, the ixa_ifindex is set to 1014 * point at an under_ill and we would return the 1015 * ire_multicast() corresponding to that under_ill. 1016 */ 1017 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1018 !(ixaflags & IXAF_IS_IPV4), ipst); 1019 } else if (ixaflags & IXAF_IS_IPV4) { 1020 ipaddr_t v4setsrc = INADDR_ANY; 1021 1022 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1023 multirtp, &v4setsrc); 1024 if (setsrcp != NULL) 1025 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1026 } else { 1027 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1028 multirtp, setsrcp); 1029 } 1030 if (ill != NULL && IS_VNI(ill)) { 1031 ill_refrele(ill); 1032 ill = NULL; 1033 } 1034 if (ill == NULL) { 1035 if (errorp != NULL) 1036 *errorp = ENXIO; 1037 /* Get a hold on the IRE_NOROUTE */ 1038 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1039 return (ire); 1040 } 1041 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1042 ill_refrele(ill); 1043 if (errorp != NULL) 1044 *errorp = EHOSTUNREACH; 1045 /* Get a hold on the IRE_NOROUTE */ 1046 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1047 return (ire); 1048 } 1049 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1050 ire = ire_multicast(ill); 1051 ill_refrele(ill); 1052 if (generationp != NULL) 1053 *generationp = ire->ire_generation; 1054 if (errorp != NULL && 1055 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1056 *errorp = EHOSTUNREACH; 1057 } 1058 return (ire); 1059 } 1060 1061 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1062 if (ixaflags & IXAF_SCOPEID_SET) { 1063 /* sin6_scope_id takes precedence over ixa_ifindex */ 1064 ASSERT(ixa->ixa_scopeid != 0); 1065 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1066 !(ixaflags & IXAF_IS_IPV4), ipst); 1067 } else { 1068 ASSERT(ixa->ixa_ifindex != 0); 1069 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1070 !(ixaflags & IXAF_IS_IPV4), ipst); 1071 } 1072 if (ill != NULL && IS_VNI(ill)) { 1073 ill_refrele(ill); 1074 ill = NULL; 1075 } 1076 if (ill == NULL) { 1077 if (errorp != NULL) 1078 *errorp = ENXIO; 1079 /* Get a hold on the IRE_NOROUTE */ 1080 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1081 return (ire); 1082 } 1083 /* 1084 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1085 * so for both of them we need to be able look for an under 1086 * interface. 1087 */ 1088 if (IS_UNDER_IPMP(ill)) 1089 match_args |= MATCH_IRE_TESTHIDDEN; 1090 } else { 1091 ill = NULL; 1092 } 1093 1094 if (ixaflags & IXAF_NEXTHOP_SET) { 1095 /* IP_NEXTHOP was set */ 1096 v6nexthop = ixa->ixa_nexthop_v6; 1097 } else { 1098 v6nexthop = *v6dst; 1099 } 1100 1101 ire_type = 0; 1102 /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1103 1104 /* 1105 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1106 * we only look for an onlink IRE. 1107 */ 1108 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1109 match_args |= MATCH_IRE_TYPE; 1110 ire_type = IRE_ONLINK; 1111 } 1112 1113 if (ixaflags & IXAF_IS_IPV4) { 1114 ipaddr_t v4nexthop; 1115 ipaddr_t v4setsrc = INADDR_ANY; 1116 1117 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1118 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1119 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1120 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1121 if (setsrcp != NULL) 1122 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1123 } else { 1124 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1125 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, B_TRUE, 1126 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1127 } 1128 1129 #ifdef DEBUG 1130 if (match_args & MATCH_IRE_TESTHIDDEN) { 1131 ip3dbg(("looking for hidden; dst %x ire %p\n", 1132 v4dst, (void *)ire)); 1133 } 1134 #endif 1135 1136 if (ill != NULL) 1137 ill_refrele(ill); 1138 1139 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1140 (ire->ire_type & IRE_MULTICAST)) { 1141 /* No ire_nce_cache */ 1142 return (ire); 1143 } 1144 1145 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1146 mutex_enter(&ire->ire_lock); 1147 nce = ire->ire_nce_cache; 1148 if (nce == NULL || nce->nce_is_condemned) { 1149 mutex_exit(&ire->ire_lock); 1150 (void) ire_revalidate_nce(ire); 1151 } else { 1152 mutex_exit(&ire->ire_lock); 1153 } 1154 return (ire); 1155 } 1156 1157 /* 1158 * Find a route given some xmit attributes and a packet. 1159 * Generic for IPv4 and IPv6 1160 * 1161 * This never returns NULL. But when it returns the IRE_NOROUTE 1162 * it might set errorp. 1163 */ 1164 ire_t * 1165 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1166 int *errorp, boolean_t *multirtp) 1167 { 1168 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1169 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1170 in6_addr_t v6dst; 1171 1172 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1173 1174 return (ip_select_route(&v6dst, ixa, generationp, 1175 NULL, errorp, multirtp)); 1176 } else { 1177 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1178 1179 return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1180 NULL, errorp, multirtp)); 1181 } 1182 } 1183 1184 ire_t * 1185 ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1186 ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1187 { 1188 in6_addr_t v6dst; 1189 ire_t *ire; 1190 in6_addr_t setsrc; 1191 1192 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1193 1194 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1195 1196 setsrc = ipv6_all_zeros; 1197 ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1198 multirtp); 1199 if (v4setsrcp != NULL) 1200 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1201 return (ire); 1202 } 1203 1204 /* 1205 * Recursively look for a route to the destination. Can also match on 1206 * the zoneid, ill, and label. Used for the data paths. See also 1207 * ire_route_recursive. 1208 * 1209 * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1210 * 1211 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1212 * instead. 1213 * 1214 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1215 * is an error. 1216 * Allow at most one RTF_INDIRECT. 1217 */ 1218 ire_t * 1219 ire_route_recursive_impl_v4(ire_t *ire, 1220 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1221 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1222 boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1223 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1224 { 1225 int i, j; 1226 ire_t *ires[MAX_IRE_RECURSION]; 1227 uint_t generation; 1228 uint_t generations[MAX_IRE_RECURSION]; 1229 boolean_t need_refrele = B_FALSE; 1230 boolean_t invalidate = B_FALSE; 1231 int prefs[MAX_IRE_RECURSION]; 1232 ill_t *ill = NULL; 1233 1234 if (setsrcp != NULL) 1235 ASSERT(*setsrcp == INADDR_ANY); 1236 if (gwattrp != NULL) 1237 ASSERT(*gwattrp == NULL); 1238 1239 if (ill_arg != NULL) 1240 match_args |= MATCH_IRE_ILL; 1241 1242 /* 1243 * We iterate up to three times to resolve a route, even though 1244 * we have four slots in the array. The extra slot is for an 1245 * IRE_IF_CLONE we might need to create. 1246 */ 1247 i = 0; 1248 while (i < MAX_IRE_RECURSION - 1) { 1249 /* ire_ftable_lookup handles round-robin/ECMP */ 1250 if (ire == NULL) { 1251 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1252 (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1253 match_args, xmit_hint, ipst, &generation); 1254 } else { 1255 /* Caller passed it; extra hold since we will rele */ 1256 ire_refhold(ire); 1257 if (generationp != NULL) 1258 generation = *generationp; 1259 else 1260 generation = IRE_GENERATION_VERIFY; 1261 } 1262 if (ire == NULL) 1263 ire = ire_reject(ipst, B_FALSE); 1264 1265 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1266 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1267 goto error; 1268 1269 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1270 1271 prefs[i] = ire_pref(ire); 1272 if (i != 0) { 1273 /* 1274 * Don't allow anything unusual past the first 1275 * iteration. 1276 */ 1277 if ((ire->ire_type & 1278 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1279 prefs[i] <= prefs[i-1]) { 1280 ire_refrele(ire); 1281 ire = ire_reject(ipst, B_FALSE); 1282 goto error; 1283 } 1284 } 1285 /* We have a usable IRE */ 1286 ires[i] = ire; 1287 generations[i] = generation; 1288 i++; 1289 1290 /* The first RTF_SETSRC address is passed back if setsrcp */ 1291 if ((ire->ire_flags & RTF_SETSRC) && 1292 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1293 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1294 *setsrcp = ire->ire_setsrc_addr; 1295 } 1296 1297 /* The first ire_gw_secattr is passed back if gwattrp */ 1298 if (ire->ire_gw_secattr != NULL && 1299 gwattrp != NULL && *gwattrp == NULL) 1300 *gwattrp = ire->ire_gw_secattr; 1301 1302 /* 1303 * Check if we have a short-cut pointer to an IRE for this 1304 * destination, and that the cached dependency isn't stale. 1305 * In that case we've rejoined an existing tree towards a 1306 * parent, thus we don't need to continue the loop to 1307 * discover the rest of the tree. 1308 */ 1309 mutex_enter(&ire->ire_lock); 1310 if (ire->ire_dep_parent != NULL && 1311 ire->ire_dep_parent->ire_generation == 1312 ire->ire_dep_parent_generation) { 1313 mutex_exit(&ire->ire_lock); 1314 ire = NULL; 1315 goto done; 1316 } 1317 mutex_exit(&ire->ire_lock); 1318 1319 /* 1320 * If this type should have an ire_nce_cache (even if it 1321 * doesn't yet have one) then we are done. Includes 1322 * IRE_INTERFACE with a full 32 bit mask. 1323 */ 1324 if (ire->ire_nce_capable) { 1325 ire = NULL; 1326 goto done; 1327 } 1328 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1329 /* 1330 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1331 * particular destination 1332 */ 1333 if (ire->ire_type & IRE_INTERFACE) { 1334 in6_addr_t v6nexthop; 1335 ire_t *clone; 1336 1337 ASSERT(ire->ire_masklen != IPV4_ABITS); 1338 1339 /* 1340 * In the case of ip_input and ILLF_FORWARDING not 1341 * being set, and in the case of RTM_GET, 1342 * there is no point in allocating 1343 * an IRE_IF_CLONE. We return the IRE_INTERFACE. 1344 * Note that !allocate can result in a ire_dep_parent 1345 * which is IRE_IF_* without an IRE_IF_CLONE. 1346 * We recover from that when we need to send packets 1347 * by ensuring that the generations become 1348 * IRE_GENERATION_VERIFY in this case. 1349 */ 1350 if (!allocate) { 1351 invalidate = B_TRUE; 1352 ire = NULL; 1353 goto done; 1354 } 1355 1356 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1357 1358 clone = ire_create_if_clone(ire, &v6nexthop, 1359 &generation); 1360 if (clone == NULL) { 1361 /* 1362 * Temporary failure - no memory. 1363 * Don't want caller to cache IRE_NOROUTE. 1364 */ 1365 invalidate = B_TRUE; 1366 ire = ire_blackhole(ipst, B_FALSE); 1367 goto error; 1368 } 1369 /* 1370 * Make clone next to last entry and the 1371 * IRE_INTERFACE the last in the dependency 1372 * chain since the clone depends on the 1373 * IRE_INTERFACE. 1374 */ 1375 ASSERT(i >= 1); 1376 ASSERT(i < MAX_IRE_RECURSION); 1377 1378 ires[i] = ires[i-1]; 1379 generations[i] = generations[i-1]; 1380 ires[i-1] = clone; 1381 generations[i-1] = generation; 1382 i++; 1383 1384 ire = NULL; 1385 goto done; 1386 } 1387 1388 /* 1389 * We only match on the type and optionally ILL when 1390 * recursing. The type match is used by some callers 1391 * to exclude certain types (such as IRE_IF_CLONE or 1392 * IRE_LOCAL|IRE_LOOPBACK). 1393 */ 1394 match_args &= MATCH_IRE_TYPE; 1395 nexthop = ire->ire_gateway_addr; 1396 if (ill == NULL && ire->ire_ill != NULL) { 1397 ill = ire->ire_ill; 1398 need_refrele = B_TRUE; 1399 ill_refhold(ill); 1400 match_args |= MATCH_IRE_ILL; 1401 } 1402 ire = NULL; 1403 } 1404 ASSERT(ire == NULL); 1405 ire = ire_reject(ipst, B_FALSE); 1406 1407 error: 1408 ASSERT(ire != NULL); 1409 if (need_refrele) 1410 ill_refrele(ill); 1411 1412 /* 1413 * In the case of MULTIRT we want to try a different IRE the next 1414 * time. We let the next packet retry in that case. 1415 */ 1416 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1417 (void) ire_no_good(ires[0]); 1418 1419 cleanup: 1420 /* cleanup ires[i] */ 1421 ire_dep_unbuild(ires, i); 1422 for (j = 0; j < i; j++) 1423 ire_refrele(ires[j]); 1424 1425 ASSERT(ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)); 1426 /* 1427 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1428 * ip_select_route since the reject or lack of memory might be gone. 1429 */ 1430 if (generationp != NULL) 1431 *generationp = IRE_GENERATION_VERIFY; 1432 return (ire); 1433 1434 done: 1435 ASSERT(ire == NULL); 1436 if (need_refrele) { 1437 ill_refrele(ill); 1438 ill = NULL; 1439 } 1440 1441 /* Build dependencies */ 1442 if (!ire_dep_build(ires, generations, i)) { 1443 /* Something in chain was condemned; tear it apart */ 1444 ire = ire_reject(ipst, B_FALSE); 1445 goto cleanup; 1446 } 1447 1448 /* 1449 * Release all refholds except the one for ires[0] that we 1450 * will return to the caller. 1451 */ 1452 for (j = 1; j < i; j++) 1453 ire_refrele(ires[j]); 1454 1455 if (invalidate) { 1456 /* 1457 * Since we needed to allocate but couldn't we need to make 1458 * sure that the dependency chain is rebuilt the next time. 1459 */ 1460 ire_dep_invalidate_generations(ires[0]); 1461 generation = IRE_GENERATION_VERIFY; 1462 } else { 1463 /* 1464 * IREs can have been added or deleted while we did the 1465 * recursive lookup and we can't catch those until we've built 1466 * the dependencies. We verify the stored 1467 * ire_dep_parent_generation to catch any such changes and 1468 * return IRE_GENERATION_VERIFY (which will cause 1469 * ip_select_route to be called again so we can redo the 1470 * recursive lookup next time we send a packet. 1471 */ 1472 generation = ire_dep_validate_generations(ires[0]); 1473 if (generations[0] != ires[0]->ire_generation) { 1474 /* Something changed at the top */ 1475 generation = IRE_GENERATION_VERIFY; 1476 } 1477 } 1478 if (generationp != NULL) 1479 *generationp = generation; 1480 1481 return (ires[0]); 1482 } 1483 1484 ire_t * 1485 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1486 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1487 boolean_t allocate, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1488 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1489 { 1490 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1491 zoneid, tsl, match_args, allocate, xmit_hint, ipst, setsrcp, 1492 gwattrp, generationp)); 1493 } 1494 1495 /* 1496 * Recursively look for a route to the destination. 1497 * We only handle a destination match here, yet we have the same arguments 1498 * as the full match to allow function pointers to select between the two. 1499 * 1500 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1501 * instead. 1502 * 1503 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1504 * is an error. 1505 * Allow at most one RTF_INDIRECT. 1506 */ 1507 ire_t * 1508 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, boolean_t allocate, 1509 uint32_t xmit_hint, ip_stack_t *ipst) 1510 { 1511 ire_t *ire; 1512 ire_t *ire1; 1513 uint_t generation; 1514 1515 /* ire_ftable_lookup handles round-robin/ECMP */ 1516 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1517 &generation); 1518 ASSERT(ire != NULL); 1519 1520 /* 1521 * If this type should have an ire_nce_cache (even if it 1522 * doesn't yet have one) then we are done. Includes 1523 * IRE_INTERFACE with a full 32 bit mask. 1524 */ 1525 if (ire->ire_nce_capable) 1526 return (ire); 1527 1528 /* 1529 * If the IRE has a current cached parent we know that the whole 1530 * parent chain is current, hence we don't need to discover and 1531 * build any dependencies by doing a recursive lookup. 1532 */ 1533 mutex_enter(&ire->ire_lock); 1534 if (ire->ire_dep_parent != NULL && 1535 ire->ire_dep_parent->ire_generation == 1536 ire->ire_dep_parent_generation) { 1537 mutex_exit(&ire->ire_lock); 1538 return (ire); 1539 } 1540 mutex_exit(&ire->ire_lock); 1541 1542 /* 1543 * Fallback to loop in the normal code starting with the ire 1544 * we found. Normally this would return the same ire. 1545 */ 1546 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1547 NULL, MATCH_IRE_DSTONLY, allocate, xmit_hint, ipst, NULL, NULL, 1548 &generation); 1549 ire_refrele(ire); 1550 return (ire1); 1551 } 1552