1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * This file contains consumer routines of the IPv4 forwarding engine 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/dlpi.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/policy.h> 38 39 #include <sys/systm.h> 40 #include <sys/strsun.h> 41 #include <sys/kmem.h> 42 #include <sys/param.h> 43 #include <sys/socket.h> 44 #include <sys/strsubr.h> 45 #include <net/if.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/ipsec_impl.h> 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ip6.h> 59 #include <inet/ip_ndp.h> 60 #include <inet/arp.h> 61 #include <inet/ip_if.h> 62 #include <inet/ip_ire.h> 63 #include <inet/ip_ftable.h> 64 #include <inet/ip_rts.h> 65 #include <inet/nd.h> 66 67 #include <net/pfkeyv2.h> 68 #include <inet/sadb.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <net/radix.h> 73 #include <sys/tsol/label.h> 74 #include <sys/tsol/tnet.h> 75 76 #define IS_DEFAULT_ROUTE(ire) \ 77 (((ire)->ire_type & IRE_DEFAULT) || \ 78 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79 80 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 81 static void ire_del_host_redir(ire_t *, char *); 82 static boolean_t ire_find_best_route(struct radix_node *, void *); 83 84 /* 85 * Lookup a route in forwarding table. A specific lookup is indicated by 86 * passing the required parameters and indicating the match required in the 87 * flag field. 88 * 89 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 90 */ 91 ire_t * 92 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 93 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 94 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 95 { 96 ire_t *ire; 97 struct rt_sockaddr rdst, rmask; 98 struct rt_entry *rt; 99 ire_ftable_args_t margs; 100 101 ASSERT(ill == NULL || !ill->ill_isv6); 102 103 /* 104 * ire_match_args() will dereference ill if MATCH_IRE_ILL 105 * is set. 106 */ 107 if ((flags & MATCH_IRE_ILL) && (ill == NULL)) 108 return (NULL); 109 110 bzero(&rdst, sizeof (rdst)); 111 rdst.rt_sin_len = sizeof (rdst); 112 rdst.rt_sin_family = AF_INET; 113 rdst.rt_sin_addr.s_addr = addr; 114 115 bzero(&rmask, sizeof (rmask)); 116 rmask.rt_sin_len = sizeof (rmask); 117 rmask.rt_sin_family = AF_INET; 118 rmask.rt_sin_addr.s_addr = mask; 119 120 bzero(&margs, sizeof (margs)); 121 margs.ift_addr = addr; 122 margs.ift_mask = mask; 123 margs.ift_gateway = gateway; 124 margs.ift_type = type; 125 margs.ift_ill = ill; 126 margs.ift_zoneid = zoneid; 127 margs.ift_tsl = tsl; 128 margs.ift_flags = flags; 129 130 /* 131 * The flags argument passed to ire_ftable_lookup may cause the 132 * search to return, not the longest matching prefix, but the 133 * "best matching prefix", i.e., the longest prefix that also 134 * satisfies constraints imposed via the permutation of flags 135 * passed in. To achieve this, we invoke ire_match_args() on 136 * each matching leaf in the radix tree. ire_match_args is 137 * invoked by the callback function ire_find_best_route() 138 * We hold the global tree lock in read mode when calling 139 * rn_match_args. Before dropping the global tree lock, ensure 140 * that the radix node can't be deleted by incrementing ire_refcnt. 141 */ 142 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 143 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 144 ipst->ips_ip_ftable, ire_find_best_route, &margs); 145 ire = margs.ift_best_ire; 146 if (rt == NULL) { 147 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 148 return (NULL); 149 } 150 ASSERT(ire != NULL); 151 152 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 153 154 /* 155 * round-robin only if we have more than one route in the bucket. 156 * ips_ip_ecmp_behavior controls when we do ECMP 157 * 2: always 158 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 159 * 0: never 160 */ 161 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 162 if (ipst->ips_ip_ecmp_behavior == 2 || 163 (ipst->ips_ip_ecmp_behavior == 1 && 164 IS_DEFAULT_ROUTE(ire))) { 165 ire_t *next_ire; 166 167 margs.ift_best_ire = NULL; 168 next_ire = ire_round_robin(ire->ire_bucket, &margs, 169 xmit_hint, ire, ipst); 170 if (next_ire == NULL) { 171 /* keep ire if next_ire is null */ 172 goto done; 173 } 174 ire_refrele(ire); 175 ire = next_ire; 176 } 177 } 178 179 done: 180 /* Return generation before dropping lock */ 181 if (generationp != NULL) 182 *generationp = ire->ire_generation; 183 184 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 185 186 /* 187 * For shared-IP zones we need additional checks to what was 188 * done in ire_match_args to make sure IRE_LOCALs are handled. 189 * 190 * When ip_restrict_interzone_loopback is set, then 191 * we ensure that IRE_LOCAL are only used for loopback 192 * between zones when the logical "Ethernet" would 193 * have looped them back. That is, if in the absense of 194 * the IRE_LOCAL we would have sent to packet out the 195 * same ill. 196 */ 197 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 198 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 199 ipst->ips_ip_restrict_interzone_loopback) { 200 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 201 ASSERT(ire != NULL); 202 } 203 return (ire); 204 } 205 206 /* 207 * This function is called by 208 * ip_input/ire_route_recursive when doing a route lookup on only the 209 * destination address. 210 * 211 * The optimizations of this function over ire_ftable_lookup are: 212 * o removing unnecessary flag matching 213 * o doing longest prefix match instead of overloading it further 214 * with the unnecessary "best_prefix_match" 215 * 216 * If no route is found we return IRE_NOROUTE. 217 */ 218 ire_t * 219 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 220 uint_t *generationp) 221 { 222 ire_t *ire; 223 struct rt_sockaddr rdst; 224 struct rt_entry *rt; 225 irb_t *irb; 226 227 rdst.rt_sin_len = sizeof (rdst); 228 rdst.rt_sin_family = AF_INET; 229 rdst.rt_sin_addr.s_addr = addr; 230 231 /* 232 * This is basically inlining a simpler version of ire_match_args 233 */ 234 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 235 236 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 237 ipst->ips_ip_ftable, NULL, NULL); 238 239 if (rt == NULL) 240 goto bad; 241 242 irb = &rt->rt_irb; 243 if (irb->irb_ire_cnt == 0) 244 goto bad; 245 246 rw_enter(&irb->irb_lock, RW_READER); 247 ire = irb->irb_ire; 248 if (ire == NULL) { 249 rw_exit(&irb->irb_lock); 250 goto bad; 251 } 252 while (IRE_IS_CONDEMNED(ire)) { 253 ire = ire->ire_next; 254 if (ire == NULL) { 255 rw_exit(&irb->irb_lock); 256 goto bad; 257 } 258 } 259 260 /* we have a ire that matches */ 261 ire_refhold(ire); 262 rw_exit(&irb->irb_lock); 263 264 /* 265 * round-robin only if we have more than one route in the bucket. 266 * ips_ip_ecmp_behavior controls when we do ECMP 267 * 2: always 268 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 269 * 0: never 270 * 271 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 272 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 273 * and the IRE_INTERFACESs are likely to be shorter matches. 274 */ 275 if (ire->ire_bucket->irb_ire_cnt > 1) { 276 if (ipst->ips_ip_ecmp_behavior == 2 || 277 (ipst->ips_ip_ecmp_behavior == 1 && 278 IS_DEFAULT_ROUTE(ire))) { 279 ire_t *next_ire; 280 ire_ftable_args_t margs; 281 282 bzero(&margs, sizeof (margs)); 283 margs.ift_addr = addr; 284 margs.ift_zoneid = ALL_ZONES; 285 286 next_ire = ire_round_robin(ire->ire_bucket, &margs, 287 xmit_hint, ire, ipst); 288 if (next_ire == NULL) { 289 /* keep ire if next_ire is null */ 290 if (generationp != NULL) 291 *generationp = ire->ire_generation; 292 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 293 return (ire); 294 } 295 ire_refrele(ire); 296 ire = next_ire; 297 } 298 } 299 /* Return generation before dropping lock */ 300 if (generationp != NULL) 301 *generationp = ire->ire_generation; 302 303 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 304 305 /* 306 * Since we only did ALL_ZONES matches there is no special handling 307 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 308 */ 309 return (ire); 310 311 bad: 312 if (generationp != NULL) 313 *generationp = IRE_GENERATION_VERIFY; 314 315 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 316 return (ire_reject(ipst, B_FALSE)); 317 } 318 319 /* 320 * Find the ill matching a multicast group. 321 * Allows different routes for multicast addresses 322 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 323 * which point at different interfaces. This is used when IP_MULTICAST_IF 324 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 325 * specify the interface to join on. 326 * 327 * Supports link-local addresses by using ire_route_recursive which follows 328 * the ill when recursing. 329 * 330 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 331 * and the MULTIRT property can be different for different groups, we 332 * extract RTF_MULTIRT from the special unicast route added for a group 333 * with CGTP and pass that back in the multirtp argument. 334 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 335 * We have a setsrcp argument for the same reason. 336 */ 337 ill_t * 338 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 339 boolean_t *multirtp, ipaddr_t *setsrcp) 340 { 341 ire_t *ire; 342 ill_t *ill; 343 344 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 345 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 346 ASSERT(ire != NULL); 347 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 348 ire_refrele(ire); 349 return (NULL); 350 } 351 352 if (multirtp != NULL) 353 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 354 355 ill = ire_nexthop_ill(ire); 356 ire_refrele(ire); 357 return (ill); 358 } 359 360 /* 361 * Delete the passed in ire if the gateway addr matches 362 */ 363 void 364 ire_del_host_redir(ire_t *ire, char *gateway) 365 { 366 if ((ire->ire_flags & RTF_DYNAMIC) && 367 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 368 ire_delete(ire); 369 } 370 371 /* 372 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 373 * pointing at the specified gateway and 374 * delete them. This routine is called only 375 * when a default gateway is going away. 376 */ 377 void 378 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 379 { 380 struct rtfuncarg rtfarg; 381 382 bzero(&rtfarg, sizeof (rtfarg)); 383 rtfarg.rt_func = ire_del_host_redir; 384 rtfarg.rt_arg = (void *)&gateway; 385 rtfarg.rt_zoneid = ALL_ZONES; 386 rtfarg.rt_ipst = ipst; 387 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 388 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 389 } 390 391 /* 392 * Obtain the rt_entry and rt_irb for the route to be added to 393 * the ips_ip_ftable. 394 * First attempt to add a node to the radix tree via rn_addroute. If the 395 * route already exists, return the bucket for the existing route. 396 * 397 * Locking notes: Need to hold the global radix tree lock in write mode to 398 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 399 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 400 * while holding the irb_lock, but not the radix tree lock. 401 */ 402 irb_t * 403 ire_get_bucket(ire_t *ire) 404 { 405 struct radix_node *rn; 406 struct rt_entry *rt; 407 struct rt_sockaddr rmask, rdst; 408 irb_t *irb = NULL; 409 ip_stack_t *ipst = ire->ire_ipst; 410 411 ASSERT(ipst->ips_ip_ftable != NULL); 412 413 /* first try to see if route exists (based on rtalloc1) */ 414 bzero(&rdst, sizeof (rdst)); 415 rdst.rt_sin_len = sizeof (rdst); 416 rdst.rt_sin_family = AF_INET; 417 rdst.rt_sin_addr.s_addr = ire->ire_addr; 418 419 bzero(&rmask, sizeof (rmask)); 420 rmask.rt_sin_len = sizeof (rmask); 421 rmask.rt_sin_family = AF_INET; 422 rmask.rt_sin_addr.s_addr = ire->ire_mask; 423 424 /* 425 * add the route. based on BSD's rtrequest1(RTM_ADD) 426 */ 427 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 428 /* kmem_alloc failed */ 429 if (rt == NULL) 430 return (NULL); 431 432 bzero(rt, sizeof (*rt)); 433 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 434 rt->rt_dst = rdst; 435 irb = &rt->rt_irb; 436 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 437 irb->irb_ipst = ipst; 438 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 439 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 440 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 441 ipst->ips_ip_ftable, (struct radix_node *)rt); 442 if (rn == NULL) { 443 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 444 Free(rt, rt_entry_cache); 445 rt = NULL; 446 irb = NULL; 447 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 448 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 449 ipst->ips_ip_ftable); 450 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 451 /* found a non-root match */ 452 rt = (struct rt_entry *)rn; 453 } 454 } 455 if (rt != NULL) { 456 irb = &rt->rt_irb; 457 irb_refhold(irb); 458 } 459 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 460 return (irb); 461 } 462 463 /* 464 * This function is used when the caller wants to know the outbound 465 * interface for a packet given only the address. 466 * If this is a offlink IP address and there are multiple 467 * routes to this destination, this routine will utilise the 468 * first route it finds to IP address 469 * Return values: 470 * 0 - FAILURE 471 * nonzero - ifindex 472 */ 473 uint_t 474 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 475 { 476 uint_t ifindex = 0; 477 ire_t *ire; 478 ill_t *ill; 479 netstack_t *ns; 480 ip_stack_t *ipst; 481 482 if (zoneid == ALL_ZONES) 483 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 484 else 485 ns = netstack_find_by_zoneid(zoneid); 486 ASSERT(ns != NULL); 487 488 /* 489 * For exclusive stacks we set the zoneid to zero 490 * since IP uses the global zoneid in the exclusive stacks. 491 */ 492 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 493 zoneid = GLOBAL_ZONEID; 494 ipst = ns->netstack_ip; 495 496 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 497 498 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 499 ill = ire_nexthop_ill(ire); 500 if (ill != NULL) { 501 ifindex = ill->ill_phyint->phyint_ifindex; 502 ill_refrele(ill); 503 } 504 ire_refrele(ire); 505 } 506 netstack_rele(ns); 507 return (ifindex); 508 } 509 510 /* 511 * Routine to find the route to a destination. If a ifindex is supplied 512 * it tries to match the route to the corresponding ipif for the ifindex 513 */ 514 static ire_t * 515 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 516 { 517 ire_t *ire = NULL; 518 int match_flags; 519 520 match_flags = MATCH_IRE_DSTONLY; 521 522 /* XXX pass NULL tsl for now */ 523 524 if (dst_addr->sa_family == AF_INET) { 525 ire = ire_route_recursive_v4( 526 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 527 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 528 NULL, NULL); 529 } else { 530 ire = ire_route_recursive_v6( 531 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 532 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 533 NULL, NULL); 534 } 535 ASSERT(ire != NULL); 536 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 537 ire_refrele(ire); 538 return (NULL); 539 } 540 return (ire); 541 } 542 543 /* 544 * This routine is called by IP Filter to send a packet out on the wire 545 * to a specified dstination (which may be onlink or offlink). The ifindex may 546 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 547 * an outgoing interface and requires the nexthop to be on that interface. 548 * IP WILL NOT DO the following to the data packet before sending it out: 549 * a. manipulate ttl 550 * b. ipsec work 551 * c. fragmentation 552 * 553 * If the packet has been prepared for hardware checksum then it will be 554 * passed off to ip_send_align_cksum() to check that the flags set on the 555 * packet are in alignment with the capabilities of the new outgoing NIC. 556 * 557 * Return values: 558 * 0: IP was able to send of the data pkt 559 * ECOMM: Could not send packet 560 * ENONET No route to dst. It is up to the caller 561 * to send icmp unreachable error message, 562 * EINPROGRESS The macaddr of the onlink dst or that 563 * of the offlink dst's nexthop needs to get 564 * resolved before packet can be sent to dst. 565 * Thus transmission is not guaranteed. 566 * Note: No longer have visibility to the ARP queue 567 * hence no EINPROGRESS. 568 */ 569 int 570 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 571 zoneid_t zoneid) 572 { 573 ipaddr_t nexthop; 574 netstack_t *ns; 575 ip_stack_t *ipst; 576 ip_xmit_attr_t ixas; 577 int error; 578 579 ASSERT(mp != NULL); 580 581 if (zoneid == ALL_ZONES) 582 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 583 else 584 ns = netstack_find_by_zoneid(zoneid); 585 ASSERT(ns != NULL); 586 587 /* 588 * For exclusive stacks we set the zoneid to zero 589 * since IP uses the global zoneid in the exclusive stacks. 590 */ 591 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 592 zoneid = GLOBAL_ZONEID; 593 ipst = ns->netstack_ip; 594 595 ASSERT(dst_addr->sa_family == AF_INET || 596 dst_addr->sa_family == AF_INET6); 597 598 bzero(&ixas, sizeof (ixas)); 599 /* 600 * No IPsec, no fragmentation, and don't let any hooks see 601 * the packet. 602 */ 603 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 604 ixas.ixa_cred = kcred; 605 ixas.ixa_cpid = NOPID; 606 ixas.ixa_tsl = NULL; 607 ixas.ixa_ipst = ipst; 608 ixas.ixa_ifindex = ifindex; 609 610 if (dst_addr->sa_family == AF_INET) { 611 ipha_t *ipha = (ipha_t *)mp->b_rptr; 612 613 ixas.ixa_flags |= IXAF_IS_IPV4; 614 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 615 if (nexthop != ipha->ipha_dst) { 616 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 617 ixas.ixa_nexthop_v4 = nexthop; 618 } 619 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 620 } else { 621 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 622 in6_addr_t *nexthop6; 623 624 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 625 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 626 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 627 ixas.ixa_nexthop_v6 = *nexthop6; 628 } 629 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 630 } 631 error = ip_output_simple(mp, &ixas); 632 ixa_cleanup(&ixas); 633 634 netstack_rele(ns); 635 switch (error) { 636 case 0: 637 break; 638 639 case EHOSTUNREACH: 640 case ENETUNREACH: 641 error = ENONET; 642 break; 643 644 default: 645 error = ECOMM; 646 break; 647 } 648 return (error); 649 } 650 651 /* 652 * callback function provided by ire_ftable_lookup when calling 653 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 654 * the radix tree. 655 */ 656 boolean_t 657 ire_find_best_route(struct radix_node *rn, void *arg) 658 { 659 struct rt_entry *rt = (struct rt_entry *)rn; 660 irb_t *irb_ptr; 661 ire_t *ire; 662 ire_ftable_args_t *margs = arg; 663 ipaddr_t match_mask; 664 665 ASSERT(rt != NULL); 666 667 irb_ptr = &rt->rt_irb; 668 669 if (irb_ptr->irb_ire_cnt == 0) 670 return (B_FALSE); 671 672 rw_enter(&irb_ptr->irb_lock, RW_READER); 673 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 674 if (IRE_IS_CONDEMNED(ire)) 675 continue; 676 if (margs->ift_flags & (MATCH_IRE_MASK|MATCH_IRE_SHORTERMASK)) 677 match_mask = margs->ift_mask; 678 else 679 match_mask = ire->ire_mask; 680 681 if (ire_match_args(ire, margs->ift_addr, match_mask, 682 margs->ift_gateway, margs->ift_type, margs->ift_ill, 683 margs->ift_zoneid, margs->ift_tsl, 684 margs->ift_flags)) { 685 ire_refhold(ire); 686 rw_exit(&irb_ptr->irb_lock); 687 margs->ift_best_ire = ire; 688 return (B_TRUE); 689 } 690 } 691 rw_exit(&irb_ptr->irb_lock); 692 return (B_FALSE); 693 } 694 695 /* 696 * ftable irb_t structures are dynamically allocated, and we need to 697 * check if the irb_t (and associated ftable tree attachment) needs to 698 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 699 * be verified are: 700 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 701 * - no other threads holding references to ire's in the bucket, 702 * i.e., irb_nire == 0 703 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 704 * - need to hold the global tree lock and irb_lock in write mode. 705 */ 706 void 707 irb_refrele_ftable(irb_t *irb) 708 { 709 for (;;) { 710 rw_enter(&irb->irb_lock, RW_WRITER); 711 ASSERT(irb->irb_refcnt != 0); 712 if (irb->irb_refcnt != 1) { 713 /* 714 * Someone has a reference to this radix node 715 * or there is some bucket walker. 716 */ 717 irb->irb_refcnt--; 718 rw_exit(&irb->irb_lock); 719 return; 720 } else { 721 /* 722 * There is no other walker, nor is there any 723 * other thread that holds a direct ref to this 724 * radix node. Do the clean up if needed. Call 725 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 726 */ 727 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 728 ire_t *ire_list; 729 730 ire_list = ire_unlink(irb); 731 rw_exit(&irb->irb_lock); 732 733 if (ire_list != NULL) 734 ire_cleanup(ire_list); 735 /* 736 * more CONDEMNED entries could have 737 * been added while we dropped the lock, 738 * so we have to re-check. 739 */ 740 continue; 741 } 742 743 /* 744 * Now check if there are still any ires 745 * associated with this radix node. 746 */ 747 if (irb->irb_nire != 0) { 748 /* 749 * someone is still holding on 750 * to ires in this bucket 751 */ 752 irb->irb_refcnt--; 753 rw_exit(&irb->irb_lock); 754 return; 755 } else { 756 /* 757 * Everything is clear. Zero walkers, 758 * Zero threads with a ref to this 759 * radix node, Zero ires associated with 760 * this radix node. Due to lock order, 761 * check the above conditions again 762 * after grabbing all locks in the right order 763 */ 764 rw_exit(&irb->irb_lock); 765 if (irb_inactive(irb)) 766 return; 767 /* 768 * irb_inactive could not free the irb. 769 * See if there are any walkers, if not 770 * try to clean up again. 771 */ 772 } 773 } 774 } 775 } 776 777 /* 778 * IRE iterator used by ire_ftable_lookup to process multiple equal 779 * routes. Given a starting point in the hash list (hash), walk the IREs 780 * in the bucket skipping deleted entries. We treat the bucket as a circular 781 * list for the purposes of walking it. 782 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 783 * not applicable (ire_match_args failed) then it returns a subsequent one. 784 * If we fail to find an IRE we return NULL. 785 * 786 * Assumes that the caller holds a reference on the IRE bucket and a read lock 787 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 788 * 789 * Applies to IPv4 and IPv6. 790 * 791 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 792 * address and bucket, we compare against ire_type for the orig_ire. We also 793 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 794 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 795 * 796 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 797 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 798 * in which the zone has an IP address. We check this for the global zone 799 * even if no shared-IP zones are configured. 800 */ 801 ire_t * 802 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 803 ire_t *orig_ire, ip_stack_t *ipst) 804 { 805 ire_t *ire, *maybe_ire = NULL; 806 uint_t maybe_badcnt; 807 uint_t maxwalk; 808 809 /* Fold in more bits from the hint/hash */ 810 hash = hash ^ (hash >> 8) ^ (hash >> 16); 811 812 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 813 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 814 hash %= maxwalk; 815 irb_refhold_locked(irb_ptr); 816 rw_exit(&irb_ptr->irb_lock); 817 818 /* 819 * Round-robin the routers list looking for a route that 820 * matches the passed in parameters. 821 * First we skip "hash" number of non-condemned IREs. 822 * Then we match the IRE. 823 * If we find an ire which has a non-zero ire_badcnt then we remember 824 * it and keep on looking for a lower ire_badcnt. 825 * If we come to the end of the list we continue (treat the 826 * bucket list as a circular list) but we match less than "max" 827 * entries. 828 */ 829 ire = irb_ptr->irb_ire; 830 while (maxwalk > 0) { 831 if (IRE_IS_CONDEMNED(ire)) 832 goto next_ire_skip; 833 834 /* Skip the first "hash" entries to do ECMP */ 835 if (hash != 0) { 836 hash--; 837 goto next_ire_skip; 838 } 839 840 /* See CGTP comment above */ 841 if (ire->ire_type != orig_ire->ire_type || 842 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 843 goto next_ire; 844 845 /* 846 * Note: Since IPv6 has hash buckets instead of radix 847 * buckers we need to explicitly compare the addresses. 848 * That makes this less efficient since we will be called 849 * even if there is no alternatives just because the 850 * bucket has multiple IREs for different addresses. 851 */ 852 if (ire->ire_ipversion == IPV6_VERSION) { 853 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 854 &ire->ire_addr_v6)) 855 goto next_ire; 856 } 857 858 /* 859 * For some reason find_best_route uses ire_mask. We do 860 * the same. 861 */ 862 if (ire->ire_ipversion == IPV4_VERSION ? 863 !ire_match_args(ire, margs->ift_addr, 864 ire->ire_mask, margs->ift_gateway, 865 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 866 margs->ift_tsl, margs->ift_flags) : 867 !ire_match_args_v6(ire, &margs->ift_addr_v6, 868 &ire->ire_mask_v6, &margs->ift_gateway_v6, 869 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 870 margs->ift_tsl, margs->ift_flags)) 871 goto next_ire; 872 873 if (margs->ift_zoneid != ALL_ZONES && 874 (ire->ire_type & IRE_OFFLINK)) { 875 /* 876 * When we're in a zone, we're only 877 * interested in routers that are 878 * reachable through ipifs within our zone. 879 */ 880 if (ire->ire_ipversion == IPV4_VERSION) { 881 if (!ire_gateway_ok_zone_v4( 882 ire->ire_gateway_addr, margs->ift_zoneid, 883 ire->ire_ill, margs->ift_tsl, ipst, 884 B_TRUE)) 885 goto next_ire; 886 } else { 887 if (!ire_gateway_ok_zone_v6( 888 &ire->ire_gateway_addr_v6, 889 margs->ift_zoneid, ire->ire_ill, 890 margs->ift_tsl, ipst, B_TRUE)) 891 goto next_ire; 892 } 893 } 894 mutex_enter(&ire->ire_lock); 895 /* Look for stale ire_badcnt and clear */ 896 if (ire->ire_badcnt != 0 && 897 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 898 ipst->ips_ip_ire_badcnt_lifetime)) 899 ire->ire_badcnt = 0; 900 mutex_exit(&ire->ire_lock); 901 902 if (ire->ire_badcnt == 0) { 903 /* We found one with a zero badcnt; done */ 904 ire_refhold(ire); 905 /* 906 * Care needed since irb_refrele grabs WLOCK to free 907 * the irb_t. 908 */ 909 if (ire->ire_ipversion == IPV4_VERSION) { 910 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 911 irb_refrele(irb_ptr); 912 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 913 } else { 914 rw_exit(&ipst->ips_ip6_ire_head_lock); 915 irb_refrele(irb_ptr); 916 rw_enter(&ipst->ips_ip6_ire_head_lock, 917 RW_READER); 918 } 919 return (ire); 920 } 921 /* 922 * keep looking to see if there is a better (lower 923 * badcnt) matching IRE, but save this one as a last resort. 924 * If we find a lower badcnt pick that one as the last* resort. 925 */ 926 if (maybe_ire == NULL) { 927 maybe_ire = ire; 928 maybe_badcnt = ire->ire_badcnt; 929 } else if (ire->ire_badcnt < maybe_badcnt) { 930 maybe_ire = ire; 931 maybe_badcnt = ire->ire_badcnt; 932 } 933 934 next_ire: 935 maxwalk--; 936 next_ire_skip: 937 ire = ire->ire_next; 938 if (ire == NULL) 939 ire = irb_ptr->irb_ire; 940 } 941 if (maybe_ire != NULL) 942 ire_refhold(maybe_ire); 943 944 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 945 if (ire->ire_ipversion == IPV4_VERSION) { 946 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 947 irb_refrele(irb_ptr); 948 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 949 } else { 950 rw_exit(&ipst->ips_ip6_ire_head_lock); 951 irb_refrele(irb_ptr); 952 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 953 } 954 return (maybe_ire); 955 } 956 957 void 958 irb_refhold_rn(struct radix_node *rn) 959 { 960 if ((rn->rn_flags & RNF_ROOT) == 0) 961 irb_refhold(&((rt_t *)(rn))->rt_irb); 962 } 963 964 void 965 irb_refrele_rn(struct radix_node *rn) 966 { 967 if ((rn->rn_flags & RNF_ROOT) == 0) 968 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 969 } 970 971 /* 972 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 973 * routes this routine sets up a ire_nce_cache as well. The caller needs to 974 * lookup an nce for the multicast case. 975 */ 976 ire_t * 977 ip_select_route(const in6_addr_t *v6dst, ip_xmit_attr_t *ixa, 978 uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 979 { 980 uint_t match_args; 981 uint_t ire_type; 982 ill_t *ill; 983 ire_t *ire; 984 ip_stack_t *ipst = ixa->ixa_ipst; 985 ipaddr_t v4dst; 986 in6_addr_t v6nexthop; 987 iaflags_t ixaflags = ixa->ixa_flags; 988 nce_t *nce; 989 990 match_args = MATCH_IRE_SECATTR; 991 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 992 if (setsrcp != NULL) 993 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 994 if (errorp != NULL) 995 ASSERT(*errorp == 0); 996 997 /* 998 * The content of the ixa will be different if IP_NEXTHOP, 999 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1000 */ 1001 1002 if ((ixaflags & IXAF_IS_IPV4) ? CLASSD(v4dst) : 1003 IN6_IS_ADDR_MULTICAST(v6dst)) { 1004 /* Pick up the IRE_MULTICAST for the ill */ 1005 if (ixa->ixa_multicast_ifindex != 0) { 1006 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1007 !(ixaflags & IXAF_IS_IPV4), ipst); 1008 } else if (ixaflags & IXAF_SCOPEID_SET) { 1009 /* sin6_scope_id takes precedence over ixa_ifindex */ 1010 ASSERT(ixa->ixa_scopeid != 0); 1011 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1012 !(ixaflags & IXAF_IS_IPV4), ipst); 1013 } else if (ixa->ixa_ifindex != 0) { 1014 /* 1015 * In the ipmp case, the ixa_ifindex is set to 1016 * point at an under_ill and we would return the 1017 * ire_multicast() corresponding to that under_ill. 1018 */ 1019 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1020 !(ixaflags & IXAF_IS_IPV4), ipst); 1021 } else if (ixaflags & IXAF_IS_IPV4) { 1022 ipaddr_t v4setsrc = INADDR_ANY; 1023 1024 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, ipst, 1025 multirtp, &v4setsrc); 1026 if (setsrcp != NULL) 1027 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1028 } else { 1029 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, ipst, 1030 multirtp, setsrcp); 1031 } 1032 if (ill != NULL && IS_VNI(ill)) { 1033 ill_refrele(ill); 1034 ill = NULL; 1035 } 1036 if (ill == NULL) { 1037 if (errorp != NULL) 1038 *errorp = ENXIO; 1039 /* Get a hold on the IRE_NOROUTE */ 1040 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1041 return (ire); 1042 } 1043 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1044 ill_refrele(ill); 1045 if (errorp != NULL) 1046 *errorp = EHOSTUNREACH; 1047 /* Get a hold on the IRE_NOROUTE */ 1048 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1049 return (ire); 1050 } 1051 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1052 ire = ire_multicast(ill); 1053 ill_refrele(ill); 1054 if (generationp != NULL) 1055 *generationp = ire->ire_generation; 1056 if (errorp != NULL && 1057 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1058 *errorp = EHOSTUNREACH; 1059 } 1060 return (ire); 1061 } 1062 1063 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1064 if (ixaflags & IXAF_SCOPEID_SET) { 1065 /* sin6_scope_id takes precedence over ixa_ifindex */ 1066 ASSERT(ixa->ixa_scopeid != 0); 1067 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1068 !(ixaflags & IXAF_IS_IPV4), ipst); 1069 } else { 1070 ASSERT(ixa->ixa_ifindex != 0); 1071 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1072 !(ixaflags & IXAF_IS_IPV4), ipst); 1073 } 1074 if (ill != NULL && IS_VNI(ill)) { 1075 ill_refrele(ill); 1076 ill = NULL; 1077 } 1078 if (ill == NULL) { 1079 if (errorp != NULL) 1080 *errorp = ENXIO; 1081 /* Get a hold on the IRE_NOROUTE */ 1082 ire = ire_reject(ipst, !(ixaflags & IXAF_IS_IPV4)); 1083 return (ire); 1084 } 1085 /* 1086 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1087 * so for both of them we need to be able look for an under 1088 * interface. 1089 */ 1090 if (IS_UNDER_IPMP(ill)) 1091 match_args |= MATCH_IRE_TESTHIDDEN; 1092 } else { 1093 ill = NULL; 1094 } 1095 1096 if (ixaflags & IXAF_NEXTHOP_SET) { 1097 /* IP_NEXTHOP was set */ 1098 v6nexthop = ixa->ixa_nexthop_v6; 1099 } else { 1100 v6nexthop = *v6dst; 1101 } 1102 1103 ire_type = 0; 1104 /* If ill is null then ire_route_recursive will set MATCH_IRE_ILL */ 1105 1106 /* 1107 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1108 * we only look for an onlink IRE. 1109 */ 1110 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1111 match_args |= MATCH_IRE_TYPE; 1112 ire_type = IRE_ONLINK; 1113 } 1114 1115 if (ixaflags & IXAF_IS_IPV4) { 1116 ipaddr_t v4nexthop; 1117 ipaddr_t v4setsrc = INADDR_ANY; 1118 1119 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1120 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1121 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1122 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1123 if (setsrcp != NULL) 1124 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1125 } else { 1126 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1127 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1128 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1129 } 1130 1131 #ifdef DEBUG 1132 if (match_args & MATCH_IRE_TESTHIDDEN) { 1133 ip3dbg(("looking for hidden; dst %x ire %p\n", 1134 v4dst, (void *)ire)); 1135 } 1136 #endif 1137 1138 if (ill != NULL) 1139 ill_refrele(ill); 1140 1141 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1142 (ire->ire_type & IRE_MULTICAST)) { 1143 /* No ire_nce_cache */ 1144 return (ire); 1145 } 1146 1147 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1148 mutex_enter(&ire->ire_lock); 1149 nce = ire->ire_nce_cache; 1150 if (nce == NULL || nce->nce_is_condemned) { 1151 mutex_exit(&ire->ire_lock); 1152 (void) ire_revalidate_nce(ire); 1153 } else { 1154 mutex_exit(&ire->ire_lock); 1155 } 1156 return (ire); 1157 } 1158 1159 /* 1160 * Find a route given some xmit attributes and a packet. 1161 * Generic for IPv4 and IPv6 1162 * 1163 * This never returns NULL. But when it returns the IRE_NOROUTE 1164 * it might set errorp. 1165 */ 1166 ire_t * 1167 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1168 int *errorp, boolean_t *multirtp) 1169 { 1170 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1171 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1172 in6_addr_t v6dst; 1173 1174 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1175 1176 return (ip_select_route(&v6dst, ixa, generationp, 1177 NULL, errorp, multirtp)); 1178 } else { 1179 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1180 1181 return (ip_select_route(&ip6h->ip6_dst, ixa, generationp, 1182 NULL, errorp, multirtp)); 1183 } 1184 } 1185 1186 ire_t * 1187 ip_select_route_v4(ipaddr_t dst, ip_xmit_attr_t *ixa, uint_t *generationp, 1188 ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1189 { 1190 in6_addr_t v6dst; 1191 ire_t *ire; 1192 in6_addr_t setsrc; 1193 1194 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1195 1196 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1197 1198 setsrc = ipv6_all_zeros; 1199 ire = ip_select_route(&v6dst, ixa, generationp, &setsrc, errorp, 1200 multirtp); 1201 if (v4setsrcp != NULL) 1202 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1203 return (ire); 1204 } 1205 1206 /* 1207 * Recursively look for a route to the destination. Can also match on 1208 * the zoneid, ill, and label. Used for the data paths. See also 1209 * ire_route_recursive. 1210 * 1211 * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1212 * 1213 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1214 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1215 * forwarding. 1216 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1217 * resolve the gateway. 1218 * 1219 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1220 * instead. 1221 * 1222 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1223 * is an error. 1224 * Allow at most one RTF_INDIRECT. 1225 */ 1226 ire_t * 1227 ire_route_recursive_impl_v4(ire_t *ire, 1228 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1229 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1230 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1231 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1232 { 1233 int i, j; 1234 ire_t *ires[MAX_IRE_RECURSION]; 1235 uint_t generation; 1236 uint_t generations[MAX_IRE_RECURSION]; 1237 boolean_t need_refrele = B_FALSE; 1238 boolean_t invalidate = B_FALSE; 1239 int prefs[MAX_IRE_RECURSION]; 1240 ill_t *ill = NULL; 1241 1242 if (setsrcp != NULL) 1243 ASSERT(*setsrcp == INADDR_ANY); 1244 if (gwattrp != NULL) 1245 ASSERT(*gwattrp == NULL); 1246 1247 if (ill_arg != NULL) 1248 match_args |= MATCH_IRE_ILL; 1249 1250 /* 1251 * We iterate up to three times to resolve a route, even though 1252 * we have four slots in the array. The extra slot is for an 1253 * IRE_IF_CLONE we might need to create. 1254 */ 1255 i = 0; 1256 while (i < MAX_IRE_RECURSION - 1) { 1257 /* ire_ftable_lookup handles round-robin/ECMP */ 1258 if (ire == NULL) { 1259 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1260 (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1261 match_args, xmit_hint, ipst, &generation); 1262 } else { 1263 /* Caller passed it; extra hold since we will rele */ 1264 ire_refhold(ire); 1265 if (generationp != NULL) 1266 generation = *generationp; 1267 else 1268 generation = IRE_GENERATION_VERIFY; 1269 } 1270 if (ire == NULL) 1271 ire = ire_reject(ipst, B_FALSE); 1272 1273 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1274 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1275 goto error; 1276 1277 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1278 1279 if (i != 0) { 1280 prefs[i] = ire_pref(ire); 1281 /* 1282 * Don't allow anything unusual past the first 1283 * iteration. 1284 */ 1285 if ((ire->ire_type & 1286 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1287 prefs[i] <= prefs[i-1]) { 1288 ire_refrele(ire); 1289 if (irr_flags & IRR_INCOMPLETE) { 1290 ire = ires[0]; 1291 ire_refhold(ire); 1292 } else { 1293 ire = ire_reject(ipst, B_FALSE); 1294 } 1295 goto error; 1296 } 1297 } 1298 /* We have a usable IRE */ 1299 ires[i] = ire; 1300 generations[i] = generation; 1301 i++; 1302 1303 /* The first RTF_SETSRC address is passed back if setsrcp */ 1304 if ((ire->ire_flags & RTF_SETSRC) && 1305 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1306 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1307 *setsrcp = ire->ire_setsrc_addr; 1308 } 1309 1310 /* The first ire_gw_secattr is passed back if gwattrp */ 1311 if (ire->ire_gw_secattr != NULL && 1312 gwattrp != NULL && *gwattrp == NULL) 1313 *gwattrp = ire->ire_gw_secattr; 1314 1315 /* 1316 * Check if we have a short-cut pointer to an IRE for this 1317 * destination, and that the cached dependency isn't stale. 1318 * In that case we've rejoined an existing tree towards a 1319 * parent, thus we don't need to continue the loop to 1320 * discover the rest of the tree. 1321 */ 1322 mutex_enter(&ire->ire_lock); 1323 if (ire->ire_dep_parent != NULL && 1324 ire->ire_dep_parent->ire_generation == 1325 ire->ire_dep_parent_generation) { 1326 mutex_exit(&ire->ire_lock); 1327 ire = NULL; 1328 goto done; 1329 } 1330 mutex_exit(&ire->ire_lock); 1331 1332 /* 1333 * If this type should have an ire_nce_cache (even if it 1334 * doesn't yet have one) then we are done. Includes 1335 * IRE_INTERFACE with a full 32 bit mask. 1336 */ 1337 if (ire->ire_nce_capable) { 1338 ire = NULL; 1339 goto done; 1340 } 1341 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1342 /* 1343 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1344 * particular destination 1345 */ 1346 if (ire->ire_type & IRE_INTERFACE) { 1347 in6_addr_t v6nexthop; 1348 ire_t *clone; 1349 1350 ASSERT(ire->ire_masklen != IPV4_ABITS); 1351 1352 /* 1353 * In the case of ip_input and ILLF_FORWARDING not 1354 * being set, and in the case of RTM_GET, there is 1355 * no point in allocating an IRE_IF_CLONE. We return 1356 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1357 * result in a ire_dep_parent which is IRE_IF_* 1358 * without an IRE_IF_CLONE. 1359 * We recover from that when we need to send packets 1360 * by ensuring that the generations become 1361 * IRE_GENERATION_VERIFY in this case. 1362 */ 1363 if (!(irr_flags & IRR_ALLOCATE)) { 1364 invalidate = B_TRUE; 1365 ire = NULL; 1366 goto done; 1367 } 1368 1369 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1370 1371 clone = ire_create_if_clone(ire, &v6nexthop, 1372 &generation); 1373 if (clone == NULL) { 1374 /* 1375 * Temporary failure - no memory. 1376 * Don't want caller to cache IRE_NOROUTE. 1377 */ 1378 invalidate = B_TRUE; 1379 ire = ire_blackhole(ipst, B_FALSE); 1380 goto error; 1381 } 1382 /* 1383 * Make clone next to last entry and the 1384 * IRE_INTERFACE the last in the dependency 1385 * chain since the clone depends on the 1386 * IRE_INTERFACE. 1387 */ 1388 ASSERT(i >= 1); 1389 ASSERT(i < MAX_IRE_RECURSION); 1390 1391 ires[i] = ires[i-1]; 1392 generations[i] = generations[i-1]; 1393 ires[i-1] = clone; 1394 generations[i-1] = generation; 1395 i++; 1396 1397 ire = NULL; 1398 goto done; 1399 } 1400 1401 /* 1402 * We only match on the type and optionally ILL when 1403 * recursing. The type match is used by some callers 1404 * to exclude certain types (such as IRE_IF_CLONE or 1405 * IRE_LOCAL|IRE_LOOPBACK). 1406 */ 1407 match_args &= MATCH_IRE_TYPE; 1408 nexthop = ire->ire_gateway_addr; 1409 if (ill == NULL && ire->ire_ill != NULL) { 1410 ill = ire->ire_ill; 1411 need_refrele = B_TRUE; 1412 ill_refhold(ill); 1413 match_args |= MATCH_IRE_ILL; 1414 } 1415 /* 1416 * We set the prefs[i] value above if i > 0. We've already 1417 * done i++ so i is one in the case of the first time around. 1418 */ 1419 if (i == 1) 1420 prefs[0] = ire_pref(ire); 1421 ire = NULL; 1422 } 1423 ASSERT(ire == NULL); 1424 ire = ire_reject(ipst, B_FALSE); 1425 1426 error: 1427 ASSERT(ire != NULL); 1428 if (need_refrele) 1429 ill_refrele(ill); 1430 1431 /* 1432 * In the case of MULTIRT we want to try a different IRE the next 1433 * time. We let the next packet retry in that case. 1434 */ 1435 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1436 (void) ire_no_good(ires[0]); 1437 1438 cleanup: 1439 /* cleanup ires[i] */ 1440 ire_dep_unbuild(ires, i); 1441 for (j = 0; j < i; j++) 1442 ire_refrele(ires[j]); 1443 1444 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1445 (irr_flags & IRR_INCOMPLETE)); 1446 /* 1447 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1448 * ip_select_route since the reject or lack of memory might be gone. 1449 */ 1450 if (generationp != NULL) 1451 *generationp = IRE_GENERATION_VERIFY; 1452 return (ire); 1453 1454 done: 1455 ASSERT(ire == NULL); 1456 if (need_refrele) { 1457 ill_refrele(ill); 1458 ill = NULL; 1459 } 1460 1461 /* Build dependencies */ 1462 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1463 /* Something in chain was condemned; tear it apart */ 1464 ire = ire_reject(ipst, B_FALSE); 1465 goto cleanup; 1466 } 1467 1468 /* 1469 * Release all refholds except the one for ires[0] that we 1470 * will return to the caller. 1471 */ 1472 for (j = 1; j < i; j++) 1473 ire_refrele(ires[j]); 1474 1475 if (invalidate) { 1476 /* 1477 * Since we needed to allocate but couldn't we need to make 1478 * sure that the dependency chain is rebuilt the next time. 1479 */ 1480 ire_dep_invalidate_generations(ires[0]); 1481 generation = IRE_GENERATION_VERIFY; 1482 } else { 1483 /* 1484 * IREs can have been added or deleted while we did the 1485 * recursive lookup and we can't catch those until we've built 1486 * the dependencies. We verify the stored 1487 * ire_dep_parent_generation to catch any such changes and 1488 * return IRE_GENERATION_VERIFY (which will cause 1489 * ip_select_route to be called again so we can redo the 1490 * recursive lookup next time we send a packet. 1491 */ 1492 if (ires[0]->ire_dep_parent == NULL) 1493 generation = ires[0]->ire_generation; 1494 else 1495 generation = ire_dep_validate_generations(ires[0]); 1496 if (generations[0] != ires[0]->ire_generation) { 1497 /* Something changed at the top */ 1498 generation = IRE_GENERATION_VERIFY; 1499 } 1500 } 1501 if (generationp != NULL) 1502 *generationp = generation; 1503 1504 return (ires[0]); 1505 } 1506 1507 ire_t * 1508 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1509 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1510 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1511 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1512 { 1513 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1514 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1515 gwattrp, generationp)); 1516 } 1517 1518 /* 1519 * Recursively look for a route to the destination. 1520 * We only handle a destination match here, yet we have the same arguments 1521 * as the full match to allow function pointers to select between the two. 1522 * 1523 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1524 * instead. 1525 * 1526 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1527 * is an error. 1528 * Allow at most one RTF_INDIRECT. 1529 */ 1530 ire_t * 1531 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1532 uint32_t xmit_hint, ip_stack_t *ipst) 1533 { 1534 ire_t *ire; 1535 ire_t *ire1; 1536 uint_t generation; 1537 1538 /* ire_ftable_lookup handles round-robin/ECMP */ 1539 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1540 &generation); 1541 ASSERT(ire != NULL); 1542 1543 /* 1544 * If this type should have an ire_nce_cache (even if it 1545 * doesn't yet have one) then we are done. Includes 1546 * IRE_INTERFACE with a full 32 bit mask. 1547 */ 1548 if (ire->ire_nce_capable) 1549 return (ire); 1550 1551 /* 1552 * If the IRE has a current cached parent we know that the whole 1553 * parent chain is current, hence we don't need to discover and 1554 * build any dependencies by doing a recursive lookup. 1555 */ 1556 mutex_enter(&ire->ire_lock); 1557 if (ire->ire_dep_parent != NULL && 1558 ire->ire_dep_parent->ire_generation == 1559 ire->ire_dep_parent_generation) { 1560 mutex_exit(&ire->ire_lock); 1561 return (ire); 1562 } 1563 mutex_exit(&ire->ire_lock); 1564 1565 /* 1566 * Fallback to loop in the normal code starting with the ire 1567 * we found. Normally this would return the same ire. 1568 */ 1569 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1570 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1571 &generation); 1572 ire_refrele(ire); 1573 return (ire1); 1574 } 1575