1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * This file contains consumer routines of the IPv4 forwarding engine 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strlog.h> 33 #include <sys/dlpi.h> 34 #include <sys/ddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/policy.h> 37 38 #include <sys/systm.h> 39 #include <sys/strsun.h> 40 #include <sys/kmem.h> 41 #include <sys/param.h> 42 #include <sys/socket.h> 43 #include <sys/strsubr.h> 44 #include <net/if.h> 45 #include <net/route.h> 46 #include <netinet/in.h> 47 #include <net/if_dl.h> 48 #include <netinet/ip6.h> 49 #include <netinet/icmp6.h> 50 51 #include <inet/ipsec_impl.h> 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/ip.h> 56 #include <inet/ip_impl.h> 57 #include <inet/ip6.h> 58 #include <inet/ip_ndp.h> 59 #include <inet/arp.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_ftable.h> 63 #include <inet/ip_rts.h> 64 #include <inet/nd.h> 65 66 #include <net/pfkeyv2.h> 67 #include <inet/sadb.h> 68 #include <inet/tcp.h> 69 #include <inet/ipclassifier.h> 70 #include <sys/zone.h> 71 #include <net/radix.h> 72 #include <sys/tsol/label.h> 73 #include <sys/tsol/tnet.h> 74 75 #define IS_DEFAULT_ROUTE(ire) \ 76 (((ire)->ire_type & IRE_DEFAULT) || \ 77 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 78 79 #define IP_SRC_MULTIHOMING(isv6, ipst) \ 80 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 81 ipst->ips_ip_strict_src_multihoming) 82 83 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 84 static void ire_del_host_redir(ire_t *, char *); 85 static boolean_t ire_find_best_route(struct radix_node *, void *); 86 87 /* 88 * Lookup a route in forwarding table. A specific lookup is indicated by 89 * passing the required parameters and indicating the match required in the 90 * flag field. 91 * 92 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 93 */ 94 ire_t * 95 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 96 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 97 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 98 { 99 ire_t *ire; 100 struct rt_sockaddr rdst, rmask; 101 struct rt_entry *rt; 102 ire_ftable_args_t margs; 103 104 ASSERT(ill == NULL || !ill->ill_isv6); 105 106 /* 107 * ire_match_args() will dereference ill if MATCH_IRE_ILL 108 * is set. 109 */ 110 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 111 return (NULL); 112 113 bzero(&rdst, sizeof (rdst)); 114 rdst.rt_sin_len = sizeof (rdst); 115 rdst.rt_sin_family = AF_INET; 116 rdst.rt_sin_addr.s_addr = addr; 117 118 bzero(&rmask, sizeof (rmask)); 119 rmask.rt_sin_len = sizeof (rmask); 120 rmask.rt_sin_family = AF_INET; 121 rmask.rt_sin_addr.s_addr = mask; 122 123 bzero(&margs, sizeof (margs)); 124 margs.ift_addr = addr; 125 margs.ift_mask = mask; 126 margs.ift_gateway = gateway; 127 margs.ift_type = type; 128 margs.ift_ill = ill; 129 margs.ift_zoneid = zoneid; 130 margs.ift_tsl = tsl; 131 margs.ift_flags = flags; 132 133 /* 134 * The flags argument passed to ire_ftable_lookup may cause the 135 * search to return, not the longest matching prefix, but the 136 * "best matching prefix", i.e., the longest prefix that also 137 * satisfies constraints imposed via the permutation of flags 138 * passed in. To achieve this, we invoke ire_match_args() on 139 * each matching leaf in the radix tree. ire_match_args is 140 * invoked by the callback function ire_find_best_route() 141 * We hold the global tree lock in read mode when calling 142 * rn_match_args. Before dropping the global tree lock, ensure 143 * that the radix node can't be deleted by incrementing ire_refcnt. 144 */ 145 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 146 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 147 ipst->ips_ip_ftable, ire_find_best_route, &margs); 148 ire = margs.ift_best_ire; 149 if (rt == NULL) { 150 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 151 return (NULL); 152 } 153 ASSERT(ire != NULL); 154 155 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 156 157 /* 158 * round-robin only if we have more than one route in the bucket. 159 * ips_ip_ecmp_behavior controls when we do ECMP 160 * 2: always 161 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 162 * 0: never 163 */ 164 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 165 if (ipst->ips_ip_ecmp_behavior == 2 || 166 (ipst->ips_ip_ecmp_behavior == 1 && 167 IS_DEFAULT_ROUTE(ire))) { 168 ire_t *next_ire; 169 170 margs.ift_best_ire = NULL; 171 next_ire = ire_round_robin(ire->ire_bucket, &margs, 172 xmit_hint, ire, ipst); 173 if (next_ire == NULL) { 174 /* keep ire if next_ire is null */ 175 goto done; 176 } 177 ire_refrele(ire); 178 ire = next_ire; 179 } 180 } 181 182 done: 183 /* Return generation before dropping lock */ 184 if (generationp != NULL) 185 *generationp = ire->ire_generation; 186 187 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 188 189 /* 190 * For shared-IP zones we need additional checks to what was 191 * done in ire_match_args to make sure IRE_LOCALs are handled. 192 * 193 * When ip_restrict_interzone_loopback is set, then 194 * we ensure that IRE_LOCAL are only used for loopback 195 * between zones when the logical "Ethernet" would 196 * have looped them back. That is, if in the absense of 197 * the IRE_LOCAL we would have sent to packet out the 198 * same ill. 199 */ 200 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 201 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 202 ipst->ips_ip_restrict_interzone_loopback) { 203 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 204 ASSERT(ire != NULL); 205 } 206 return (ire); 207 } 208 209 /* 210 * This function is called by 211 * ip_input/ire_route_recursive when doing a route lookup on only the 212 * destination address. 213 * 214 * The optimizations of this function over ire_ftable_lookup are: 215 * o removing unnecessary flag matching 216 * o doing longest prefix match instead of overloading it further 217 * with the unnecessary "best_prefix_match" 218 * 219 * If no route is found we return IRE_NOROUTE. 220 */ 221 ire_t * 222 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 223 uint_t *generationp) 224 { 225 ire_t *ire; 226 struct rt_sockaddr rdst; 227 struct rt_entry *rt; 228 irb_t *irb; 229 230 rdst.rt_sin_len = sizeof (rdst); 231 rdst.rt_sin_family = AF_INET; 232 rdst.rt_sin_addr.s_addr = addr; 233 234 /* 235 * This is basically inlining a simpler version of ire_match_args 236 */ 237 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 238 239 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 240 ipst->ips_ip_ftable, NULL, NULL); 241 242 if (rt == NULL) 243 goto bad; 244 245 irb = &rt->rt_irb; 246 if (irb->irb_ire_cnt == 0) 247 goto bad; 248 249 rw_enter(&irb->irb_lock, RW_READER); 250 ire = irb->irb_ire; 251 if (ire == NULL) { 252 rw_exit(&irb->irb_lock); 253 goto bad; 254 } 255 while (IRE_IS_CONDEMNED(ire)) { 256 ire = ire->ire_next; 257 if (ire == NULL) { 258 rw_exit(&irb->irb_lock); 259 goto bad; 260 } 261 } 262 263 /* we have a ire that matches */ 264 ire_refhold(ire); 265 rw_exit(&irb->irb_lock); 266 267 /* 268 * round-robin only if we have more than one route in the bucket. 269 * ips_ip_ecmp_behavior controls when we do ECMP 270 * 2: always 271 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 272 * 0: never 273 * 274 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 275 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 276 * and the IRE_INTERFACESs are likely to be shorter matches. 277 */ 278 if (ire->ire_bucket->irb_ire_cnt > 1) { 279 if (ipst->ips_ip_ecmp_behavior == 2 || 280 (ipst->ips_ip_ecmp_behavior == 1 && 281 IS_DEFAULT_ROUTE(ire))) { 282 ire_t *next_ire; 283 ire_ftable_args_t margs; 284 285 bzero(&margs, sizeof (margs)); 286 margs.ift_addr = addr; 287 margs.ift_zoneid = ALL_ZONES; 288 289 next_ire = ire_round_robin(ire->ire_bucket, &margs, 290 xmit_hint, ire, ipst); 291 if (next_ire == NULL) { 292 /* keep ire if next_ire is null */ 293 if (generationp != NULL) 294 *generationp = ire->ire_generation; 295 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 296 return (ire); 297 } 298 ire_refrele(ire); 299 ire = next_ire; 300 } 301 } 302 /* Return generation before dropping lock */ 303 if (generationp != NULL) 304 *generationp = ire->ire_generation; 305 306 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 307 308 /* 309 * Since we only did ALL_ZONES matches there is no special handling 310 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 311 */ 312 return (ire); 313 314 bad: 315 if (generationp != NULL) 316 *generationp = IRE_GENERATION_VERIFY; 317 318 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 319 return (ire_reject(ipst, B_FALSE)); 320 } 321 322 /* 323 * Find the ill matching a multicast group. 324 * Allows different routes for multicast addresses 325 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 326 * which point at different interfaces. This is used when IP_MULTICAST_IF 327 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 328 * specify the interface to join on. 329 * 330 * Supports link-local addresses by using ire_route_recursive which follows 331 * the ill when recursing. 332 * 333 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 334 * and the MULTIRT property can be different for different groups, we 335 * extract RTF_MULTIRT from the special unicast route added for a group 336 * with CGTP and pass that back in the multirtp argument. 337 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 338 * We have a setsrcp argument for the same reason. 339 */ 340 ill_t * 341 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 342 boolean_t *multirtp, ipaddr_t *setsrcp) 343 { 344 ire_t *ire; 345 ill_t *ill; 346 347 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 348 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 349 ASSERT(ire != NULL); 350 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 351 ire_refrele(ire); 352 return (NULL); 353 } 354 355 if (multirtp != NULL) 356 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 357 358 ill = ire_nexthop_ill(ire); 359 ire_refrele(ire); 360 return (ill); 361 } 362 363 /* 364 * Delete the passed in ire if the gateway addr matches 365 */ 366 void 367 ire_del_host_redir(ire_t *ire, char *gateway) 368 { 369 if ((ire->ire_flags & RTF_DYNAMIC) && 370 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 371 ire_delete(ire); 372 } 373 374 /* 375 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 376 * pointing at the specified gateway and 377 * delete them. This routine is called only 378 * when a default gateway is going away. 379 */ 380 void 381 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 382 { 383 struct rtfuncarg rtfarg; 384 385 bzero(&rtfarg, sizeof (rtfarg)); 386 rtfarg.rt_func = ire_del_host_redir; 387 rtfarg.rt_arg = (void *)&gateway; 388 rtfarg.rt_zoneid = ALL_ZONES; 389 rtfarg.rt_ipst = ipst; 390 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 391 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 392 } 393 394 /* 395 * Obtain the rt_entry and rt_irb for the route to be added to 396 * the ips_ip_ftable. 397 * First attempt to add a node to the radix tree via rn_addroute. If the 398 * route already exists, return the bucket for the existing route. 399 * 400 * Locking notes: Need to hold the global radix tree lock in write mode to 401 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 402 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 403 * while holding the irb_lock, but not the radix tree lock. 404 */ 405 irb_t * 406 ire_get_bucket(ire_t *ire) 407 { 408 struct radix_node *rn; 409 struct rt_entry *rt; 410 struct rt_sockaddr rmask, rdst; 411 irb_t *irb = NULL; 412 ip_stack_t *ipst = ire->ire_ipst; 413 414 ASSERT(ipst->ips_ip_ftable != NULL); 415 416 /* first try to see if route exists (based on rtalloc1) */ 417 bzero(&rdst, sizeof (rdst)); 418 rdst.rt_sin_len = sizeof (rdst); 419 rdst.rt_sin_family = AF_INET; 420 rdst.rt_sin_addr.s_addr = ire->ire_addr; 421 422 bzero(&rmask, sizeof (rmask)); 423 rmask.rt_sin_len = sizeof (rmask); 424 rmask.rt_sin_family = AF_INET; 425 rmask.rt_sin_addr.s_addr = ire->ire_mask; 426 427 /* 428 * add the route. based on BSD's rtrequest1(RTM_ADD) 429 */ 430 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 431 /* kmem_alloc failed */ 432 if (rt == NULL) 433 return (NULL); 434 435 bzero(rt, sizeof (*rt)); 436 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 437 rt->rt_dst = rdst; 438 irb = &rt->rt_irb; 439 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 440 irb->irb_ipst = ipst; 441 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 442 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 443 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 444 ipst->ips_ip_ftable, (struct radix_node *)rt); 445 if (rn == NULL) { 446 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 447 Free(rt, rt_entry_cache); 448 rt = NULL; 449 irb = NULL; 450 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 451 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 452 ipst->ips_ip_ftable); 453 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 454 /* found a non-root match */ 455 rt = (struct rt_entry *)rn; 456 } 457 } 458 if (rt != NULL) { 459 irb = &rt->rt_irb; 460 irb_refhold(irb); 461 } 462 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 463 return (irb); 464 } 465 466 /* 467 * This function is used when the caller wants to know the outbound 468 * interface for a packet given only the address. 469 * If this is a offlink IP address and there are multiple 470 * routes to this destination, this routine will utilise the 471 * first route it finds to IP address 472 * Return values: 473 * 0 - FAILURE 474 * nonzero - ifindex 475 */ 476 uint_t 477 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 478 { 479 uint_t ifindex = 0; 480 ire_t *ire; 481 ill_t *ill; 482 netstack_t *ns; 483 ip_stack_t *ipst; 484 485 if (zoneid == ALL_ZONES) 486 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 487 else 488 ns = netstack_find_by_zoneid(zoneid); 489 ASSERT(ns != NULL); 490 491 /* 492 * For exclusive stacks we set the zoneid to zero 493 * since IP uses the global zoneid in the exclusive stacks. 494 */ 495 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 496 zoneid = GLOBAL_ZONEID; 497 ipst = ns->netstack_ip; 498 499 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 500 501 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 502 ill = ire_nexthop_ill(ire); 503 if (ill != NULL) { 504 ifindex = ill->ill_phyint->phyint_ifindex; 505 ill_refrele(ill); 506 } 507 ire_refrele(ire); 508 } 509 netstack_rele(ns); 510 return (ifindex); 511 } 512 513 /* 514 * Routine to find the route to a destination. If a ifindex is supplied 515 * it tries to match the route to the corresponding ipif for the ifindex 516 */ 517 static ire_t * 518 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 519 { 520 ire_t *ire = NULL; 521 int match_flags; 522 523 match_flags = MATCH_IRE_DSTONLY; 524 525 /* XXX pass NULL tsl for now */ 526 527 if (dst_addr->sa_family == AF_INET) { 528 ire = ire_route_recursive_v4( 529 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 530 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 531 NULL, NULL); 532 } else { 533 ire = ire_route_recursive_v6( 534 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 535 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 536 NULL, NULL); 537 } 538 ASSERT(ire != NULL); 539 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 540 ire_refrele(ire); 541 return (NULL); 542 } 543 return (ire); 544 } 545 546 /* 547 * This routine is called by IP Filter to send a packet out on the wire 548 * to a specified dstination (which may be onlink or offlink). The ifindex may 549 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 550 * an outgoing interface and requires the nexthop to be on that interface. 551 * IP WILL NOT DO the following to the data packet before sending it out: 552 * a. manipulate ttl 553 * b. ipsec work 554 * c. fragmentation 555 * 556 * If the packet has been prepared for hardware checksum then it will be 557 * passed off to ip_send_align_cksum() to check that the flags set on the 558 * packet are in alignment with the capabilities of the new outgoing NIC. 559 * 560 * Return values: 561 * 0: IP was able to send of the data pkt 562 * ECOMM: Could not send packet 563 * ENONET No route to dst. It is up to the caller 564 * to send icmp unreachable error message, 565 * EINPROGRESS The macaddr of the onlink dst or that 566 * of the offlink dst's nexthop needs to get 567 * resolved before packet can be sent to dst. 568 * Thus transmission is not guaranteed. 569 * Note: No longer have visibility to the ARP queue 570 * hence no EINPROGRESS. 571 */ 572 int 573 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 574 zoneid_t zoneid) 575 { 576 ipaddr_t nexthop; 577 netstack_t *ns; 578 ip_stack_t *ipst; 579 ip_xmit_attr_t ixas; 580 int error; 581 582 ASSERT(mp != NULL); 583 584 if (zoneid == ALL_ZONES) 585 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 586 else 587 ns = netstack_find_by_zoneid(zoneid); 588 ASSERT(ns != NULL); 589 590 /* 591 * For exclusive stacks we set the zoneid to zero 592 * since IP uses the global zoneid in the exclusive stacks. 593 */ 594 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 595 zoneid = GLOBAL_ZONEID; 596 ipst = ns->netstack_ip; 597 598 ASSERT(dst_addr->sa_family == AF_INET || 599 dst_addr->sa_family == AF_INET6); 600 601 bzero(&ixas, sizeof (ixas)); 602 /* 603 * No IPsec, no fragmentation, and don't let any hooks see 604 * the packet. 605 */ 606 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 607 ixas.ixa_cred = kcred; 608 ixas.ixa_cpid = NOPID; 609 ixas.ixa_tsl = NULL; 610 ixas.ixa_ipst = ipst; 611 ixas.ixa_ifindex = ifindex; 612 613 if (dst_addr->sa_family == AF_INET) { 614 ipha_t *ipha = (ipha_t *)mp->b_rptr; 615 616 ixas.ixa_flags |= IXAF_IS_IPV4; 617 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 618 if (nexthop != ipha->ipha_dst) { 619 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 620 ixas.ixa_nexthop_v4 = nexthop; 621 } 622 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 623 } else { 624 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 625 in6_addr_t *nexthop6; 626 627 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 628 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 629 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 630 ixas.ixa_nexthop_v6 = *nexthop6; 631 } 632 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 633 } 634 error = ip_output_simple(mp, &ixas); 635 ixa_cleanup(&ixas); 636 637 netstack_rele(ns); 638 switch (error) { 639 case 0: 640 break; 641 642 case EHOSTUNREACH: 643 case ENETUNREACH: 644 error = ENONET; 645 break; 646 647 default: 648 error = ECOMM; 649 break; 650 } 651 return (error); 652 } 653 654 /* 655 * callback function provided by ire_ftable_lookup when calling 656 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 657 * the radix tree. 658 */ 659 boolean_t 660 ire_find_best_route(struct radix_node *rn, void *arg) 661 { 662 struct rt_entry *rt = (struct rt_entry *)rn; 663 irb_t *irb_ptr; 664 ire_t *ire; 665 ire_ftable_args_t *margs = arg; 666 ipaddr_t match_mask; 667 668 ASSERT(rt != NULL); 669 670 irb_ptr = &rt->rt_irb; 671 672 if (irb_ptr->irb_ire_cnt == 0) 673 return (B_FALSE); 674 675 rw_enter(&irb_ptr->irb_lock, RW_READER); 676 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 677 if (IRE_IS_CONDEMNED(ire)) 678 continue; 679 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 680 if (margs->ift_flags & MATCH_IRE_MASK) 681 match_mask = margs->ift_mask; 682 else 683 match_mask = ire->ire_mask; 684 685 if (ire_match_args(ire, margs->ift_addr, match_mask, 686 margs->ift_gateway, margs->ift_type, margs->ift_ill, 687 margs->ift_zoneid, margs->ift_tsl, 688 margs->ift_flags)) { 689 ire_refhold(ire); 690 rw_exit(&irb_ptr->irb_lock); 691 margs->ift_best_ire = ire; 692 return (B_TRUE); 693 } 694 } 695 rw_exit(&irb_ptr->irb_lock); 696 return (B_FALSE); 697 } 698 699 /* 700 * ftable irb_t structures are dynamically allocated, and we need to 701 * check if the irb_t (and associated ftable tree attachment) needs to 702 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 703 * be verified are: 704 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 705 * - no other threads holding references to ire's in the bucket, 706 * i.e., irb_nire == 0 707 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 708 * - need to hold the global tree lock and irb_lock in write mode. 709 */ 710 void 711 irb_refrele_ftable(irb_t *irb) 712 { 713 for (;;) { 714 rw_enter(&irb->irb_lock, RW_WRITER); 715 ASSERT(irb->irb_refcnt != 0); 716 if (irb->irb_refcnt != 1) { 717 /* 718 * Someone has a reference to this radix node 719 * or there is some bucket walker. 720 */ 721 irb->irb_refcnt--; 722 rw_exit(&irb->irb_lock); 723 return; 724 } else { 725 /* 726 * There is no other walker, nor is there any 727 * other thread that holds a direct ref to this 728 * radix node. Do the clean up if needed. Call 729 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 730 */ 731 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 732 ire_t *ire_list; 733 734 ire_list = ire_unlink(irb); 735 rw_exit(&irb->irb_lock); 736 737 if (ire_list != NULL) 738 ire_cleanup(ire_list); 739 /* 740 * more CONDEMNED entries could have 741 * been added while we dropped the lock, 742 * so we have to re-check. 743 */ 744 continue; 745 } 746 747 /* 748 * Now check if there are still any ires 749 * associated with this radix node. 750 */ 751 if (irb->irb_nire != 0) { 752 /* 753 * someone is still holding on 754 * to ires in this bucket 755 */ 756 irb->irb_refcnt--; 757 rw_exit(&irb->irb_lock); 758 return; 759 } else { 760 /* 761 * Everything is clear. Zero walkers, 762 * Zero threads with a ref to this 763 * radix node, Zero ires associated with 764 * this radix node. Due to lock order, 765 * check the above conditions again 766 * after grabbing all locks in the right order 767 */ 768 rw_exit(&irb->irb_lock); 769 if (irb_inactive(irb)) 770 return; 771 /* 772 * irb_inactive could not free the irb. 773 * See if there are any walkers, if not 774 * try to clean up again. 775 */ 776 } 777 } 778 } 779 } 780 781 /* 782 * IRE iterator used by ire_ftable_lookup to process multiple equal 783 * routes. Given a starting point in the hash list (hash), walk the IREs 784 * in the bucket skipping deleted entries. We treat the bucket as a circular 785 * list for the purposes of walking it. 786 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 787 * not applicable (ire_match_args failed) then it returns a subsequent one. 788 * If we fail to find an IRE we return NULL. 789 * 790 * Assumes that the caller holds a reference on the IRE bucket and a read lock 791 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 792 * 793 * Applies to IPv4 and IPv6. 794 * 795 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 796 * address and bucket, we compare against ire_type for the orig_ire. We also 797 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 798 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 799 * 800 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 801 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 802 * in which the zone has an IP address. We check this for the global zone 803 * even if no shared-IP zones are configured. 804 */ 805 ire_t * 806 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 807 ire_t *orig_ire, ip_stack_t *ipst) 808 { 809 ire_t *ire, *maybe_ire = NULL; 810 uint_t maybe_badcnt; 811 uint_t maxwalk; 812 813 /* Fold in more bits from the hint/hash */ 814 hash = hash ^ (hash >> 8) ^ (hash >> 16); 815 816 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 817 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 818 hash %= maxwalk; 819 irb_refhold_locked(irb_ptr); 820 rw_exit(&irb_ptr->irb_lock); 821 822 /* 823 * Round-robin the routers list looking for a route that 824 * matches the passed in parameters. 825 * First we skip "hash" number of non-condemned IREs. 826 * Then we match the IRE. 827 * If we find an ire which has a non-zero ire_badcnt then we remember 828 * it and keep on looking for a lower ire_badcnt. 829 * If we come to the end of the list we continue (treat the 830 * bucket list as a circular list) but we match less than "max" 831 * entries. 832 */ 833 ire = irb_ptr->irb_ire; 834 while (maxwalk > 0) { 835 if (IRE_IS_CONDEMNED(ire)) 836 goto next_ire_skip; 837 838 /* Skip the first "hash" entries to do ECMP */ 839 if (hash != 0) { 840 hash--; 841 goto next_ire_skip; 842 } 843 844 /* See CGTP comment above */ 845 if (ire->ire_type != orig_ire->ire_type || 846 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 847 goto next_ire; 848 849 /* 850 * Note: Since IPv6 has hash buckets instead of radix 851 * buckers we need to explicitly compare the addresses. 852 * That makes this less efficient since we will be called 853 * even if there is no alternatives just because the 854 * bucket has multiple IREs for different addresses. 855 */ 856 if (ire->ire_ipversion == IPV6_VERSION) { 857 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 858 &ire->ire_addr_v6)) 859 goto next_ire; 860 } 861 862 /* 863 * For some reason find_best_route uses ire_mask. We do 864 * the same. 865 */ 866 if (ire->ire_ipversion == IPV4_VERSION ? 867 !ire_match_args(ire, margs->ift_addr, 868 ire->ire_mask, margs->ift_gateway, 869 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 870 margs->ift_tsl, margs->ift_flags) : 871 !ire_match_args_v6(ire, &margs->ift_addr_v6, 872 &ire->ire_mask_v6, &margs->ift_gateway_v6, 873 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 874 margs->ift_tsl, margs->ift_flags)) 875 goto next_ire; 876 877 if (margs->ift_zoneid != ALL_ZONES && 878 (ire->ire_type & IRE_OFFLINK)) { 879 /* 880 * When we're in a zone, we're only 881 * interested in routers that are 882 * reachable through ipifs within our zone. 883 */ 884 if (ire->ire_ipversion == IPV4_VERSION) { 885 if (!ire_gateway_ok_zone_v4( 886 ire->ire_gateway_addr, margs->ift_zoneid, 887 ire->ire_ill, margs->ift_tsl, ipst, 888 B_TRUE)) 889 goto next_ire; 890 } else { 891 if (!ire_gateway_ok_zone_v6( 892 &ire->ire_gateway_addr_v6, 893 margs->ift_zoneid, ire->ire_ill, 894 margs->ift_tsl, ipst, B_TRUE)) 895 goto next_ire; 896 } 897 } 898 mutex_enter(&ire->ire_lock); 899 /* Look for stale ire_badcnt and clear */ 900 if (ire->ire_badcnt != 0 && 901 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 902 ipst->ips_ip_ire_badcnt_lifetime)) 903 ire->ire_badcnt = 0; 904 mutex_exit(&ire->ire_lock); 905 906 if (ire->ire_badcnt == 0) { 907 /* We found one with a zero badcnt; done */ 908 ire_refhold(ire); 909 /* 910 * Care needed since irb_refrele grabs WLOCK to free 911 * the irb_t. 912 */ 913 if (ire->ire_ipversion == IPV4_VERSION) { 914 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 915 irb_refrele(irb_ptr); 916 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 917 } else { 918 rw_exit(&ipst->ips_ip6_ire_head_lock); 919 irb_refrele(irb_ptr); 920 rw_enter(&ipst->ips_ip6_ire_head_lock, 921 RW_READER); 922 } 923 return (ire); 924 } 925 /* 926 * keep looking to see if there is a better (lower 927 * badcnt) matching IRE, but save this one as a last resort. 928 * If we find a lower badcnt pick that one as the last* resort. 929 */ 930 if (maybe_ire == NULL) { 931 maybe_ire = ire; 932 maybe_badcnt = ire->ire_badcnt; 933 } else if (ire->ire_badcnt < maybe_badcnt) { 934 maybe_ire = ire; 935 maybe_badcnt = ire->ire_badcnt; 936 } 937 938 next_ire: 939 maxwalk--; 940 next_ire_skip: 941 ire = ire->ire_next; 942 if (ire == NULL) 943 ire = irb_ptr->irb_ire; 944 } 945 if (maybe_ire != NULL) 946 ire_refhold(maybe_ire); 947 948 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 949 if (ire->ire_ipversion == IPV4_VERSION) { 950 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 951 irb_refrele(irb_ptr); 952 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 953 } else { 954 rw_exit(&ipst->ips_ip6_ire_head_lock); 955 irb_refrele(irb_ptr); 956 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 957 } 958 return (maybe_ire); 959 } 960 961 void 962 irb_refhold_rn(struct radix_node *rn) 963 { 964 if ((rn->rn_flags & RNF_ROOT) == 0) 965 irb_refhold(&((rt_t *)(rn))->rt_irb); 966 } 967 968 void 969 irb_refrele_rn(struct radix_node *rn) 970 { 971 if ((rn->rn_flags & RNF_ROOT) == 0) 972 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 973 } 974 975 976 /* 977 * ip_select_src_ill() is used by ip_select_route() to find the src_ill 978 * to be used for source-aware routing table lookup. This function will 979 * ignore IPIF_UNNUMBERED interface addresses, and will only return a 980 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 981 * interfaces). 982 */ 983 static ill_t * 984 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 985 { 986 ipif_t *ipif; 987 ill_t *ill; 988 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 989 ipaddr_t v4src; 990 991 if (isv6) { 992 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 993 } else { 994 IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 995 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 996 } 997 if (ipif == NULL) 998 return (NULL); 999 ill = ipif->ipif_ill; 1000 ill_refhold(ill); 1001 ipif_refrele(ipif); 1002 return (ill); 1003 } 1004 1005 /* 1006 * verify that v6src is configured on ill 1007 */ 1008 static boolean_t 1009 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 1010 { 1011 ipif_t *ipif; 1012 ip_stack_t *ipst; 1013 ipaddr_t v4src; 1014 1015 if (ill == NULL) 1016 return (B_FALSE); 1017 ipst = ill->ill_ipst; 1018 1019 if (ill->ill_isv6) { 1020 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 1021 } else { 1022 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 1023 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 1024 } 1025 1026 if (ipif != NULL) { 1027 ipif_refrele(ipif); 1028 return (B_TRUE); 1029 } else { 1030 return (B_FALSE); 1031 } 1032 } 1033 1034 /* 1035 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 1036 * routes this routine sets up a ire_nce_cache as well. The caller needs to 1037 * lookup an nce for the multicast case. 1038 * 1039 * When src_multihoming is set to 2 (strict src multihoming) we use the source 1040 * address to select the interface and route. If IP_BOUND_IF etc are 1041 * specified, we require that they specify an interface on which the 1042 * source address is assigned. 1043 * 1044 * When src_multihoming is set to 1 (preferred src aware route 1045 * selection) the unicast lookup prefers a matching source 1046 * (i.e., that the route points out an ill on which the source is assigned), but 1047 * if no such route is found we fallback to not considering the source in the 1048 * route lookup. 1049 * 1050 * We skip the src_multihoming check when the source isn't (yet) set, and 1051 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 1052 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 1053 * when secpolicy_net_rawaccess(). 1054 */ 1055 ire_t * 1056 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 1057 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1058 int *errorp, boolean_t *multirtp) 1059 { 1060 uint_t match_args; 1061 uint_t ire_type; 1062 ill_t *ill = NULL; 1063 ire_t *ire; 1064 ip_stack_t *ipst = ixa->ixa_ipst; 1065 ipaddr_t v4dst; 1066 in6_addr_t v6nexthop; 1067 iaflags_t ixaflags = ixa->ixa_flags; 1068 nce_t *nce; 1069 boolean_t preferred_src_aware = B_FALSE; 1070 boolean_t verify_src; 1071 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 1072 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 1073 1074 /* 1075 * We only verify that the src has been configured on a selected 1076 * interface if the src is not :: or INADDR_ANY, and if the 1077 * IXAF_VERIFY_SOURCE flag is set. 1078 */ 1079 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 1080 (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 1081 1082 match_args = MATCH_IRE_SECATTR; 1083 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 1084 if (setsrcp != NULL) 1085 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1086 if (errorp != NULL) 1087 ASSERT(*errorp == 0); 1088 1089 /* 1090 * The content of the ixa will be different if IP_NEXTHOP, 1091 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1092 */ 1093 1094 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 1095 /* Pick up the IRE_MULTICAST for the ill */ 1096 if (ixa->ixa_multicast_ifindex != 0) { 1097 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1098 isv6, ipst); 1099 } else if (ixaflags & IXAF_SCOPEID_SET) { 1100 /* sin6_scope_id takes precedence over ixa_ifindex */ 1101 ASSERT(ixa->ixa_scopeid != 0); 1102 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1103 isv6, ipst); 1104 } else if (ixa->ixa_ifindex != 0) { 1105 /* 1106 * In the ipmp case, the ixa_ifindex is set to 1107 * point at an under_ill and we would return the 1108 * ire_multicast() corresponding to that under_ill. 1109 */ 1110 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1111 isv6, ipst); 1112 } else if (src_multihoming != 0 && verify_src) { 1113 /* Look up the ill based on the source address */ 1114 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1115 /* 1116 * Since we looked up the ill from the source there 1117 * is no need to verify that the source is on the ill 1118 * below. 1119 */ 1120 verify_src = B_FALSE; 1121 if (ill != NULL && IS_VNI(ill)) { 1122 ill_t *usesrc = ill; 1123 1124 ill = ill_lookup_usesrc(usesrc); 1125 ill_refrele(usesrc); 1126 } 1127 } else if (!isv6) { 1128 ipaddr_t v4setsrc = INADDR_ANY; 1129 1130 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 1131 ipst, multirtp, &v4setsrc); 1132 if (setsrcp != NULL) 1133 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1134 } else { 1135 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 1136 ipst, multirtp, setsrcp); 1137 } 1138 if (ill != NULL && IS_VNI(ill)) { 1139 ill_refrele(ill); 1140 ill = NULL; 1141 } 1142 if (ill == NULL) { 1143 if (errorp != NULL) 1144 *errorp = ENXIO; 1145 /* Get a hold on the IRE_NOROUTE */ 1146 ire = ire_reject(ipst, isv6); 1147 return (ire); 1148 } 1149 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1150 ill_refrele(ill); 1151 if (errorp != NULL) 1152 *errorp = EHOSTUNREACH; 1153 /* Get a hold on the IRE_NOROUTE */ 1154 ire = ire_reject(ipst, isv6); 1155 return (ire); 1156 } 1157 /* 1158 * If we are doing the strictest src_multihoming, then 1159 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 1160 * an interface that is consistent with the source address. 1161 */ 1162 if (verify_src && src_multihoming == 2 && 1163 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1164 if (errorp != NULL) 1165 *errorp = EADDRNOTAVAIL; 1166 ill_refrele(ill); 1167 /* Get a hold on the IRE_NOROUTE */ 1168 ire = ire_reject(ipst, isv6); 1169 return (ire); 1170 } 1171 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1172 ire = ire_multicast(ill); 1173 ill_refrele(ill); 1174 if (generationp != NULL) 1175 *generationp = ire->ire_generation; 1176 if (errorp != NULL && 1177 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1178 *errorp = EHOSTUNREACH; 1179 } 1180 return (ire); 1181 } 1182 1183 /* Now for unicast */ 1184 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1185 if (ixaflags & IXAF_SCOPEID_SET) { 1186 /* sin6_scope_id takes precedence over ixa_ifindex */ 1187 ASSERT(ixa->ixa_scopeid != 0); 1188 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1189 isv6, ipst); 1190 } else { 1191 ASSERT(ixa->ixa_ifindex != 0); 1192 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1193 isv6, ipst); 1194 } 1195 if (ill != NULL && IS_VNI(ill)) { 1196 ill_refrele(ill); 1197 ill = NULL; 1198 } 1199 if (ill == NULL) { 1200 if (errorp != NULL) 1201 *errorp = ENXIO; 1202 /* Get a hold on the IRE_NOROUTE */ 1203 ire = ire_reject(ipst, isv6); 1204 return (ire); 1205 } 1206 1207 match_args |= MATCH_IRE_ILL; 1208 1209 /* 1210 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1211 * so for both of them we need to be able look for an under 1212 * interface. 1213 */ 1214 if (IS_UNDER_IPMP(ill)) 1215 match_args |= MATCH_IRE_TESTHIDDEN; 1216 1217 /* 1218 * If we are doing the strictest src_multihoming, then 1219 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 1220 * an interface that is consistent with the source address. 1221 */ 1222 if (src_multihoming == 2 && 1223 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1224 if (errorp != NULL) 1225 *errorp = EADDRNOTAVAIL; 1226 ill_refrele(ill); 1227 /* Get a hold on the IRE_NOROUTE */ 1228 ire = ire_reject(ipst, isv6); 1229 return (ire); 1230 } 1231 } else if (src_multihoming != 0 && verify_src) { 1232 /* Look up the ill based on the source address */ 1233 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1234 if (ill == NULL) { 1235 char addrbuf[INET6_ADDRSTRLEN]; 1236 1237 ip3dbg(("%s not a valid src for unicast", 1238 inet_ntop(AF_INET6, &v6src, addrbuf, 1239 sizeof (addrbuf)))); 1240 if (errorp != NULL) 1241 *errorp = EADDRNOTAVAIL; 1242 /* Get a hold on the IRE_NOROUTE */ 1243 ire = ire_reject(ipst, isv6); 1244 return (ire); 1245 } 1246 match_args |= MATCH_IRE_SRC_ILL; 1247 preferred_src_aware = (src_multihoming == 1); 1248 } 1249 1250 if (ixaflags & IXAF_NEXTHOP_SET) { 1251 /* IP_NEXTHOP was set */ 1252 v6nexthop = ixa->ixa_nexthop_v6; 1253 } else { 1254 v6nexthop = *v6dst; 1255 } 1256 1257 ire_type = 0; 1258 1259 /* 1260 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1261 * we only look for an onlink IRE. 1262 */ 1263 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1264 match_args |= MATCH_IRE_TYPE; 1265 ire_type = IRE_ONLINK; 1266 } 1267 1268 retry: 1269 if (!isv6) { 1270 ipaddr_t v4nexthop; 1271 ipaddr_t v4setsrc = INADDR_ANY; 1272 1273 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1274 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1275 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1276 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1277 if (setsrcp != NULL) 1278 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1279 } else { 1280 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1281 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1282 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1283 } 1284 1285 #ifdef DEBUG 1286 if (match_args & MATCH_IRE_TESTHIDDEN) { 1287 ip3dbg(("looking for hidden; dst %x ire %p\n", 1288 v4dst, (void *)ire)); 1289 } 1290 #endif 1291 if (ill != NULL) { 1292 ill_refrele(ill); 1293 ill = NULL; 1294 } 1295 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1296 (ire->ire_type & IRE_MULTICAST)) { 1297 if (preferred_src_aware) { 1298 /* 1299 * "Preferred Source Aware" send mode. If we cannot 1300 * find an ire whose ire_ill had the desired source 1301 * address retry after relaxing the ill matching 1302 * constraint. 1303 */ 1304 ire_refrele(ire); 1305 preferred_src_aware = B_FALSE; 1306 match_args &= ~MATCH_IRE_SRC_ILL; 1307 goto retry; 1308 } 1309 /* No ire_nce_cache */ 1310 return (ire); 1311 } 1312 1313 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1314 mutex_enter(&ire->ire_lock); 1315 nce = ire->ire_nce_cache; 1316 if (nce == NULL || nce->nce_is_condemned) { 1317 mutex_exit(&ire->ire_lock); 1318 (void) ire_revalidate_nce(ire); 1319 } else { 1320 mutex_exit(&ire->ire_lock); 1321 } 1322 return (ire); 1323 } 1324 1325 /* 1326 * Find a route given some xmit attributes and a packet. 1327 * Generic for IPv4 and IPv6 1328 * 1329 * This never returns NULL. But when it returns the IRE_NOROUTE 1330 * it might set errorp. 1331 */ 1332 ire_t * 1333 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1334 int *errorp, boolean_t *multirtp) 1335 { 1336 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1337 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1338 in6_addr_t v6dst, v6src; 1339 1340 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1341 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 1342 1343 return (ip_select_route(&v6dst, v6src, ixa, generationp, 1344 NULL, errorp, multirtp)); 1345 } else { 1346 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1347 1348 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 1349 ixa, generationp, NULL, errorp, multirtp)); 1350 } 1351 } 1352 1353 ire_t * 1354 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 1355 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1356 { 1357 in6_addr_t v6dst, v6src; 1358 ire_t *ire; 1359 in6_addr_t setsrc; 1360 1361 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1362 1363 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1364 IN6_IPADDR_TO_V4MAPPED(src, &v6src); 1365 1366 setsrc = ipv6_all_zeros; 1367 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 1368 multirtp); 1369 if (v4setsrcp != NULL) 1370 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1371 return (ire); 1372 } 1373 1374 /* 1375 * Recursively look for a route to the destination. Can also match on 1376 * the zoneid, ill, and label. Used for the data paths. See also 1377 * ire_route_recursive. 1378 * 1379 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1380 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1381 * forwarding. 1382 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1383 * resolve the gateway. 1384 * 1385 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1386 * instead. 1387 * 1388 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1389 * is an error. 1390 * Allow at most one RTF_INDIRECT. 1391 */ 1392 ire_t * 1393 ire_route_recursive_impl_v4(ire_t *ire, 1394 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1395 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1396 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1397 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1398 { 1399 int i, j; 1400 ire_t *ires[MAX_IRE_RECURSION]; 1401 uint_t generation; 1402 uint_t generations[MAX_IRE_RECURSION]; 1403 boolean_t need_refrele = B_FALSE; 1404 boolean_t invalidate = B_FALSE; 1405 ill_t *ill = NULL; 1406 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST); 1407 1408 if (setsrcp != NULL) 1409 ASSERT(*setsrcp == INADDR_ANY); 1410 if (gwattrp != NULL) 1411 ASSERT(*gwattrp == NULL); 1412 1413 /* 1414 * We iterate up to three times to resolve a route, even though 1415 * we have four slots in the array. The extra slot is for an 1416 * IRE_IF_CLONE we might need to create. 1417 */ 1418 i = 0; 1419 while (i < MAX_IRE_RECURSION - 1) { 1420 /* ire_ftable_lookup handles round-robin/ECMP */ 1421 if (ire == NULL) { 1422 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1423 (ill != NULL? ill : ill_arg), zoneid, tsl, 1424 match_args, xmit_hint, ipst, &generation); 1425 } else { 1426 /* Caller passed it; extra hold since we will rele */ 1427 ire_refhold(ire); 1428 if (generationp != NULL) 1429 generation = *generationp; 1430 else 1431 generation = IRE_GENERATION_VERIFY; 1432 } 1433 if (ire == NULL) { 1434 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 1435 ire = ires[0]; 1436 ire_refhold(ire); 1437 } else { 1438 ire = ire_reject(ipst, B_FALSE); 1439 } 1440 goto error; 1441 } 1442 1443 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1444 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1445 goto error; 1446 1447 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1448 /* 1449 * Verify that the IRE_IF_CLONE has a consistent generation 1450 * number. 1451 */ 1452 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) { 1453 ire_refrele(ire); 1454 ire = NULL; 1455 continue; 1456 } 1457 1458 /* 1459 * Don't allow anything unusual past the first iteration. 1460 * After the first lookup, we should no longer look for 1461 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT 1462 * routes. 1463 * 1464 * In addition, after we have found a direct IRE_OFFLINK, 1465 * we should only look for interface or clone routes. 1466 */ 1467 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 1468 1469 if ((ire->ire_type & IRE_OFFLINK) && 1470 !(ire->ire_flags & RTF_INDIRECT)) { 1471 ire_type = IRE_IF_ALL; 1472 } else { 1473 /* 1474 * no more local, loopback, broadcast routes 1475 */ 1476 if (!(match_args & MATCH_IRE_TYPE)) 1477 ire_type = (IRE_OFFLINK|IRE_ONLINK); 1478 ire_type &= ~maskoff; 1479 } 1480 match_args |= MATCH_IRE_TYPE; 1481 1482 /* We have a usable IRE */ 1483 ires[i] = ire; 1484 generations[i] = generation; 1485 i++; 1486 1487 /* The first RTF_SETSRC address is passed back if setsrcp */ 1488 if ((ire->ire_flags & RTF_SETSRC) && 1489 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1490 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1491 *setsrcp = ire->ire_setsrc_addr; 1492 } 1493 1494 /* The first ire_gw_secattr is passed back if gwattrp */ 1495 if (ire->ire_gw_secattr != NULL && 1496 gwattrp != NULL && *gwattrp == NULL) 1497 *gwattrp = ire->ire_gw_secattr; 1498 1499 /* 1500 * Check if we have a short-cut pointer to an IRE for this 1501 * destination, and that the cached dependency isn't stale. 1502 * In that case we've rejoined an existing tree towards a 1503 * parent, thus we don't need to continue the loop to 1504 * discover the rest of the tree. 1505 */ 1506 mutex_enter(&ire->ire_lock); 1507 if (ire->ire_dep_parent != NULL && 1508 ire->ire_dep_parent->ire_generation == 1509 ire->ire_dep_parent_generation) { 1510 mutex_exit(&ire->ire_lock); 1511 ire = NULL; 1512 goto done; 1513 } 1514 mutex_exit(&ire->ire_lock); 1515 1516 /* 1517 * If this type should have an ire_nce_cache (even if it 1518 * doesn't yet have one) then we are done. Includes 1519 * IRE_INTERFACE with a full 32 bit mask. 1520 */ 1521 if (ire->ire_nce_capable) { 1522 ire = NULL; 1523 goto done; 1524 } 1525 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1526 /* 1527 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1528 * particular destination 1529 */ 1530 if (ire->ire_type & IRE_INTERFACE) { 1531 in6_addr_t v6nexthop; 1532 ire_t *clone; 1533 1534 ASSERT(ire->ire_masklen != IPV4_ABITS); 1535 1536 /* 1537 * In the case of ip_input and ILLF_FORWARDING not 1538 * being set, and in the case of RTM_GET, there is 1539 * no point in allocating an IRE_IF_CLONE. We return 1540 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1541 * result in a ire_dep_parent which is IRE_IF_* 1542 * without an IRE_IF_CLONE. 1543 * We recover from that when we need to send packets 1544 * by ensuring that the generations become 1545 * IRE_GENERATION_VERIFY in this case. 1546 */ 1547 if (!(irr_flags & IRR_ALLOCATE)) { 1548 invalidate = B_TRUE; 1549 ire = NULL; 1550 goto done; 1551 } 1552 1553 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1554 1555 clone = ire_create_if_clone(ire, &v6nexthop, 1556 &generation); 1557 if (clone == NULL) { 1558 /* 1559 * Temporary failure - no memory. 1560 * Don't want caller to cache IRE_NOROUTE. 1561 */ 1562 invalidate = B_TRUE; 1563 ire = ire_blackhole(ipst, B_FALSE); 1564 goto error; 1565 } 1566 /* 1567 * Make clone next to last entry and the 1568 * IRE_INTERFACE the last in the dependency 1569 * chain since the clone depends on the 1570 * IRE_INTERFACE. 1571 */ 1572 ASSERT(i >= 1); 1573 ASSERT(i < MAX_IRE_RECURSION); 1574 1575 ires[i] = ires[i-1]; 1576 generations[i] = generations[i-1]; 1577 ires[i-1] = clone; 1578 generations[i-1] = generation; 1579 i++; 1580 1581 ire = NULL; 1582 goto done; 1583 } 1584 1585 /* 1586 * We only match on the type and optionally ILL when 1587 * recursing. The type match is used by some callers 1588 * to exclude certain types (such as IRE_IF_CLONE or 1589 * IRE_LOCAL|IRE_LOOPBACK). 1590 * 1591 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1592 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1593 * ire_ill, so we set ill to the ire_ill; 1594 */ 1595 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1596 nexthop = ire->ire_gateway_addr; 1597 if (ill == NULL && ire->ire_ill != NULL) { 1598 ill = ire->ire_ill; 1599 need_refrele = B_TRUE; 1600 ill_refhold(ill); 1601 match_args |= MATCH_IRE_ILL; 1602 } 1603 ire = NULL; 1604 } 1605 ASSERT(ire == NULL); 1606 ire = ire_reject(ipst, B_FALSE); 1607 1608 error: 1609 ASSERT(ire != NULL); 1610 if (need_refrele) 1611 ill_refrele(ill); 1612 1613 /* 1614 * In the case of MULTIRT we want to try a different IRE the next 1615 * time. We let the next packet retry in that case. 1616 */ 1617 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1618 (void) ire_no_good(ires[0]); 1619 1620 cleanup: 1621 /* cleanup ires[i] */ 1622 ire_dep_unbuild(ires, i); 1623 for (j = 0; j < i; j++) 1624 ire_refrele(ires[j]); 1625 1626 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1627 (irr_flags & IRR_INCOMPLETE)); 1628 /* 1629 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1630 * ip_select_route since the reject or lack of memory might be gone. 1631 */ 1632 if (generationp != NULL) 1633 *generationp = IRE_GENERATION_VERIFY; 1634 return (ire); 1635 1636 done: 1637 ASSERT(ire == NULL); 1638 if (need_refrele) { 1639 ill_refrele(ill); 1640 ill = NULL; 1641 } 1642 1643 /* Build dependencies */ 1644 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1645 /* Something in chain was condemned; tear it apart */ 1646 ire = ire_reject(ipst, B_FALSE); 1647 goto cleanup; 1648 } 1649 1650 /* 1651 * Release all refholds except the one for ires[0] that we 1652 * will return to the caller. 1653 */ 1654 for (j = 1; j < i; j++) 1655 ire_refrele(ires[j]); 1656 1657 if (invalidate) { 1658 /* 1659 * Since we needed to allocate but couldn't we need to make 1660 * sure that the dependency chain is rebuilt the next time. 1661 */ 1662 ire_dep_invalidate_generations(ires[0]); 1663 generation = IRE_GENERATION_VERIFY; 1664 } else { 1665 /* 1666 * IREs can have been added or deleted while we did the 1667 * recursive lookup and we can't catch those until we've built 1668 * the dependencies. We verify the stored 1669 * ire_dep_parent_generation to catch any such changes and 1670 * return IRE_GENERATION_VERIFY (which will cause 1671 * ip_select_route to be called again so we can redo the 1672 * recursive lookup next time we send a packet. 1673 */ 1674 if (ires[0]->ire_dep_parent == NULL) 1675 generation = ires[0]->ire_generation; 1676 else 1677 generation = ire_dep_validate_generations(ires[0]); 1678 if (generations[0] != ires[0]->ire_generation) { 1679 /* Something changed at the top */ 1680 generation = IRE_GENERATION_VERIFY; 1681 } 1682 } 1683 if (generationp != NULL) 1684 *generationp = generation; 1685 1686 return (ires[0]); 1687 } 1688 1689 ire_t * 1690 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1691 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1692 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1693 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1694 { 1695 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1696 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1697 gwattrp, generationp)); 1698 } 1699 1700 /* 1701 * Recursively look for a route to the destination. 1702 * We only handle a destination match here, yet we have the same arguments 1703 * as the full match to allow function pointers to select between the two. 1704 * 1705 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1706 * instead. 1707 * 1708 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1709 * is an error. 1710 * Allow at most one RTF_INDIRECT. 1711 */ 1712 ire_t * 1713 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1714 uint32_t xmit_hint, ip_stack_t *ipst) 1715 { 1716 ire_t *ire; 1717 ire_t *ire1; 1718 uint_t generation; 1719 1720 /* ire_ftable_lookup handles round-robin/ECMP */ 1721 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1722 &generation); 1723 ASSERT(ire != NULL); 1724 /* 1725 * If the IRE has a current cached parent we know that the whole 1726 * parent chain is current, hence we don't need to discover and 1727 * build any dependencies by doing a recursive lookup. 1728 */ 1729 mutex_enter(&ire->ire_lock); 1730 if (ire->ire_dep_parent != NULL) { 1731 if (ire->ire_dep_parent->ire_generation == 1732 ire->ire_dep_parent_generation) { 1733 mutex_exit(&ire->ire_lock); 1734 return (ire); 1735 } 1736 mutex_exit(&ire->ire_lock); 1737 } else { 1738 mutex_exit(&ire->ire_lock); 1739 /* 1740 * If this type should have an ire_nce_cache (even if it 1741 * doesn't yet have one) then we are done. Includes 1742 * IRE_INTERFACE with a full 32 bit mask. 1743 */ 1744 if (ire->ire_nce_capable) 1745 return (ire); 1746 } 1747 1748 /* 1749 * Fallback to loop in the normal code starting with the ire 1750 * we found. Normally this would return the same ire. 1751 */ 1752 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1753 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1754 &generation); 1755 ire_refrele(ire); 1756 return (ire1); 1757 } 1758 1759 /* 1760 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE 1761 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they 1762 * are not consistent, and TRUE otherwise. 1763 */ 1764 boolean_t 1765 ire_clone_verify(ire_t *ire) 1766 { 1767 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0); 1768 mutex_enter(&ire->ire_lock); 1769 if (ire->ire_dep_parent != NULL && 1770 ire->ire_dep_parent->ire_generation != 1771 ire->ire_dep_parent_generation) { 1772 mutex_exit(&ire->ire_lock); 1773 ire_delete(ire); 1774 return (B_FALSE); 1775 } 1776 mutex_exit(&ire->ire_lock); 1777 return (B_TRUE); 1778 } 1779