1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 26 /* 27 * This file contains consumer routines of the IPv4 forwarding engine 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/dlpi.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/policy.h> 38 39 #include <sys/systm.h> 40 #include <sys/strsun.h> 41 #include <sys/kmem.h> 42 #include <sys/param.h> 43 #include <sys/socket.h> 44 #include <sys/strsubr.h> 45 #include <net/if.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/ipsec_impl.h> 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ip6.h> 59 #include <inet/ip_ndp.h> 60 #include <inet/arp.h> 61 #include <inet/ip_if.h> 62 #include <inet/ip_ire.h> 63 #include <inet/ip_ftable.h> 64 #include <inet/ip_rts.h> 65 #include <inet/nd.h> 66 67 #include <net/pfkeyv2.h> 68 #include <inet/sadb.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <net/radix.h> 73 #include <sys/tsol/label.h> 74 #include <sys/tsol/tnet.h> 75 76 #define IS_DEFAULT_ROUTE(ire) \ 77 (((ire)->ire_type & IRE_DEFAULT) || \ 78 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79 80 #define IP_SRC_MULTIHOMING(isv6, ipst) \ 81 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 82 ipst->ips_ip_strict_src_multihoming) 83 84 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 85 static void ire_del_host_redir(ire_t *, char *); 86 static boolean_t ire_find_best_route(struct radix_node *, void *); 87 88 /* 89 * Lookup a route in forwarding table. A specific lookup is indicated by 90 * passing the required parameters and indicating the match required in the 91 * flag field. 92 * 93 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 94 */ 95 ire_t * 96 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 97 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 98 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 99 { 100 ire_t *ire; 101 struct rt_sockaddr rdst, rmask; 102 struct rt_entry *rt; 103 ire_ftable_args_t margs; 104 105 ASSERT(ill == NULL || !ill->ill_isv6); 106 107 /* 108 * ire_match_args() will dereference ill if MATCH_IRE_ILL 109 * is set. 110 */ 111 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 112 return (NULL); 113 114 bzero(&rdst, sizeof (rdst)); 115 rdst.rt_sin_len = sizeof (rdst); 116 rdst.rt_sin_family = AF_INET; 117 rdst.rt_sin_addr.s_addr = addr; 118 119 bzero(&rmask, sizeof (rmask)); 120 rmask.rt_sin_len = sizeof (rmask); 121 rmask.rt_sin_family = AF_INET; 122 rmask.rt_sin_addr.s_addr = mask; 123 124 bzero(&margs, sizeof (margs)); 125 margs.ift_addr = addr; 126 margs.ift_mask = mask; 127 margs.ift_gateway = gateway; 128 margs.ift_type = type; 129 margs.ift_ill = ill; 130 margs.ift_zoneid = zoneid; 131 margs.ift_tsl = tsl; 132 margs.ift_flags = flags; 133 134 /* 135 * The flags argument passed to ire_ftable_lookup may cause the 136 * search to return, not the longest matching prefix, but the 137 * "best matching prefix", i.e., the longest prefix that also 138 * satisfies constraints imposed via the permutation of flags 139 * passed in. To achieve this, we invoke ire_match_args() on 140 * each matching leaf in the radix tree. ire_match_args is 141 * invoked by the callback function ire_find_best_route() 142 * We hold the global tree lock in read mode when calling 143 * rn_match_args. Before dropping the global tree lock, ensure 144 * that the radix node can't be deleted by incrementing ire_refcnt. 145 */ 146 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 147 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 148 ipst->ips_ip_ftable, ire_find_best_route, &margs); 149 ire = margs.ift_best_ire; 150 if (rt == NULL) { 151 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 152 return (NULL); 153 } 154 ASSERT(ire != NULL); 155 156 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 157 158 /* 159 * round-robin only if we have more than one route in the bucket. 160 * ips_ip_ecmp_behavior controls when we do ECMP 161 * 2: always 162 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 163 * 0: never 164 */ 165 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 166 if (ipst->ips_ip_ecmp_behavior == 2 || 167 (ipst->ips_ip_ecmp_behavior == 1 && 168 IS_DEFAULT_ROUTE(ire))) { 169 ire_t *next_ire; 170 171 margs.ift_best_ire = NULL; 172 next_ire = ire_round_robin(ire->ire_bucket, &margs, 173 xmit_hint, ire, ipst); 174 if (next_ire == NULL) { 175 /* keep ire if next_ire is null */ 176 goto done; 177 } 178 ire_refrele(ire); 179 ire = next_ire; 180 } 181 } 182 183 done: 184 /* Return generation before dropping lock */ 185 if (generationp != NULL) 186 *generationp = ire->ire_generation; 187 188 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 189 190 /* 191 * For shared-IP zones we need additional checks to what was 192 * done in ire_match_args to make sure IRE_LOCALs are handled. 193 * 194 * When ip_restrict_interzone_loopback is set, then 195 * we ensure that IRE_LOCAL are only used for loopback 196 * between zones when the logical "Ethernet" would 197 * have looped them back. That is, if in the absense of 198 * the IRE_LOCAL we would have sent to packet out the 199 * same ill. 200 */ 201 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 202 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 203 ipst->ips_ip_restrict_interzone_loopback) { 204 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 205 ASSERT(ire != NULL); 206 } 207 return (ire); 208 } 209 210 /* 211 * This function is called by 212 * ip_input/ire_route_recursive when doing a route lookup on only the 213 * destination address. 214 * 215 * The optimizations of this function over ire_ftable_lookup are: 216 * o removing unnecessary flag matching 217 * o doing longest prefix match instead of overloading it further 218 * with the unnecessary "best_prefix_match" 219 * 220 * If no route is found we return IRE_NOROUTE. 221 */ 222 ire_t * 223 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 224 uint_t *generationp) 225 { 226 ire_t *ire; 227 struct rt_sockaddr rdst; 228 struct rt_entry *rt; 229 irb_t *irb; 230 231 rdst.rt_sin_len = sizeof (rdst); 232 rdst.rt_sin_family = AF_INET; 233 rdst.rt_sin_addr.s_addr = addr; 234 235 /* 236 * This is basically inlining a simpler version of ire_match_args 237 */ 238 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 239 240 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 241 ipst->ips_ip_ftable, NULL, NULL); 242 243 if (rt == NULL) 244 goto bad; 245 246 irb = &rt->rt_irb; 247 if (irb->irb_ire_cnt == 0) 248 goto bad; 249 250 rw_enter(&irb->irb_lock, RW_READER); 251 ire = irb->irb_ire; 252 if (ire == NULL) { 253 rw_exit(&irb->irb_lock); 254 goto bad; 255 } 256 while (IRE_IS_CONDEMNED(ire)) { 257 ire = ire->ire_next; 258 if (ire == NULL) { 259 rw_exit(&irb->irb_lock); 260 goto bad; 261 } 262 } 263 264 /* we have a ire that matches */ 265 ire_refhold(ire); 266 rw_exit(&irb->irb_lock); 267 268 /* 269 * round-robin only if we have more than one route in the bucket. 270 * ips_ip_ecmp_behavior controls when we do ECMP 271 * 2: always 272 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 273 * 0: never 274 * 275 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 276 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 277 * and the IRE_INTERFACESs are likely to be shorter matches. 278 */ 279 if (ire->ire_bucket->irb_ire_cnt > 1) { 280 if (ipst->ips_ip_ecmp_behavior == 2 || 281 (ipst->ips_ip_ecmp_behavior == 1 && 282 IS_DEFAULT_ROUTE(ire))) { 283 ire_t *next_ire; 284 ire_ftable_args_t margs; 285 286 bzero(&margs, sizeof (margs)); 287 margs.ift_addr = addr; 288 margs.ift_zoneid = ALL_ZONES; 289 290 next_ire = ire_round_robin(ire->ire_bucket, &margs, 291 xmit_hint, ire, ipst); 292 if (next_ire == NULL) { 293 /* keep ire if next_ire is null */ 294 if (generationp != NULL) 295 *generationp = ire->ire_generation; 296 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 297 return (ire); 298 } 299 ire_refrele(ire); 300 ire = next_ire; 301 } 302 } 303 /* Return generation before dropping lock */ 304 if (generationp != NULL) 305 *generationp = ire->ire_generation; 306 307 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 308 309 /* 310 * Since we only did ALL_ZONES matches there is no special handling 311 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 312 */ 313 return (ire); 314 315 bad: 316 if (generationp != NULL) 317 *generationp = IRE_GENERATION_VERIFY; 318 319 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 320 return (ire_reject(ipst, B_FALSE)); 321 } 322 323 /* 324 * Find the ill matching a multicast group. 325 * Allows different routes for multicast addresses 326 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 327 * which point at different interfaces. This is used when IP_MULTICAST_IF 328 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 329 * specify the interface to join on. 330 * 331 * Supports link-local addresses by using ire_route_recursive which follows 332 * the ill when recursing. 333 * 334 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 335 * and the MULTIRT property can be different for different groups, we 336 * extract RTF_MULTIRT from the special unicast route added for a group 337 * with CGTP and pass that back in the multirtp argument. 338 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 339 * We have a setsrcp argument for the same reason. 340 */ 341 ill_t * 342 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 343 boolean_t *multirtp, ipaddr_t *setsrcp) 344 { 345 ire_t *ire; 346 ill_t *ill; 347 348 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 349 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 350 ASSERT(ire != NULL); 351 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 352 ire_refrele(ire); 353 return (NULL); 354 } 355 356 if (multirtp != NULL) 357 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 358 359 ill = ire_nexthop_ill(ire); 360 ire_refrele(ire); 361 return (ill); 362 } 363 364 /* 365 * Delete the passed in ire if the gateway addr matches 366 */ 367 void 368 ire_del_host_redir(ire_t *ire, char *gateway) 369 { 370 if ((ire->ire_flags & RTF_DYNAMIC) && 371 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 372 ire_delete(ire); 373 } 374 375 /* 376 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 377 * pointing at the specified gateway and 378 * delete them. This routine is called only 379 * when a default gateway is going away. 380 */ 381 void 382 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 383 { 384 struct rtfuncarg rtfarg; 385 386 bzero(&rtfarg, sizeof (rtfarg)); 387 rtfarg.rt_func = ire_del_host_redir; 388 rtfarg.rt_arg = (void *)&gateway; 389 rtfarg.rt_zoneid = ALL_ZONES; 390 rtfarg.rt_ipst = ipst; 391 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 392 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 393 } 394 395 /* 396 * Obtain the rt_entry and rt_irb for the route to be added to 397 * the ips_ip_ftable. 398 * First attempt to add a node to the radix tree via rn_addroute. If the 399 * route already exists, return the bucket for the existing route. 400 * 401 * Locking notes: Need to hold the global radix tree lock in write mode to 402 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 403 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 404 * while holding the irb_lock, but not the radix tree lock. 405 */ 406 irb_t * 407 ire_get_bucket(ire_t *ire) 408 { 409 struct radix_node *rn; 410 struct rt_entry *rt; 411 struct rt_sockaddr rmask, rdst; 412 irb_t *irb = NULL; 413 ip_stack_t *ipst = ire->ire_ipst; 414 415 ASSERT(ipst->ips_ip_ftable != NULL); 416 417 /* first try to see if route exists (based on rtalloc1) */ 418 bzero(&rdst, sizeof (rdst)); 419 rdst.rt_sin_len = sizeof (rdst); 420 rdst.rt_sin_family = AF_INET; 421 rdst.rt_sin_addr.s_addr = ire->ire_addr; 422 423 bzero(&rmask, sizeof (rmask)); 424 rmask.rt_sin_len = sizeof (rmask); 425 rmask.rt_sin_family = AF_INET; 426 rmask.rt_sin_addr.s_addr = ire->ire_mask; 427 428 /* 429 * add the route. based on BSD's rtrequest1(RTM_ADD) 430 */ 431 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 432 /* kmem_alloc failed */ 433 if (rt == NULL) 434 return (NULL); 435 436 bzero(rt, sizeof (*rt)); 437 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 438 rt->rt_dst = rdst; 439 irb = &rt->rt_irb; 440 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 441 irb->irb_ipst = ipst; 442 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 443 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 444 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 445 ipst->ips_ip_ftable, (struct radix_node *)rt); 446 if (rn == NULL) { 447 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 448 Free(rt, rt_entry_cache); 449 rt = NULL; 450 irb = NULL; 451 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 452 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 453 ipst->ips_ip_ftable); 454 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 455 /* found a non-root match */ 456 rt = (struct rt_entry *)rn; 457 } 458 } 459 if (rt != NULL) { 460 irb = &rt->rt_irb; 461 irb_refhold(irb); 462 } 463 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 464 return (irb); 465 } 466 467 /* 468 * This function is used when the caller wants to know the outbound 469 * interface for a packet given only the address. 470 * If this is a offlink IP address and there are multiple 471 * routes to this destination, this routine will utilise the 472 * first route it finds to IP address 473 * Return values: 474 * 0 - FAILURE 475 * nonzero - ifindex 476 */ 477 uint_t 478 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 479 { 480 uint_t ifindex = 0; 481 ire_t *ire; 482 ill_t *ill; 483 netstack_t *ns; 484 ip_stack_t *ipst; 485 486 if (zoneid == ALL_ZONES) 487 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 488 else 489 ns = netstack_find_by_zoneid(zoneid); 490 ASSERT(ns != NULL); 491 492 /* 493 * For exclusive stacks we set the zoneid to zero 494 * since IP uses the global zoneid in the exclusive stacks. 495 */ 496 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 497 zoneid = GLOBAL_ZONEID; 498 ipst = ns->netstack_ip; 499 500 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 501 502 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 503 ill = ire_nexthop_ill(ire); 504 if (ill != NULL) { 505 ifindex = ill->ill_phyint->phyint_ifindex; 506 ill_refrele(ill); 507 } 508 ire_refrele(ire); 509 } 510 netstack_rele(ns); 511 return (ifindex); 512 } 513 514 /* 515 * Routine to find the route to a destination. If a ifindex is supplied 516 * it tries to match the route to the corresponding ipif for the ifindex 517 */ 518 static ire_t * 519 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 520 { 521 ire_t *ire = NULL; 522 int match_flags; 523 524 match_flags = MATCH_IRE_DSTONLY; 525 526 /* XXX pass NULL tsl for now */ 527 528 if (dst_addr->sa_family == AF_INET) { 529 ire = ire_route_recursive_v4( 530 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 531 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 532 NULL, NULL); 533 } else { 534 ire = ire_route_recursive_v6( 535 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 536 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 537 NULL, NULL); 538 } 539 ASSERT(ire != NULL); 540 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 541 ire_refrele(ire); 542 return (NULL); 543 } 544 return (ire); 545 } 546 547 /* 548 * This routine is called by IP Filter to send a packet out on the wire 549 * to a specified dstination (which may be onlink or offlink). The ifindex may 550 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 551 * an outgoing interface and requires the nexthop to be on that interface. 552 * IP WILL NOT DO the following to the data packet before sending it out: 553 * a. manipulate ttl 554 * b. ipsec work 555 * c. fragmentation 556 * 557 * If the packet has been prepared for hardware checksum then it will be 558 * passed off to ip_send_align_cksum() to check that the flags set on the 559 * packet are in alignment with the capabilities of the new outgoing NIC. 560 * 561 * Return values: 562 * 0: IP was able to send of the data pkt 563 * ECOMM: Could not send packet 564 * ENONET No route to dst. It is up to the caller 565 * to send icmp unreachable error message, 566 * EINPROGRESS The macaddr of the onlink dst or that 567 * of the offlink dst's nexthop needs to get 568 * resolved before packet can be sent to dst. 569 * Thus transmission is not guaranteed. 570 * Note: No longer have visibility to the ARP queue 571 * hence no EINPROGRESS. 572 */ 573 int 574 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 575 zoneid_t zoneid) 576 { 577 ipaddr_t nexthop; 578 netstack_t *ns; 579 ip_stack_t *ipst; 580 ip_xmit_attr_t ixas; 581 int error; 582 583 ASSERT(mp != NULL); 584 585 if (zoneid == ALL_ZONES) 586 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 587 else 588 ns = netstack_find_by_zoneid(zoneid); 589 ASSERT(ns != NULL); 590 591 /* 592 * For exclusive stacks we set the zoneid to zero 593 * since IP uses the global zoneid in the exclusive stacks. 594 */ 595 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 596 zoneid = GLOBAL_ZONEID; 597 ipst = ns->netstack_ip; 598 599 ASSERT(dst_addr->sa_family == AF_INET || 600 dst_addr->sa_family == AF_INET6); 601 602 bzero(&ixas, sizeof (ixas)); 603 /* 604 * No IPsec, no fragmentation, and don't let any hooks see 605 * the packet. 606 */ 607 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 608 ixas.ixa_cred = kcred; 609 ixas.ixa_cpid = NOPID; 610 ixas.ixa_tsl = NULL; 611 ixas.ixa_ipst = ipst; 612 ixas.ixa_ifindex = ifindex; 613 614 if (dst_addr->sa_family == AF_INET) { 615 ipha_t *ipha = (ipha_t *)mp->b_rptr; 616 617 ixas.ixa_flags |= IXAF_IS_IPV4; 618 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 619 if (nexthop != ipha->ipha_dst) { 620 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 621 ixas.ixa_nexthop_v4 = nexthop; 622 } 623 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 624 } else { 625 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 626 in6_addr_t *nexthop6; 627 628 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 629 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 630 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 631 ixas.ixa_nexthop_v6 = *nexthop6; 632 } 633 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 634 } 635 error = ip_output_simple(mp, &ixas); 636 ixa_cleanup(&ixas); 637 638 netstack_rele(ns); 639 switch (error) { 640 case 0: 641 break; 642 643 case EHOSTUNREACH: 644 case ENETUNREACH: 645 error = ENONET; 646 break; 647 648 default: 649 error = ECOMM; 650 break; 651 } 652 return (error); 653 } 654 655 /* 656 * callback function provided by ire_ftable_lookup when calling 657 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 658 * the radix tree. 659 */ 660 boolean_t 661 ire_find_best_route(struct radix_node *rn, void *arg) 662 { 663 struct rt_entry *rt = (struct rt_entry *)rn; 664 irb_t *irb_ptr; 665 ire_t *ire; 666 ire_ftable_args_t *margs = arg; 667 ipaddr_t match_mask; 668 669 ASSERT(rt != NULL); 670 671 irb_ptr = &rt->rt_irb; 672 673 if (irb_ptr->irb_ire_cnt == 0) 674 return (B_FALSE); 675 676 rw_enter(&irb_ptr->irb_lock, RW_READER); 677 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 678 if (IRE_IS_CONDEMNED(ire)) 679 continue; 680 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 681 if (margs->ift_flags & MATCH_IRE_MASK) 682 match_mask = margs->ift_mask; 683 else 684 match_mask = ire->ire_mask; 685 686 if (ire_match_args(ire, margs->ift_addr, match_mask, 687 margs->ift_gateway, margs->ift_type, margs->ift_ill, 688 margs->ift_zoneid, margs->ift_tsl, 689 margs->ift_flags)) { 690 ire_refhold(ire); 691 rw_exit(&irb_ptr->irb_lock); 692 margs->ift_best_ire = ire; 693 return (B_TRUE); 694 } 695 } 696 rw_exit(&irb_ptr->irb_lock); 697 return (B_FALSE); 698 } 699 700 /* 701 * ftable irb_t structures are dynamically allocated, and we need to 702 * check if the irb_t (and associated ftable tree attachment) needs to 703 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 704 * be verified are: 705 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 706 * - no other threads holding references to ire's in the bucket, 707 * i.e., irb_nire == 0 708 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 709 * - need to hold the global tree lock and irb_lock in write mode. 710 */ 711 void 712 irb_refrele_ftable(irb_t *irb) 713 { 714 for (;;) { 715 rw_enter(&irb->irb_lock, RW_WRITER); 716 ASSERT(irb->irb_refcnt != 0); 717 if (irb->irb_refcnt != 1) { 718 /* 719 * Someone has a reference to this radix node 720 * or there is some bucket walker. 721 */ 722 irb->irb_refcnt--; 723 rw_exit(&irb->irb_lock); 724 return; 725 } else { 726 /* 727 * There is no other walker, nor is there any 728 * other thread that holds a direct ref to this 729 * radix node. Do the clean up if needed. Call 730 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 731 */ 732 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 733 ire_t *ire_list; 734 735 ire_list = ire_unlink(irb); 736 rw_exit(&irb->irb_lock); 737 738 if (ire_list != NULL) 739 ire_cleanup(ire_list); 740 /* 741 * more CONDEMNED entries could have 742 * been added while we dropped the lock, 743 * so we have to re-check. 744 */ 745 continue; 746 } 747 748 /* 749 * Now check if there are still any ires 750 * associated with this radix node. 751 */ 752 if (irb->irb_nire != 0) { 753 /* 754 * someone is still holding on 755 * to ires in this bucket 756 */ 757 irb->irb_refcnt--; 758 rw_exit(&irb->irb_lock); 759 return; 760 } else { 761 /* 762 * Everything is clear. Zero walkers, 763 * Zero threads with a ref to this 764 * radix node, Zero ires associated with 765 * this radix node. Due to lock order, 766 * check the above conditions again 767 * after grabbing all locks in the right order 768 */ 769 rw_exit(&irb->irb_lock); 770 if (irb_inactive(irb)) 771 return; 772 /* 773 * irb_inactive could not free the irb. 774 * See if there are any walkers, if not 775 * try to clean up again. 776 */ 777 } 778 } 779 } 780 } 781 782 /* 783 * IRE iterator used by ire_ftable_lookup to process multiple equal 784 * routes. Given a starting point in the hash list (hash), walk the IREs 785 * in the bucket skipping deleted entries. We treat the bucket as a circular 786 * list for the purposes of walking it. 787 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 788 * not applicable (ire_match_args failed) then it returns a subsequent one. 789 * If we fail to find an IRE we return NULL. 790 * 791 * Assumes that the caller holds a reference on the IRE bucket and a read lock 792 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 793 * 794 * Applies to IPv4 and IPv6. 795 * 796 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 797 * address and bucket, we compare against ire_type for the orig_ire. We also 798 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 799 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 800 * 801 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 802 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 803 * in which the zone has an IP address. We check this for the global zone 804 * even if no shared-IP zones are configured. 805 */ 806 ire_t * 807 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 808 ire_t *orig_ire, ip_stack_t *ipst) 809 { 810 ire_t *ire, *maybe_ire = NULL; 811 uint_t maybe_badcnt; 812 uint_t maxwalk; 813 814 /* Fold in more bits from the hint/hash */ 815 hash = hash ^ (hash >> 8) ^ (hash >> 16); 816 817 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 818 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 819 hash %= maxwalk; 820 irb_refhold_locked(irb_ptr); 821 rw_exit(&irb_ptr->irb_lock); 822 823 /* 824 * Round-robin the routers list looking for a route that 825 * matches the passed in parameters. 826 * First we skip "hash" number of non-condemned IREs. 827 * Then we match the IRE. 828 * If we find an ire which has a non-zero ire_badcnt then we remember 829 * it and keep on looking for a lower ire_badcnt. 830 * If we come to the end of the list we continue (treat the 831 * bucket list as a circular list) but we match less than "max" 832 * entries. 833 */ 834 ire = irb_ptr->irb_ire; 835 while (maxwalk > 0) { 836 if (IRE_IS_CONDEMNED(ire)) 837 goto next_ire_skip; 838 839 /* Skip the first "hash" entries to do ECMP */ 840 if (hash != 0) { 841 hash--; 842 goto next_ire_skip; 843 } 844 845 /* See CGTP comment above */ 846 if (ire->ire_type != orig_ire->ire_type || 847 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 848 goto next_ire; 849 850 /* 851 * Note: Since IPv6 has hash buckets instead of radix 852 * buckers we need to explicitly compare the addresses. 853 * That makes this less efficient since we will be called 854 * even if there is no alternatives just because the 855 * bucket has multiple IREs for different addresses. 856 */ 857 if (ire->ire_ipversion == IPV6_VERSION) { 858 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 859 &ire->ire_addr_v6)) 860 goto next_ire; 861 } 862 863 /* 864 * For some reason find_best_route uses ire_mask. We do 865 * the same. 866 */ 867 if (ire->ire_ipversion == IPV4_VERSION ? 868 !ire_match_args(ire, margs->ift_addr, 869 ire->ire_mask, margs->ift_gateway, 870 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 871 margs->ift_tsl, margs->ift_flags) : 872 !ire_match_args_v6(ire, &margs->ift_addr_v6, 873 &ire->ire_mask_v6, &margs->ift_gateway_v6, 874 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 875 margs->ift_tsl, margs->ift_flags)) 876 goto next_ire; 877 878 if (margs->ift_zoneid != ALL_ZONES && 879 (ire->ire_type & IRE_OFFLINK)) { 880 /* 881 * When we're in a zone, we're only 882 * interested in routers that are 883 * reachable through ipifs within our zone. 884 */ 885 if (ire->ire_ipversion == IPV4_VERSION) { 886 if (!ire_gateway_ok_zone_v4( 887 ire->ire_gateway_addr, margs->ift_zoneid, 888 ire->ire_ill, margs->ift_tsl, ipst, 889 B_TRUE)) 890 goto next_ire; 891 } else { 892 if (!ire_gateway_ok_zone_v6( 893 &ire->ire_gateway_addr_v6, 894 margs->ift_zoneid, ire->ire_ill, 895 margs->ift_tsl, ipst, B_TRUE)) 896 goto next_ire; 897 } 898 } 899 mutex_enter(&ire->ire_lock); 900 /* Look for stale ire_badcnt and clear */ 901 if (ire->ire_badcnt != 0 && 902 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 903 ipst->ips_ip_ire_badcnt_lifetime)) 904 ire->ire_badcnt = 0; 905 mutex_exit(&ire->ire_lock); 906 907 if (ire->ire_badcnt == 0) { 908 /* We found one with a zero badcnt; done */ 909 ire_refhold(ire); 910 /* 911 * Care needed since irb_refrele grabs WLOCK to free 912 * the irb_t. 913 */ 914 if (ire->ire_ipversion == IPV4_VERSION) { 915 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 916 irb_refrele(irb_ptr); 917 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 918 } else { 919 rw_exit(&ipst->ips_ip6_ire_head_lock); 920 irb_refrele(irb_ptr); 921 rw_enter(&ipst->ips_ip6_ire_head_lock, 922 RW_READER); 923 } 924 return (ire); 925 } 926 /* 927 * keep looking to see if there is a better (lower 928 * badcnt) matching IRE, but save this one as a last resort. 929 * If we find a lower badcnt pick that one as the last* resort. 930 */ 931 if (maybe_ire == NULL) { 932 maybe_ire = ire; 933 maybe_badcnt = ire->ire_badcnt; 934 } else if (ire->ire_badcnt < maybe_badcnt) { 935 maybe_ire = ire; 936 maybe_badcnt = ire->ire_badcnt; 937 } 938 939 next_ire: 940 maxwalk--; 941 next_ire_skip: 942 ire = ire->ire_next; 943 if (ire == NULL) 944 ire = irb_ptr->irb_ire; 945 } 946 if (maybe_ire != NULL) 947 ire_refhold(maybe_ire); 948 949 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 950 if (ire->ire_ipversion == IPV4_VERSION) { 951 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 952 irb_refrele(irb_ptr); 953 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 954 } else { 955 rw_exit(&ipst->ips_ip6_ire_head_lock); 956 irb_refrele(irb_ptr); 957 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 958 } 959 return (maybe_ire); 960 } 961 962 void 963 irb_refhold_rn(struct radix_node *rn) 964 { 965 if ((rn->rn_flags & RNF_ROOT) == 0) 966 irb_refhold(&((rt_t *)(rn))->rt_irb); 967 } 968 969 void 970 irb_refrele_rn(struct radix_node *rn) 971 { 972 if ((rn->rn_flags & RNF_ROOT) == 0) 973 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 974 } 975 976 977 /* 978 * ip_select_src_ill() is used by ip_select_route() to find the src_ill 979 * to be used for source-aware routing table lookup. This function will 980 * ignore IPIF_UNNUMBERED interface addresses, and will only return a 981 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 982 * interfaces). 983 */ 984 static ill_t * 985 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 986 { 987 ipif_t *ipif; 988 ill_t *ill; 989 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 990 ipaddr_t v4src; 991 992 if (isv6) { 993 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 994 } else { 995 IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 996 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 997 } 998 if (ipif == NULL) 999 return (NULL); 1000 ill = ipif->ipif_ill; 1001 ill_refhold(ill); 1002 ipif_refrele(ipif); 1003 return (ill); 1004 } 1005 1006 /* 1007 * verify that v6src is configured on ill 1008 */ 1009 static boolean_t 1010 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 1011 { 1012 ipif_t *ipif; 1013 ip_stack_t *ipst; 1014 ipaddr_t v4src; 1015 1016 if (ill == NULL) 1017 return (B_FALSE); 1018 ipst = ill->ill_ipst; 1019 1020 if (ill->ill_isv6) { 1021 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 1022 } else { 1023 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 1024 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 1025 } 1026 1027 if (ipif != NULL) { 1028 ipif_refrele(ipif); 1029 return (B_TRUE); 1030 } else { 1031 return (B_FALSE); 1032 } 1033 } 1034 1035 /* 1036 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 1037 * routes this routine sets up a ire_nce_cache as well. The caller needs to 1038 * lookup an nce for the multicast case. 1039 * 1040 * When src_multihoming is set to 2 (strict src multihoming) we use the source 1041 * address to select the interface and route. If IP_BOUND_IF etc are 1042 * specified, we require that they specify an interface on which the 1043 * source address is assigned. 1044 * 1045 * When src_multihoming is set to 1 (preferred src aware route 1046 * selection) the unicast lookup prefers a matching source 1047 * (i.e., that the route points out an ill on which the source is assigned), but 1048 * if no such route is found we fallback to not considering the source in the 1049 * route lookup. 1050 * 1051 * We skip the src_multihoming check when the source isn't (yet) set, and 1052 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 1053 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 1054 * when secpolicy_net_rawaccess(). 1055 */ 1056 ire_t * 1057 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 1058 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1059 int *errorp, boolean_t *multirtp) 1060 { 1061 uint_t match_args; 1062 uint_t ire_type; 1063 ill_t *ill = NULL; 1064 ire_t *ire; 1065 ip_stack_t *ipst = ixa->ixa_ipst; 1066 ipaddr_t v4dst; 1067 in6_addr_t v6nexthop; 1068 iaflags_t ixaflags = ixa->ixa_flags; 1069 nce_t *nce; 1070 boolean_t preferred_src_aware = B_FALSE; 1071 boolean_t verify_src; 1072 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 1073 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 1074 1075 /* 1076 * We only verify that the src has been configured on a selected 1077 * interface if the src is not :: or INADDR_ANY, and if the 1078 * IXAF_VERIFY_SOURCE flag is set. 1079 */ 1080 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 1081 (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 1082 1083 match_args = MATCH_IRE_SECATTR; 1084 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 1085 if (setsrcp != NULL) 1086 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1087 if (errorp != NULL) 1088 ASSERT(*errorp == 0); 1089 1090 /* 1091 * The content of the ixa will be different if IP_NEXTHOP, 1092 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1093 */ 1094 1095 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 1096 /* Pick up the IRE_MULTICAST for the ill */ 1097 if (ixa->ixa_multicast_ifindex != 0) { 1098 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1099 isv6, ipst); 1100 } else if (ixaflags & IXAF_SCOPEID_SET) { 1101 /* sin6_scope_id takes precedence over ixa_ifindex */ 1102 ASSERT(ixa->ixa_scopeid != 0); 1103 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1104 isv6, ipst); 1105 } else if (ixa->ixa_ifindex != 0) { 1106 /* 1107 * In the ipmp case, the ixa_ifindex is set to 1108 * point at an under_ill and we would return the 1109 * ire_multicast() corresponding to that under_ill. 1110 */ 1111 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1112 isv6, ipst); 1113 } else if (src_multihoming != 0 && verify_src) { 1114 /* Look up the ill based on the source address */ 1115 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1116 /* 1117 * Since we looked up the ill from the source there 1118 * is no need to verify that the source is on the ill 1119 * below. 1120 */ 1121 verify_src = B_FALSE; 1122 if (ill != NULL && IS_VNI(ill)) { 1123 ill_t *usesrc = ill; 1124 1125 ill = ill_lookup_usesrc(usesrc); 1126 ill_refrele(usesrc); 1127 } 1128 } else if (!isv6) { 1129 ipaddr_t v4setsrc = INADDR_ANY; 1130 1131 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 1132 ipst, multirtp, &v4setsrc); 1133 if (setsrcp != NULL) 1134 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1135 } else { 1136 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 1137 ipst, multirtp, setsrcp); 1138 } 1139 if (ill != NULL && IS_VNI(ill)) { 1140 ill_refrele(ill); 1141 ill = NULL; 1142 } 1143 if (ill == NULL) { 1144 if (errorp != NULL) 1145 *errorp = ENXIO; 1146 /* Get a hold on the IRE_NOROUTE */ 1147 ire = ire_reject(ipst, isv6); 1148 return (ire); 1149 } 1150 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1151 ill_refrele(ill); 1152 if (errorp != NULL) 1153 *errorp = EHOSTUNREACH; 1154 /* Get a hold on the IRE_NOROUTE */ 1155 ire = ire_reject(ipst, isv6); 1156 return (ire); 1157 } 1158 /* 1159 * If we are doing the strictest src_multihoming, then 1160 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 1161 * an interface that is consistent with the source address. 1162 */ 1163 if (verify_src && src_multihoming == 2 && 1164 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1165 if (errorp != NULL) 1166 *errorp = EADDRNOTAVAIL; 1167 ill_refrele(ill); 1168 /* Get a hold on the IRE_NOROUTE */ 1169 ire = ire_reject(ipst, isv6); 1170 return (ire); 1171 } 1172 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1173 ire = ire_multicast(ill); 1174 ill_refrele(ill); 1175 if (generationp != NULL) 1176 *generationp = ire->ire_generation; 1177 if (errorp != NULL && 1178 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1179 *errorp = EHOSTUNREACH; 1180 } 1181 return (ire); 1182 } 1183 1184 /* Now for unicast */ 1185 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1186 if (ixaflags & IXAF_SCOPEID_SET) { 1187 /* sin6_scope_id takes precedence over ixa_ifindex */ 1188 ASSERT(ixa->ixa_scopeid != 0); 1189 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1190 isv6, ipst); 1191 } else { 1192 ASSERT(ixa->ixa_ifindex != 0); 1193 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1194 isv6, ipst); 1195 } 1196 if (ill != NULL && IS_VNI(ill)) { 1197 ill_refrele(ill); 1198 ill = NULL; 1199 } 1200 if (ill == NULL) { 1201 if (errorp != NULL) 1202 *errorp = ENXIO; 1203 /* Get a hold on the IRE_NOROUTE */ 1204 ire = ire_reject(ipst, isv6); 1205 return (ire); 1206 } 1207 1208 match_args |= MATCH_IRE_ILL; 1209 1210 /* 1211 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1212 * so for both of them we need to be able look for an under 1213 * interface. 1214 */ 1215 if (IS_UNDER_IPMP(ill)) 1216 match_args |= MATCH_IRE_TESTHIDDEN; 1217 1218 /* 1219 * If we are doing the strictest src_multihoming, then 1220 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 1221 * an interface that is consistent with the source address. 1222 */ 1223 if (src_multihoming == 2 && 1224 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1225 if (errorp != NULL) 1226 *errorp = EADDRNOTAVAIL; 1227 ill_refrele(ill); 1228 /* Get a hold on the IRE_NOROUTE */ 1229 ire = ire_reject(ipst, isv6); 1230 return (ire); 1231 } 1232 } else if (src_multihoming != 0 && verify_src) { 1233 /* Look up the ill based on the source address */ 1234 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1235 if (ill == NULL) { 1236 char addrbuf[INET6_ADDRSTRLEN]; 1237 1238 ip3dbg(("%s not a valid src for unicast", 1239 inet_ntop(AF_INET6, &v6src, addrbuf, 1240 sizeof (addrbuf)))); 1241 if (errorp != NULL) 1242 *errorp = EADDRNOTAVAIL; 1243 /* Get a hold on the IRE_NOROUTE */ 1244 ire = ire_reject(ipst, isv6); 1245 return (ire); 1246 } 1247 match_args |= MATCH_IRE_SRC_ILL; 1248 preferred_src_aware = (src_multihoming == 1); 1249 } 1250 1251 if (ixaflags & IXAF_NEXTHOP_SET) { 1252 /* IP_NEXTHOP was set */ 1253 v6nexthop = ixa->ixa_nexthop_v6; 1254 } else { 1255 v6nexthop = *v6dst; 1256 } 1257 1258 ire_type = 0; 1259 1260 /* 1261 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1262 * we only look for an onlink IRE. 1263 */ 1264 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1265 match_args |= MATCH_IRE_TYPE; 1266 ire_type = IRE_ONLINK; 1267 } 1268 1269 retry: 1270 if (!isv6) { 1271 ipaddr_t v4nexthop; 1272 ipaddr_t v4setsrc = INADDR_ANY; 1273 1274 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1275 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1276 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1277 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1278 if (setsrcp != NULL) 1279 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1280 } else { 1281 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1282 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1283 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1284 } 1285 1286 #ifdef DEBUG 1287 if (match_args & MATCH_IRE_TESTHIDDEN) { 1288 ip3dbg(("looking for hidden; dst %x ire %p\n", 1289 v4dst, (void *)ire)); 1290 } 1291 #endif 1292 if (ill != NULL) { 1293 ill_refrele(ill); 1294 ill = NULL; 1295 } 1296 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1297 (ire->ire_type & IRE_MULTICAST)) { 1298 if (preferred_src_aware) { 1299 /* 1300 * "Preferred Source Aware" send mode. If we cannot 1301 * find an ire whose ire_ill had the desired source 1302 * address retry after relaxing the ill matching 1303 * constraint. 1304 */ 1305 ire_refrele(ire); 1306 preferred_src_aware = B_FALSE; 1307 match_args &= ~MATCH_IRE_SRC_ILL; 1308 goto retry; 1309 } 1310 /* No ire_nce_cache */ 1311 return (ire); 1312 } 1313 1314 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1315 mutex_enter(&ire->ire_lock); 1316 nce = ire->ire_nce_cache; 1317 if (nce == NULL || nce->nce_is_condemned) { 1318 mutex_exit(&ire->ire_lock); 1319 (void) ire_revalidate_nce(ire); 1320 } else { 1321 mutex_exit(&ire->ire_lock); 1322 } 1323 return (ire); 1324 } 1325 1326 /* 1327 * Find a route given some xmit attributes and a packet. 1328 * Generic for IPv4 and IPv6 1329 * 1330 * This never returns NULL. But when it returns the IRE_NOROUTE 1331 * it might set errorp. 1332 */ 1333 ire_t * 1334 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1335 int *errorp, boolean_t *multirtp) 1336 { 1337 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1338 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1339 in6_addr_t v6dst, v6src; 1340 1341 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1342 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 1343 1344 return (ip_select_route(&v6dst, v6src, ixa, generationp, 1345 NULL, errorp, multirtp)); 1346 } else { 1347 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1348 1349 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 1350 ixa, generationp, NULL, errorp, multirtp)); 1351 } 1352 } 1353 1354 ire_t * 1355 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 1356 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1357 { 1358 in6_addr_t v6dst, v6src; 1359 ire_t *ire; 1360 in6_addr_t setsrc; 1361 1362 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1363 1364 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1365 IN6_IPADDR_TO_V4MAPPED(src, &v6src); 1366 1367 setsrc = ipv6_all_zeros; 1368 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 1369 multirtp); 1370 if (v4setsrcp != NULL) 1371 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1372 return (ire); 1373 } 1374 1375 /* 1376 * Recursively look for a route to the destination. Can also match on 1377 * the zoneid, ill, and label. Used for the data paths. See also 1378 * ire_route_recursive. 1379 * 1380 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1381 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1382 * forwarding. 1383 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1384 * resolve the gateway. 1385 * 1386 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1387 * instead. 1388 * 1389 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1390 * is an error. 1391 * Allow at most one RTF_INDIRECT. 1392 */ 1393 ire_t * 1394 ire_route_recursive_impl_v4(ire_t *ire, 1395 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1396 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1397 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1398 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1399 { 1400 int i, j; 1401 ire_t *ires[MAX_IRE_RECURSION]; 1402 uint_t generation; 1403 uint_t generations[MAX_IRE_RECURSION]; 1404 boolean_t need_refrele = B_FALSE; 1405 boolean_t invalidate = B_FALSE; 1406 int prefs[MAX_IRE_RECURSION]; 1407 ill_t *ill = NULL; 1408 1409 if (setsrcp != NULL) 1410 ASSERT(*setsrcp == INADDR_ANY); 1411 if (gwattrp != NULL) 1412 ASSERT(*gwattrp == NULL); 1413 1414 /* 1415 * We iterate up to three times to resolve a route, even though 1416 * we have four slots in the array. The extra slot is for an 1417 * IRE_IF_CLONE we might need to create. 1418 */ 1419 i = 0; 1420 while (i < MAX_IRE_RECURSION - 1) { 1421 /* ire_ftable_lookup handles round-robin/ECMP */ 1422 if (ire == NULL) { 1423 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1424 (ill != NULL? ill : ill_arg), zoneid, tsl, 1425 match_args, xmit_hint, ipst, &generation); 1426 } else { 1427 /* Caller passed it; extra hold since we will rele */ 1428 ire_refhold(ire); 1429 if (generationp != NULL) 1430 generation = *generationp; 1431 else 1432 generation = IRE_GENERATION_VERIFY; 1433 } 1434 if (ire == NULL) 1435 ire = ire_reject(ipst, B_FALSE); 1436 1437 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1438 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1439 goto error; 1440 1441 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1442 1443 if (i != 0) { 1444 prefs[i] = ire_pref(ire); 1445 /* 1446 * Don't allow anything unusual past the first 1447 * iteration. 1448 */ 1449 if ((ire->ire_type & 1450 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1451 prefs[i] <= prefs[i-1]) { 1452 ire_refrele(ire); 1453 if (irr_flags & IRR_INCOMPLETE) { 1454 ire = ires[0]; 1455 ire_refhold(ire); 1456 } else { 1457 ire = ire_reject(ipst, B_FALSE); 1458 } 1459 goto error; 1460 } 1461 } 1462 /* We have a usable IRE */ 1463 ires[i] = ire; 1464 generations[i] = generation; 1465 i++; 1466 1467 /* The first RTF_SETSRC address is passed back if setsrcp */ 1468 if ((ire->ire_flags & RTF_SETSRC) && 1469 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1470 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1471 *setsrcp = ire->ire_setsrc_addr; 1472 } 1473 1474 /* The first ire_gw_secattr is passed back if gwattrp */ 1475 if (ire->ire_gw_secattr != NULL && 1476 gwattrp != NULL && *gwattrp == NULL) 1477 *gwattrp = ire->ire_gw_secattr; 1478 1479 /* 1480 * Check if we have a short-cut pointer to an IRE for this 1481 * destination, and that the cached dependency isn't stale. 1482 * In that case we've rejoined an existing tree towards a 1483 * parent, thus we don't need to continue the loop to 1484 * discover the rest of the tree. 1485 */ 1486 mutex_enter(&ire->ire_lock); 1487 if (ire->ire_dep_parent != NULL && 1488 ire->ire_dep_parent->ire_generation == 1489 ire->ire_dep_parent_generation) { 1490 mutex_exit(&ire->ire_lock); 1491 ire = NULL; 1492 goto done; 1493 } 1494 mutex_exit(&ire->ire_lock); 1495 1496 /* 1497 * If this type should have an ire_nce_cache (even if it 1498 * doesn't yet have one) then we are done. Includes 1499 * IRE_INTERFACE with a full 32 bit mask. 1500 */ 1501 if (ire->ire_nce_capable) { 1502 ire = NULL; 1503 goto done; 1504 } 1505 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1506 /* 1507 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1508 * particular destination 1509 */ 1510 if (ire->ire_type & IRE_INTERFACE) { 1511 in6_addr_t v6nexthop; 1512 ire_t *clone; 1513 1514 ASSERT(ire->ire_masklen != IPV4_ABITS); 1515 1516 /* 1517 * In the case of ip_input and ILLF_FORWARDING not 1518 * being set, and in the case of RTM_GET, there is 1519 * no point in allocating an IRE_IF_CLONE. We return 1520 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1521 * result in a ire_dep_parent which is IRE_IF_* 1522 * without an IRE_IF_CLONE. 1523 * We recover from that when we need to send packets 1524 * by ensuring that the generations become 1525 * IRE_GENERATION_VERIFY in this case. 1526 */ 1527 if (!(irr_flags & IRR_ALLOCATE)) { 1528 invalidate = B_TRUE; 1529 ire = NULL; 1530 goto done; 1531 } 1532 1533 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1534 1535 clone = ire_create_if_clone(ire, &v6nexthop, 1536 &generation); 1537 if (clone == NULL) { 1538 /* 1539 * Temporary failure - no memory. 1540 * Don't want caller to cache IRE_NOROUTE. 1541 */ 1542 invalidate = B_TRUE; 1543 ire = ire_blackhole(ipst, B_FALSE); 1544 goto error; 1545 } 1546 /* 1547 * Make clone next to last entry and the 1548 * IRE_INTERFACE the last in the dependency 1549 * chain since the clone depends on the 1550 * IRE_INTERFACE. 1551 */ 1552 ASSERT(i >= 1); 1553 ASSERT(i < MAX_IRE_RECURSION); 1554 1555 ires[i] = ires[i-1]; 1556 generations[i] = generations[i-1]; 1557 ires[i-1] = clone; 1558 generations[i-1] = generation; 1559 i++; 1560 1561 ire = NULL; 1562 goto done; 1563 } 1564 1565 /* 1566 * We only match on the type and optionally ILL when 1567 * recursing. The type match is used by some callers 1568 * to exclude certain types (such as IRE_IF_CLONE or 1569 * IRE_LOCAL|IRE_LOOPBACK). 1570 * 1571 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1572 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1573 * ire_ill, so we set ill to the ire_ill; 1574 */ 1575 match_args &= MATCH_IRE_TYPE; 1576 nexthop = ire->ire_gateway_addr; 1577 if (ill == NULL && ire->ire_ill != NULL) { 1578 ill = ire->ire_ill; 1579 need_refrele = B_TRUE; 1580 ill_refhold(ill); 1581 match_args |= MATCH_IRE_ILL; 1582 } 1583 /* 1584 * We set the prefs[i] value above if i > 0. We've already 1585 * done i++ so i is one in the case of the first time around. 1586 */ 1587 if (i == 1) 1588 prefs[0] = ire_pref(ire); 1589 ire = NULL; 1590 } 1591 ASSERT(ire == NULL); 1592 ire = ire_reject(ipst, B_FALSE); 1593 1594 error: 1595 ASSERT(ire != NULL); 1596 if (need_refrele) 1597 ill_refrele(ill); 1598 1599 /* 1600 * In the case of MULTIRT we want to try a different IRE the next 1601 * time. We let the next packet retry in that case. 1602 */ 1603 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1604 (void) ire_no_good(ires[0]); 1605 1606 cleanup: 1607 /* cleanup ires[i] */ 1608 ire_dep_unbuild(ires, i); 1609 for (j = 0; j < i; j++) 1610 ire_refrele(ires[j]); 1611 1612 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1613 (irr_flags & IRR_INCOMPLETE)); 1614 /* 1615 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1616 * ip_select_route since the reject or lack of memory might be gone. 1617 */ 1618 if (generationp != NULL) 1619 *generationp = IRE_GENERATION_VERIFY; 1620 return (ire); 1621 1622 done: 1623 ASSERT(ire == NULL); 1624 if (need_refrele) { 1625 ill_refrele(ill); 1626 ill = NULL; 1627 } 1628 1629 /* Build dependencies */ 1630 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1631 /* Something in chain was condemned; tear it apart */ 1632 ire = ire_reject(ipst, B_FALSE); 1633 goto cleanup; 1634 } 1635 1636 /* 1637 * Release all refholds except the one for ires[0] that we 1638 * will return to the caller. 1639 */ 1640 for (j = 1; j < i; j++) 1641 ire_refrele(ires[j]); 1642 1643 if (invalidate) { 1644 /* 1645 * Since we needed to allocate but couldn't we need to make 1646 * sure that the dependency chain is rebuilt the next time. 1647 */ 1648 ire_dep_invalidate_generations(ires[0]); 1649 generation = IRE_GENERATION_VERIFY; 1650 } else { 1651 /* 1652 * IREs can have been added or deleted while we did the 1653 * recursive lookup and we can't catch those until we've built 1654 * the dependencies. We verify the stored 1655 * ire_dep_parent_generation to catch any such changes and 1656 * return IRE_GENERATION_VERIFY (which will cause 1657 * ip_select_route to be called again so we can redo the 1658 * recursive lookup next time we send a packet. 1659 */ 1660 if (ires[0]->ire_dep_parent == NULL) 1661 generation = ires[0]->ire_generation; 1662 else 1663 generation = ire_dep_validate_generations(ires[0]); 1664 if (generations[0] != ires[0]->ire_generation) { 1665 /* Something changed at the top */ 1666 generation = IRE_GENERATION_VERIFY; 1667 } 1668 } 1669 if (generationp != NULL) 1670 *generationp = generation; 1671 1672 return (ires[0]); 1673 } 1674 1675 ire_t * 1676 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1677 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1678 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1679 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1680 { 1681 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1682 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1683 gwattrp, generationp)); 1684 } 1685 1686 /* 1687 * Recursively look for a route to the destination. 1688 * We only handle a destination match here, yet we have the same arguments 1689 * as the full match to allow function pointers to select between the two. 1690 * 1691 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1692 * instead. 1693 * 1694 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1695 * is an error. 1696 * Allow at most one RTF_INDIRECT. 1697 */ 1698 ire_t * 1699 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1700 uint32_t xmit_hint, ip_stack_t *ipst) 1701 { 1702 ire_t *ire; 1703 ire_t *ire1; 1704 uint_t generation; 1705 1706 /* ire_ftable_lookup handles round-robin/ECMP */ 1707 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1708 &generation); 1709 ASSERT(ire != NULL); 1710 1711 /* 1712 * If this type should have an ire_nce_cache (even if it 1713 * doesn't yet have one) then we are done. Includes 1714 * IRE_INTERFACE with a full 32 bit mask. 1715 */ 1716 if (ire->ire_nce_capable) 1717 return (ire); 1718 1719 /* 1720 * If the IRE has a current cached parent we know that the whole 1721 * parent chain is current, hence we don't need to discover and 1722 * build any dependencies by doing a recursive lookup. 1723 */ 1724 mutex_enter(&ire->ire_lock); 1725 if (ire->ire_dep_parent != NULL && 1726 ire->ire_dep_parent->ire_generation == 1727 ire->ire_dep_parent_generation) { 1728 mutex_exit(&ire->ire_lock); 1729 return (ire); 1730 } 1731 mutex_exit(&ire->ire_lock); 1732 1733 /* 1734 * Fallback to loop in the normal code starting with the ire 1735 * we found. Normally this would return the same ire. 1736 */ 1737 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1738 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1739 &generation); 1740 ire_refrele(ire); 1741 return (ire1); 1742 } 1743