1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 */ 24 25 /* 26 * This file contains consumer routines of the IPv4 forwarding engine 27 */ 28 29 #include <sys/types.h> 30 #include <sys/stream.h> 31 #include <sys/stropts.h> 32 #include <sys/strlog.h> 33 #include <sys/dlpi.h> 34 #include <sys/ddi.h> 35 #include <sys/cmn_err.h> 36 #include <sys/policy.h> 37 38 #include <sys/systm.h> 39 #include <sys/strsun.h> 40 #include <sys/kmem.h> 41 #include <sys/param.h> 42 #include <sys/socket.h> 43 #include <sys/strsubr.h> 44 #include <net/if.h> 45 #include <net/route.h> 46 #include <netinet/in.h> 47 #include <net/if_dl.h> 48 #include <netinet/ip6.h> 49 #include <netinet/icmp6.h> 50 51 #include <inet/ipsec_impl.h> 52 #include <inet/common.h> 53 #include <inet/mi.h> 54 #include <inet/mib2.h> 55 #include <inet/ip.h> 56 #include <inet/ip_impl.h> 57 #include <inet/ip6.h> 58 #include <inet/ip_ndp.h> 59 #include <inet/arp.h> 60 #include <inet/ip_if.h> 61 #include <inet/ip_ire.h> 62 #include <inet/ip_ftable.h> 63 #include <inet/ip_rts.h> 64 #include <inet/nd.h> 65 66 #include <net/pfkeyv2.h> 67 #include <inet/sadb.h> 68 #include <inet/tcp.h> 69 #include <inet/ipclassifier.h> 70 #include <sys/zone.h> 71 #include <net/radix.h> 72 #include <sys/tsol/label.h> 73 #include <sys/tsol/tnet.h> 74 75 #define IS_DEFAULT_ROUTE(ire) \ 76 (((ire)->ire_type & IRE_DEFAULT) || \ 77 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 78 79 #define IP_SRC_MULTIHOMING(isv6, ipst) \ 80 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 81 ipst->ips_ip_strict_src_multihoming) 82 83 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 84 static void ire_del_host_redir(ire_t *, char *); 85 static boolean_t ire_find_best_route(struct radix_node *, void *); 86 87 /* 88 * Lookup a route in forwarding table. A specific lookup is indicated by 89 * passing the required parameters and indicating the match required in the 90 * flag field. 91 * 92 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 93 */ 94 ire_t * 95 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 96 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 97 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 98 { 99 ire_t *ire; 100 struct rt_sockaddr rdst, rmask; 101 struct rt_entry *rt; 102 ire_ftable_args_t margs; 103 104 ASSERT(ill == NULL || !ill->ill_isv6); 105 106 /* 107 * ire_match_args() will dereference ill if MATCH_IRE_ILL 108 * is set. 109 */ 110 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 111 return (NULL); 112 113 bzero(&rdst, sizeof (rdst)); 114 rdst.rt_sin_len = sizeof (rdst); 115 rdst.rt_sin_family = AF_INET; 116 rdst.rt_sin_addr.s_addr = addr; 117 118 bzero(&rmask, sizeof (rmask)); 119 rmask.rt_sin_len = sizeof (rmask); 120 rmask.rt_sin_family = AF_INET; 121 rmask.rt_sin_addr.s_addr = mask; 122 123 bzero(&margs, sizeof (margs)); 124 margs.ift_addr = addr; 125 margs.ift_mask = mask; 126 margs.ift_gateway = gateway; 127 margs.ift_type = type; 128 margs.ift_ill = ill; 129 margs.ift_zoneid = zoneid; 130 margs.ift_tsl = tsl; 131 margs.ift_flags = flags; 132 133 /* 134 * The flags argument passed to ire_ftable_lookup may cause the 135 * search to return, not the longest matching prefix, but the 136 * "best matching prefix", i.e., the longest prefix that also 137 * satisfies constraints imposed via the permutation of flags 138 * passed in. To achieve this, we invoke ire_match_args() on 139 * each matching leaf in the radix tree. ire_match_args is 140 * invoked by the callback function ire_find_best_route() 141 * We hold the global tree lock in read mode when calling 142 * rn_match_args. Before dropping the global tree lock, ensure 143 * that the radix node can't be deleted by incrementing ire_refcnt. 144 */ 145 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 146 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 147 ipst->ips_ip_ftable, ire_find_best_route, &margs); 148 ire = margs.ift_best_ire; 149 if (rt == NULL) { 150 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 151 return (NULL); 152 } 153 ASSERT(ire != NULL); 154 155 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 156 157 /* 158 * round-robin only if we have more than one route in the bucket. 159 * ips_ip_ecmp_behavior controls when we do ECMP 160 * 2: always 161 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 162 * 0: never 163 */ 164 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 165 if (ipst->ips_ip_ecmp_behavior == 2 || 166 (ipst->ips_ip_ecmp_behavior == 1 && 167 IS_DEFAULT_ROUTE(ire))) { 168 ire_t *next_ire; 169 170 margs.ift_best_ire = NULL; 171 next_ire = ire_round_robin(ire->ire_bucket, &margs, 172 xmit_hint, ire, ipst); 173 if (next_ire == NULL) { 174 /* keep ire if next_ire is null */ 175 goto done; 176 } 177 ire_refrele(ire); 178 ire = next_ire; 179 } 180 } 181 182 done: 183 /* Return generation before dropping lock */ 184 if (generationp != NULL) 185 *generationp = ire->ire_generation; 186 187 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 188 189 /* 190 * For shared-IP zones we need additional checks to what was 191 * done in ire_match_args to make sure IRE_LOCALs are handled. 192 * 193 * When ip_restrict_interzone_loopback is set, then 194 * we ensure that IRE_LOCAL are only used for loopback 195 * between zones when the logical "Ethernet" would 196 * have looped them back. That is, if in the absense of 197 * the IRE_LOCAL we would have sent to packet out the 198 * same ill. 199 */ 200 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 201 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 202 ipst->ips_ip_restrict_interzone_loopback) { 203 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 204 ASSERT(ire != NULL); 205 } 206 return (ire); 207 } 208 209 /* 210 * This function is called by 211 * ip_input/ire_route_recursive when doing a route lookup on only the 212 * destination address. 213 * 214 * The optimizations of this function over ire_ftable_lookup are: 215 * o removing unnecessary flag matching 216 * o doing longest prefix match instead of overloading it further 217 * with the unnecessary "best_prefix_match" 218 * 219 * If no route is found we return IRE_NOROUTE. 220 */ 221 ire_t * 222 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 223 uint_t *generationp) 224 { 225 ire_t *ire; 226 struct rt_sockaddr rdst; 227 struct rt_entry *rt; 228 irb_t *irb; 229 230 rdst.rt_sin_len = sizeof (rdst); 231 rdst.rt_sin_family = AF_INET; 232 rdst.rt_sin_addr.s_addr = addr; 233 234 /* 235 * This is basically inlining a simpler version of ire_match_args 236 */ 237 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 238 239 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 240 ipst->ips_ip_ftable, NULL, NULL); 241 242 if (rt == NULL) 243 goto bad; 244 245 irb = &rt->rt_irb; 246 if (irb->irb_ire_cnt == 0) 247 goto bad; 248 249 rw_enter(&irb->irb_lock, RW_READER); 250 ire = irb->irb_ire; 251 if (ire == NULL) { 252 rw_exit(&irb->irb_lock); 253 goto bad; 254 } 255 while (IRE_IS_CONDEMNED(ire)) { 256 ire = ire->ire_next; 257 if (ire == NULL) { 258 rw_exit(&irb->irb_lock); 259 goto bad; 260 } 261 } 262 263 /* we have a ire that matches */ 264 ire_refhold(ire); 265 rw_exit(&irb->irb_lock); 266 267 /* 268 * round-robin only if we have more than one route in the bucket. 269 * ips_ip_ecmp_behavior controls when we do ECMP 270 * 2: always 271 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 272 * 0: never 273 * 274 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 275 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 276 * and the IRE_INTERFACESs are likely to be shorter matches. 277 */ 278 if (ire->ire_bucket->irb_ire_cnt > 1) { 279 if (ipst->ips_ip_ecmp_behavior == 2 || 280 (ipst->ips_ip_ecmp_behavior == 1 && 281 IS_DEFAULT_ROUTE(ire))) { 282 ire_t *next_ire; 283 ire_ftable_args_t margs; 284 285 bzero(&margs, sizeof (margs)); 286 margs.ift_addr = addr; 287 margs.ift_zoneid = ALL_ZONES; 288 289 next_ire = ire_round_robin(ire->ire_bucket, &margs, 290 xmit_hint, ire, ipst); 291 if (next_ire == NULL) { 292 /* keep ire if next_ire is null */ 293 if (generationp != NULL) 294 *generationp = ire->ire_generation; 295 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 296 return (ire); 297 } 298 ire_refrele(ire); 299 ire = next_ire; 300 } 301 } 302 /* Return generation before dropping lock */ 303 if (generationp != NULL) 304 *generationp = ire->ire_generation; 305 306 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 307 308 /* 309 * Since we only did ALL_ZONES matches there is no special handling 310 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 311 */ 312 return (ire); 313 314 bad: 315 if (generationp != NULL) 316 *generationp = IRE_GENERATION_VERIFY; 317 318 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 319 return (ire_reject(ipst, B_FALSE)); 320 } 321 322 /* 323 * Find the ill matching a multicast group. 324 * Allows different routes for multicast addresses 325 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 326 * which point at different interfaces. This is used when IP_MULTICAST_IF 327 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 328 * specify the interface to join on. 329 * 330 * Supports link-local addresses by using ire_route_recursive which follows 331 * the ill when recursing. 332 * 333 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 334 * and the MULTIRT property can be different for different groups, we 335 * extract RTF_MULTIRT from the special unicast route added for a group 336 * with CGTP and pass that back in the multirtp argument. 337 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 338 * We have a setsrcp argument for the same reason. 339 */ 340 ill_t * 341 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 342 boolean_t *multirtp, ipaddr_t *setsrcp) 343 { 344 ire_t *ire; 345 ill_t *ill; 346 347 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 348 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 349 ASSERT(ire != NULL); 350 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 351 ire_refrele(ire); 352 return (NULL); 353 } 354 355 if (multirtp != NULL) 356 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 357 358 ill = ire_nexthop_ill(ire); 359 ire_refrele(ire); 360 return (ill); 361 } 362 363 /* 364 * Delete the passed in ire if the gateway addr matches 365 */ 366 void 367 ire_del_host_redir(ire_t *ire, char *gateway) 368 { 369 if ((ire->ire_flags & RTF_DYNAMIC) && 370 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 371 ire_delete(ire); 372 } 373 374 /* 375 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 376 * pointing at the specified gateway and 377 * delete them. This routine is called only 378 * when a default gateway is going away. 379 */ 380 void 381 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 382 { 383 struct rtfuncarg rtfarg; 384 385 bzero(&rtfarg, sizeof (rtfarg)); 386 rtfarg.rt_func = ire_del_host_redir; 387 rtfarg.rt_arg = (void *)&gateway; 388 rtfarg.rt_zoneid = ALL_ZONES; 389 rtfarg.rt_ipst = ipst; 390 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 391 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 392 } 393 394 /* 395 * Obtain the rt_entry and rt_irb for the route to be added to 396 * the ips_ip_ftable. 397 * First attempt to add a node to the radix tree via rn_addroute. If the 398 * route already exists, return the bucket for the existing route. 399 * 400 * Locking notes: Need to hold the global radix tree lock in write mode to 401 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 402 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 403 * while holding the irb_lock, but not the radix tree lock. 404 */ 405 irb_t * 406 ire_get_bucket(ire_t *ire) 407 { 408 struct radix_node *rn; 409 struct rt_entry *rt; 410 struct rt_sockaddr rmask, rdst; 411 irb_t *irb = NULL; 412 ip_stack_t *ipst = ire->ire_ipst; 413 414 ASSERT(ipst->ips_ip_ftable != NULL); 415 416 /* first try to see if route exists (based on rtalloc1) */ 417 bzero(&rdst, sizeof (rdst)); 418 rdst.rt_sin_len = sizeof (rdst); 419 rdst.rt_sin_family = AF_INET; 420 rdst.rt_sin_addr.s_addr = ire->ire_addr; 421 422 bzero(&rmask, sizeof (rmask)); 423 rmask.rt_sin_len = sizeof (rmask); 424 rmask.rt_sin_family = AF_INET; 425 rmask.rt_sin_addr.s_addr = ire->ire_mask; 426 427 /* 428 * add the route. based on BSD's rtrequest1(RTM_ADD) 429 */ 430 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 431 /* kmem_alloc failed */ 432 if (rt == NULL) 433 return (NULL); 434 435 bzero(rt, sizeof (*rt)); 436 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 437 rt->rt_dst = rdst; 438 irb = &rt->rt_irb; 439 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 440 irb->irb_ipst = ipst; 441 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 442 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 443 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 444 ipst->ips_ip_ftable, (struct radix_node *)rt); 445 if (rn == NULL) { 446 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 447 Free(rt, rt_entry_cache); 448 rt = NULL; 449 irb = NULL; 450 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 451 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 452 ipst->ips_ip_ftable); 453 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 454 /* found a non-root match */ 455 rt = (struct rt_entry *)rn; 456 } 457 } 458 if (rt != NULL) { 459 irb = &rt->rt_irb; 460 irb_refhold(irb); 461 } 462 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 463 return (irb); 464 } 465 466 /* 467 * This function is used when the caller wants to know the outbound 468 * interface for a packet given only the address. 469 * If this is a offlink IP address and there are multiple 470 * routes to this destination, this routine will utilise the 471 * first route it finds to IP address 472 * Return values: 473 * 0 - FAILURE 474 * nonzero - ifindex 475 */ 476 uint_t 477 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 478 { 479 uint_t ifindex = 0; 480 ire_t *ire; 481 ill_t *ill; 482 netstack_t *ns; 483 ip_stack_t *ipst; 484 485 if (zoneid == ALL_ZONES) 486 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 487 else 488 ns = netstack_find_by_zoneid(zoneid); 489 ASSERT(ns != NULL); 490 491 /* 492 * For exclusive stacks we set the zoneid to zero 493 * since IP uses the global zoneid in the exclusive stacks. 494 */ 495 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 496 zoneid = GLOBAL_ZONEID; 497 ipst = ns->netstack_ip; 498 499 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 500 501 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 502 ill = ire_nexthop_ill(ire); 503 if (ill != NULL) { 504 ifindex = ill->ill_phyint->phyint_ifindex; 505 ill_refrele(ill); 506 } 507 ire_refrele(ire); 508 } 509 netstack_rele(ns); 510 return (ifindex); 511 } 512 513 /* 514 * Routine to find the route to a destination. If a ifindex is supplied 515 * it tries to match the route to the corresponding ipif for the ifindex 516 */ 517 static ire_t * 518 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 519 { 520 ire_t *ire = NULL; 521 int match_flags; 522 523 match_flags = MATCH_IRE_DSTONLY; 524 525 /* XXX pass NULL tsl for now */ 526 527 if (dst_addr->sa_family == AF_INET) { 528 ire = ire_route_recursive_v4( 529 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 530 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 531 NULL, NULL); 532 } else { 533 ire = ire_route_recursive_v6( 534 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 535 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 536 NULL, NULL); 537 } 538 ASSERT(ire != NULL); 539 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 540 ire_refrele(ire); 541 return (NULL); 542 } 543 return (ire); 544 } 545 546 /* 547 * This routine is called by IP Filter to send a packet out on the wire 548 * to a specified dstination (which may be onlink or offlink). The ifindex may 549 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 550 * an outgoing interface and requires the nexthop to be on that interface. 551 * IP WILL NOT DO the following to the data packet before sending it out: 552 * a. manipulate ttl 553 * b. ipsec work 554 * c. fragmentation 555 * 556 * If the packet has been prepared for hardware checksum then it will be 557 * passed off to ip_send_align_cksum() to check that the flags set on the 558 * packet are in alignment with the capabilities of the new outgoing NIC. 559 * 560 * Return values: 561 * 0: IP was able to send of the data pkt 562 * ECOMM: Could not send packet 563 * ENONET No route to dst. It is up to the caller 564 * to send icmp unreachable error message, 565 * EINPROGRESS The macaddr of the onlink dst or that 566 * of the offlink dst's nexthop needs to get 567 * resolved before packet can be sent to dst. 568 * Thus transmission is not guaranteed. 569 * Note: No longer have visibility to the ARP queue 570 * hence no EINPROGRESS. 571 */ 572 int 573 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 574 zoneid_t zoneid) 575 { 576 ipaddr_t nexthop; 577 netstack_t *ns; 578 ip_stack_t *ipst; 579 ip_xmit_attr_t ixas; 580 int error; 581 582 ASSERT(mp != NULL); 583 584 if (zoneid == ALL_ZONES) 585 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 586 else 587 ns = netstack_find_by_zoneid(zoneid); 588 ASSERT(ns != NULL); 589 590 /* 591 * For exclusive stacks we set the zoneid to zero 592 * since IP uses the global zoneid in the exclusive stacks. 593 */ 594 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 595 zoneid = GLOBAL_ZONEID; 596 ipst = ns->netstack_ip; 597 598 ASSERT(dst_addr->sa_family == AF_INET || 599 dst_addr->sa_family == AF_INET6); 600 601 bzero(&ixas, sizeof (ixas)); 602 /* 603 * No IPsec, no fragmentation, and don't let any hooks see 604 * the packet. 605 */ 606 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 607 ixas.ixa_cred = kcred; 608 ixas.ixa_cpid = NOPID; 609 ixas.ixa_tsl = NULL; 610 ixas.ixa_ipst = ipst; 611 ixas.ixa_ifindex = ifindex; 612 613 if (dst_addr->sa_family == AF_INET) { 614 ipha_t *ipha = (ipha_t *)mp->b_rptr; 615 616 ixas.ixa_flags |= IXAF_IS_IPV4; 617 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 618 if (nexthop != ipha->ipha_dst) { 619 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 620 ixas.ixa_nexthop_v4 = nexthop; 621 } 622 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 623 } else { 624 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 625 in6_addr_t *nexthop6; 626 627 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 628 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 629 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 630 ixas.ixa_nexthop_v6 = *nexthop6; 631 } 632 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 633 } 634 error = ip_output_simple(mp, &ixas); 635 ixa_cleanup(&ixas); 636 637 netstack_rele(ns); 638 switch (error) { 639 case 0: 640 break; 641 642 case EHOSTUNREACH: 643 case ENETUNREACH: 644 error = ENONET; 645 break; 646 647 default: 648 error = ECOMM; 649 break; 650 } 651 return (error); 652 } 653 654 /* 655 * callback function provided by ire_ftable_lookup when calling 656 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 657 * the radix tree. 658 */ 659 boolean_t 660 ire_find_best_route(struct radix_node *rn, void *arg) 661 { 662 struct rt_entry *rt = (struct rt_entry *)rn; 663 irb_t *irb_ptr; 664 ire_t *ire; 665 ire_ftable_args_t *margs = arg; 666 ipaddr_t match_mask; 667 668 ASSERT(rt != NULL); 669 670 irb_ptr = &rt->rt_irb; 671 672 if (irb_ptr->irb_ire_cnt == 0) 673 return (B_FALSE); 674 675 rw_enter(&irb_ptr->irb_lock, RW_READER); 676 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 677 if (IRE_IS_CONDEMNED(ire)) 678 continue; 679 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 680 if (margs->ift_flags & MATCH_IRE_MASK) 681 match_mask = margs->ift_mask; 682 else 683 match_mask = ire->ire_mask; 684 685 if (ire_match_args(ire, margs->ift_addr, match_mask, 686 margs->ift_gateway, margs->ift_type, margs->ift_ill, 687 margs->ift_zoneid, margs->ift_tsl, 688 margs->ift_flags)) { 689 ire_refhold(ire); 690 rw_exit(&irb_ptr->irb_lock); 691 margs->ift_best_ire = ire; 692 return (B_TRUE); 693 } 694 } 695 rw_exit(&irb_ptr->irb_lock); 696 return (B_FALSE); 697 } 698 699 /* 700 * ftable irb_t structures are dynamically allocated, and we need to 701 * check if the irb_t (and associated ftable tree attachment) needs to 702 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 703 * be verified are: 704 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 705 * - no other threads holding references to ire's in the bucket, 706 * i.e., irb_nire == 0 707 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 708 * - need to hold the global tree lock and irb_lock in write mode. 709 */ 710 void 711 irb_refrele_ftable(irb_t *irb) 712 { 713 for (;;) { 714 rw_enter(&irb->irb_lock, RW_WRITER); 715 ASSERT(irb->irb_refcnt != 0); 716 if (irb->irb_refcnt != 1) { 717 /* 718 * Someone has a reference to this radix node 719 * or there is some bucket walker. 720 */ 721 irb->irb_refcnt--; 722 rw_exit(&irb->irb_lock); 723 return; 724 } else { 725 /* 726 * There is no other walker, nor is there any 727 * other thread that holds a direct ref to this 728 * radix node. Do the clean up if needed. Call 729 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 730 */ 731 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 732 ire_t *ire_list; 733 734 ire_list = ire_unlink(irb); 735 rw_exit(&irb->irb_lock); 736 737 if (ire_list != NULL) 738 ire_cleanup(ire_list); 739 /* 740 * more CONDEMNED entries could have 741 * been added while we dropped the lock, 742 * so we have to re-check. 743 */ 744 continue; 745 } 746 747 /* 748 * Now check if there are still any ires 749 * associated with this radix node. 750 */ 751 if (irb->irb_nire != 0) { 752 /* 753 * someone is still holding on 754 * to ires in this bucket 755 */ 756 irb->irb_refcnt--; 757 rw_exit(&irb->irb_lock); 758 return; 759 } else { 760 /* 761 * Everything is clear. Zero walkers, 762 * Zero threads with a ref to this 763 * radix node, Zero ires associated with 764 * this radix node. Due to lock order, 765 * check the above conditions again 766 * after grabbing all locks in the right order 767 */ 768 rw_exit(&irb->irb_lock); 769 if (irb_inactive(irb)) 770 return; 771 /* 772 * irb_inactive could not free the irb. 773 * See if there are any walkers, if not 774 * try to clean up again. 775 */ 776 } 777 } 778 } 779 } 780 781 /* 782 * IRE iterator used by ire_ftable_lookup to process multiple equal 783 * routes. Given a starting point in the hash list (hash), walk the IREs 784 * in the bucket skipping deleted entries. We treat the bucket as a circular 785 * list for the purposes of walking it. 786 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 787 * not applicable (ire_match_args failed) then it returns a subsequent one. 788 * If we fail to find an IRE we return NULL. 789 * 790 * Assumes that the caller holds a reference on the IRE bucket and a read lock 791 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 792 * 793 * Applies to IPv4 and IPv6. 794 * 795 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 796 * address and bucket, we compare against ire_type for the orig_ire. We also 797 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 798 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 799 * 800 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 801 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 802 * in which the zone has an IP address. We check this for the global zone 803 * even if no shared-IP zones are configured. 804 */ 805 ire_t * 806 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 807 ire_t *orig_ire, ip_stack_t *ipst) 808 { 809 ire_t *ire, *maybe_ire = NULL; 810 uint_t maybe_badcnt = 0; 811 uint_t maxwalk; 812 813 /* Fold in more bits from the hint/hash */ 814 hash = hash ^ (hash >> 8) ^ (hash >> 16); 815 816 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 817 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 818 if (maxwalk == 0) { 819 rw_exit(&irb_ptr->irb_lock); 820 return (NULL); 821 } 822 823 hash %= maxwalk; 824 irb_refhold_locked(irb_ptr); 825 rw_exit(&irb_ptr->irb_lock); 826 827 /* 828 * Round-robin the routers list looking for a route that 829 * matches the passed in parameters. 830 * First we skip "hash" number of non-condemned IREs. 831 * Then we match the IRE. 832 * If we find an ire which has a non-zero ire_badcnt then we remember 833 * it and keep on looking for a lower ire_badcnt. 834 * If we come to the end of the list we continue (treat the 835 * bucket list as a circular list) but we match less than "max" 836 * entries. 837 */ 838 ire = irb_ptr->irb_ire; 839 while (maxwalk > 0) { 840 if (IRE_IS_CONDEMNED(ire)) 841 goto next_ire_skip; 842 843 /* Skip the first "hash" entries to do ECMP */ 844 if (hash != 0) { 845 hash--; 846 goto next_ire_skip; 847 } 848 849 /* See CGTP comment above */ 850 if (ire->ire_type != orig_ire->ire_type || 851 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 852 goto next_ire; 853 854 /* 855 * Note: Since IPv6 has hash buckets instead of radix 856 * buckers we need to explicitly compare the addresses. 857 * That makes this less efficient since we will be called 858 * even if there is no alternatives just because the 859 * bucket has multiple IREs for different addresses. 860 */ 861 if (ire->ire_ipversion == IPV6_VERSION) { 862 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 863 &ire->ire_addr_v6)) 864 goto next_ire; 865 } 866 867 /* 868 * For some reason find_best_route uses ire_mask. We do 869 * the same. 870 */ 871 if (ire->ire_ipversion == IPV4_VERSION ? 872 !ire_match_args(ire, margs->ift_addr, 873 ire->ire_mask, margs->ift_gateway, 874 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 875 margs->ift_tsl, margs->ift_flags) : 876 !ire_match_args_v6(ire, &margs->ift_addr_v6, 877 &ire->ire_mask_v6, &margs->ift_gateway_v6, 878 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 879 margs->ift_tsl, margs->ift_flags)) 880 goto next_ire; 881 882 if (margs->ift_zoneid != ALL_ZONES && 883 (ire->ire_type & IRE_OFFLINK)) { 884 /* 885 * When we're in a zone, we're only 886 * interested in routers that are 887 * reachable through ipifs within our zone. 888 */ 889 if (ire->ire_ipversion == IPV4_VERSION) { 890 if (!ire_gateway_ok_zone_v4( 891 ire->ire_gateway_addr, margs->ift_zoneid, 892 ire->ire_ill, margs->ift_tsl, ipst, 893 B_TRUE)) 894 goto next_ire; 895 } else { 896 if (!ire_gateway_ok_zone_v6( 897 &ire->ire_gateway_addr_v6, 898 margs->ift_zoneid, ire->ire_ill, 899 margs->ift_tsl, ipst, B_TRUE)) 900 goto next_ire; 901 } 902 } 903 mutex_enter(&ire->ire_lock); 904 /* Look for stale ire_badcnt and clear */ 905 if (ire->ire_badcnt != 0 && 906 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 907 ipst->ips_ip_ire_badcnt_lifetime)) 908 ire->ire_badcnt = 0; 909 mutex_exit(&ire->ire_lock); 910 911 if (ire->ire_badcnt == 0) { 912 /* We found one with a zero badcnt; done */ 913 ire_refhold(ire); 914 /* 915 * Care needed since irb_refrele grabs WLOCK to free 916 * the irb_t. 917 */ 918 if (ire->ire_ipversion == IPV4_VERSION) { 919 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 920 irb_refrele(irb_ptr); 921 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 922 } else { 923 rw_exit(&ipst->ips_ip6_ire_head_lock); 924 irb_refrele(irb_ptr); 925 rw_enter(&ipst->ips_ip6_ire_head_lock, 926 RW_READER); 927 } 928 return (ire); 929 } 930 /* 931 * keep looking to see if there is a better (lower 932 * badcnt) matching IRE, but save this one as a last resort. 933 * If we find a lower badcnt pick that one as the last* resort. 934 */ 935 if (maybe_ire == NULL) { 936 maybe_ire = ire; 937 maybe_badcnt = ire->ire_badcnt; 938 } else if (ire->ire_badcnt < maybe_badcnt) { 939 maybe_ire = ire; 940 maybe_badcnt = ire->ire_badcnt; 941 } 942 943 next_ire: 944 maxwalk--; 945 next_ire_skip: 946 ire = ire->ire_next; 947 if (ire == NULL) 948 ire = irb_ptr->irb_ire; 949 } 950 if (maybe_ire != NULL) 951 ire_refhold(maybe_ire); 952 953 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 954 if (ire->ire_ipversion == IPV4_VERSION) { 955 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 956 irb_refrele(irb_ptr); 957 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 958 } else { 959 rw_exit(&ipst->ips_ip6_ire_head_lock); 960 irb_refrele(irb_ptr); 961 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 962 } 963 return (maybe_ire); 964 } 965 966 void 967 irb_refhold_rn(struct radix_node *rn) 968 { 969 if ((rn->rn_flags & RNF_ROOT) == 0) 970 irb_refhold(&((rt_t *)(rn))->rt_irb); 971 } 972 973 void 974 irb_refrele_rn(struct radix_node *rn) 975 { 976 if ((rn->rn_flags & RNF_ROOT) == 0) 977 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 978 } 979 980 981 /* 982 * ip_select_src_ill() is used by ip_select_route() to find the src_ill 983 * to be used for source-aware routing table lookup. This function will 984 * ignore IPIF_UNNUMBERED interface addresses, and will only return a 985 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 986 * interfaces). 987 */ 988 static ill_t * 989 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 990 { 991 ipif_t *ipif; 992 ill_t *ill; 993 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 994 ipaddr_t v4src; 995 996 if (isv6) { 997 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 998 } else { 999 IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 1000 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 1001 } 1002 if (ipif == NULL) 1003 return (NULL); 1004 ill = ipif->ipif_ill; 1005 ill_refhold(ill); 1006 ipif_refrele(ipif); 1007 return (ill); 1008 } 1009 1010 /* 1011 * verify that v6src is configured on ill 1012 */ 1013 static boolean_t 1014 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 1015 { 1016 ipif_t *ipif; 1017 ip_stack_t *ipst; 1018 ipaddr_t v4src; 1019 1020 if (ill == NULL) 1021 return (B_FALSE); 1022 ipst = ill->ill_ipst; 1023 1024 if (ill->ill_isv6) { 1025 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 1026 } else { 1027 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 1028 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 1029 } 1030 1031 if (ipif != NULL) { 1032 ipif_refrele(ipif); 1033 return (B_TRUE); 1034 } else { 1035 return (B_FALSE); 1036 } 1037 } 1038 1039 /* 1040 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 1041 * routes this routine sets up a ire_nce_cache as well. The caller needs to 1042 * lookup an nce for the multicast case. 1043 * 1044 * When src_multihoming is set to 2 (strict src multihoming) we use the source 1045 * address to select the interface and route. If IP_BOUND_IF etc are 1046 * specified, we require that they specify an interface on which the 1047 * source address is assigned. 1048 * 1049 * When src_multihoming is set to 1 (preferred src aware route 1050 * selection) the unicast lookup prefers a matching source 1051 * (i.e., that the route points out an ill on which the source is assigned), but 1052 * if no such route is found we fallback to not considering the source in the 1053 * route lookup. 1054 * 1055 * We skip the src_multihoming check when the source isn't (yet) set, and 1056 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 1057 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 1058 * when secpolicy_net_rawaccess(). 1059 */ 1060 ire_t * 1061 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 1062 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1063 int *errorp, boolean_t *multirtp) 1064 { 1065 uint_t match_args; 1066 uint_t ire_type; 1067 ill_t *ill = NULL; 1068 ire_t *ire; 1069 ip_stack_t *ipst = ixa->ixa_ipst; 1070 ipaddr_t v4dst; 1071 in6_addr_t v6nexthop; 1072 iaflags_t ixaflags = ixa->ixa_flags; 1073 nce_t *nce; 1074 boolean_t preferred_src_aware = B_FALSE; 1075 boolean_t verify_src; 1076 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 1077 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 1078 1079 /* 1080 * We only verify that the src has been configured on a selected 1081 * interface if the src is not :: or INADDR_ANY, and if the 1082 * IXAF_VERIFY_SOURCE flag is set. 1083 */ 1084 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 1085 (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 1086 1087 match_args = MATCH_IRE_SECATTR; 1088 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 1089 if (setsrcp != NULL) 1090 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1091 if (errorp != NULL) 1092 ASSERT(*errorp == 0); 1093 1094 /* 1095 * The content of the ixa will be different if IP_NEXTHOP, 1096 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1097 */ 1098 1099 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 1100 /* Pick up the IRE_MULTICAST for the ill */ 1101 if (ixa->ixa_multicast_ifindex != 0) { 1102 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1103 isv6, ipst); 1104 } else if (ixaflags & IXAF_SCOPEID_SET) { 1105 /* sin6_scope_id takes precedence over ixa_ifindex */ 1106 ASSERT(ixa->ixa_scopeid != 0); 1107 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1108 isv6, ipst); 1109 } else if (ixa->ixa_ifindex != 0) { 1110 /* 1111 * In the ipmp case, the ixa_ifindex is set to 1112 * point at an under_ill and we would return the 1113 * ire_multicast() corresponding to that under_ill. 1114 */ 1115 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1116 isv6, ipst); 1117 } else if (src_multihoming != 0 && verify_src) { 1118 /* Look up the ill based on the source address */ 1119 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1120 /* 1121 * Since we looked up the ill from the source there 1122 * is no need to verify that the source is on the ill 1123 * below. 1124 */ 1125 verify_src = B_FALSE; 1126 if (ill != NULL && IS_VNI(ill)) { 1127 ill_t *usesrc = ill; 1128 1129 ill = ill_lookup_usesrc(usesrc); 1130 ill_refrele(usesrc); 1131 } 1132 } else if (!isv6) { 1133 ipaddr_t v4setsrc = INADDR_ANY; 1134 1135 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 1136 ipst, multirtp, &v4setsrc); 1137 if (setsrcp != NULL) 1138 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1139 } else { 1140 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 1141 ipst, multirtp, setsrcp); 1142 } 1143 if (ill != NULL && IS_VNI(ill)) { 1144 ill_refrele(ill); 1145 ill = NULL; 1146 } 1147 if (ill == NULL) { 1148 if (errorp != NULL) 1149 *errorp = ENXIO; 1150 /* Get a hold on the IRE_NOROUTE */ 1151 ire = ire_reject(ipst, isv6); 1152 return (ire); 1153 } 1154 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1155 ill_refrele(ill); 1156 if (errorp != NULL) 1157 *errorp = EHOSTUNREACH; 1158 /* Get a hold on the IRE_NOROUTE */ 1159 ire = ire_reject(ipst, isv6); 1160 return (ire); 1161 } 1162 /* 1163 * If we are doing the strictest src_multihoming, then 1164 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 1165 * an interface that is consistent with the source address. 1166 */ 1167 if (verify_src && src_multihoming == 2 && 1168 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1169 if (errorp != NULL) 1170 *errorp = EADDRNOTAVAIL; 1171 ill_refrele(ill); 1172 /* Get a hold on the IRE_NOROUTE */ 1173 ire = ire_reject(ipst, isv6); 1174 return (ire); 1175 } 1176 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1177 ire = ire_multicast(ill); 1178 ill_refrele(ill); 1179 if (generationp != NULL) 1180 *generationp = ire->ire_generation; 1181 if (errorp != NULL && 1182 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1183 *errorp = EHOSTUNREACH; 1184 } 1185 return (ire); 1186 } 1187 1188 /* Now for unicast */ 1189 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1190 if (ixaflags & IXAF_SCOPEID_SET) { 1191 /* sin6_scope_id takes precedence over ixa_ifindex */ 1192 ASSERT(ixa->ixa_scopeid != 0); 1193 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1194 isv6, ipst); 1195 } else { 1196 ASSERT(ixa->ixa_ifindex != 0); 1197 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1198 isv6, ipst); 1199 } 1200 if (ill != NULL && IS_VNI(ill)) { 1201 ill_refrele(ill); 1202 ill = NULL; 1203 } 1204 if (ill == NULL) { 1205 if (errorp != NULL) 1206 *errorp = ENXIO; 1207 /* Get a hold on the IRE_NOROUTE */ 1208 ire = ire_reject(ipst, isv6); 1209 return (ire); 1210 } 1211 1212 match_args |= MATCH_IRE_ILL; 1213 1214 /* 1215 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1216 * so for both of them we need to be able look for an under 1217 * interface. 1218 */ 1219 if (IS_UNDER_IPMP(ill)) 1220 match_args |= MATCH_IRE_TESTHIDDEN; 1221 1222 /* 1223 * If we are doing the strictest src_multihoming, then 1224 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 1225 * an interface that is consistent with the source address. 1226 */ 1227 if (src_multihoming == 2 && 1228 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1229 if (errorp != NULL) 1230 *errorp = EADDRNOTAVAIL; 1231 ill_refrele(ill); 1232 /* Get a hold on the IRE_NOROUTE */ 1233 ire = ire_reject(ipst, isv6); 1234 return (ire); 1235 } 1236 } else if (src_multihoming != 0 && verify_src) { 1237 /* Look up the ill based on the source address */ 1238 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1239 if (ill == NULL) { 1240 char addrbuf[INET6_ADDRSTRLEN]; 1241 1242 ip3dbg(("%s not a valid src for unicast", 1243 inet_ntop(AF_INET6, &v6src, addrbuf, 1244 sizeof (addrbuf)))); 1245 if (errorp != NULL) 1246 *errorp = EADDRNOTAVAIL; 1247 /* Get a hold on the IRE_NOROUTE */ 1248 ire = ire_reject(ipst, isv6); 1249 return (ire); 1250 } 1251 match_args |= MATCH_IRE_SRC_ILL; 1252 preferred_src_aware = (src_multihoming == 1); 1253 } 1254 1255 if (ixaflags & IXAF_NEXTHOP_SET) { 1256 /* IP_NEXTHOP was set */ 1257 v6nexthop = ixa->ixa_nexthop_v6; 1258 } else { 1259 v6nexthop = *v6dst; 1260 } 1261 1262 ire_type = 0; 1263 1264 /* 1265 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1266 * we only look for an onlink IRE. 1267 */ 1268 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1269 match_args |= MATCH_IRE_TYPE; 1270 ire_type = IRE_ONLINK; 1271 } 1272 1273 retry: 1274 if (!isv6) { 1275 ipaddr_t v4nexthop; 1276 ipaddr_t v4setsrc = INADDR_ANY; 1277 1278 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1279 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1280 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1281 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1282 if (setsrcp != NULL) 1283 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1284 } else { 1285 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1286 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1287 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1288 } 1289 1290 #ifdef DEBUG 1291 if (match_args & MATCH_IRE_TESTHIDDEN) { 1292 ip3dbg(("looking for hidden; dst %x ire %p\n", 1293 v4dst, (void *)ire)); 1294 } 1295 #endif 1296 if (ill != NULL) { 1297 ill_refrele(ill); 1298 ill = NULL; 1299 } 1300 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1301 (ire->ire_type & IRE_MULTICAST)) { 1302 if (preferred_src_aware) { 1303 /* 1304 * "Preferred Source Aware" send mode. If we cannot 1305 * find an ire whose ire_ill had the desired source 1306 * address retry after relaxing the ill matching 1307 * constraint. 1308 */ 1309 ire_refrele(ire); 1310 preferred_src_aware = B_FALSE; 1311 match_args &= ~MATCH_IRE_SRC_ILL; 1312 goto retry; 1313 } 1314 /* No ire_nce_cache */ 1315 return (ire); 1316 } 1317 1318 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1319 mutex_enter(&ire->ire_lock); 1320 nce = ire->ire_nce_cache; 1321 if (nce == NULL || nce->nce_is_condemned) { 1322 mutex_exit(&ire->ire_lock); 1323 (void) ire_revalidate_nce(ire); 1324 } else { 1325 mutex_exit(&ire->ire_lock); 1326 } 1327 return (ire); 1328 } 1329 1330 /* 1331 * Find a route given some xmit attributes and a packet. 1332 * Generic for IPv4 and IPv6 1333 * 1334 * This never returns NULL. But when it returns the IRE_NOROUTE 1335 * it might set errorp. 1336 */ 1337 ire_t * 1338 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1339 int *errorp, boolean_t *multirtp) 1340 { 1341 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1342 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1343 in6_addr_t v6dst, v6src; 1344 1345 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1346 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 1347 1348 return (ip_select_route(&v6dst, v6src, ixa, generationp, 1349 NULL, errorp, multirtp)); 1350 } else { 1351 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1352 1353 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 1354 ixa, generationp, NULL, errorp, multirtp)); 1355 } 1356 } 1357 1358 ire_t * 1359 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 1360 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1361 { 1362 in6_addr_t v6dst, v6src; 1363 ire_t *ire; 1364 in6_addr_t setsrc; 1365 1366 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1367 1368 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1369 IN6_IPADDR_TO_V4MAPPED(src, &v6src); 1370 1371 setsrc = ipv6_all_zeros; 1372 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 1373 multirtp); 1374 if (v4setsrcp != NULL) 1375 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1376 return (ire); 1377 } 1378 1379 /* 1380 * Recursively look for a route to the destination. Can also match on 1381 * the zoneid, ill, and label. Used for the data paths. See also 1382 * ire_route_recursive. 1383 * 1384 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1385 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1386 * forwarding. 1387 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1388 * resolve the gateway. 1389 * 1390 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1391 * instead. 1392 * 1393 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1394 * is an error. 1395 * Allow at most one RTF_INDIRECT. 1396 */ 1397 ire_t * 1398 ire_route_recursive_impl_v4(ire_t *ire, 1399 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1400 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1401 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1402 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1403 { 1404 int i, j; 1405 ire_t *ires[MAX_IRE_RECURSION]; 1406 uint_t generation; 1407 uint_t generations[MAX_IRE_RECURSION]; 1408 boolean_t need_refrele = B_FALSE; 1409 boolean_t invalidate = B_FALSE; 1410 ill_t *ill = NULL; 1411 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST); 1412 1413 if (setsrcp != NULL) 1414 ASSERT(*setsrcp == INADDR_ANY); 1415 if (gwattrp != NULL) 1416 ASSERT(*gwattrp == NULL); 1417 1418 /* 1419 * We iterate up to three times to resolve a route, even though 1420 * we have four slots in the array. The extra slot is for an 1421 * IRE_IF_CLONE we might need to create. 1422 */ 1423 i = 0; 1424 while (i < MAX_IRE_RECURSION - 1) { 1425 /* ire_ftable_lookup handles round-robin/ECMP */ 1426 if (ire == NULL) { 1427 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1428 (ill != NULL? ill : ill_arg), zoneid, tsl, 1429 match_args, xmit_hint, ipst, &generation); 1430 } else { 1431 /* Caller passed it; extra hold since we will rele */ 1432 ire_refhold(ire); 1433 if (generationp != NULL) 1434 generation = *generationp; 1435 else 1436 generation = IRE_GENERATION_VERIFY; 1437 } 1438 if (ire == NULL) { 1439 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 1440 ire = ires[0]; 1441 ire_refhold(ire); 1442 } else { 1443 ire = ire_reject(ipst, B_FALSE); 1444 } 1445 goto error; 1446 } 1447 1448 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1449 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1450 goto error; 1451 1452 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1453 /* 1454 * Verify that the IRE_IF_CLONE has a consistent generation 1455 * number. 1456 */ 1457 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) { 1458 ire_refrele(ire); 1459 ire = NULL; 1460 continue; 1461 } 1462 1463 /* 1464 * Don't allow anything unusual past the first iteration. 1465 * After the first lookup, we should no longer look for 1466 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT 1467 * routes. 1468 * 1469 * In addition, after we have found a direct IRE_OFFLINK, 1470 * we should only look for interface or clone routes. 1471 */ 1472 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 1473 1474 if ((ire->ire_type & IRE_OFFLINK) && 1475 !(ire->ire_flags & RTF_INDIRECT)) { 1476 ire_type = IRE_IF_ALL; 1477 } else { 1478 /* 1479 * no more local, loopback, broadcast routes 1480 */ 1481 if (!(match_args & MATCH_IRE_TYPE)) 1482 ire_type = (IRE_OFFLINK|IRE_ONLINK); 1483 ire_type &= ~maskoff; 1484 } 1485 match_args |= MATCH_IRE_TYPE; 1486 1487 /* We have a usable IRE */ 1488 ires[i] = ire; 1489 generations[i] = generation; 1490 i++; 1491 1492 /* The first RTF_SETSRC address is passed back if setsrcp */ 1493 if ((ire->ire_flags & RTF_SETSRC) && 1494 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1495 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1496 *setsrcp = ire->ire_setsrc_addr; 1497 } 1498 1499 /* The first ire_gw_secattr is passed back if gwattrp */ 1500 if (ire->ire_gw_secattr != NULL && 1501 gwattrp != NULL && *gwattrp == NULL) 1502 *gwattrp = ire->ire_gw_secattr; 1503 1504 /* 1505 * Check if we have a short-cut pointer to an IRE for this 1506 * destination, and that the cached dependency isn't stale. 1507 * In that case we've rejoined an existing tree towards a 1508 * parent, thus we don't need to continue the loop to 1509 * discover the rest of the tree. 1510 */ 1511 mutex_enter(&ire->ire_lock); 1512 if (ire->ire_dep_parent != NULL && 1513 ire->ire_dep_parent->ire_generation == 1514 ire->ire_dep_parent_generation) { 1515 mutex_exit(&ire->ire_lock); 1516 ire = NULL; 1517 goto done; 1518 } 1519 mutex_exit(&ire->ire_lock); 1520 1521 /* 1522 * If this type should have an ire_nce_cache (even if it 1523 * doesn't yet have one) then we are done. Includes 1524 * IRE_INTERFACE with a full 32 bit mask. 1525 */ 1526 if (ire->ire_nce_capable) { 1527 ire = NULL; 1528 goto done; 1529 } 1530 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1531 /* 1532 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1533 * particular destination 1534 */ 1535 if (ire->ire_type & IRE_INTERFACE) { 1536 in6_addr_t v6nexthop; 1537 ire_t *clone; 1538 1539 ASSERT(ire->ire_masklen != IPV4_ABITS); 1540 1541 /* 1542 * In the case of ip_input and ILLF_FORWARDING not 1543 * being set, and in the case of RTM_GET, there is 1544 * no point in allocating an IRE_IF_CLONE. We return 1545 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1546 * result in a ire_dep_parent which is IRE_IF_* 1547 * without an IRE_IF_CLONE. 1548 * We recover from that when we need to send packets 1549 * by ensuring that the generations become 1550 * IRE_GENERATION_VERIFY in this case. 1551 */ 1552 if (!(irr_flags & IRR_ALLOCATE)) { 1553 invalidate = B_TRUE; 1554 ire = NULL; 1555 goto done; 1556 } 1557 1558 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1559 1560 clone = ire_create_if_clone(ire, &v6nexthop, 1561 &generation); 1562 if (clone == NULL) { 1563 /* 1564 * Temporary failure - no memory. 1565 * Don't want caller to cache IRE_NOROUTE. 1566 */ 1567 invalidate = B_TRUE; 1568 ire = ire_blackhole(ipst, B_FALSE); 1569 goto error; 1570 } 1571 /* 1572 * Make clone next to last entry and the 1573 * IRE_INTERFACE the last in the dependency 1574 * chain since the clone depends on the 1575 * IRE_INTERFACE. 1576 */ 1577 ASSERT(i >= 1); 1578 ASSERT(i < MAX_IRE_RECURSION); 1579 1580 ires[i] = ires[i-1]; 1581 generations[i] = generations[i-1]; 1582 ires[i-1] = clone; 1583 generations[i-1] = generation; 1584 i++; 1585 1586 ire = NULL; 1587 goto done; 1588 } 1589 1590 /* 1591 * We only match on the type and optionally ILL when 1592 * recursing. The type match is used by some callers 1593 * to exclude certain types (such as IRE_IF_CLONE or 1594 * IRE_LOCAL|IRE_LOOPBACK). 1595 * 1596 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1597 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1598 * ire_ill, so we set ill to the ire_ill; 1599 */ 1600 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1601 nexthop = ire->ire_gateway_addr; 1602 if (ill == NULL && ire->ire_ill != NULL) { 1603 ill = ire->ire_ill; 1604 need_refrele = B_TRUE; 1605 ill_refhold(ill); 1606 match_args |= MATCH_IRE_ILL; 1607 } 1608 ire = NULL; 1609 } 1610 ASSERT(ire == NULL); 1611 ire = ire_reject(ipst, B_FALSE); 1612 1613 error: 1614 ASSERT(ire != NULL); 1615 if (need_refrele) 1616 ill_refrele(ill); 1617 1618 /* 1619 * In the case of MULTIRT we want to try a different IRE the next 1620 * time. We let the next packet retry in that case. 1621 */ 1622 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1623 (void) ire_no_good(ires[0]); 1624 1625 cleanup: 1626 /* cleanup ires[i] */ 1627 ire_dep_unbuild(ires, i); 1628 for (j = 0; j < i; j++) 1629 ire_refrele(ires[j]); 1630 1631 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1632 (irr_flags & IRR_INCOMPLETE)); 1633 /* 1634 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1635 * ip_select_route since the reject or lack of memory might be gone. 1636 */ 1637 if (generationp != NULL) 1638 *generationp = IRE_GENERATION_VERIFY; 1639 return (ire); 1640 1641 done: 1642 ASSERT(ire == NULL); 1643 if (need_refrele) { 1644 ill_refrele(ill); 1645 ill = NULL; 1646 } 1647 1648 /* Build dependencies */ 1649 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1650 /* Something in chain was condemned; tear it apart */ 1651 ire = ire_reject(ipst, B_FALSE); 1652 goto cleanup; 1653 } 1654 1655 /* 1656 * Release all refholds except the one for ires[0] that we 1657 * will return to the caller. 1658 */ 1659 for (j = 1; j < i; j++) 1660 ire_refrele(ires[j]); 1661 1662 if (invalidate) { 1663 /* 1664 * Since we needed to allocate but couldn't we need to make 1665 * sure that the dependency chain is rebuilt the next time. 1666 */ 1667 ire_dep_invalidate_generations(ires[0]); 1668 generation = IRE_GENERATION_VERIFY; 1669 } else { 1670 /* 1671 * IREs can have been added or deleted while we did the 1672 * recursive lookup and we can't catch those until we've built 1673 * the dependencies. We verify the stored 1674 * ire_dep_parent_generation to catch any such changes and 1675 * return IRE_GENERATION_VERIFY (which will cause 1676 * ip_select_route to be called again so we can redo the 1677 * recursive lookup next time we send a packet. 1678 */ 1679 if (ires[0]->ire_dep_parent == NULL) 1680 generation = ires[0]->ire_generation; 1681 else 1682 generation = ire_dep_validate_generations(ires[0]); 1683 if (generations[0] != ires[0]->ire_generation) { 1684 /* Something changed at the top */ 1685 generation = IRE_GENERATION_VERIFY; 1686 } 1687 } 1688 if (generationp != NULL) 1689 *generationp = generation; 1690 1691 return (ires[0]); 1692 } 1693 1694 ire_t * 1695 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1696 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1697 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1698 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1699 { 1700 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1701 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1702 gwattrp, generationp)); 1703 } 1704 1705 /* 1706 * Recursively look for a route to the destination. 1707 * We only handle a destination match here, yet we have the same arguments 1708 * as the full match to allow function pointers to select between the two. 1709 * 1710 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1711 * instead. 1712 * 1713 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1714 * is an error. 1715 * Allow at most one RTF_INDIRECT. 1716 */ 1717 ire_t * 1718 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1719 uint32_t xmit_hint, ip_stack_t *ipst) 1720 { 1721 ire_t *ire; 1722 ire_t *ire1; 1723 uint_t generation; 1724 1725 /* ire_ftable_lookup handles round-robin/ECMP */ 1726 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1727 &generation); 1728 ASSERT(ire != NULL); 1729 /* 1730 * If the IRE has a current cached parent we know that the whole 1731 * parent chain is current, hence we don't need to discover and 1732 * build any dependencies by doing a recursive lookup. 1733 */ 1734 mutex_enter(&ire->ire_lock); 1735 if (ire->ire_dep_parent != NULL) { 1736 if (ire->ire_dep_parent->ire_generation == 1737 ire->ire_dep_parent_generation) { 1738 mutex_exit(&ire->ire_lock); 1739 return (ire); 1740 } 1741 mutex_exit(&ire->ire_lock); 1742 } else { 1743 mutex_exit(&ire->ire_lock); 1744 /* 1745 * If this type should have an ire_nce_cache (even if it 1746 * doesn't yet have one) then we are done. Includes 1747 * IRE_INTERFACE with a full 32 bit mask. 1748 */ 1749 if (ire->ire_nce_capable) 1750 return (ire); 1751 } 1752 1753 /* 1754 * Fallback to loop in the normal code starting with the ire 1755 * we found. Normally this would return the same ire. 1756 */ 1757 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1758 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1759 &generation); 1760 ire_refrele(ire); 1761 return (ire1); 1762 } 1763 1764 /* 1765 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE 1766 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they 1767 * are not consistent, and TRUE otherwise. 1768 */ 1769 boolean_t 1770 ire_clone_verify(ire_t *ire) 1771 { 1772 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0); 1773 mutex_enter(&ire->ire_lock); 1774 if (ire->ire_dep_parent != NULL && 1775 ire->ire_dep_parent->ire_generation != 1776 ire->ire_dep_parent_generation) { 1777 mutex_exit(&ire->ire_lock); 1778 ire_delete(ire); 1779 return (B_FALSE); 1780 } 1781 mutex_exit(&ire->ire_lock); 1782 return (B_TRUE); 1783 } 1784