1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright (c) 2006, 2010, Oracle and/or its affiliates. All rights reserved. 23 * Copyright 2021 Racktop Systems, Inc. 24 */ 25 26 /* 27 * This file contains consumer routines of the IPv4 forwarding engine 28 */ 29 30 #include <sys/types.h> 31 #include <sys/stream.h> 32 #include <sys/stropts.h> 33 #include <sys/strlog.h> 34 #include <sys/dlpi.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 #include <sys/policy.h> 38 39 #include <sys/systm.h> 40 #include <sys/strsun.h> 41 #include <sys/kmem.h> 42 #include <sys/param.h> 43 #include <sys/socket.h> 44 #include <sys/strsubr.h> 45 #include <net/if.h> 46 #include <net/route.h> 47 #include <netinet/in.h> 48 #include <net/if_dl.h> 49 #include <netinet/ip6.h> 50 #include <netinet/icmp6.h> 51 52 #include <inet/ipsec_impl.h> 53 #include <inet/common.h> 54 #include <inet/mi.h> 55 #include <inet/mib2.h> 56 #include <inet/ip.h> 57 #include <inet/ip_impl.h> 58 #include <inet/ip6.h> 59 #include <inet/ip_ndp.h> 60 #include <inet/arp.h> 61 #include <inet/ip_if.h> 62 #include <inet/ip_ire.h> 63 #include <inet/ip_ftable.h> 64 #include <inet/ip_rts.h> 65 #include <inet/nd.h> 66 67 #include <net/pfkeyv2.h> 68 #include <inet/sadb.h> 69 #include <inet/tcp.h> 70 #include <inet/ipclassifier.h> 71 #include <sys/zone.h> 72 #include <net/radix.h> 73 #include <sys/tsol/label.h> 74 #include <sys/tsol/tnet.h> 75 76 #define IS_DEFAULT_ROUTE(ire) \ 77 (((ire)->ire_type & IRE_DEFAULT) || \ 78 (((ire)->ire_type & IRE_INTERFACE) && ((ire)->ire_addr == 0))) 79 80 #define IP_SRC_MULTIHOMING(isv6, ipst) \ 81 (isv6 ? ipst->ips_ipv6_strict_src_multihoming : \ 82 ipst->ips_ip_strict_src_multihoming) 83 84 static ire_t *route_to_dst(const struct sockaddr *, zoneid_t, ip_stack_t *); 85 static void ire_del_host_redir(ire_t *, char *); 86 static boolean_t ire_find_best_route(struct radix_node *, void *); 87 88 /* 89 * Lookup a route in forwarding table. A specific lookup is indicated by 90 * passing the required parameters and indicating the match required in the 91 * flag field. 92 * 93 * Supports IP_BOUND_IF by following the ipif/ill when recursing. 94 */ 95 ire_t * 96 ire_ftable_lookup_v4(ipaddr_t addr, ipaddr_t mask, ipaddr_t gateway, 97 int type, const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, 98 int flags, uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 99 { 100 ire_t *ire; 101 struct rt_sockaddr rdst, rmask; 102 struct rt_entry *rt; 103 ire_ftable_args_t margs; 104 105 ASSERT(ill == NULL || !ill->ill_isv6); 106 107 /* 108 * ire_match_args() will dereference ill if MATCH_IRE_ILL 109 * is set. 110 */ 111 if ((flags & (MATCH_IRE_ILL|MATCH_IRE_SRC_ILL)) && (ill == NULL)) 112 return (NULL); 113 114 bzero(&rdst, sizeof (rdst)); 115 rdst.rt_sin_len = sizeof (rdst); 116 rdst.rt_sin_family = AF_INET; 117 rdst.rt_sin_addr.s_addr = addr; 118 119 bzero(&rmask, sizeof (rmask)); 120 rmask.rt_sin_len = sizeof (rmask); 121 rmask.rt_sin_family = AF_INET; 122 rmask.rt_sin_addr.s_addr = mask; 123 124 bzero(&margs, sizeof (margs)); 125 margs.ift_addr = addr; 126 margs.ift_mask = mask; 127 margs.ift_gateway = gateway; 128 margs.ift_type = type; 129 margs.ift_ill = ill; 130 margs.ift_zoneid = zoneid; 131 margs.ift_tsl = tsl; 132 margs.ift_flags = flags; 133 134 /* 135 * The flags argument passed to ire_ftable_lookup may cause the 136 * search to return, not the longest matching prefix, but the 137 * "best matching prefix", i.e., the longest prefix that also 138 * satisfies constraints imposed via the permutation of flags 139 * passed in. To achieve this, we invoke ire_match_args() on 140 * each matching leaf in the radix tree. ire_match_args is 141 * invoked by the callback function ire_find_best_route() 142 * We hold the global tree lock in read mode when calling 143 * rn_match_args. Before dropping the global tree lock, ensure 144 * that the radix node can't be deleted by incrementing ire_refcnt. 145 */ 146 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 147 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 148 ipst->ips_ip_ftable, ire_find_best_route, &margs); 149 ire = margs.ift_best_ire; 150 if (rt == NULL) { 151 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 152 return (NULL); 153 } 154 ASSERT(ire != NULL); 155 156 DTRACE_PROBE2(ire__found, ire_ftable_args_t *, &margs, ire_t *, ire); 157 158 /* 159 * round-robin only if we have more than one route in the bucket. 160 * ips_ip_ecmp_behavior controls when we do ECMP 161 * 2: always 162 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 163 * 0: never 164 */ 165 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 166 if (ipst->ips_ip_ecmp_behavior == 2 || 167 (ipst->ips_ip_ecmp_behavior == 1 && 168 IS_DEFAULT_ROUTE(ire))) { 169 ire_t *next_ire; 170 171 margs.ift_best_ire = NULL; 172 next_ire = ire_round_robin(ire->ire_bucket, &margs, 173 xmit_hint, ire, ipst); 174 if (next_ire == NULL) { 175 /* keep ire if next_ire is null */ 176 goto done; 177 } 178 ire_refrele(ire); 179 ire = next_ire; 180 } 181 } 182 183 done: 184 /* Return generation before dropping lock */ 185 if (generationp != NULL) 186 *generationp = ire->ire_generation; 187 188 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 189 190 /* 191 * For shared-IP zones we need additional checks to what was 192 * done in ire_match_args to make sure IRE_LOCALs are handled. 193 * 194 * When ip_restrict_interzone_loopback is set, then 195 * we ensure that IRE_LOCAL are only used for loopback 196 * between zones when the logical "Ethernet" would 197 * have looped them back. That is, if in the absense of 198 * the IRE_LOCAL we would have sent to packet out the 199 * same ill. 200 */ 201 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 202 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 203 ipst->ips_ip_restrict_interzone_loopback) { 204 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 205 ASSERT(ire != NULL); 206 } 207 return (ire); 208 } 209 210 /* 211 * This function is called by 212 * ip_input/ire_route_recursive when doing a route lookup on only the 213 * destination address. 214 * 215 * The optimizations of this function over ire_ftable_lookup are: 216 * o removing unnecessary flag matching 217 * o doing longest prefix match instead of overloading it further 218 * with the unnecessary "best_prefix_match" 219 * 220 * If no route is found we return IRE_NOROUTE. 221 */ 222 ire_t * 223 ire_ftable_lookup_simple_v4(ipaddr_t addr, uint32_t xmit_hint, ip_stack_t *ipst, 224 uint_t *generationp) 225 { 226 ire_t *ire; 227 struct rt_sockaddr rdst; 228 struct rt_entry *rt; 229 irb_t *irb; 230 231 rdst.rt_sin_len = sizeof (rdst); 232 rdst.rt_sin_family = AF_INET; 233 rdst.rt_sin_addr.s_addr = addr; 234 235 /* 236 * This is basically inlining a simpler version of ire_match_args 237 */ 238 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 239 240 rt = (struct rt_entry *)ipst->ips_ip_ftable->rnh_matchaddr_args(&rdst, 241 ipst->ips_ip_ftable, NULL, NULL); 242 243 if (rt == NULL) 244 goto bad; 245 246 irb = &rt->rt_irb; 247 if (irb->irb_ire_cnt == 0) 248 goto bad; 249 250 rw_enter(&irb->irb_lock, RW_READER); 251 ire = irb->irb_ire; 252 if (ire == NULL) { 253 rw_exit(&irb->irb_lock); 254 goto bad; 255 } 256 while (IRE_IS_CONDEMNED(ire)) { 257 ire = ire->ire_next; 258 if (ire == NULL) { 259 rw_exit(&irb->irb_lock); 260 goto bad; 261 } 262 } 263 264 /* we have a ire that matches */ 265 ire_refhold(ire); 266 rw_exit(&irb->irb_lock); 267 268 /* 269 * round-robin only if we have more than one route in the bucket. 270 * ips_ip_ecmp_behavior controls when we do ECMP 271 * 2: always 272 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 273 * 0: never 274 * 275 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 276 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 277 * and the IRE_INTERFACESs are likely to be shorter matches. 278 */ 279 if (ire->ire_bucket->irb_ire_cnt > 1) { 280 if (ipst->ips_ip_ecmp_behavior == 2 || 281 (ipst->ips_ip_ecmp_behavior == 1 && 282 IS_DEFAULT_ROUTE(ire))) { 283 ire_t *next_ire; 284 ire_ftable_args_t margs; 285 286 bzero(&margs, sizeof (margs)); 287 margs.ift_addr = addr; 288 margs.ift_zoneid = ALL_ZONES; 289 290 next_ire = ire_round_robin(ire->ire_bucket, &margs, 291 xmit_hint, ire, ipst); 292 if (next_ire == NULL) { 293 /* keep ire if next_ire is null */ 294 if (generationp != NULL) 295 *generationp = ire->ire_generation; 296 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 297 return (ire); 298 } 299 ire_refrele(ire); 300 ire = next_ire; 301 } 302 } 303 /* Return generation before dropping lock */ 304 if (generationp != NULL) 305 *generationp = ire->ire_generation; 306 307 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 308 309 /* 310 * Since we only did ALL_ZONES matches there is no special handling 311 * of IRE_LOCALs needed here. ire_ftable_lookup_v4 has to handle that. 312 */ 313 return (ire); 314 315 bad: 316 if (generationp != NULL) 317 *generationp = IRE_GENERATION_VERIFY; 318 319 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 320 return (ire_reject(ipst, B_FALSE)); 321 } 322 323 /* 324 * Find the ill matching a multicast group. 325 * Allows different routes for multicast addresses 326 * in the unicast routing table (akin to 224.0.0.0 but could be more specific) 327 * which point at different interfaces. This is used when IP_MULTICAST_IF 328 * isn't specified (when sending) and when IP_ADD_MEMBERSHIP doesn't 329 * specify the interface to join on. 330 * 331 * Supports link-local addresses by using ire_route_recursive which follows 332 * the ill when recursing. 333 * 334 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 335 * and the MULTIRT property can be different for different groups, we 336 * extract RTF_MULTIRT from the special unicast route added for a group 337 * with CGTP and pass that back in the multirtp argument. 338 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 339 * We have a setsrcp argument for the same reason. 340 */ 341 ill_t * 342 ire_lookup_multi_ill_v4(ipaddr_t group, zoneid_t zoneid, ip_stack_t *ipst, 343 boolean_t *multirtp, ipaddr_t *setsrcp) 344 { 345 ire_t *ire; 346 ill_t *ill; 347 348 ire = ire_route_recursive_v4(group, 0, NULL, zoneid, NULL, 349 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 350 ASSERT(ire != NULL); 351 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 352 ire_refrele(ire); 353 return (NULL); 354 } 355 356 if (multirtp != NULL) 357 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 358 359 ill = ire_nexthop_ill(ire); 360 ire_refrele(ire); 361 return (ill); 362 } 363 364 /* 365 * Delete the passed in ire if the gateway addr matches 366 */ 367 void 368 ire_del_host_redir(ire_t *ire, char *gateway) 369 { 370 if ((ire->ire_flags & RTF_DYNAMIC) && 371 (ire->ire_gateway_addr == *(ipaddr_t *)gateway)) 372 ire_delete(ire); 373 } 374 375 /* 376 * Search for all IRE_HOST RTF_DYNAMIC (aka redirect) routes that are 377 * pointing at the specified gateway and 378 * delete them. This routine is called only 379 * when a default gateway is going away. 380 */ 381 void 382 ire_delete_host_redirects(ipaddr_t gateway, ip_stack_t *ipst) 383 { 384 struct rtfuncarg rtfarg; 385 386 bzero(&rtfarg, sizeof (rtfarg)); 387 rtfarg.rt_func = ire_del_host_redir; 388 rtfarg.rt_arg = (void *)&gateway; 389 rtfarg.rt_zoneid = ALL_ZONES; 390 rtfarg.rt_ipst = ipst; 391 (void) ipst->ips_ip_ftable->rnh_walktree_mt(ipst->ips_ip_ftable, 392 rtfunc, &rtfarg, irb_refhold_rn, irb_refrele_rn); 393 } 394 395 /* 396 * Obtain the rt_entry and rt_irb for the route to be added to 397 * the ips_ip_ftable. 398 * First attempt to add a node to the radix tree via rn_addroute. If the 399 * route already exists, return the bucket for the existing route. 400 * 401 * Locking notes: Need to hold the global radix tree lock in write mode to 402 * add a radix node. To prevent the node from being deleted, ire_get_bucket() 403 * returns with a ref'ed irb_t. The ire itself is added in ire_add_v4() 404 * while holding the irb_lock, but not the radix tree lock. 405 */ 406 irb_t * 407 ire_get_bucket(ire_t *ire) 408 { 409 struct radix_node *rn; 410 struct rt_entry *rt; 411 struct rt_sockaddr rmask, rdst; 412 irb_t *irb = NULL; 413 ip_stack_t *ipst = ire->ire_ipst; 414 415 ASSERT(ipst->ips_ip_ftable != NULL); 416 417 /* first try to see if route exists (based on rtalloc1) */ 418 bzero(&rdst, sizeof (rdst)); 419 rdst.rt_sin_len = sizeof (rdst); 420 rdst.rt_sin_family = AF_INET; 421 rdst.rt_sin_addr.s_addr = ire->ire_addr; 422 423 bzero(&rmask, sizeof (rmask)); 424 rmask.rt_sin_len = sizeof (rmask); 425 rmask.rt_sin_family = AF_INET; 426 rmask.rt_sin_addr.s_addr = ire->ire_mask; 427 428 /* 429 * add the route. based on BSD's rtrequest1(RTM_ADD) 430 */ 431 R_Malloc(rt, rt_entry_cache, sizeof (*rt)); 432 /* kmem_alloc failed */ 433 if (rt == NULL) 434 return (NULL); 435 436 bzero(rt, sizeof (*rt)); 437 rt->rt_nodes->rn_key = (char *)&rt->rt_dst; 438 rt->rt_dst = rdst; 439 irb = &rt->rt_irb; 440 irb->irb_marks |= IRB_MARK_DYNAMIC; /* dynamically allocated/freed */ 441 irb->irb_ipst = ipst; 442 rw_init(&irb->irb_lock, NULL, RW_DEFAULT, NULL); 443 RADIX_NODE_HEAD_WLOCK(ipst->ips_ip_ftable); 444 rn = ipst->ips_ip_ftable->rnh_addaddr(&rt->rt_dst, &rmask, 445 ipst->ips_ip_ftable, (struct radix_node *)rt); 446 if (rn == NULL) { 447 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 448 Free(rt, rt_entry_cache); 449 rt = NULL; 450 irb = NULL; 451 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 452 rn = ipst->ips_ip_ftable->rnh_lookup(&rdst, &rmask, 453 ipst->ips_ip_ftable); 454 if (rn != NULL && ((rn->rn_flags & RNF_ROOT) == 0)) { 455 /* found a non-root match */ 456 rt = (struct rt_entry *)rn; 457 } 458 } 459 if (rt != NULL) { 460 irb = &rt->rt_irb; 461 irb_refhold(irb); 462 } 463 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 464 return (irb); 465 } 466 467 /* 468 * This function is used when the caller wants to know the outbound 469 * interface for a packet given only the address. 470 * If this is a offlink IP address and there are multiple 471 * routes to this destination, this routine will utilise the 472 * first route it finds to IP address 473 * Return values: 474 * 0 - FAILURE 475 * nonzero - ifindex 476 */ 477 uint_t 478 ifindex_lookup(const struct sockaddr *ipaddr, zoneid_t zoneid) 479 { 480 uint_t ifindex = 0; 481 ire_t *ire; 482 ill_t *ill; 483 netstack_t *ns; 484 ip_stack_t *ipst; 485 486 if (zoneid == ALL_ZONES) 487 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 488 else 489 ns = netstack_find_by_zoneid(zoneid); 490 ASSERT(ns != NULL); 491 492 /* 493 * For exclusive stacks we set the zoneid to zero 494 * since IP uses the global zoneid in the exclusive stacks. 495 */ 496 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 497 zoneid = GLOBAL_ZONEID; 498 ipst = ns->netstack_ip; 499 500 ASSERT(ipaddr->sa_family == AF_INET || ipaddr->sa_family == AF_INET6); 501 502 if ((ire = route_to_dst(ipaddr, zoneid, ipst)) != NULL) { 503 ill = ire_nexthop_ill(ire); 504 if (ill != NULL) { 505 ifindex = ill->ill_phyint->phyint_ifindex; 506 ill_refrele(ill); 507 } 508 ire_refrele(ire); 509 } 510 netstack_rele(ns); 511 return (ifindex); 512 } 513 514 /* 515 * Routine to find the route to a destination. If a ifindex is supplied 516 * it tries to match the route to the corresponding ipif for the ifindex 517 */ 518 static ire_t * 519 route_to_dst(const struct sockaddr *dst_addr, zoneid_t zoneid, ip_stack_t *ipst) 520 { 521 ire_t *ire = NULL; 522 int match_flags; 523 524 match_flags = MATCH_IRE_DSTONLY; 525 526 /* XXX pass NULL tsl for now */ 527 528 if (dst_addr->sa_family == AF_INET) { 529 ire = ire_route_recursive_v4( 530 ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr, 0, NULL, 531 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 532 NULL, NULL); 533 } else { 534 ire = ire_route_recursive_v6( 535 &((struct sockaddr_in6 *)dst_addr)->sin6_addr, 0, NULL, 536 zoneid, NULL, match_flags, IRR_ALLOCATE, 0, ipst, NULL, 537 NULL, NULL); 538 } 539 ASSERT(ire != NULL); 540 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 541 ire_refrele(ire); 542 return (NULL); 543 } 544 return (ire); 545 } 546 547 /* 548 * This routine is called by IP Filter to send a packet out on the wire 549 * to a specified dstination (which may be onlink or offlink). The ifindex may 550 * or may not be 0. A non-null ifindex indicates IP Filter has stipulated 551 * an outgoing interface and requires the nexthop to be on that interface. 552 * IP WILL NOT DO the following to the data packet before sending it out: 553 * a. manipulate ttl 554 * b. ipsec work 555 * c. fragmentation 556 * 557 * If the packet has been prepared for hardware checksum then it will be 558 * passed off to ip_send_align_cksum() to check that the flags set on the 559 * packet are in alignment with the capabilities of the new outgoing NIC. 560 * 561 * Return values: 562 * 0: IP was able to send of the data pkt 563 * ECOMM: Could not send packet 564 * ENONET No route to dst. It is up to the caller 565 * to send icmp unreachable error message, 566 * EINPROGRESS The macaddr of the onlink dst or that 567 * of the offlink dst's nexthop needs to get 568 * resolved before packet can be sent to dst. 569 * Thus transmission is not guaranteed. 570 * Note: No longer have visibility to the ARP queue 571 * hence no EINPROGRESS. 572 */ 573 int 574 ipfil_sendpkt(const struct sockaddr *dst_addr, mblk_t *mp, uint_t ifindex, 575 zoneid_t zoneid) 576 { 577 ipaddr_t nexthop; 578 netstack_t *ns; 579 ip_stack_t *ipst; 580 ip_xmit_attr_t ixas; 581 int error; 582 583 ASSERT(mp != NULL); 584 585 if (zoneid == ALL_ZONES) 586 ns = netstack_find_by_zoneid(GLOBAL_ZONEID); 587 else 588 ns = netstack_find_by_zoneid(zoneid); 589 ASSERT(ns != NULL); 590 591 /* 592 * For exclusive stacks we set the zoneid to zero 593 * since IP uses the global zoneid in the exclusive stacks. 594 */ 595 if (ns->netstack_stackid != GLOBAL_NETSTACKID) 596 zoneid = GLOBAL_ZONEID; 597 ipst = ns->netstack_ip; 598 599 ASSERT(dst_addr->sa_family == AF_INET || 600 dst_addr->sa_family == AF_INET6); 601 602 bzero(&ixas, sizeof (ixas)); 603 /* 604 * No IPsec, no fragmentation, and don't let any hooks see 605 * the packet. 606 */ 607 ixas.ixa_flags = IXAF_NO_IPSEC | IXAF_DONTFRAG | IXAF_NO_PFHOOK; 608 ixas.ixa_cred = kcred; 609 ixas.ixa_cpid = NOPID; 610 ixas.ixa_tsl = NULL; 611 ixas.ixa_ipst = ipst; 612 ixas.ixa_ifindex = ifindex; 613 614 if (dst_addr->sa_family == AF_INET) { 615 ipha_t *ipha = (ipha_t *)mp->b_rptr; 616 617 ixas.ixa_flags |= IXAF_IS_IPV4; 618 nexthop = ((struct sockaddr_in *)dst_addr)->sin_addr.s_addr; 619 if (nexthop != ipha->ipha_dst) { 620 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 621 ixas.ixa_nexthop_v4 = nexthop; 622 } 623 ixas.ixa_multicast_ttl = ipha->ipha_ttl; 624 } else { 625 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 626 in6_addr_t *nexthop6; 627 628 nexthop6 = &((struct sockaddr_in6 *)dst_addr)->sin6_addr; 629 if (!IN6_ARE_ADDR_EQUAL(nexthop6, &ip6h->ip6_dst)) { 630 ixas.ixa_flags |= IXAF_NEXTHOP_SET; 631 ixas.ixa_nexthop_v6 = *nexthop6; 632 } 633 ixas.ixa_multicast_ttl = ip6h->ip6_hops; 634 } 635 error = ip_output_simple(mp, &ixas); 636 ixa_cleanup(&ixas); 637 638 netstack_rele(ns); 639 switch (error) { 640 case 0: 641 break; 642 643 case EHOSTUNREACH: 644 case ENETUNREACH: 645 error = ENONET; 646 break; 647 648 default: 649 error = ECOMM; 650 break; 651 } 652 return (error); 653 } 654 655 /* 656 * callback function provided by ire_ftable_lookup when calling 657 * rn_match_args(). Invoke ire_match_args on each matching leaf node in 658 * the radix tree. 659 */ 660 boolean_t 661 ire_find_best_route(struct radix_node *rn, void *arg) 662 { 663 struct rt_entry *rt = (struct rt_entry *)rn; 664 irb_t *irb_ptr; 665 ire_t *ire; 666 ire_ftable_args_t *margs = arg; 667 ipaddr_t match_mask; 668 669 ASSERT(rt != NULL); 670 671 irb_ptr = &rt->rt_irb; 672 673 if (irb_ptr->irb_ire_cnt == 0) 674 return (B_FALSE); 675 676 rw_enter(&irb_ptr->irb_lock, RW_READER); 677 for (ire = irb_ptr->irb_ire; ire != NULL; ire = ire->ire_next) { 678 if (IRE_IS_CONDEMNED(ire)) 679 continue; 680 ASSERT((margs->ift_flags & MATCH_IRE_SHORTERMASK) == 0); 681 if (margs->ift_flags & MATCH_IRE_MASK) 682 match_mask = margs->ift_mask; 683 else 684 match_mask = ire->ire_mask; 685 686 if (ire_match_args(ire, margs->ift_addr, match_mask, 687 margs->ift_gateway, margs->ift_type, margs->ift_ill, 688 margs->ift_zoneid, margs->ift_tsl, 689 margs->ift_flags)) { 690 ire_refhold(ire); 691 rw_exit(&irb_ptr->irb_lock); 692 margs->ift_best_ire = ire; 693 return (B_TRUE); 694 } 695 } 696 rw_exit(&irb_ptr->irb_lock); 697 return (B_FALSE); 698 } 699 700 /* 701 * ftable irb_t structures are dynamically allocated, and we need to 702 * check if the irb_t (and associated ftable tree attachment) needs to 703 * be cleaned up when the irb_refcnt goes to 0. The conditions that need 704 * be verified are: 705 * - no other walkers of the irebucket, i.e., quiescent irb_refcnt, 706 * - no other threads holding references to ire's in the bucket, 707 * i.e., irb_nire == 0 708 * - no active ire's in the bucket, i.e., irb_ire_cnt == 0 709 * - need to hold the global tree lock and irb_lock in write mode. 710 */ 711 void 712 irb_refrele_ftable(irb_t *irb) 713 { 714 for (;;) { 715 rw_enter(&irb->irb_lock, RW_WRITER); 716 ASSERT(irb->irb_refcnt != 0); 717 if (irb->irb_refcnt != 1) { 718 /* 719 * Someone has a reference to this radix node 720 * or there is some bucket walker. 721 */ 722 irb->irb_refcnt--; 723 rw_exit(&irb->irb_lock); 724 return; 725 } else { 726 /* 727 * There is no other walker, nor is there any 728 * other thread that holds a direct ref to this 729 * radix node. Do the clean up if needed. Call 730 * to ire_unlink will clear the IRB_MARK_CONDEMNED flag 731 */ 732 if (irb->irb_marks & IRB_MARK_CONDEMNED) { 733 ire_t *ire_list; 734 735 ire_list = ire_unlink(irb); 736 rw_exit(&irb->irb_lock); 737 738 if (ire_list != NULL) 739 ire_cleanup(ire_list); 740 /* 741 * more CONDEMNED entries could have 742 * been added while we dropped the lock, 743 * so we have to re-check. 744 */ 745 continue; 746 } 747 748 /* 749 * Now check if there are still any ires 750 * associated with this radix node. 751 */ 752 if (irb->irb_nire != 0) { 753 /* 754 * someone is still holding on 755 * to ires in this bucket 756 */ 757 irb->irb_refcnt--; 758 rw_exit(&irb->irb_lock); 759 return; 760 } else { 761 /* 762 * Everything is clear. Zero walkers, 763 * Zero threads with a ref to this 764 * radix node, Zero ires associated with 765 * this radix node. Due to lock order, 766 * check the above conditions again 767 * after grabbing all locks in the right order 768 */ 769 rw_exit(&irb->irb_lock); 770 if (irb_inactive(irb)) 771 return; 772 /* 773 * irb_inactive could not free the irb. 774 * See if there are any walkers, if not 775 * try to clean up again. 776 */ 777 } 778 } 779 } 780 } 781 782 /* 783 * IRE iterator used by ire_ftable_lookup to process multiple equal 784 * routes. Given a starting point in the hash list (hash), walk the IREs 785 * in the bucket skipping deleted entries. We treat the bucket as a circular 786 * list for the purposes of walking it. 787 * Returns the IRE (held) that corresponds to the hash value. If that IRE is 788 * not applicable (ire_match_args failed) then it returns a subsequent one. 789 * If we fail to find an IRE we return NULL. 790 * 791 * Assumes that the caller holds a reference on the IRE bucket and a read lock 792 * on the radix_node_head (for IPv4) or the ip6_ire_head (for IPv6). 793 * 794 * Applies to IPv4 and IPv6. 795 * 796 * For CGTP, where an IRE_BROADCAST and IRE_HOST can exist for the same 797 * address and bucket, we compare against ire_type for the orig_ire. We also 798 * have IRE_BROADCASTs with and without RTF_MULTIRT, with the former being 799 * first in the bucket. Thus we compare that RTF_MULTIRT match the orig_ire. 800 * 801 * Due to shared-IP zones we check that an IRE_OFFLINK has a gateway that is 802 * reachable from the zone i.e., that the ire_gateway_addr is in a subnet 803 * in which the zone has an IP address. We check this for the global zone 804 * even if no shared-IP zones are configured. 805 */ 806 ire_t * 807 ire_round_robin(irb_t *irb_ptr, ire_ftable_args_t *margs, uint_t hash, 808 ire_t *orig_ire, ip_stack_t *ipst) 809 { 810 ire_t *ire, *maybe_ire = NULL; 811 uint_t maybe_badcnt = 0; 812 uint_t maxwalk; 813 814 /* Fold in more bits from the hint/hash */ 815 hash = hash ^ (hash >> 8) ^ (hash >> 16); 816 817 rw_enter(&irb_ptr->irb_lock, RW_WRITER); 818 maxwalk = irb_ptr->irb_ire_cnt; /* Excludes condemned */ 819 if (maxwalk == 0) { 820 rw_exit(&irb_ptr->irb_lock); 821 return (NULL); 822 } 823 824 hash %= maxwalk; 825 irb_refhold_locked(irb_ptr); 826 rw_exit(&irb_ptr->irb_lock); 827 828 /* 829 * Round-robin the routers list looking for a route that 830 * matches the passed in parameters. 831 * First we skip "hash" number of non-condemned IREs. 832 * Then we match the IRE. 833 * If we find an ire which has a non-zero ire_badcnt then we remember 834 * it and keep on looking for a lower ire_badcnt. 835 * If we come to the end of the list we continue (treat the 836 * bucket list as a circular list) but we match less than "max" 837 * entries. 838 */ 839 ire = irb_ptr->irb_ire; 840 while (maxwalk > 0) { 841 if (IRE_IS_CONDEMNED(ire)) 842 goto next_ire_skip; 843 844 /* Skip the first "hash" entries to do ECMP */ 845 if (hash != 0) { 846 hash--; 847 goto next_ire_skip; 848 } 849 850 /* See CGTP comment above */ 851 if (ire->ire_type != orig_ire->ire_type || 852 ((ire->ire_flags ^ orig_ire->ire_flags) & RTF_MULTIRT) != 0) 853 goto next_ire; 854 855 /* 856 * Note: Since IPv6 has hash buckets instead of radix 857 * buckers we need to explicitly compare the addresses. 858 * That makes this less efficient since we will be called 859 * even if there is no alternatives just because the 860 * bucket has multiple IREs for different addresses. 861 */ 862 if (ire->ire_ipversion == IPV6_VERSION) { 863 if (!IN6_ARE_ADDR_EQUAL(&orig_ire->ire_addr_v6, 864 &ire->ire_addr_v6)) 865 goto next_ire; 866 } 867 868 /* 869 * For some reason find_best_route uses ire_mask. We do 870 * the same. 871 */ 872 if (ire->ire_ipversion == IPV4_VERSION ? 873 !ire_match_args(ire, margs->ift_addr, 874 ire->ire_mask, margs->ift_gateway, 875 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 876 margs->ift_tsl, margs->ift_flags) : 877 !ire_match_args_v6(ire, &margs->ift_addr_v6, 878 &ire->ire_mask_v6, &margs->ift_gateway_v6, 879 margs->ift_type, margs->ift_ill, margs->ift_zoneid, 880 margs->ift_tsl, margs->ift_flags)) 881 goto next_ire; 882 883 if (margs->ift_zoneid != ALL_ZONES && 884 (ire->ire_type & IRE_OFFLINK)) { 885 /* 886 * When we're in a zone, we're only 887 * interested in routers that are 888 * reachable through ipifs within our zone. 889 */ 890 if (ire->ire_ipversion == IPV4_VERSION) { 891 if (!ire_gateway_ok_zone_v4( 892 ire->ire_gateway_addr, margs->ift_zoneid, 893 ire->ire_ill, margs->ift_tsl, ipst, 894 B_TRUE)) 895 goto next_ire; 896 } else { 897 if (!ire_gateway_ok_zone_v6( 898 &ire->ire_gateway_addr_v6, 899 margs->ift_zoneid, ire->ire_ill, 900 margs->ift_tsl, ipst, B_TRUE)) 901 goto next_ire; 902 } 903 } 904 mutex_enter(&ire->ire_lock); 905 /* Look for stale ire_badcnt and clear */ 906 if (ire->ire_badcnt != 0 && 907 (TICK_TO_SEC(ddi_get_lbolt64()) - ire->ire_last_badcnt > 908 ipst->ips_ip_ire_badcnt_lifetime)) 909 ire->ire_badcnt = 0; 910 mutex_exit(&ire->ire_lock); 911 912 if (ire->ire_badcnt == 0) { 913 /* We found one with a zero badcnt; done */ 914 ire_refhold(ire); 915 /* 916 * Care needed since irb_refrele grabs WLOCK to free 917 * the irb_t. 918 */ 919 if (ire->ire_ipversion == IPV4_VERSION) { 920 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 921 irb_refrele(irb_ptr); 922 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 923 } else { 924 rw_exit(&ipst->ips_ip6_ire_head_lock); 925 irb_refrele(irb_ptr); 926 rw_enter(&ipst->ips_ip6_ire_head_lock, 927 RW_READER); 928 } 929 return (ire); 930 } 931 /* 932 * keep looking to see if there is a better (lower 933 * badcnt) matching IRE, but save this one as a last resort. 934 * If we find a lower badcnt pick that one as the last* resort. 935 */ 936 if (maybe_ire == NULL) { 937 maybe_ire = ire; 938 maybe_badcnt = ire->ire_badcnt; 939 } else if (ire->ire_badcnt < maybe_badcnt) { 940 maybe_ire = ire; 941 maybe_badcnt = ire->ire_badcnt; 942 } 943 944 next_ire: 945 maxwalk--; 946 next_ire_skip: 947 ire = ire->ire_next; 948 if (ire == NULL) 949 ire = irb_ptr->irb_ire; 950 } 951 if (maybe_ire != NULL) 952 ire_refhold(maybe_ire); 953 954 /* Care needed since irb_refrele grabs WLOCK to free the irb_t. */ 955 if (ire->ire_ipversion == IPV4_VERSION) { 956 RADIX_NODE_HEAD_UNLOCK(ipst->ips_ip_ftable); 957 irb_refrele(irb_ptr); 958 RADIX_NODE_HEAD_RLOCK(ipst->ips_ip_ftable); 959 } else { 960 rw_exit(&ipst->ips_ip6_ire_head_lock); 961 irb_refrele(irb_ptr); 962 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 963 } 964 return (maybe_ire); 965 } 966 967 void 968 irb_refhold_rn(struct radix_node *rn) 969 { 970 if ((rn->rn_flags & RNF_ROOT) == 0) 971 irb_refhold(&((rt_t *)(rn))->rt_irb); 972 } 973 974 void 975 irb_refrele_rn(struct radix_node *rn) 976 { 977 if ((rn->rn_flags & RNF_ROOT) == 0) 978 irb_refrele_ftable(&((rt_t *)(rn))->rt_irb); 979 } 980 981 982 /* 983 * ip_select_src_ill() is used by ip_select_route() to find the src_ill 984 * to be used for source-aware routing table lookup. This function will 985 * ignore IPIF_UNNUMBERED interface addresses, and will only return a 986 * numbered interface (ipif_lookup_addr_nondup() will ignore UNNUMBERED 987 * interfaces). 988 */ 989 static ill_t * 990 ip_select_src_ill(const in6_addr_t *v6src, zoneid_t zoneid, ip_stack_t *ipst) 991 { 992 ipif_t *ipif; 993 ill_t *ill; 994 boolean_t isv6 = !IN6_IS_ADDR_V4MAPPED(v6src); 995 ipaddr_t v4src; 996 997 if (isv6) { 998 ipif = ipif_lookup_addr_nondup_v6(v6src, NULL, zoneid, ipst); 999 } else { 1000 IN6_V4MAPPED_TO_IPADDR(v6src, v4src); 1001 ipif = ipif_lookup_addr_nondup(v4src, NULL, zoneid, ipst); 1002 } 1003 if (ipif == NULL) 1004 return (NULL); 1005 ill = ipif->ipif_ill; 1006 ill_refhold(ill); 1007 ipif_refrele(ipif); 1008 return (ill); 1009 } 1010 1011 /* 1012 * verify that v6src is configured on ill 1013 */ 1014 static boolean_t 1015 ip_verify_src_on_ill(const in6_addr_t v6src, ill_t *ill, zoneid_t zoneid) 1016 { 1017 ipif_t *ipif; 1018 ip_stack_t *ipst; 1019 ipaddr_t v4src; 1020 1021 if (ill == NULL) 1022 return (B_FALSE); 1023 ipst = ill->ill_ipst; 1024 1025 if (ill->ill_isv6) { 1026 ipif = ipif_lookup_addr_nondup_v6(&v6src, ill, zoneid, ipst); 1027 } else { 1028 IN6_V4MAPPED_TO_IPADDR(&v6src, v4src); 1029 ipif = ipif_lookup_addr_nondup(v4src, ill, zoneid, ipst); 1030 } 1031 1032 if (ipif != NULL) { 1033 ipif_refrele(ipif); 1034 return (B_TRUE); 1035 } else { 1036 return (B_FALSE); 1037 } 1038 } 1039 1040 /* 1041 * Select a route for IPv4 and IPv6. Except for multicast, loopback and reject 1042 * routes this routine sets up a ire_nce_cache as well. The caller needs to 1043 * lookup an nce for the multicast case. 1044 * 1045 * When src_multihoming is set to 2 (strict src multihoming) we use the source 1046 * address to select the interface and route. If IP_BOUND_IF etc are 1047 * specified, we require that they specify an interface on which the 1048 * source address is assigned. 1049 * 1050 * When src_multihoming is set to 1 (preferred src aware route 1051 * selection) the unicast lookup prefers a matching source 1052 * (i.e., that the route points out an ill on which the source is assigned), but 1053 * if no such route is found we fallback to not considering the source in the 1054 * route lookup. 1055 * 1056 * We skip the src_multihoming check when the source isn't (yet) set, and 1057 * when IXAF_VERIFY_SOURCE is not set. The latter allows RAW sockets to send 1058 * with bogus source addresses as allowed by IP_HDRINCL and IPV6_PKTINFO 1059 * when secpolicy_net_rawaccess(). 1060 */ 1061 ire_t * 1062 ip_select_route(const in6_addr_t *v6dst, const in6_addr_t v6src, 1063 ip_xmit_attr_t *ixa, uint_t *generationp, in6_addr_t *setsrcp, 1064 int *errorp, boolean_t *multirtp) 1065 { 1066 uint_t match_args; 1067 uint_t ire_type; 1068 ill_t *ill = NULL; 1069 ire_t *ire; 1070 ip_stack_t *ipst = ixa->ixa_ipst; 1071 ipaddr_t v4dst; 1072 in6_addr_t v6nexthop; 1073 iaflags_t ixaflags = ixa->ixa_flags; 1074 nce_t *nce; 1075 boolean_t preferred_src_aware = B_FALSE; 1076 boolean_t verify_src; 1077 boolean_t isv6 = !(ixa->ixa_flags & IXAF_IS_IPV4); 1078 int src_multihoming = IP_SRC_MULTIHOMING(isv6, ipst); 1079 1080 /* 1081 * We only verify that the src has been configured on a selected 1082 * interface if the src is not :: or INADDR_ANY, and if the 1083 * IXAF_VERIFY_SOURCE flag is set. 1084 */ 1085 verify_src = (!V6_OR_V4_INADDR_ANY(v6src) && 1086 (ixa->ixa_flags & IXAF_VERIFY_SOURCE)); 1087 1088 match_args = MATCH_IRE_SECATTR; 1089 IN6_V4MAPPED_TO_IPADDR(v6dst, v4dst); 1090 if (setsrcp != NULL) 1091 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1092 if (errorp != NULL) 1093 ASSERT(*errorp == 0); 1094 1095 /* 1096 * The content of the ixa will be different if IP_NEXTHOP, 1097 * SO_DONTROUTE, IP_BOUND_IF, IP_PKTINFO etc are set 1098 */ 1099 1100 if (isv6 ? IN6_IS_ADDR_MULTICAST(v6dst) : CLASSD(v4dst)) { 1101 /* Pick up the IRE_MULTICAST for the ill */ 1102 if (ixa->ixa_multicast_ifindex != 0) { 1103 ill = ill_lookup_on_ifindex(ixa->ixa_multicast_ifindex, 1104 isv6, ipst); 1105 } else if (ixaflags & IXAF_SCOPEID_SET) { 1106 /* sin6_scope_id takes precedence over ixa_ifindex */ 1107 ASSERT(ixa->ixa_scopeid != 0); 1108 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1109 isv6, ipst); 1110 } else if (ixa->ixa_ifindex != 0) { 1111 /* 1112 * In the ipmp case, the ixa_ifindex is set to 1113 * point at an under_ill and we would return the 1114 * ire_multicast() corresponding to that under_ill. 1115 */ 1116 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1117 isv6, ipst); 1118 } else if (src_multihoming != 0 && verify_src) { 1119 /* Look up the ill based on the source address */ 1120 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1121 /* 1122 * Since we looked up the ill from the source there 1123 * is no need to verify that the source is on the ill 1124 * below. 1125 */ 1126 verify_src = B_FALSE; 1127 if (ill != NULL && IS_VNI(ill)) { 1128 ill_t *usesrc = ill; 1129 1130 ill = ill_lookup_usesrc(usesrc); 1131 ill_refrele(usesrc); 1132 } 1133 } else if (!isv6) { 1134 ipaddr_t v4setsrc = INADDR_ANY; 1135 1136 ill = ill_lookup_group_v4(v4dst, ixa->ixa_zoneid, 1137 ipst, multirtp, &v4setsrc); 1138 if (setsrcp != NULL) 1139 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1140 } else { 1141 ill = ill_lookup_group_v6(v6dst, ixa->ixa_zoneid, 1142 ipst, multirtp, setsrcp); 1143 } 1144 if (ill != NULL && IS_VNI(ill)) { 1145 ill_refrele(ill); 1146 ill = NULL; 1147 } 1148 if (ill == NULL) { 1149 if (errorp != NULL) 1150 *errorp = ENXIO; 1151 /* Get a hold on the IRE_NOROUTE */ 1152 ire = ire_reject(ipst, isv6); 1153 return (ire); 1154 } 1155 if (!(ill->ill_flags & ILLF_MULTICAST)) { 1156 ill_refrele(ill); 1157 if (errorp != NULL) 1158 *errorp = EHOSTUNREACH; 1159 /* Get a hold on the IRE_NOROUTE */ 1160 ire = ire_reject(ipst, isv6); 1161 return (ire); 1162 } 1163 /* 1164 * If we are doing the strictest src_multihoming, then 1165 * we check that IP_MULTICAST_IF, IP_BOUND_IF, etc specify 1166 * an interface that is consistent with the source address. 1167 */ 1168 if (verify_src && src_multihoming == 2 && 1169 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1170 if (errorp != NULL) 1171 *errorp = EADDRNOTAVAIL; 1172 ill_refrele(ill); 1173 /* Get a hold on the IRE_NOROUTE */ 1174 ire = ire_reject(ipst, isv6); 1175 return (ire); 1176 } 1177 /* Get a refcnt on the single IRE_MULTICAST per ill */ 1178 ire = ire_multicast(ill); 1179 ill_refrele(ill); 1180 if (generationp != NULL) 1181 *generationp = ire->ire_generation; 1182 if (errorp != NULL && 1183 (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE))) { 1184 *errorp = EHOSTUNREACH; 1185 } 1186 return (ire); 1187 } 1188 1189 /* Now for unicast and broadcast */ 1190 if (ixa->ixa_ifindex != 0 || (ixaflags & IXAF_SCOPEID_SET)) { 1191 if (ixaflags & IXAF_SCOPEID_SET) { 1192 /* sin6_scope_id takes precedence over ixa_ifindex */ 1193 ASSERT(ixa->ixa_scopeid != 0); 1194 ill = ill_lookup_on_ifindex(ixa->ixa_scopeid, 1195 isv6, ipst); 1196 } else { 1197 ASSERT(ixa->ixa_ifindex != 0); 1198 ill = ill_lookup_on_ifindex(ixa->ixa_ifindex, 1199 isv6, ipst); 1200 } 1201 if (ill != NULL && IS_VNI(ill)) { 1202 ill_refrele(ill); 1203 ill = NULL; 1204 } 1205 if (ill == NULL) { 1206 if (errorp != NULL) 1207 *errorp = ENXIO; 1208 /* Get a hold on the IRE_NOROUTE */ 1209 ire = ire_reject(ipst, isv6); 1210 return (ire); 1211 } 1212 1213 match_args |= MATCH_IRE_ILL; 1214 1215 /* 1216 * icmp_send_reply_v6 uses scopeid, and mpathd sets IP*_BOUND_IF 1217 * so for both of them we need to be able look for an under 1218 * interface. 1219 */ 1220 if (IS_UNDER_IPMP(ill)) 1221 match_args |= MATCH_IRE_TESTHIDDEN; 1222 1223 /* 1224 * If we are doing the strictest src_multihoming, then 1225 * we check that IP_BOUND_IF, IP_PKTINFO, etc specify 1226 * an interface that is consistent with the source address. 1227 */ 1228 if (verify_src && src_multihoming == 2 && 1229 !ip_verify_src_on_ill(v6src, ill, ixa->ixa_zoneid)) { 1230 if (errorp != NULL) 1231 *errorp = EADDRNOTAVAIL; 1232 ill_refrele(ill); 1233 /* Get a hold on the IRE_NOROUTE */ 1234 ire = ire_reject(ipst, isv6); 1235 return (ire); 1236 } 1237 } else if (src_multihoming != 0 && verify_src) { 1238 /* Look up the ill based on the source address */ 1239 ill = ip_select_src_ill(&v6src, ixa->ixa_zoneid, ipst); 1240 if (ill == NULL) { 1241 char addrbuf[INET6_ADDRSTRLEN]; 1242 1243 ip3dbg(("%s not a valid src for unicast", 1244 inet_ntop(AF_INET6, &v6src, addrbuf, 1245 sizeof (addrbuf)))); 1246 if (errorp != NULL) 1247 *errorp = EADDRNOTAVAIL; 1248 /* Get a hold on the IRE_NOROUTE */ 1249 ire = ire_reject(ipst, isv6); 1250 return (ire); 1251 } 1252 match_args |= MATCH_IRE_SRC_ILL; 1253 preferred_src_aware = (src_multihoming == 1); 1254 } 1255 1256 if (ixaflags & IXAF_NEXTHOP_SET) { 1257 /* IP_NEXTHOP was set */ 1258 v6nexthop = ixa->ixa_nexthop_v6; 1259 } else { 1260 v6nexthop = *v6dst; 1261 } 1262 1263 ire_type = 0; 1264 1265 /* 1266 * If SO_DONTROUTE is set or if IP_NEXTHOP is set, then 1267 * we only look for an onlink IRE. 1268 */ 1269 if (ixaflags & (IXAF_DONTROUTE|IXAF_NEXTHOP_SET)) { 1270 match_args |= MATCH_IRE_TYPE; 1271 ire_type = IRE_ONLINK; 1272 } 1273 1274 retry: 1275 if (!isv6) { 1276 ipaddr_t v4nexthop; 1277 ipaddr_t v4setsrc = INADDR_ANY; 1278 1279 IN6_V4MAPPED_TO_IPADDR(&v6nexthop, v4nexthop); 1280 ire = ire_route_recursive_v4(v4nexthop, ire_type, ill, 1281 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1282 ixa->ixa_xmit_hint, ipst, &v4setsrc, NULL, generationp); 1283 if (setsrcp != NULL) 1284 IN6_IPADDR_TO_V4MAPPED(v4setsrc, setsrcp); 1285 } else { 1286 ire = ire_route_recursive_v6(&v6nexthop, ire_type, ill, 1287 ixa->ixa_zoneid, ixa->ixa_tsl, match_args, IRR_ALLOCATE, 1288 ixa->ixa_xmit_hint, ipst, setsrcp, NULL, generationp); 1289 } 1290 1291 #ifdef DEBUG 1292 if (match_args & MATCH_IRE_TESTHIDDEN) { 1293 ip3dbg(("looking for hidden; dst %x ire %p\n", 1294 v4dst, (void *)ire)); 1295 } 1296 #endif 1297 if (ill != NULL) { 1298 ill_refrele(ill); 1299 ill = NULL; 1300 } 1301 if ((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1302 (ire->ire_type & IRE_MULTICAST)) { 1303 if (preferred_src_aware) { 1304 /* 1305 * "Preferred Source Aware" send mode. If we cannot 1306 * find an ire whose ire_ill had the desired source 1307 * address retry after relaxing the ill matching 1308 * constraint. 1309 */ 1310 ire_refrele(ire); 1311 preferred_src_aware = B_FALSE; 1312 match_args &= ~MATCH_IRE_SRC_ILL; 1313 goto retry; 1314 } 1315 /* No ire_nce_cache */ 1316 return (ire); 1317 } 1318 1319 /* Setup ire_nce_cache if it doesn't exist or is condemned. */ 1320 mutex_enter(&ire->ire_lock); 1321 nce = ire->ire_nce_cache; 1322 if (nce == NULL || nce->nce_is_condemned) { 1323 mutex_exit(&ire->ire_lock); 1324 (void) ire_revalidate_nce(ire); 1325 } else { 1326 mutex_exit(&ire->ire_lock); 1327 } 1328 return (ire); 1329 } 1330 1331 /* 1332 * Find a route given some xmit attributes and a packet. 1333 * Generic for IPv4 and IPv6 1334 * 1335 * This never returns NULL. But when it returns the IRE_NOROUTE 1336 * it might set errorp. 1337 */ 1338 ire_t * 1339 ip_select_route_pkt(mblk_t *mp, ip_xmit_attr_t *ixa, uint_t *generationp, 1340 int *errorp, boolean_t *multirtp) 1341 { 1342 if (ixa->ixa_flags & IXAF_IS_IPV4) { 1343 ipha_t *ipha = (ipha_t *)mp->b_rptr; 1344 in6_addr_t v6dst, v6src; 1345 1346 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_dst, &v6dst); 1347 IN6_IPADDR_TO_V4MAPPED(ipha->ipha_src, &v6src); 1348 1349 return (ip_select_route(&v6dst, v6src, ixa, generationp, 1350 NULL, errorp, multirtp)); 1351 } else { 1352 ip6_t *ip6h = (ip6_t *)mp->b_rptr; 1353 1354 return (ip_select_route(&ip6h->ip6_dst, ip6h->ip6_src, 1355 ixa, generationp, NULL, errorp, multirtp)); 1356 } 1357 } 1358 1359 ire_t * 1360 ip_select_route_v4(ipaddr_t dst, ipaddr_t src, ip_xmit_attr_t *ixa, 1361 uint_t *generationp, ipaddr_t *v4setsrcp, int *errorp, boolean_t *multirtp) 1362 { 1363 in6_addr_t v6dst, v6src; 1364 ire_t *ire; 1365 in6_addr_t setsrc; 1366 1367 ASSERT(ixa->ixa_flags & IXAF_IS_IPV4); 1368 1369 IN6_IPADDR_TO_V4MAPPED(dst, &v6dst); 1370 IN6_IPADDR_TO_V4MAPPED(src, &v6src); 1371 1372 setsrc = ipv6_all_zeros; 1373 ire = ip_select_route(&v6dst, v6src, ixa, generationp, &setsrc, errorp, 1374 multirtp); 1375 if (v4setsrcp != NULL) 1376 IN6_V4MAPPED_TO_IPADDR(&setsrc, *v4setsrcp); 1377 return (ire); 1378 } 1379 1380 /* 1381 * Recursively look for a route to the destination. Can also match on 1382 * the zoneid, ill, and label. Used for the data paths. See also 1383 * ire_route_recursive. 1384 * 1385 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1386 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1387 * forwarding. 1388 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1389 * resolve the gateway. 1390 * 1391 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1392 * instead. 1393 * 1394 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1395 * is an error. 1396 * Allow at most one RTF_INDIRECT. 1397 */ 1398 ire_t * 1399 ire_route_recursive_impl_v4(ire_t *ire, 1400 ipaddr_t nexthop, uint_t ire_type, const ill_t *ill_arg, 1401 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1402 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1403 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1404 { 1405 int i, j; 1406 ire_t *ires[MAX_IRE_RECURSION]; 1407 uint_t generation; 1408 uint_t generations[MAX_IRE_RECURSION]; 1409 boolean_t need_refrele = B_FALSE; 1410 boolean_t invalidate = B_FALSE; 1411 ill_t *ill = NULL; 1412 uint_t maskoff = (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST); 1413 1414 if (setsrcp != NULL) 1415 ASSERT(*setsrcp == INADDR_ANY); 1416 if (gwattrp != NULL) 1417 ASSERT(*gwattrp == NULL); 1418 1419 /* 1420 * We iterate up to three times to resolve a route, even though 1421 * we have four slots in the array. The extra slot is for an 1422 * IRE_IF_CLONE we might need to create. 1423 */ 1424 i = 0; 1425 while (i < MAX_IRE_RECURSION - 1) { 1426 /* ire_ftable_lookup handles round-robin/ECMP */ 1427 if (ire == NULL) { 1428 ire = ire_ftable_lookup_v4(nexthop, 0, 0, ire_type, 1429 (ill != NULL? ill : ill_arg), zoneid, tsl, 1430 match_args, xmit_hint, ipst, &generation); 1431 } else { 1432 /* Caller passed it; extra hold since we will rele */ 1433 ire_refhold(ire); 1434 if (generationp != NULL) 1435 generation = *generationp; 1436 else 1437 generation = IRE_GENERATION_VERIFY; 1438 } 1439 if (ire == NULL) { 1440 if (i > 0 && (irr_flags & IRR_INCOMPLETE)) { 1441 ire = ires[0]; 1442 ire_refhold(ire); 1443 } else { 1444 ire = ire_reject(ipst, B_FALSE); 1445 } 1446 goto error; 1447 } 1448 1449 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1450 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1451 goto error; 1452 1453 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1454 /* 1455 * Verify that the IRE_IF_CLONE has a consistent generation 1456 * number. 1457 */ 1458 if ((ire->ire_type & IRE_IF_CLONE) && !ire_clone_verify(ire)) { 1459 ire_refrele(ire); 1460 ire = NULL; 1461 continue; 1462 } 1463 1464 /* 1465 * Don't allow anything unusual past the first iteration. 1466 * After the first lookup, we should no longer look for 1467 * (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST) or RTF_INDIRECT 1468 * routes. 1469 * 1470 * In addition, after we have found a direct IRE_OFFLINK, 1471 * we should only look for interface or clone routes. 1472 */ 1473 match_args |= MATCH_IRE_DIRECT; /* no more RTF_INDIRECTs */ 1474 1475 if ((ire->ire_type & IRE_OFFLINK) && 1476 !(ire->ire_flags & RTF_INDIRECT)) { 1477 ire_type = IRE_IF_ALL; 1478 } else { 1479 /* 1480 * no more local, loopback, broadcast routes 1481 */ 1482 if (!(match_args & MATCH_IRE_TYPE)) 1483 ire_type = (IRE_OFFLINK|IRE_ONLINK); 1484 ire_type &= ~maskoff; 1485 } 1486 match_args |= MATCH_IRE_TYPE; 1487 1488 /* We have a usable IRE */ 1489 ires[i] = ire; 1490 generations[i] = generation; 1491 i++; 1492 1493 /* The first RTF_SETSRC address is passed back if setsrcp */ 1494 if ((ire->ire_flags & RTF_SETSRC) && 1495 setsrcp != NULL && *setsrcp == INADDR_ANY) { 1496 ASSERT(ire->ire_setsrc_addr != INADDR_ANY); 1497 *setsrcp = ire->ire_setsrc_addr; 1498 } 1499 1500 /* The first ire_gw_secattr is passed back if gwattrp */ 1501 if (ire->ire_gw_secattr != NULL && 1502 gwattrp != NULL && *gwattrp == NULL) 1503 *gwattrp = ire->ire_gw_secattr; 1504 1505 /* 1506 * Check if we have a short-cut pointer to an IRE for this 1507 * destination, and that the cached dependency isn't stale. 1508 * In that case we've rejoined an existing tree towards a 1509 * parent, thus we don't need to continue the loop to 1510 * discover the rest of the tree. 1511 */ 1512 mutex_enter(&ire->ire_lock); 1513 if (ire->ire_dep_parent != NULL && 1514 ire->ire_dep_parent->ire_generation == 1515 ire->ire_dep_parent_generation) { 1516 mutex_exit(&ire->ire_lock); 1517 ire = NULL; 1518 goto done; 1519 } 1520 mutex_exit(&ire->ire_lock); 1521 1522 /* 1523 * If this type should have an ire_nce_cache (even if it 1524 * doesn't yet have one) then we are done. Includes 1525 * IRE_INTERFACE with a full 32 bit mask. 1526 */ 1527 if (ire->ire_nce_capable) { 1528 ire = NULL; 1529 goto done; 1530 } 1531 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1532 /* 1533 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1534 * particular destination 1535 */ 1536 if (ire->ire_type & IRE_INTERFACE) { 1537 in6_addr_t v6nexthop; 1538 ire_t *clone; 1539 1540 ASSERT(ire->ire_masklen != IPV4_ABITS); 1541 1542 /* 1543 * In the case of ip_input and ILLF_FORWARDING not 1544 * being set, and in the case of RTM_GET, there is 1545 * no point in allocating an IRE_IF_CLONE. We return 1546 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1547 * result in a ire_dep_parent which is IRE_IF_* 1548 * without an IRE_IF_CLONE. 1549 * We recover from that when we need to send packets 1550 * by ensuring that the generations become 1551 * IRE_GENERATION_VERIFY in this case. 1552 */ 1553 if (!(irr_flags & IRR_ALLOCATE)) { 1554 invalidate = B_TRUE; 1555 ire = NULL; 1556 goto done; 1557 } 1558 1559 IN6_IPADDR_TO_V4MAPPED(nexthop, &v6nexthop); 1560 1561 clone = ire_create_if_clone(ire, &v6nexthop, 1562 &generation); 1563 if (clone == NULL) { 1564 /* 1565 * Temporary failure - no memory. 1566 * Don't want caller to cache IRE_NOROUTE. 1567 */ 1568 invalidate = B_TRUE; 1569 ire = ire_blackhole(ipst, B_FALSE); 1570 goto error; 1571 } 1572 /* 1573 * Make clone next to last entry and the 1574 * IRE_INTERFACE the last in the dependency 1575 * chain since the clone depends on the 1576 * IRE_INTERFACE. 1577 */ 1578 ASSERT(i >= 1); 1579 ASSERT(i < MAX_IRE_RECURSION); 1580 1581 ires[i] = ires[i-1]; 1582 generations[i] = generations[i-1]; 1583 ires[i-1] = clone; 1584 generations[i-1] = generation; 1585 i++; 1586 1587 ire = NULL; 1588 goto done; 1589 } 1590 1591 /* 1592 * We only match on the type and optionally ILL when 1593 * recursing. The type match is used by some callers 1594 * to exclude certain types (such as IRE_IF_CLONE or 1595 * IRE_LOCAL|IRE_LOOPBACK). 1596 * 1597 * In the MATCH_IRE_SRC_ILL case, ill_arg may be the 'srcof' 1598 * ire->ire_ill, and we want to find the IRE_INTERFACE for 1599 * ire_ill, so we set ill to the ire_ill; 1600 */ 1601 match_args &= (MATCH_IRE_TYPE | MATCH_IRE_DIRECT); 1602 nexthop = ire->ire_gateway_addr; 1603 if (ill == NULL && ire->ire_ill != NULL) { 1604 ill = ire->ire_ill; 1605 need_refrele = B_TRUE; 1606 ill_refhold(ill); 1607 match_args |= MATCH_IRE_ILL; 1608 } 1609 ire = NULL; 1610 } 1611 ASSERT(ire == NULL); 1612 ire = ire_reject(ipst, B_FALSE); 1613 1614 error: 1615 ASSERT(ire != NULL); 1616 if (need_refrele) 1617 ill_refrele(ill); 1618 1619 /* 1620 * In the case of MULTIRT we want to try a different IRE the next 1621 * time. We let the next packet retry in that case. 1622 */ 1623 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1624 (void) ire_no_good(ires[0]); 1625 1626 cleanup: 1627 /* cleanup ires[i] */ 1628 ire_dep_unbuild(ires, i); 1629 for (j = 0; j < i; j++) 1630 ire_refrele(ires[j]); 1631 1632 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1633 (irr_flags & IRR_INCOMPLETE)); 1634 /* 1635 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1636 * ip_select_route since the reject or lack of memory might be gone. 1637 */ 1638 if (generationp != NULL) 1639 *generationp = IRE_GENERATION_VERIFY; 1640 return (ire); 1641 1642 done: 1643 ASSERT(ire == NULL); 1644 if (need_refrele) { 1645 ill_refrele(ill); 1646 ill = NULL; 1647 } 1648 1649 /* Build dependencies */ 1650 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1651 /* Something in chain was condemned; tear it apart */ 1652 ire = ire_reject(ipst, B_FALSE); 1653 goto cleanup; 1654 } 1655 1656 /* 1657 * Release all refholds except the one for ires[0] that we 1658 * will return to the caller. 1659 */ 1660 for (j = 1; j < i; j++) 1661 ire_refrele(ires[j]); 1662 1663 if (invalidate) { 1664 /* 1665 * Since we needed to allocate but couldn't we need to make 1666 * sure that the dependency chain is rebuilt the next time. 1667 */ 1668 ire_dep_invalidate_generations(ires[0]); 1669 generation = IRE_GENERATION_VERIFY; 1670 } else { 1671 /* 1672 * IREs can have been added or deleted while we did the 1673 * recursive lookup and we can't catch those until we've built 1674 * the dependencies. We verify the stored 1675 * ire_dep_parent_generation to catch any such changes and 1676 * return IRE_GENERATION_VERIFY (which will cause 1677 * ip_select_route to be called again so we can redo the 1678 * recursive lookup next time we send a packet. 1679 */ 1680 if (ires[0]->ire_dep_parent == NULL) 1681 generation = ires[0]->ire_generation; 1682 else 1683 generation = ire_dep_validate_generations(ires[0]); 1684 if (generations[0] != ires[0]->ire_generation) { 1685 /* Something changed at the top */ 1686 generation = IRE_GENERATION_VERIFY; 1687 } 1688 } 1689 if (generationp != NULL) 1690 *generationp = generation; 1691 1692 return (ires[0]); 1693 } 1694 1695 ire_t * 1696 ire_route_recursive_v4(ipaddr_t nexthop, uint_t ire_type, const ill_t *ill, 1697 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1698 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, ipaddr_t *setsrcp, 1699 tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1700 { 1701 return (ire_route_recursive_impl_v4(NULL, nexthop, ire_type, ill, 1702 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1703 gwattrp, generationp)); 1704 } 1705 1706 /* 1707 * Recursively look for a route to the destination. 1708 * We only handle a destination match here, yet we have the same arguments 1709 * as the full match to allow function pointers to select between the two. 1710 * 1711 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1712 * instead. 1713 * 1714 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1715 * is an error. 1716 * Allow at most one RTF_INDIRECT. 1717 */ 1718 ire_t * 1719 ire_route_recursive_dstonly_v4(ipaddr_t nexthop, uint_t irr_flags, 1720 uint32_t xmit_hint, ip_stack_t *ipst) 1721 { 1722 ire_t *ire; 1723 ire_t *ire1; 1724 uint_t generation; 1725 1726 /* ire_ftable_lookup handles round-robin/ECMP */ 1727 ire = ire_ftable_lookup_simple_v4(nexthop, xmit_hint, ipst, 1728 &generation); 1729 ASSERT(ire != NULL); 1730 /* 1731 * If the IRE has a current cached parent we know that the whole 1732 * parent chain is current, hence we don't need to discover and 1733 * build any dependencies by doing a recursive lookup. 1734 */ 1735 mutex_enter(&ire->ire_lock); 1736 if (ire->ire_dep_parent != NULL) { 1737 if (ire->ire_dep_parent->ire_generation == 1738 ire->ire_dep_parent_generation) { 1739 mutex_exit(&ire->ire_lock); 1740 return (ire); 1741 } 1742 mutex_exit(&ire->ire_lock); 1743 } else { 1744 mutex_exit(&ire->ire_lock); 1745 /* 1746 * If this type should have an ire_nce_cache (even if it 1747 * doesn't yet have one) then we are done. Includes 1748 * IRE_INTERFACE with a full 32 bit mask. 1749 */ 1750 if (ire->ire_nce_capable) 1751 return (ire); 1752 } 1753 1754 /* 1755 * Fallback to loop in the normal code starting with the ire 1756 * we found. Normally this would return the same ire. 1757 */ 1758 ire1 = ire_route_recursive_impl_v4(ire, nexthop, 0, NULL, ALL_ZONES, 1759 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1760 &generation); 1761 ire_refrele(ire); 1762 return (ire1); 1763 } 1764 1765 /* 1766 * Verify that the generation numbers in the chain leading to an IRE_IF_CLONE 1767 * are consistent. Return FALSE (and delete the IRE_IF_CLONE) if they 1768 * are not consistent, and TRUE otherwise. 1769 */ 1770 boolean_t 1771 ire_clone_verify(ire_t *ire) 1772 { 1773 ASSERT((ire->ire_type & IRE_IF_CLONE) != 0); 1774 mutex_enter(&ire->ire_lock); 1775 if (ire->ire_dep_parent != NULL && 1776 ire->ire_dep_parent->ire_generation != 1777 ire->ire_dep_parent_generation) { 1778 mutex_exit(&ire->ire_lock); 1779 ire_delete(ire); 1780 return (B_FALSE); 1781 } 1782 mutex_exit(&ire->ire_lock); 1783 return (B_TRUE); 1784 } 1785