1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <sys/kmem.h> 58 #include <sys/zone.h> 59 60 #include <sys/tsol/label.h> 61 #include <sys/tsol/tnet.h> 62 63 #define IS_DEFAULT_ROUTE_V6(ire) \ 64 (((ire)->ire_type & IRE_DEFAULT) || \ 65 (((ire)->ire_type & IRE_INTERFACE) && \ 66 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 67 68 static ire_t ire_null; 69 70 static ire_t * 71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 72 const in6_addr_t *gateway, int type, const ill_t *ill, 73 zoneid_t zoneid, const ts_label_t *tsl, int flags, 74 ip_stack_t *ipst); 75 76 /* 77 * Initialize the ire that is specific to IPv6 part and call 78 * ire_init_common to finish it. 79 * Returns zero or errno. 80 */ 81 int 82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 83 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 84 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 85 { 86 int error; 87 88 /* 89 * Reject IRE security attmakeribute creation/initialization 90 * if system is not running in Trusted mode. 91 */ 92 if (gc != NULL && !is_system_labeled()) 93 return (EINVAL); 94 95 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 96 if (v6addr != NULL) 97 ire->ire_addr_v6 = *v6addr; 98 if (v6gateway != NULL) 99 ire->ire_gateway_addr_v6 = *v6gateway; 100 101 /* Make sure we don't have stray values in some fields */ 102 switch (type) { 103 case IRE_LOOPBACK: 104 case IRE_HOST: 105 case IRE_LOCAL: 106 case IRE_IF_CLONE: 107 ire->ire_mask_v6 = ipv6_all_ones; 108 ire->ire_masklen = IPV6_ABITS; 109 break; 110 case IRE_PREFIX: 111 case IRE_DEFAULT: 112 case IRE_IF_RESOLVER: 113 case IRE_IF_NORESOLVER: 114 if (v6mask != NULL) { 115 ire->ire_mask_v6 = *v6mask; 116 ire->ire_masklen = 117 ip_mask_to_plen_v6(&ire->ire_mask_v6); 118 } 119 break; 120 case IRE_MULTICAST: 121 case IRE_NOROUTE: 122 ASSERT(v6mask == NULL); 123 break; 124 default: 125 ASSERT(0); 126 return (EINVAL); 127 } 128 129 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 130 gc, ipst); 131 if (error != NULL) 132 return (error); 133 134 /* Determine which function pointers to use */ 135 ire->ire_postfragfn = ip_xmit; /* Common case */ 136 137 switch (ire->ire_type) { 138 case IRE_LOCAL: 139 ire->ire_sendfn = ire_send_local_v6; 140 ire->ire_recvfn = ire_recv_local_v6; 141 ASSERT(ire->ire_ill != NULL); 142 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 143 ire->ire_recvfn = ire_recv_noaccept_v6; 144 break; 145 case IRE_LOOPBACK: 146 ire->ire_sendfn = ire_send_local_v6; 147 ire->ire_recvfn = ire_recv_loopback_v6; 148 break; 149 case IRE_MULTICAST: 150 ire->ire_postfragfn = ip_postfrag_loopcheck; 151 ire->ire_sendfn = ire_send_multicast_v6; 152 ire->ire_recvfn = ire_recv_multicast_v6; 153 break; 154 default: 155 /* 156 * For IRE_IF_ALL and IRE_OFFLINK we forward received 157 * packets by default. 158 */ 159 ire->ire_sendfn = ire_send_wire_v6; 160 ire->ire_recvfn = ire_recv_forward_v6; 161 break; 162 } 163 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 164 ire->ire_sendfn = ire_send_noroute_v6; 165 ire->ire_recvfn = ire_recv_noroute_v6; 166 } else if (ire->ire_flags & RTF_MULTIRT) { 167 ire->ire_postfragfn = ip_postfrag_multirt_v6; 168 ire->ire_sendfn = ire_send_multirt_v6; 169 ire->ire_recvfn = ire_recv_multirt_v6; 170 } 171 ire->ire_nce_capable = ire_determine_nce_capable(ire); 172 return (0); 173 } 174 175 /* 176 * ire_create_v6 is called to allocate and initialize a new IRE. 177 * 178 * NOTE : This is called as writer sometimes though not required 179 * by this function. 180 */ 181 /* ARGSUSED */ 182 ire_t * 183 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 184 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 185 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 186 { 187 ire_t *ire; 188 int error; 189 190 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 191 192 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 193 if (ire == NULL) { 194 DTRACE_PROBE(kmem__cache__alloc); 195 return (NULL); 196 } 197 *ire = ire_null; 198 199 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 200 type, ill, zoneid, flags, gc, ipst); 201 202 if (error != 0) { 203 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 204 kmem_cache_free(ire_cache, ire); 205 return (NULL); 206 } 207 return (ire); 208 } 209 210 /* 211 * Find the ill matching a multicast group. 212 * Allows different routes for multicast addresses 213 * in the unicast routing table (akin to FF::0/8 but could be more specific) 214 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 215 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 216 * specify the interface to join on. 217 * 218 * Supports link-local addresses by using ire_route_recursive which follows 219 * the ill when recursing. 220 * 221 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 222 * and the MULTIRT property can be different for different groups, we 223 * extract RTF_MULTIRT from the special unicast route added for a group 224 * with CGTP and pass that back in the multirtp argument. 225 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 226 * We have a setsrcp argument for the same reason. 227 */ 228 ill_t * 229 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 230 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 231 { 232 ire_t *ire; 233 ill_t *ill; 234 235 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 236 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 237 ASSERT(ire != NULL); 238 239 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 240 ire_refrele(ire); 241 return (NULL); 242 } 243 244 if (multirtp != NULL) 245 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 246 247 ill = ire_nexthop_ill(ire); 248 ire_refrele(ire); 249 return (ill); 250 } 251 252 /* 253 * This function takes a mask and returns number of bits set in the 254 * mask (the represented prefix length). Assumes a contiguous mask. 255 */ 256 int 257 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 258 { 259 int bits; 260 int plen = IPV6_ABITS; 261 int i; 262 263 for (i = 3; i >= 0; i--) { 264 if (v6mask->s6_addr32[i] == 0) { 265 plen -= 32; 266 continue; 267 } 268 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 269 if (bits == 0) 270 break; 271 plen -= bits; 272 } 273 274 return (plen); 275 } 276 277 /* 278 * Convert a prefix length to the mask for that prefix. 279 * Returns the argument bitmask. 280 */ 281 in6_addr_t * 282 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 283 { 284 uint32_t *ptr; 285 286 if (plen < 0 || plen > IPV6_ABITS) 287 return (NULL); 288 *bitmask = ipv6_all_zeros; 289 if (plen == 0) 290 return (bitmask); 291 292 ptr = (uint32_t *)bitmask; 293 while (plen > 32) { 294 *ptr++ = 0xffffffffU; 295 plen -= 32; 296 } 297 *ptr = htonl(0xffffffffU << (32 - plen)); 298 return (bitmask); 299 } 300 301 /* 302 * Add a fully initialized IPv6 IRE to the forwarding table. 303 * This returns NULL on failure, or a held IRE on success. 304 * Normally the returned IRE is the same as the argument. But a different 305 * IRE will be returned if the added IRE is deemed identical to an existing 306 * one. In that case ire_identical_ref will be increased. 307 * The caller always needs to do an ire_refrele() on the returned IRE. 308 */ 309 ire_t * 310 ire_add_v6(ire_t *ire) 311 { 312 ire_t *ire1; 313 int mask_table_index; 314 irb_t *irb_ptr; 315 ire_t **irep; 316 int match_flags; 317 int error; 318 ip_stack_t *ipst = ire->ire_ipst; 319 320 ASSERT(ire->ire_ipversion == IPV6_VERSION); 321 322 /* Make sure the address is properly masked. */ 323 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 324 325 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 326 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 327 irb_t *ptr; 328 int i; 329 330 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 331 sizeof (irb_t))); 332 if (ptr == NULL) { 333 ire_delete(ire); 334 return (NULL); 335 } 336 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 337 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 338 ptr[i].irb_ipst = ipst; 339 } 340 mutex_enter(&ipst->ips_ire_ft_init_lock); 341 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 342 NULL) { 343 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 344 ptr; 345 mutex_exit(&ipst->ips_ire_ft_init_lock); 346 } else { 347 /* 348 * Some other thread won the race in 349 * initializing the forwarding table at the 350 * same index. 351 */ 352 mutex_exit(&ipst->ips_ire_ft_init_lock); 353 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 354 rw_destroy(&ptr[i].irb_lock); 355 } 356 mi_free(ptr); 357 } 358 } 359 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 360 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 361 ipst->ips_ip6_ftable_hash_size)]); 362 363 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 364 if (ire->ire_ill != NULL) 365 match_flags |= MATCH_IRE_ILL; 366 /* 367 * Start the atomic add of the ire. Grab the bucket lock and the 368 * ill lock. Check for condemned. 369 */ 370 error = ire_atomic_start(irb_ptr, ire); 371 if (error != 0) { 372 ire_delete(ire); 373 return (NULL); 374 } 375 376 /* 377 * If we are creating a hidden IRE, make sure we search for 378 * hidden IREs when searching for duplicates below. 379 * Otherwise, we might find an IRE on some other interface 380 * that's not marked hidden. 381 */ 382 if (ire->ire_testhidden) 383 match_flags |= MATCH_IRE_TESTHIDDEN; 384 385 /* 386 * Atomically check for duplicate and insert in the table. 387 */ 388 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 389 if (IRE_IS_CONDEMNED(ire1)) 390 continue; 391 /* 392 * Here we need an exact match on zoneid, i.e., 393 * ire_match_args doesn't fit. 394 */ 395 if (ire1->ire_zoneid != ire->ire_zoneid) 396 continue; 397 398 if (ire1->ire_type != ire->ire_type) 399 continue; 400 401 /* 402 * Note: We do not allow multiple routes that differ only 403 * in the gateway security attributes; such routes are 404 * considered duplicates. 405 * To change that we explicitly have to treat them as 406 * different here. 407 */ 408 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 409 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 410 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 411 match_flags)) { 412 /* 413 * Return the old ire after doing a REFHOLD. 414 * As most of the callers continue to use the IRE 415 * after adding, we return a held ire. This will 416 * avoid a lookup in the caller again. If the callers 417 * don't want to use it, they need to do a REFRELE. 418 */ 419 ip1dbg(("found dup ire existing %p new %p", 420 (void *)ire1, (void *)ire)); 421 ire_refhold(ire1); 422 atomic_add_32(&ire1->ire_identical_ref, 1); 423 ire_atomic_end(irb_ptr, ire); 424 ire_delete(ire); 425 return (ire1); 426 } 427 } 428 429 /* 430 * Normally we do head insertion since most things do not care about 431 * the order of the IREs in the bucket. 432 * However, due to shared-IP zones (and restrict_interzone_loopback) 433 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 434 * address. For that reason we do tail insertion for IRE_IF_CLONE. 435 */ 436 irep = (ire_t **)irb_ptr; 437 if (ire->ire_type & IRE_IF_CLONE) { 438 while ((ire1 = *irep) != NULL) 439 irep = &ire1->ire_next; 440 } 441 /* Insert at *irep */ 442 ire1 = *irep; 443 if (ire1 != NULL) 444 ire1->ire_ptpn = &ire->ire_next; 445 ire->ire_next = ire1; 446 /* Link the new one in. */ 447 ire->ire_ptpn = irep; 448 /* 449 * ire_walk routines de-reference ire_next without holding 450 * a lock. Before we point to the new ire, we want to make 451 * sure the store that sets the ire_next of the new ire 452 * reaches global visibility, so that ire_walk routines 453 * don't see a truncated list of ires i.e if the ire_next 454 * of the new ire gets set after we do "*irep = ire" due 455 * to re-ordering, the ire_walk thread will see a NULL 456 * once it accesses the ire_next of the new ire. 457 * membar_producer() makes sure that the following store 458 * happens *after* all of the above stores. 459 */ 460 membar_producer(); 461 *irep = ire; 462 ire->ire_bucket = irb_ptr; 463 /* 464 * We return a bumped up IRE above. Keep it symmetrical 465 * so that the callers will always have to release. This 466 * helps the callers of this function because they continue 467 * to use the IRE after adding and hence they don't have to 468 * lookup again after we return the IRE. 469 * 470 * NOTE : We don't have to use atomics as this is appearing 471 * in the list for the first time and no one else can bump 472 * up the reference count on this yet. 473 */ 474 ire_refhold_locked(ire); 475 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 476 irb_ptr->irb_ire_cnt++; 477 478 if (ire->ire_ill != NULL) { 479 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 480 (char *), "ire", (void *), ire); 481 ire->ire_ill->ill_ire_cnt++; 482 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 483 } 484 ire_atomic_end(irb_ptr, ire); 485 486 /* Make any caching of the IREs be notified or updated */ 487 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 488 489 return (ire); 490 } 491 492 /* 493 * Search for all HOST REDIRECT routes that are 494 * pointing at the specified gateway and 495 * delete them. This routine is called only 496 * when a default gateway is going away. 497 */ 498 static void 499 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 500 { 501 irb_t *irb_ptr; 502 irb_t *irb; 503 ire_t *ire; 504 in6_addr_t gw_addr_v6; 505 int i; 506 507 /* get the hash table for HOST routes */ 508 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 509 if (irb_ptr == NULL) 510 return; 511 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 512 irb = &irb_ptr[i]; 513 irb_refhold(irb); 514 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 515 if (!(ire->ire_flags & RTF_DYNAMIC)) 516 continue; 517 mutex_enter(&ire->ire_lock); 518 gw_addr_v6 = ire->ire_gateway_addr_v6; 519 mutex_exit(&ire->ire_lock); 520 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 521 ire_delete(ire); 522 } 523 irb_refrele(irb); 524 } 525 } 526 527 /* 528 * Delete the specified IRE. 529 * All calls should use ire_delete(). 530 * Sometimes called as writer though not required by this function. 531 * 532 * NOTE : This function is called only if the ire was added 533 * in the list. 534 */ 535 void 536 ire_delete_v6(ire_t *ire) 537 { 538 in6_addr_t gw_addr_v6; 539 ip_stack_t *ipst = ire->ire_ipst; 540 541 /* 542 * Make sure ire_generation increases from ire_flush_cache happen 543 * after any lookup/reader has read ire_generation. 544 * Since the rw_enter makes us wait until any lookup/reader has 545 * completed we can exit the lock immediately. 546 */ 547 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 548 rw_exit(&ipst->ips_ip6_ire_head_lock); 549 550 ASSERT(ire->ire_refcnt >= 1); 551 ASSERT(ire->ire_ipversion == IPV6_VERSION); 552 553 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 554 555 if (ire->ire_type == IRE_DEFAULT) { 556 /* 557 * when a default gateway is going away 558 * delete all the host redirects pointing at that 559 * gateway. 560 */ 561 mutex_enter(&ire->ire_lock); 562 gw_addr_v6 = ire->ire_gateway_addr_v6; 563 mutex_exit(&ire->ire_lock); 564 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 565 } 566 567 /* 568 * If we are deleting an IRE_INTERFACE then we make sure we also 569 * delete any IRE_IF_CLONE that has been created from it. 570 * Those are always in ire_dep_children. 571 */ 572 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 573 ire_dep_delete_if_clone(ire); 574 575 /* Remove from parent dependencies and child */ 576 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 577 if (ire->ire_dep_parent != NULL) { 578 ire_dep_remove(ire); 579 } 580 while (ire->ire_dep_children != NULL) 581 ire_dep_remove(ire->ire_dep_children); 582 rw_exit(&ipst->ips_ire_dep_lock); 583 } 584 585 /* 586 * When an IRE is added or deleted this routine is called to make sure 587 * any caching of IRE information is notified or updated. 588 * 589 * The flag argument indicates if the flush request is due to addition 590 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 591 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 592 */ 593 void 594 ire_flush_cache_v6(ire_t *ire, int flag) 595 { 596 ip_stack_t *ipst = ire->ire_ipst; 597 598 /* 599 * IRE_IF_CLONE ire's don't provide any new information 600 * than the parent from which they are cloned, so don't 601 * perturb the generation numbers. 602 */ 603 if (ire->ire_type & IRE_IF_CLONE) 604 return; 605 606 /* 607 * Ensure that an ire_add during a lookup serializes the updates of 608 * the generation numbers under ire_head_lock so that the lookup gets 609 * either the old ire and old generation number, or a new ire and new 610 * generation number. 611 */ 612 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 613 614 /* 615 * If a route was just added, we need to notify everybody that 616 * has cached an IRE_NOROUTE since there might now be a better 617 * route for them. 618 */ 619 if (flag == IRE_FLUSH_ADD) { 620 ire_increment_generation(ipst->ips_ire_reject_v6); 621 ire_increment_generation(ipst->ips_ire_blackhole_v6); 622 } 623 624 /* Adding a default can't otherwise provide a better route */ 625 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 626 rw_exit(&ipst->ips_ip6_ire_head_lock); 627 return; 628 } 629 630 switch (flag) { 631 case IRE_FLUSH_DELETE: 632 case IRE_FLUSH_GWCHANGE: 633 /* 634 * Update ire_generation for all ire_dep_children chains 635 * starting with this IRE 636 */ 637 ire_dep_incr_generation(ire); 638 break; 639 case IRE_FLUSH_ADD: { 640 in6_addr_t addr; 641 in6_addr_t mask; 642 ip_stack_t *ipst = ire->ire_ipst; 643 uint_t masklen; 644 645 /* 646 * Find an IRE which is a shorter match than the ire to be added 647 * For any such IRE (which we repeat) we update the 648 * ire_generation the same way as in the delete case. 649 */ 650 addr = ire->ire_addr_v6; 651 mask = ire->ire_mask_v6; 652 masklen = ip_mask_to_plen_v6(&mask); 653 654 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 655 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 656 while (ire != NULL) { 657 /* We need to handle all in the same bucket */ 658 irb_increment_generation(ire->ire_bucket); 659 660 mask = ire->ire_mask_v6; 661 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 662 masklen = ip_mask_to_plen_v6(&mask); 663 ire_refrele(ire); 664 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 665 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 666 } 667 } 668 break; 669 } 670 rw_exit(&ipst->ips_ip6_ire_head_lock); 671 } 672 673 /* 674 * Matches the arguments passed with the values in the ire. 675 * 676 * Note: for match types that match using "ill" passed in, ill 677 * must be checked for non-NULL before calling this routine. 678 */ 679 boolean_t 680 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 681 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 682 const ts_label_t *tsl, int match_flags) 683 { 684 in6_addr_t masked_addr; 685 in6_addr_t gw_addr_v6; 686 ill_t *ire_ill = NULL, *dst_ill; 687 ip_stack_t *ipst = ire->ire_ipst; 688 689 ASSERT(ire->ire_ipversion == IPV6_VERSION); 690 ASSERT(addr != NULL); 691 ASSERT(mask != NULL); 692 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 693 ASSERT((!(match_flags & MATCH_IRE_ILL)) || 694 (ill != NULL && ill->ill_isv6)); 695 696 /* 697 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 698 * is in fact hidden, to ensure the caller gets the right one. 699 */ 700 if (ire->ire_testhidden) { 701 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 702 return (B_FALSE); 703 } 704 705 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 706 ire->ire_zoneid != ALL_ZONES) { 707 /* 708 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 709 * does not match that of ire_zoneid, a failure to 710 * match is reported at this point. Otherwise, since some IREs 711 * that are available in the global zone can be used in local 712 * zones, additional checks need to be performed: 713 * 714 * IRE_LOOPBACK 715 * entries should never be matched in this situation. 716 * Each zone has its own IRE_LOOPBACK. 717 * 718 * IRE_LOCAL 719 * We allow them for any zoneid. ire_route_recursive 720 * does additional checks when 721 * ip_restrict_interzone_loopback is set. 722 * 723 * If ill_usesrc_ifindex is set 724 * Then we check if the zone has a valid source address 725 * on the usesrc ill. 726 * 727 * If ire_ill is set, then check that the zone has an ipif 728 * on that ill. 729 * 730 * Outside of this function (in ire_round_robin) we check 731 * that any IRE_OFFLINK has a gateway that reachable from the 732 * zone when we have multiple choices (ECMP). 733 */ 734 if (match_flags & MATCH_IRE_ZONEONLY) 735 return (B_FALSE); 736 if (ire->ire_type & IRE_LOOPBACK) 737 return (B_FALSE); 738 739 if (ire->ire_type & IRE_LOCAL) 740 goto matchit; 741 742 /* 743 * The normal case of IRE_ONLINK has a matching zoneid. 744 * Here we handle the case when shared-IP zones have been 745 * configured with IP addresses on vniN. In that case it 746 * is ok for traffic from a zone to use IRE_ONLINK routes 747 * if the ill has a usesrc pointing at vniN 748 * Applies to IRE_INTERFACE. 749 */ 750 dst_ill = ire->ire_ill; 751 if (ire->ire_type & IRE_ONLINK) { 752 uint_t ifindex; 753 754 /* 755 * Note there is no IRE_INTERFACE on vniN thus 756 * can't do an IRE lookup for a matching route. 757 */ 758 ifindex = dst_ill->ill_usesrc_ifindex; 759 if (ifindex == 0) 760 return (B_FALSE); 761 762 /* 763 * If there is a usable source address in the 764 * zone, then it's ok to return this IRE_INTERFACE 765 */ 766 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 767 zoneid, ipst)) { 768 ip3dbg(("ire_match_args: no usrsrc for zone" 769 " dst_ill %p\n", (void *)dst_ill)); 770 return (B_FALSE); 771 } 772 } 773 /* 774 * For exampe, with 775 * route add 11.0.0.0 gw1 -ifp bge0 776 * route add 11.0.0.0 gw2 -ifp bge1 777 * this code would differentiate based on 778 * where the sending zone has addresses. 779 * Only if the zone has an address on bge0 can it use the first 780 * route. It isn't clear if this behavior is documented 781 * anywhere. 782 */ 783 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 784 ipif_t *tipif; 785 786 mutex_enter(&dst_ill->ill_lock); 787 for (tipif = dst_ill->ill_ipif; 788 tipif != NULL; tipif = tipif->ipif_next) { 789 if (!IPIF_IS_CONDEMNED(tipif) && 790 (tipif->ipif_flags & IPIF_UP) && 791 (tipif->ipif_zoneid == zoneid || 792 tipif->ipif_zoneid == ALL_ZONES)) 793 break; 794 } 795 mutex_exit(&dst_ill->ill_lock); 796 if (tipif == NULL) 797 return (B_FALSE); 798 } 799 } 800 801 matchit: 802 if (match_flags & MATCH_IRE_GW) { 803 mutex_enter(&ire->ire_lock); 804 gw_addr_v6 = ire->ire_gateway_addr_v6; 805 mutex_exit(&ire->ire_lock); 806 } 807 if (match_flags & MATCH_IRE_ILL) { 808 ire_ill = ire->ire_ill; 809 810 /* 811 * If asked to match an ill, we *must* match 812 * on the ire_ill for ipmp test addresses, or 813 * any of the ill in the group for data addresses. 814 * If we don't, we may as well fail. 815 * However, we need an exception for IRE_LOCALs to ensure 816 * we loopback packets even sent to test addresses on different 817 * interfaces in the group. 818 */ 819 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 820 !(ire->ire_type & IRE_LOCAL)) { 821 if (ire->ire_ill != ill) 822 return (B_FALSE); 823 } else { 824 match_flags &= ~MATCH_IRE_TESTHIDDEN; 825 /* 826 * We know that ill is not NULL, but ire_ill could be 827 * NULL 828 */ 829 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 830 return (B_FALSE); 831 } 832 } 833 /* No ire_addr_v6 bits set past the mask */ 834 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 835 ire->ire_addr_v6)); 836 V6_MASK_COPY(*addr, *mask, masked_addr); 837 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 838 ((!(match_flags & MATCH_IRE_GW)) || 839 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 840 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 841 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 842 ((!(match_flags & MATCH_IRE_MASK)) || 843 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 844 ((!(match_flags & MATCH_IRE_SECATTR)) || 845 (!is_system_labeled()) || 846 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 847 /* We found the matched IRE */ 848 return (B_TRUE); 849 } 850 return (B_FALSE); 851 } 852 853 /* 854 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 855 * gateway address. If ill is non-NULL we also match on it. 856 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 857 */ 858 boolean_t 859 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 860 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 861 { 862 ire_t *ire; 863 uint_t match_flags; 864 865 if (lock_held) 866 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 867 else 868 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 869 870 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 871 if (ill != NULL) 872 match_flags |= MATCH_IRE_ILL; 873 874 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 875 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 876 ipst); 877 878 if (!lock_held) 879 rw_exit(&ipst->ips_ip6_ire_head_lock); 880 if (ire != NULL) { 881 ire_refrele(ire); 882 return (B_TRUE); 883 } else { 884 return (B_FALSE); 885 } 886 } 887 888 /* 889 * Lookup a route in forwarding table. 890 * specific lookup is indicated by passing the 891 * required parameters and indicating the 892 * match required in flag field. 893 * 894 * Supports link-local addresses by following the ipif/ill when recursing. 895 */ 896 ire_t * 897 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 898 const in6_addr_t *gateway, int type, const ill_t *ill, 899 zoneid_t zoneid, const ts_label_t *tsl, int flags, 900 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 901 { 902 ire_t *ire = NULL; 903 904 ASSERT(addr != NULL); 905 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 906 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 907 ASSERT(ill == NULL || ill->ill_isv6); 908 909 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 910 911 /* 912 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 913 * is set. 914 */ 915 if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) 916 return (NULL); 917 918 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 919 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 920 tsl, flags, ipst); 921 if (ire == NULL) { 922 rw_exit(&ipst->ips_ip6_ire_head_lock); 923 return (NULL); 924 } 925 926 /* 927 * round-robin only if we have more than one route in the bucket. 928 * ips_ip_ecmp_behavior controls when we do ECMP 929 * 2: always 930 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 931 * 0: never 932 * 933 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 934 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 935 * and the IRE_INTERFACESs are likely to be shorter matches. 936 */ 937 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 938 if (ipst->ips_ip_ecmp_behavior == 2 || 939 (ipst->ips_ip_ecmp_behavior == 1 && 940 IS_DEFAULT_ROUTE_V6(ire))) { 941 ire_t *next_ire; 942 ire_ftable_args_t margs; 943 944 bzero(&margs, sizeof (margs)); 945 margs.ift_addr_v6 = *addr; 946 if (mask != NULL) 947 margs.ift_mask_v6 = *mask; 948 if (gateway != NULL) 949 margs.ift_gateway_v6 = *gateway; 950 margs.ift_type = type; 951 margs.ift_ill = ill; 952 margs.ift_zoneid = zoneid; 953 margs.ift_tsl = tsl; 954 margs.ift_flags = flags; 955 956 next_ire = ire_round_robin(ire->ire_bucket, &margs, 957 xmit_hint, ire, ipst); 958 if (next_ire == NULL) { 959 /* keep ire if next_ire is null */ 960 goto done; 961 } 962 ire_refrele(ire); 963 ire = next_ire; 964 } 965 } 966 967 done: 968 /* Return generation before dropping lock */ 969 if (generationp != NULL) 970 *generationp = ire->ire_generation; 971 972 rw_exit(&ipst->ips_ip6_ire_head_lock); 973 974 /* 975 * For shared-IP zones we need additional checks to what was 976 * done in ire_match_args to make sure IRE_LOCALs are handled. 977 * 978 * When ip_restrict_interzone_loopback is set, then 979 * we ensure that IRE_LOCAL are only used for loopback 980 * between zones when the logical "Ethernet" would 981 * have looped them back. That is, if in the absense of 982 * the IRE_LOCAL we would have sent to packet out the 983 * same ill. 984 */ 985 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 986 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 987 ipst->ips_ip_restrict_interzone_loopback) { 988 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 989 ASSERT(ire != NULL); 990 } 991 992 return (ire); 993 } 994 995 /* 996 * Look up a single ire. The caller holds either the read or write lock. 997 */ 998 ire_t * 999 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1000 const in6_addr_t *gateway, int type, const ill_t *ill, 1001 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1002 ip_stack_t *ipst) 1003 { 1004 irb_t *irb_ptr; 1005 ire_t *ire = NULL; 1006 int i; 1007 1008 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1009 1010 /* 1011 * If the mask is known, the lookup 1012 * is simple, if the mask is not known 1013 * we need to search. 1014 */ 1015 if (flags & MATCH_IRE_MASK) { 1016 uint_t masklen; 1017 1018 masklen = ip_mask_to_plen_v6(mask); 1019 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1020 return (NULL); 1021 } 1022 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1023 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1024 ipst->ips_ip6_ftable_hash_size)]); 1025 rw_enter(&irb_ptr->irb_lock, RW_READER); 1026 for (ire = irb_ptr->irb_ire; ire != NULL; 1027 ire = ire->ire_next) { 1028 if (IRE_IS_CONDEMNED(ire)) 1029 continue; 1030 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1031 ill, zoneid, tsl, flags)) 1032 goto found_ire; 1033 } 1034 rw_exit(&irb_ptr->irb_lock); 1035 } else { 1036 uint_t masklen; 1037 1038 /* 1039 * In this case we don't know the mask, we need to 1040 * search the table assuming different mask sizes. 1041 */ 1042 if (flags & MATCH_IRE_SHORTERMASK) { 1043 masklen = ip_mask_to_plen_v6(mask); 1044 if (masklen == 0) { 1045 /* Nothing shorter than zero */ 1046 return (NULL); 1047 } 1048 masklen--; 1049 } else { 1050 masklen = IP6_MASK_TABLE_SIZE - 1; 1051 } 1052 1053 for (i = masklen; i >= 0; i--) { 1054 in6_addr_t tmpmask; 1055 1056 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1057 continue; 1058 (void) ip_plen_to_mask_v6(i, &tmpmask); 1059 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1060 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1061 ipst->ips_ip6_ftable_hash_size)]; 1062 rw_enter(&irb_ptr->irb_lock, RW_READER); 1063 for (ire = irb_ptr->irb_ire; ire != NULL; 1064 ire = ire->ire_next) { 1065 if (IRE_IS_CONDEMNED(ire)) 1066 continue; 1067 if (ire_match_args_v6(ire, addr, 1068 &ire->ire_mask_v6, gateway, type, ill, 1069 zoneid, tsl, flags)) 1070 goto found_ire; 1071 } 1072 rw_exit(&irb_ptr->irb_lock); 1073 } 1074 } 1075 ASSERT(ire == NULL); 1076 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1077 return (NULL); 1078 1079 found_ire: 1080 ire_refhold(ire); 1081 rw_exit(&irb_ptr->irb_lock); 1082 return (ire); 1083 } 1084 1085 1086 /* 1087 * This function is called by 1088 * ip_input/ire_route_recursive when doing a route lookup on only the 1089 * destination address. 1090 * 1091 * The optimizations of this function over ire_ftable_lookup are: 1092 * o removing unnecessary flag matching 1093 * o doing longest prefix match instead of overloading it further 1094 * with the unnecessary "best_prefix_match" 1095 * 1096 * If no route is found we return IRE_NOROUTE. 1097 */ 1098 ire_t * 1099 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1100 ip_stack_t *ipst, uint_t *generationp) 1101 { 1102 ire_t *ire; 1103 1104 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1105 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1106 if (ire == NULL) { 1107 ire = ire_reject(ipst, B_TRUE); 1108 if (generationp != NULL) 1109 *generationp = IRE_GENERATION_VERIFY; 1110 } 1111 /* ftable_lookup did round robin */ 1112 return (ire); 1113 } 1114 1115 ire_t * 1116 ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, 1117 uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 1118 { 1119 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1120 1121 return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, 1122 multirtp)); 1123 } 1124 1125 /* 1126 * Recursively look for a route to the destination. Can also match on 1127 * the zoneid, ill, and label. Used for the data paths. See also 1128 * ire_route_recursive_dstonly. 1129 * 1130 * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1131 * 1132 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1133 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1134 * forwarding. 1135 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1136 * resolve the gateway. 1137 * 1138 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1139 * instead. 1140 * 1141 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1142 * is an error. 1143 * Allow at most one RTF_INDIRECT. 1144 */ 1145 ire_t * 1146 ire_route_recursive_impl_v6(ire_t *ire, 1147 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1148 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1149 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1150 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1151 { 1152 int i, j; 1153 in6_addr_t v6nexthop = *nexthop; 1154 ire_t *ires[MAX_IRE_RECURSION]; 1155 uint_t generation; 1156 uint_t generations[MAX_IRE_RECURSION]; 1157 boolean_t need_refrele = B_FALSE; 1158 boolean_t invalidate = B_FALSE; 1159 int prefs[MAX_IRE_RECURSION]; 1160 ill_t *ill = NULL; 1161 1162 if (setsrcp != NULL) 1163 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1164 if (gwattrp != NULL) 1165 ASSERT(*gwattrp == NULL); 1166 1167 if (ill_arg != NULL) 1168 match_args |= MATCH_IRE_ILL; 1169 1170 /* 1171 * We iterate up to three times to resolve a route, even though 1172 * we have four slots in the array. The extra slot is for an 1173 * IRE_IF_CLONE we might need to create. 1174 */ 1175 i = 0; 1176 while (i < MAX_IRE_RECURSION - 1) { 1177 /* ire_ftable_lookup handles round-robin/ECMP */ 1178 if (ire == NULL) { 1179 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1180 (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1181 match_args, xmit_hint, ipst, &generation); 1182 } else { 1183 /* Caller passed it; extra hold since we will rele */ 1184 ire_refhold(ire); 1185 if (generationp != NULL) 1186 generation = *generationp; 1187 else 1188 generation = IRE_GENERATION_VERIFY; 1189 } 1190 1191 if (ire == NULL) 1192 ire = ire_reject(ipst, B_TRUE); 1193 1194 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1195 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1196 goto error; 1197 1198 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1199 1200 if (i != 0) { 1201 prefs[i] = ire_pref(ire); 1202 /* 1203 * Don't allow anything unusual past the first 1204 * iteration. 1205 */ 1206 if ((ire->ire_type & 1207 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1208 prefs[i] <= prefs[i-1]) { 1209 ire_refrele(ire); 1210 if (irr_flags & IRR_INCOMPLETE) { 1211 ire = ires[0]; 1212 ire_refhold(ire); 1213 } else { 1214 ire = ire_reject(ipst, B_TRUE); 1215 } 1216 goto error; 1217 } 1218 } 1219 /* We have a usable IRE */ 1220 ires[i] = ire; 1221 generations[i] = generation; 1222 i++; 1223 1224 /* The first RTF_SETSRC address is passed back if setsrcp */ 1225 if ((ire->ire_flags & RTF_SETSRC) && 1226 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1227 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1228 &ire->ire_setsrc_addr_v6)); 1229 *setsrcp = ire->ire_setsrc_addr_v6; 1230 } 1231 1232 /* The first ire_gw_secattr is passed back if gwattrp */ 1233 if (ire->ire_gw_secattr != NULL && 1234 gwattrp != NULL && *gwattrp == NULL) 1235 *gwattrp = ire->ire_gw_secattr; 1236 1237 /* 1238 * Check if we have a short-cut pointer to an IRE for this 1239 * destination, and that the cached dependency isn't stale. 1240 * In that case we've rejoined an existing tree towards a 1241 * parent, thus we don't need to continue the loop to 1242 * discover the rest of the tree. 1243 */ 1244 mutex_enter(&ire->ire_lock); 1245 if (ire->ire_dep_parent != NULL && 1246 ire->ire_dep_parent->ire_generation == 1247 ire->ire_dep_parent_generation) { 1248 mutex_exit(&ire->ire_lock); 1249 ire = NULL; 1250 goto done; 1251 } 1252 mutex_exit(&ire->ire_lock); 1253 1254 /* 1255 * If this type should have an ire_nce_cache (even if it 1256 * doesn't yet have one) then we are done. Includes 1257 * IRE_INTERFACE with a full 128 bit mask. 1258 */ 1259 if (ire->ire_nce_capable) { 1260 ire = NULL; 1261 goto done; 1262 } 1263 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1264 /* 1265 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1266 * particular destination 1267 */ 1268 if (ire->ire_type & IRE_INTERFACE) { 1269 ire_t *clone; 1270 1271 ASSERT(ire->ire_masklen != IPV6_ABITS); 1272 1273 /* 1274 * In the case of ip_input and ILLF_FORWARDING not 1275 * being set, and in the case of RTM_GET, there is 1276 * no point in allocating an IRE_IF_CLONE. We return 1277 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1278 * result in a ire_dep_parent which is IRE_IF_* 1279 * without an IRE_IF_CLONE. 1280 * We recover from that when we need to send packets 1281 * by ensuring that the generations become 1282 * IRE_GENERATION_VERIFY in this case. 1283 */ 1284 if (!(irr_flags & IRR_ALLOCATE)) { 1285 invalidate = B_TRUE; 1286 ire = NULL; 1287 goto done; 1288 } 1289 1290 clone = ire_create_if_clone(ire, &v6nexthop, 1291 &generation); 1292 if (clone == NULL) { 1293 /* 1294 * Temporary failure - no memory. 1295 * Don't want caller to cache IRE_NOROUTE. 1296 */ 1297 invalidate = B_TRUE; 1298 ire = ire_blackhole(ipst, B_TRUE); 1299 goto error; 1300 } 1301 /* 1302 * Make clone next to last entry and the 1303 * IRE_INTERFACE the last in the dependency 1304 * chain since the clone depends on the 1305 * IRE_INTERFACE. 1306 */ 1307 ASSERT(i >= 1); 1308 ASSERT(i < MAX_IRE_RECURSION); 1309 1310 ires[i] = ires[i-1]; 1311 generations[i] = generations[i-1]; 1312 ires[i-1] = clone; 1313 generations[i-1] = generation; 1314 i++; 1315 1316 ire = NULL; 1317 goto done; 1318 } 1319 1320 /* 1321 * We only match on the type and optionally ILL when 1322 * recursing. The type match is used by some callers 1323 * to exclude certain types (such as IRE_IF_CLONE or 1324 * IRE_LOCAL|IRE_LOOPBACK). 1325 */ 1326 match_args &= MATCH_IRE_TYPE; 1327 v6nexthop = ire->ire_gateway_addr_v6; 1328 if (ill == NULL && ire->ire_ill != NULL) { 1329 ill = ire->ire_ill; 1330 need_refrele = B_TRUE; 1331 ill_refhold(ill); 1332 match_args |= MATCH_IRE_ILL; 1333 } 1334 /* 1335 * We set the prefs[i] value above if i > 0. We've already 1336 * done i++ so i is one in the case of the first time around. 1337 */ 1338 if (i == 1) 1339 prefs[0] = ire_pref(ire); 1340 ire = NULL; 1341 } 1342 ASSERT(ire == NULL); 1343 ire = ire_reject(ipst, B_TRUE); 1344 1345 error: 1346 ASSERT(ire != NULL); 1347 if (need_refrele) 1348 ill_refrele(ill); 1349 1350 /* 1351 * In the case of MULTIRT we want to try a different IRE the next 1352 * time. We let the next packet retry in that case. 1353 */ 1354 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1355 (void) ire_no_good(ires[0]); 1356 1357 cleanup: 1358 /* cleanup ires[i] */ 1359 ire_dep_unbuild(ires, i); 1360 for (j = 0; j < i; j++) 1361 ire_refrele(ires[j]); 1362 1363 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1364 (irr_flags & IRR_INCOMPLETE)); 1365 /* 1366 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1367 * ip_select_route since the reject or lack of memory might be gone. 1368 */ 1369 if (generationp != NULL) 1370 *generationp = IRE_GENERATION_VERIFY; 1371 return (ire); 1372 1373 done: 1374 ASSERT(ire == NULL); 1375 if (need_refrele) 1376 ill_refrele(ill); 1377 1378 /* Build dependencies */ 1379 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1380 /* Something in chain was condemned; tear it apart */ 1381 ire = ire_blackhole(ipst, B_TRUE); 1382 goto cleanup; 1383 } 1384 1385 /* 1386 * Release all refholds except the one for ires[0] that we 1387 * will return to the caller. 1388 */ 1389 for (j = 1; j < i; j++) 1390 ire_refrele(ires[j]); 1391 1392 if (invalidate) { 1393 /* 1394 * Since we needed to allocate but couldn't we need to make 1395 * sure that the dependency chain is rebuilt the next time. 1396 */ 1397 ire_dep_invalidate_generations(ires[0]); 1398 generation = IRE_GENERATION_VERIFY; 1399 } else { 1400 /* 1401 * IREs can have been added or deleted while we did the 1402 * recursive lookup and we can't catch those until we've built 1403 * the dependencies. We verify the stored 1404 * ire_dep_parent_generation to catch any such changes and 1405 * return IRE_GENERATION_VERIFY (which will cause 1406 * ip_select_route to be called again so we can redo the 1407 * recursive lookup next time we send a packet. 1408 */ 1409 if (ires[0]->ire_dep_parent == NULL) 1410 generation = ires[0]->ire_generation; 1411 else 1412 generation = ire_dep_validate_generations(ires[0]); 1413 if (generations[0] != ires[0]->ire_generation) { 1414 /* Something changed at the top */ 1415 generation = IRE_GENERATION_VERIFY; 1416 } 1417 } 1418 if (generationp != NULL) 1419 *generationp = generation; 1420 1421 return (ires[0]); 1422 } 1423 1424 ire_t * 1425 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1426 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1427 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1428 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1429 { 1430 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1431 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1432 gwattrp, generationp)); 1433 } 1434 1435 /* 1436 * Recursively look for a route to the destination. 1437 * We only handle a destination match here, yet we have the same arguments 1438 * as the full match to allow function pointers to select between the two. 1439 * 1440 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1441 * instead. 1442 * 1443 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1444 * is an error. 1445 * Allow at most one RTF_INDIRECT. 1446 */ 1447 ire_t * 1448 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1449 uint32_t xmit_hint, ip_stack_t *ipst) 1450 { 1451 ire_t *ire; 1452 ire_t *ire1; 1453 uint_t generation; 1454 1455 /* ire_ftable_lookup handles round-robin/ECMP */ 1456 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1457 &generation); 1458 ASSERT(ire != NULL); 1459 1460 /* 1461 * If this type should have an ire_nce_cache (even if it 1462 * doesn't yet have one) then we are done. Includes 1463 * IRE_INTERFACE with a full 128 bit mask. 1464 */ 1465 if (ire->ire_nce_capable) 1466 return (ire); 1467 1468 /* 1469 * If the IRE has a current cached parent we know that the whole 1470 * parent chain is current, hence we don't need to discover and 1471 * build any dependencies by doing a recursive lookup. 1472 */ 1473 mutex_enter(&ire->ire_lock); 1474 if (ire->ire_dep_parent != NULL && 1475 ire->ire_dep_parent->ire_generation == 1476 ire->ire_dep_parent_generation) { 1477 mutex_exit(&ire->ire_lock); 1478 return (ire); 1479 } 1480 mutex_exit(&ire->ire_lock); 1481 1482 /* 1483 * Fallback to loop in the normal code starting with the ire 1484 * we found. Normally this would return the same ire. 1485 */ 1486 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1487 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1488 &generation); 1489 ire_refrele(ire); 1490 return (ire1); 1491 } 1492