1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 /* 22 * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 23 * Use is subject to license terms. 24 */ 25 /* 26 * Copyright (c) 1990 Mentat Inc. 27 */ 28 29 /* 30 * This file contains routines that manipulate Internet Routing Entries (IREs). 31 */ 32 #include <sys/types.h> 33 #include <sys/stream.h> 34 #include <sys/stropts.h> 35 #include <sys/ddi.h> 36 #include <sys/cmn_err.h> 37 38 #include <sys/systm.h> 39 #include <sys/param.h> 40 #include <sys/socket.h> 41 #include <net/if.h> 42 #include <net/route.h> 43 #include <netinet/in.h> 44 #include <net/if_dl.h> 45 #include <netinet/ip6.h> 46 #include <netinet/icmp6.h> 47 48 #include <inet/common.h> 49 #include <inet/mi.h> 50 #include <inet/ip.h> 51 #include <inet/ip6.h> 52 #include <inet/ip_ndp.h> 53 #include <inet/ip_if.h> 54 #include <inet/ip_ire.h> 55 #include <inet/ipclassifier.h> 56 #include <inet/nd.h> 57 #include <sys/kmem.h> 58 #include <sys/zone.h> 59 60 #include <sys/tsol/label.h> 61 #include <sys/tsol/tnet.h> 62 63 #define IS_DEFAULT_ROUTE_V6(ire) \ 64 (((ire)->ire_type & IRE_DEFAULT) || \ 65 (((ire)->ire_type & IRE_INTERFACE) && \ 66 (IN6_IS_ADDR_UNSPECIFIED(&(ire)->ire_addr_v6)))) 67 68 static ire_t ire_null; 69 70 static ire_t * 71 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 72 const in6_addr_t *gateway, int type, const ill_t *ill, 73 zoneid_t zoneid, const ts_label_t *tsl, int flags, 74 ip_stack_t *ipst); 75 76 /* 77 * Initialize the ire that is specific to IPv6 part and call 78 * ire_init_common to finish it. 79 * Returns zero or errno. 80 */ 81 int 82 ire_init_v6(ire_t *ire, const in6_addr_t *v6addr, const in6_addr_t *v6mask, 83 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, 84 zoneid_t zoneid, uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 85 { 86 int error; 87 88 /* 89 * Reject IRE security attmakeribute creation/initialization 90 * if system is not running in Trusted mode. 91 */ 92 if (gc != NULL && !is_system_labeled()) 93 return (EINVAL); 94 95 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_alloced); 96 if (v6addr != NULL) 97 ire->ire_addr_v6 = *v6addr; 98 if (v6gateway != NULL) 99 ire->ire_gateway_addr_v6 = *v6gateway; 100 101 /* Make sure we don't have stray values in some fields */ 102 switch (type) { 103 case IRE_LOOPBACK: 104 ire->ire_gateway_addr_v6 = ire->ire_addr_v6; 105 /* FALLTHRU */ 106 case IRE_HOST: 107 case IRE_LOCAL: 108 case IRE_IF_CLONE: 109 ire->ire_mask_v6 = ipv6_all_ones; 110 ire->ire_masklen = IPV6_ABITS; 111 break; 112 case IRE_PREFIX: 113 case IRE_DEFAULT: 114 case IRE_IF_RESOLVER: 115 case IRE_IF_NORESOLVER: 116 if (v6mask != NULL) { 117 ire->ire_mask_v6 = *v6mask; 118 ire->ire_masklen = 119 ip_mask_to_plen_v6(&ire->ire_mask_v6); 120 } 121 break; 122 case IRE_MULTICAST: 123 case IRE_NOROUTE: 124 ASSERT(v6mask == NULL); 125 break; 126 default: 127 ASSERT(0); 128 return (EINVAL); 129 } 130 131 error = ire_init_common(ire, type, ill, zoneid, flags, IPV6_VERSION, 132 gc, ipst); 133 if (error != NULL) 134 return (error); 135 136 /* Determine which function pointers to use */ 137 ire->ire_postfragfn = ip_xmit; /* Common case */ 138 139 switch (ire->ire_type) { 140 case IRE_LOCAL: 141 ire->ire_sendfn = ire_send_local_v6; 142 ire->ire_recvfn = ire_recv_local_v6; 143 ASSERT(ire->ire_ill != NULL); 144 if (ire->ire_ill->ill_flags & ILLF_NOACCEPT) 145 ire->ire_recvfn = ire_recv_noaccept_v6; 146 break; 147 case IRE_LOOPBACK: 148 ire->ire_sendfn = ire_send_local_v6; 149 ire->ire_recvfn = ire_recv_loopback_v6; 150 break; 151 case IRE_MULTICAST: 152 ire->ire_postfragfn = ip_postfrag_loopcheck; 153 ire->ire_sendfn = ire_send_multicast_v6; 154 ire->ire_recvfn = ire_recv_multicast_v6; 155 break; 156 default: 157 /* 158 * For IRE_IF_ALL and IRE_OFFLINK we forward received 159 * packets by default. 160 */ 161 ire->ire_sendfn = ire_send_wire_v6; 162 ire->ire_recvfn = ire_recv_forward_v6; 163 break; 164 } 165 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 166 ire->ire_sendfn = ire_send_noroute_v6; 167 ire->ire_recvfn = ire_recv_noroute_v6; 168 } else if (ire->ire_flags & RTF_MULTIRT) { 169 ire->ire_postfragfn = ip_postfrag_multirt_v6; 170 ire->ire_sendfn = ire_send_multirt_v6; 171 ire->ire_recvfn = ire_recv_multirt_v6; 172 } 173 ire->ire_nce_capable = ire_determine_nce_capable(ire); 174 return (0); 175 } 176 177 /* 178 * ire_create_v6 is called to allocate and initialize a new IRE. 179 * 180 * NOTE : This is called as writer sometimes though not required 181 * by this function. 182 */ 183 /* ARGSUSED */ 184 ire_t * 185 ire_create_v6(const in6_addr_t *v6addr, const in6_addr_t *v6mask, 186 const in6_addr_t *v6gateway, ushort_t type, ill_t *ill, zoneid_t zoneid, 187 uint_t flags, tsol_gc_t *gc, ip_stack_t *ipst) 188 { 189 ire_t *ire; 190 int error; 191 192 ASSERT(!IN6_IS_ADDR_V4MAPPED(v6addr)); 193 194 ire = kmem_cache_alloc(ire_cache, KM_NOSLEEP); 195 if (ire == NULL) { 196 DTRACE_PROBE(kmem__cache__alloc); 197 return (NULL); 198 } 199 *ire = ire_null; 200 201 error = ire_init_v6(ire, v6addr, v6mask, v6gateway, 202 type, ill, zoneid, flags, gc, ipst); 203 204 if (error != 0) { 205 DTRACE_PROBE2(ire__init__v6, ire_t *, ire, int, error); 206 kmem_cache_free(ire_cache, ire); 207 return (NULL); 208 } 209 return (ire); 210 } 211 212 /* 213 * Find the ill matching a multicast group. 214 * Allows different routes for multicast addresses 215 * in the unicast routing table (akin to FF::0/8 but could be more specific) 216 * which point at different interfaces. This is used when IPV6_MULTICAST_IF 217 * isn't specified (when sending) and when IPV6_JOIN_GROUP doesn't 218 * specify the interface to join on. 219 * 220 * Supports link-local addresses by using ire_route_recursive which follows 221 * the ill when recursing. 222 * 223 * To handle CGTP, since we don't have a separate IRE_MULTICAST for each group 224 * and the MULTIRT property can be different for different groups, we 225 * extract RTF_MULTIRT from the special unicast route added for a group 226 * with CGTP and pass that back in the multirtp argument. 227 * This is used in ip_set_destination etc to set ixa_postfragfn for multicast. 228 * We have a setsrcp argument for the same reason. 229 */ 230 ill_t * 231 ire_lookup_multi_ill_v6(const in6_addr_t *group, zoneid_t zoneid, 232 ip_stack_t *ipst, boolean_t *multirtp, in6_addr_t *setsrcp) 233 { 234 ire_t *ire; 235 ill_t *ill; 236 237 ire = ire_route_recursive_v6(group, 0, NULL, zoneid, NULL, 238 MATCH_IRE_DSTONLY, IRR_NONE, 0, ipst, setsrcp, NULL, NULL); 239 ASSERT(ire != NULL); 240 241 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) { 242 ire_refrele(ire); 243 return (NULL); 244 } 245 246 if (multirtp != NULL) 247 *multirtp = (ire->ire_flags & RTF_MULTIRT) != 0; 248 249 ill = ire_nexthop_ill(ire); 250 ire_refrele(ire); 251 return (ill); 252 } 253 254 /* 255 * This function takes a mask and returns number of bits set in the 256 * mask (the represented prefix length). Assumes a contiguous mask. 257 */ 258 int 259 ip_mask_to_plen_v6(const in6_addr_t *v6mask) 260 { 261 int bits; 262 int plen = IPV6_ABITS; 263 int i; 264 265 for (i = 3; i >= 0; i--) { 266 if (v6mask->s6_addr32[i] == 0) { 267 plen -= 32; 268 continue; 269 } 270 bits = ffs(ntohl(v6mask->s6_addr32[i])) - 1; 271 if (bits == 0) 272 break; 273 plen -= bits; 274 } 275 276 return (plen); 277 } 278 279 /* 280 * Convert a prefix length to the mask for that prefix. 281 * Returns the argument bitmask. 282 */ 283 in6_addr_t * 284 ip_plen_to_mask_v6(uint_t plen, in6_addr_t *bitmask) 285 { 286 uint32_t *ptr; 287 288 if (plen < 0 || plen > IPV6_ABITS) 289 return (NULL); 290 *bitmask = ipv6_all_zeros; 291 if (plen == 0) 292 return (bitmask); 293 294 ptr = (uint32_t *)bitmask; 295 while (plen > 32) { 296 *ptr++ = 0xffffffffU; 297 plen -= 32; 298 } 299 *ptr = htonl(0xffffffffU << (32 - plen)); 300 return (bitmask); 301 } 302 303 /* 304 * Add a fully initialized IPv6 IRE to the forwarding table. 305 * This returns NULL on failure, or a held IRE on success. 306 * Normally the returned IRE is the same as the argument. But a different 307 * IRE will be returned if the added IRE is deemed identical to an existing 308 * one. In that case ire_identical_ref will be increased. 309 * The caller always needs to do an ire_refrele() on the returned IRE. 310 */ 311 ire_t * 312 ire_add_v6(ire_t *ire) 313 { 314 ire_t *ire1; 315 int mask_table_index; 316 irb_t *irb_ptr; 317 ire_t **irep; 318 int match_flags; 319 int error; 320 ip_stack_t *ipst = ire->ire_ipst; 321 322 ASSERT(ire->ire_ipversion == IPV6_VERSION); 323 324 /* Make sure the address is properly masked. */ 325 V6_MASK_COPY(ire->ire_addr_v6, ire->ire_mask_v6, ire->ire_addr_v6); 326 327 mask_table_index = ip_mask_to_plen_v6(&ire->ire_mask_v6); 328 if ((ipst->ips_ip_forwarding_table_v6[mask_table_index]) == NULL) { 329 irb_t *ptr; 330 int i; 331 332 ptr = (irb_t *)mi_zalloc((ipst->ips_ip6_ftable_hash_size * 333 sizeof (irb_t))); 334 if (ptr == NULL) { 335 ire_delete(ire); 336 return (NULL); 337 } 338 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 339 rw_init(&ptr[i].irb_lock, NULL, RW_DEFAULT, NULL); 340 ptr[i].irb_ipst = ipst; 341 } 342 mutex_enter(&ipst->ips_ire_ft_init_lock); 343 if (ipst->ips_ip_forwarding_table_v6[mask_table_index] == 344 NULL) { 345 ipst->ips_ip_forwarding_table_v6[mask_table_index] = 346 ptr; 347 mutex_exit(&ipst->ips_ire_ft_init_lock); 348 } else { 349 /* 350 * Some other thread won the race in 351 * initializing the forwarding table at the 352 * same index. 353 */ 354 mutex_exit(&ipst->ips_ire_ft_init_lock); 355 for (i = 0; i < ipst->ips_ip6_ftable_hash_size; i++) { 356 rw_destroy(&ptr[i].irb_lock); 357 } 358 mi_free(ptr); 359 } 360 } 361 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[mask_table_index][ 362 IRE_ADDR_MASK_HASH_V6(ire->ire_addr_v6, ire->ire_mask_v6, 363 ipst->ips_ip6_ftable_hash_size)]); 364 365 match_flags = (MATCH_IRE_MASK | MATCH_IRE_TYPE | MATCH_IRE_GW); 366 if (ire->ire_ill != NULL) 367 match_flags |= MATCH_IRE_ILL; 368 /* 369 * Start the atomic add of the ire. Grab the bucket lock and the 370 * ill lock. Check for condemned. 371 */ 372 error = ire_atomic_start(irb_ptr, ire); 373 if (error != 0) { 374 ire_delete(ire); 375 return (NULL); 376 } 377 378 /* 379 * If we are creating a hidden IRE, make sure we search for 380 * hidden IREs when searching for duplicates below. 381 * Otherwise, we might find an IRE on some other interface 382 * that's not marked hidden. 383 */ 384 if (ire->ire_testhidden) 385 match_flags |= MATCH_IRE_TESTHIDDEN; 386 387 /* 388 * Atomically check for duplicate and insert in the table. 389 */ 390 for (ire1 = irb_ptr->irb_ire; ire1 != NULL; ire1 = ire1->ire_next) { 391 if (IRE_IS_CONDEMNED(ire1)) 392 continue; 393 /* 394 * Here we need an exact match on zoneid, i.e., 395 * ire_match_args doesn't fit. 396 */ 397 if (ire1->ire_zoneid != ire->ire_zoneid) 398 continue; 399 400 if (ire1->ire_type != ire->ire_type) 401 continue; 402 403 /* 404 * Note: We do not allow multiple routes that differ only 405 * in the gateway security attributes; such routes are 406 * considered duplicates. 407 * To change that we explicitly have to treat them as 408 * different here. 409 */ 410 if (ire_match_args_v6(ire1, &ire->ire_addr_v6, 411 &ire->ire_mask_v6, &ire->ire_gateway_addr_v6, 412 ire->ire_type, ire->ire_ill, ire->ire_zoneid, NULL, 413 match_flags)) { 414 /* 415 * Return the old ire after doing a REFHOLD. 416 * As most of the callers continue to use the IRE 417 * after adding, we return a held ire. This will 418 * avoid a lookup in the caller again. If the callers 419 * don't want to use it, they need to do a REFRELE. 420 */ 421 ip1dbg(("found dup ire existing %p new %p", 422 (void *)ire1, (void *)ire)); 423 ire_refhold(ire1); 424 atomic_add_32(&ire1->ire_identical_ref, 1); 425 ire_atomic_end(irb_ptr, ire); 426 ire_delete(ire); 427 return (ire1); 428 } 429 } 430 431 /* 432 * Normally we do head insertion since most things do not care about 433 * the order of the IREs in the bucket. 434 * However, due to shared-IP zones (and restrict_interzone_loopback) 435 * we can have an IRE_LOCAL as well as IRE_IF_CLONE for the same 436 * address. For that reason we do tail insertion for IRE_IF_CLONE. 437 */ 438 irep = (ire_t **)irb_ptr; 439 if (ire->ire_type & IRE_IF_CLONE) { 440 while ((ire1 = *irep) != NULL) 441 irep = &ire1->ire_next; 442 } 443 /* Insert at *irep */ 444 ire1 = *irep; 445 if (ire1 != NULL) 446 ire1->ire_ptpn = &ire->ire_next; 447 ire->ire_next = ire1; 448 /* Link the new one in. */ 449 ire->ire_ptpn = irep; 450 /* 451 * ire_walk routines de-reference ire_next without holding 452 * a lock. Before we point to the new ire, we want to make 453 * sure the store that sets the ire_next of the new ire 454 * reaches global visibility, so that ire_walk routines 455 * don't see a truncated list of ires i.e if the ire_next 456 * of the new ire gets set after we do "*irep = ire" due 457 * to re-ordering, the ire_walk thread will see a NULL 458 * once it accesses the ire_next of the new ire. 459 * membar_producer() makes sure that the following store 460 * happens *after* all of the above stores. 461 */ 462 membar_producer(); 463 *irep = ire; 464 ire->ire_bucket = irb_ptr; 465 /* 466 * We return a bumped up IRE above. Keep it symmetrical 467 * so that the callers will always have to release. This 468 * helps the callers of this function because they continue 469 * to use the IRE after adding and hence they don't have to 470 * lookup again after we return the IRE. 471 * 472 * NOTE : We don't have to use atomics as this is appearing 473 * in the list for the first time and no one else can bump 474 * up the reference count on this yet. 475 */ 476 ire_refhold_locked(ire); 477 BUMP_IRE_STATS(ipst->ips_ire_stats_v6, ire_stats_inserted); 478 irb_ptr->irb_ire_cnt++; 479 480 if (ire->ire_ill != NULL) { 481 DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ire->ire_ill, 482 (char *), "ire", (void *), ire); 483 ire->ire_ill->ill_ire_cnt++; 484 ASSERT(ire->ire_ill->ill_ire_cnt != 0); /* Wraparound */ 485 } 486 ire_atomic_end(irb_ptr, ire); 487 488 /* Make any caching of the IREs be notified or updated */ 489 ire_flush_cache_v6(ire, IRE_FLUSH_ADD); 490 491 return (ire); 492 } 493 494 /* 495 * Search for all HOST REDIRECT routes that are 496 * pointing at the specified gateway and 497 * delete them. This routine is called only 498 * when a default gateway is going away. 499 */ 500 static void 501 ire_delete_host_redirects_v6(const in6_addr_t *gateway, ip_stack_t *ipst) 502 { 503 irb_t *irb_ptr; 504 irb_t *irb; 505 ire_t *ire; 506 in6_addr_t gw_addr_v6; 507 int i; 508 509 /* get the hash table for HOST routes */ 510 irb_ptr = ipst->ips_ip_forwarding_table_v6[(IP6_MASK_TABLE_SIZE - 1)]; 511 if (irb_ptr == NULL) 512 return; 513 for (i = 0; (i < ipst->ips_ip6_ftable_hash_size); i++) { 514 irb = &irb_ptr[i]; 515 irb_refhold(irb); 516 for (ire = irb->irb_ire; ire != NULL; ire = ire->ire_next) { 517 if (!(ire->ire_flags & RTF_DYNAMIC)) 518 continue; 519 mutex_enter(&ire->ire_lock); 520 gw_addr_v6 = ire->ire_gateway_addr_v6; 521 mutex_exit(&ire->ire_lock); 522 if (IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) 523 ire_delete(ire); 524 } 525 irb_refrele(irb); 526 } 527 } 528 529 /* 530 * Delete the specified IRE. 531 * All calls should use ire_delete(). 532 * Sometimes called as writer though not required by this function. 533 * 534 * NOTE : This function is called only if the ire was added 535 * in the list. 536 */ 537 void 538 ire_delete_v6(ire_t *ire) 539 { 540 in6_addr_t gw_addr_v6; 541 ip_stack_t *ipst = ire->ire_ipst; 542 543 /* 544 * Make sure ire_generation increases from ire_flush_cache happen 545 * after any lookup/reader has read ire_generation. 546 * Since the rw_enter makes us wait until any lookup/reader has 547 * completed we can exit the lock immediately. 548 */ 549 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 550 rw_exit(&ipst->ips_ip6_ire_head_lock); 551 552 ASSERT(ire->ire_refcnt >= 1); 553 ASSERT(ire->ire_ipversion == IPV6_VERSION); 554 555 ire_flush_cache_v6(ire, IRE_FLUSH_DELETE); 556 557 if (ire->ire_type == IRE_DEFAULT) { 558 /* 559 * when a default gateway is going away 560 * delete all the host redirects pointing at that 561 * gateway. 562 */ 563 mutex_enter(&ire->ire_lock); 564 gw_addr_v6 = ire->ire_gateway_addr_v6; 565 mutex_exit(&ire->ire_lock); 566 ire_delete_host_redirects_v6(&gw_addr_v6, ipst); 567 } 568 569 /* 570 * If we are deleting an IRE_INTERFACE then we make sure we also 571 * delete any IRE_IF_CLONE that has been created from it. 572 * Those are always in ire_dep_children. 573 */ 574 if ((ire->ire_type & IRE_INTERFACE) && ire->ire_dep_children != 0) 575 ire_dep_delete_if_clone(ire); 576 577 /* Remove from parent dependencies and child */ 578 rw_enter(&ipst->ips_ire_dep_lock, RW_WRITER); 579 if (ire->ire_dep_parent != NULL) { 580 ire_dep_remove(ire); 581 } 582 while (ire->ire_dep_children != NULL) 583 ire_dep_remove(ire->ire_dep_children); 584 rw_exit(&ipst->ips_ire_dep_lock); 585 } 586 587 /* 588 * When an IRE is added or deleted this routine is called to make sure 589 * any caching of IRE information is notified or updated. 590 * 591 * The flag argument indicates if the flush request is due to addition 592 * of new route (IRE_FLUSH_ADD), deletion of old route (IRE_FLUSH_DELETE), 593 * or a change to ire_gateway_addr (IRE_FLUSH_GWCHANGE). 594 */ 595 void 596 ire_flush_cache_v6(ire_t *ire, int flag) 597 { 598 ip_stack_t *ipst = ire->ire_ipst; 599 600 /* 601 * IRE_IF_CLONE ire's don't provide any new information 602 * than the parent from which they are cloned, so don't 603 * perturb the generation numbers. 604 */ 605 if (ire->ire_type & IRE_IF_CLONE) 606 return; 607 608 /* 609 * Ensure that an ire_add during a lookup serializes the updates of 610 * the generation numbers under ire_head_lock so that the lookup gets 611 * either the old ire and old generation number, or a new ire and new 612 * generation number. 613 */ 614 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_WRITER); 615 616 /* 617 * If a route was just added, we need to notify everybody that 618 * has cached an IRE_NOROUTE since there might now be a better 619 * route for them. 620 */ 621 if (flag == IRE_FLUSH_ADD) { 622 ire_increment_generation(ipst->ips_ire_reject_v6); 623 ire_increment_generation(ipst->ips_ire_blackhole_v6); 624 } 625 626 /* Adding a default can't otherwise provide a better route */ 627 if (ire->ire_type == IRE_DEFAULT && flag == IRE_FLUSH_ADD) { 628 rw_exit(&ipst->ips_ip6_ire_head_lock); 629 return; 630 } 631 632 switch (flag) { 633 case IRE_FLUSH_DELETE: 634 case IRE_FLUSH_GWCHANGE: 635 /* 636 * Update ire_generation for all ire_dep_children chains 637 * starting with this IRE 638 */ 639 ire_dep_incr_generation(ire); 640 break; 641 case IRE_FLUSH_ADD: { 642 in6_addr_t addr; 643 in6_addr_t mask; 644 ip_stack_t *ipst = ire->ire_ipst; 645 uint_t masklen; 646 647 /* 648 * Find an IRE which is a shorter match than the ire to be added 649 * For any such IRE (which we repeat) we update the 650 * ire_generation the same way as in the delete case. 651 */ 652 addr = ire->ire_addr_v6; 653 mask = ire->ire_mask_v6; 654 masklen = ip_mask_to_plen_v6(&mask); 655 656 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, NULL, 657 ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 658 while (ire != NULL) { 659 /* We need to handle all in the same bucket */ 660 irb_increment_generation(ire->ire_bucket); 661 662 mask = ire->ire_mask_v6; 663 ASSERT(masklen > ip_mask_to_plen_v6(&mask)); 664 masklen = ip_mask_to_plen_v6(&mask); 665 ire_refrele(ire); 666 ire = ire_ftable_lookup_impl_v6(&addr, &mask, NULL, 0, 667 NULL, ALL_ZONES, NULL, MATCH_IRE_SHORTERMASK, ipst); 668 } 669 } 670 break; 671 } 672 rw_exit(&ipst->ips_ip6_ire_head_lock); 673 } 674 675 /* 676 * Matches the arguments passed with the values in the ire. 677 * 678 * Note: for match types that match using "ill" passed in, ill 679 * must be checked for non-NULL before calling this routine. 680 */ 681 boolean_t 682 ire_match_args_v6(ire_t *ire, const in6_addr_t *addr, const in6_addr_t *mask, 683 const in6_addr_t *gateway, int type, const ill_t *ill, zoneid_t zoneid, 684 const ts_label_t *tsl, int match_flags) 685 { 686 in6_addr_t masked_addr; 687 in6_addr_t gw_addr_v6; 688 ill_t *ire_ill = NULL, *dst_ill; 689 ip_stack_t *ipst = ire->ire_ipst; 690 691 ASSERT(ire->ire_ipversion == IPV6_VERSION); 692 ASSERT(addr != NULL); 693 ASSERT(mask != NULL); 694 ASSERT((!(match_flags & MATCH_IRE_GW)) || gateway != NULL); 695 ASSERT((!(match_flags & MATCH_IRE_ILL)) || 696 (ill != NULL && ill->ill_isv6)); 697 698 /* 699 * If MATCH_IRE_TESTHIDDEN is set, then only return the IRE if it 700 * is in fact hidden, to ensure the caller gets the right one. 701 */ 702 if (ire->ire_testhidden) { 703 if (!(match_flags & MATCH_IRE_TESTHIDDEN)) 704 return (B_FALSE); 705 } 706 707 if (zoneid != ALL_ZONES && zoneid != ire->ire_zoneid && 708 ire->ire_zoneid != ALL_ZONES) { 709 /* 710 * If MATCH_IRE_ZONEONLY has been set and the supplied zoneid 711 * does not match that of ire_zoneid, a failure to 712 * match is reported at this point. Otherwise, since some IREs 713 * that are available in the global zone can be used in local 714 * zones, additional checks need to be performed: 715 * 716 * IRE_LOOPBACK 717 * entries should never be matched in this situation. 718 * Each zone has its own IRE_LOOPBACK. 719 * 720 * IRE_LOCAL 721 * We allow them for any zoneid. ire_route_recursive 722 * does additional checks when 723 * ip_restrict_interzone_loopback is set. 724 * 725 * If ill_usesrc_ifindex is set 726 * Then we check if the zone has a valid source address 727 * on the usesrc ill. 728 * 729 * If ire_ill is set, then check that the zone has an ipif 730 * on that ill. 731 * 732 * Outside of this function (in ire_round_robin) we check 733 * that any IRE_OFFLINK has a gateway that reachable from the 734 * zone when we have multiple choices (ECMP). 735 */ 736 if (match_flags & MATCH_IRE_ZONEONLY) 737 return (B_FALSE); 738 if (ire->ire_type & IRE_LOOPBACK) 739 return (B_FALSE); 740 741 if (ire->ire_type & IRE_LOCAL) 742 goto matchit; 743 744 /* 745 * The normal case of IRE_ONLINK has a matching zoneid. 746 * Here we handle the case when shared-IP zones have been 747 * configured with IP addresses on vniN. In that case it 748 * is ok for traffic from a zone to use IRE_ONLINK routes 749 * if the ill has a usesrc pointing at vniN 750 * Applies to IRE_INTERFACE. 751 */ 752 dst_ill = ire->ire_ill; 753 if (ire->ire_type & IRE_ONLINK) { 754 uint_t ifindex; 755 756 /* 757 * Note there is no IRE_INTERFACE on vniN thus 758 * can't do an IRE lookup for a matching route. 759 */ 760 ifindex = dst_ill->ill_usesrc_ifindex; 761 if (ifindex == 0) 762 return (B_FALSE); 763 764 /* 765 * If there is a usable source address in the 766 * zone, then it's ok to return this IRE_INTERFACE 767 */ 768 if (!ipif_zone_avail(ifindex, dst_ill->ill_isv6, 769 zoneid, ipst)) { 770 ip3dbg(("ire_match_args: no usrsrc for zone" 771 " dst_ill %p\n", (void *)dst_ill)); 772 return (B_FALSE); 773 } 774 } 775 /* 776 * For exampe, with 777 * route add 11.0.0.0 gw1 -ifp bge0 778 * route add 11.0.0.0 gw2 -ifp bge1 779 * this code would differentiate based on 780 * where the sending zone has addresses. 781 * Only if the zone has an address on bge0 can it use the first 782 * route. It isn't clear if this behavior is documented 783 * anywhere. 784 */ 785 if (dst_ill != NULL && (ire->ire_type & IRE_OFFLINK)) { 786 ipif_t *tipif; 787 788 mutex_enter(&dst_ill->ill_lock); 789 for (tipif = dst_ill->ill_ipif; 790 tipif != NULL; tipif = tipif->ipif_next) { 791 if (!IPIF_IS_CONDEMNED(tipif) && 792 (tipif->ipif_flags & IPIF_UP) && 793 (tipif->ipif_zoneid == zoneid || 794 tipif->ipif_zoneid == ALL_ZONES)) 795 break; 796 } 797 mutex_exit(&dst_ill->ill_lock); 798 if (tipif == NULL) 799 return (B_FALSE); 800 } 801 } 802 803 matchit: 804 if (match_flags & MATCH_IRE_GW) { 805 mutex_enter(&ire->ire_lock); 806 gw_addr_v6 = ire->ire_gateway_addr_v6; 807 mutex_exit(&ire->ire_lock); 808 } 809 if (match_flags & MATCH_IRE_ILL) { 810 ire_ill = ire->ire_ill; 811 812 /* 813 * If asked to match an ill, we *must* match 814 * on the ire_ill for ipmp test addresses, or 815 * any of the ill in the group for data addresses. 816 * If we don't, we may as well fail. 817 * However, we need an exception for IRE_LOCALs to ensure 818 * we loopback packets even sent to test addresses on different 819 * interfaces in the group. 820 */ 821 if ((match_flags & MATCH_IRE_TESTHIDDEN) && 822 !(ire->ire_type & IRE_LOCAL)) { 823 if (ire->ire_ill != ill) 824 return (B_FALSE); 825 } else { 826 match_flags &= ~MATCH_IRE_TESTHIDDEN; 827 /* 828 * We know that ill is not NULL, but ire_ill could be 829 * NULL 830 */ 831 if (ire_ill == NULL || !IS_ON_SAME_LAN(ill, ire_ill)) 832 return (B_FALSE); 833 } 834 } 835 /* No ire_addr_v6 bits set past the mask */ 836 ASSERT(V6_MASK_EQ(ire->ire_addr_v6, ire->ire_mask_v6, 837 ire->ire_addr_v6)); 838 V6_MASK_COPY(*addr, *mask, masked_addr); 839 if (V6_MASK_EQ(*addr, *mask, ire->ire_addr_v6) && 840 ((!(match_flags & MATCH_IRE_GW)) || 841 IN6_ARE_ADDR_EQUAL(&gw_addr_v6, gateway)) && 842 ((!(match_flags & MATCH_IRE_TYPE)) || (ire->ire_type & type)) && 843 ((!(match_flags & MATCH_IRE_TESTHIDDEN)) || ire->ire_testhidden) && 844 ((!(match_flags & MATCH_IRE_MASK)) || 845 (IN6_ARE_ADDR_EQUAL(&ire->ire_mask_v6, mask))) && 846 ((!(match_flags & MATCH_IRE_SECATTR)) || 847 (!is_system_labeled()) || 848 (tsol_ire_match_gwattr(ire, tsl) == 0))) { 849 /* We found the matched IRE */ 850 return (B_TRUE); 851 } 852 return (B_FALSE); 853 } 854 855 /* 856 * Check if the zoneid (not ALL_ZONES) has an IRE_INTERFACE for the specified 857 * gateway address. If ill is non-NULL we also match on it. 858 * The caller must hold a read lock on RADIX_NODE_HEAD if lock_held is set. 859 */ 860 boolean_t 861 ire_gateway_ok_zone_v6(const in6_addr_t *gateway, zoneid_t zoneid, ill_t *ill, 862 const ts_label_t *tsl, ip_stack_t *ipst, boolean_t lock_held) 863 { 864 ire_t *ire; 865 uint_t match_flags; 866 867 if (lock_held) 868 ASSERT(RW_READ_HELD(&ipst->ips_ip6_ire_head_lock)); 869 else 870 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 871 872 match_flags = MATCH_IRE_TYPE | MATCH_IRE_SECATTR; 873 if (ill != NULL) 874 match_flags |= MATCH_IRE_ILL; 875 876 ire = ire_ftable_lookup_impl_v6(gateway, &ipv6_all_zeros, 877 &ipv6_all_zeros, IRE_INTERFACE, ill, zoneid, tsl, match_flags, 878 ipst); 879 880 if (!lock_held) 881 rw_exit(&ipst->ips_ip6_ire_head_lock); 882 if (ire != NULL) { 883 ire_refrele(ire); 884 return (B_TRUE); 885 } else { 886 return (B_FALSE); 887 } 888 } 889 890 /* 891 * Lookup a route in forwarding table. 892 * specific lookup is indicated by passing the 893 * required parameters and indicating the 894 * match required in flag field. 895 * 896 * Supports link-local addresses by following the ipif/ill when recursing. 897 */ 898 ire_t * 899 ire_ftable_lookup_v6(const in6_addr_t *addr, const in6_addr_t *mask, 900 const in6_addr_t *gateway, int type, const ill_t *ill, 901 zoneid_t zoneid, const ts_label_t *tsl, int flags, 902 uint32_t xmit_hint, ip_stack_t *ipst, uint_t *generationp) 903 { 904 ire_t *ire = NULL; 905 906 ASSERT(addr != NULL); 907 ASSERT((!(flags & MATCH_IRE_MASK)) || mask != NULL); 908 ASSERT((!(flags & MATCH_IRE_GW)) || gateway != NULL); 909 ASSERT(ill == NULL || ill->ill_isv6); 910 911 ASSERT(!IN6_IS_ADDR_V4MAPPED(addr)); 912 913 /* 914 * ire_match_args_v6() will dereference ill if MATCH_IRE_ILL 915 * is set. 916 */ 917 if ((flags & (MATCH_IRE_ILL)) && (ill == NULL)) 918 return (NULL); 919 920 rw_enter(&ipst->ips_ip6_ire_head_lock, RW_READER); 921 ire = ire_ftable_lookup_impl_v6(addr, mask, gateway, type, ill, zoneid, 922 tsl, flags, ipst); 923 if (ire == NULL) { 924 rw_exit(&ipst->ips_ip6_ire_head_lock); 925 return (NULL); 926 } 927 928 /* 929 * round-robin only if we have more than one route in the bucket. 930 * ips_ip_ecmp_behavior controls when we do ECMP 931 * 2: always 932 * 1: for IRE_DEFAULT and /0 IRE_INTERFACE 933 * 0: never 934 * 935 * Note: if we found an IRE_IF_CLONE we won't look at the bucket with 936 * other ECMP IRE_INTERFACEs since the IRE_IF_CLONE is a /128 match 937 * and the IRE_INTERFACESs are likely to be shorter matches. 938 */ 939 if (ire->ire_bucket->irb_ire_cnt > 1 && !(flags & MATCH_IRE_GW)) { 940 if (ipst->ips_ip_ecmp_behavior == 2 || 941 (ipst->ips_ip_ecmp_behavior == 1 && 942 IS_DEFAULT_ROUTE_V6(ire))) { 943 ire_t *next_ire; 944 ire_ftable_args_t margs; 945 946 bzero(&margs, sizeof (margs)); 947 margs.ift_addr_v6 = *addr; 948 if (mask != NULL) 949 margs.ift_mask_v6 = *mask; 950 if (gateway != NULL) 951 margs.ift_gateway_v6 = *gateway; 952 margs.ift_type = type; 953 margs.ift_ill = ill; 954 margs.ift_zoneid = zoneid; 955 margs.ift_tsl = tsl; 956 margs.ift_flags = flags; 957 958 next_ire = ire_round_robin(ire->ire_bucket, &margs, 959 xmit_hint, ire, ipst); 960 if (next_ire == NULL) { 961 /* keep ire if next_ire is null */ 962 goto done; 963 } 964 ire_refrele(ire); 965 ire = next_ire; 966 } 967 } 968 969 done: 970 /* Return generation before dropping lock */ 971 if (generationp != NULL) 972 *generationp = ire->ire_generation; 973 974 rw_exit(&ipst->ips_ip6_ire_head_lock); 975 976 /* 977 * For shared-IP zones we need additional checks to what was 978 * done in ire_match_args to make sure IRE_LOCALs are handled. 979 * 980 * When ip_restrict_interzone_loopback is set, then 981 * we ensure that IRE_LOCAL are only used for loopback 982 * between zones when the logical "Ethernet" would 983 * have looped them back. That is, if in the absense of 984 * the IRE_LOCAL we would have sent to packet out the 985 * same ill. 986 */ 987 if ((ire->ire_type & IRE_LOCAL) && zoneid != ALL_ZONES && 988 ire->ire_zoneid != zoneid && ire->ire_zoneid != ALL_ZONES && 989 ipst->ips_ip_restrict_interzone_loopback) { 990 ire = ire_alt_local(ire, zoneid, tsl, ill, generationp); 991 ASSERT(ire != NULL); 992 } 993 994 return (ire); 995 } 996 997 /* 998 * Look up a single ire. The caller holds either the read or write lock. 999 */ 1000 ire_t * 1001 ire_ftable_lookup_impl_v6(const in6_addr_t *addr, const in6_addr_t *mask, 1002 const in6_addr_t *gateway, int type, const ill_t *ill, 1003 zoneid_t zoneid, const ts_label_t *tsl, int flags, 1004 ip_stack_t *ipst) 1005 { 1006 irb_t *irb_ptr; 1007 ire_t *ire = NULL; 1008 int i; 1009 1010 ASSERT(RW_LOCK_HELD(&ipst->ips_ip6_ire_head_lock)); 1011 1012 /* 1013 * If the mask is known, the lookup 1014 * is simple, if the mask is not known 1015 * we need to search. 1016 */ 1017 if (flags & MATCH_IRE_MASK) { 1018 uint_t masklen; 1019 1020 masklen = ip_mask_to_plen_v6(mask); 1021 if (ipst->ips_ip_forwarding_table_v6[masklen] == NULL) { 1022 return (NULL); 1023 } 1024 irb_ptr = &(ipst->ips_ip_forwarding_table_v6[masklen][ 1025 IRE_ADDR_MASK_HASH_V6(*addr, *mask, 1026 ipst->ips_ip6_ftable_hash_size)]); 1027 rw_enter(&irb_ptr->irb_lock, RW_READER); 1028 for (ire = irb_ptr->irb_ire; ire != NULL; 1029 ire = ire->ire_next) { 1030 if (IRE_IS_CONDEMNED(ire)) 1031 continue; 1032 if (ire_match_args_v6(ire, addr, mask, gateway, type, 1033 ill, zoneid, tsl, flags)) 1034 goto found_ire; 1035 } 1036 rw_exit(&irb_ptr->irb_lock); 1037 } else { 1038 uint_t masklen; 1039 1040 /* 1041 * In this case we don't know the mask, we need to 1042 * search the table assuming different mask sizes. 1043 */ 1044 if (flags & MATCH_IRE_SHORTERMASK) { 1045 masklen = ip_mask_to_plen_v6(mask); 1046 if (masklen == 0) { 1047 /* Nothing shorter than zero */ 1048 return (NULL); 1049 } 1050 masklen--; 1051 } else { 1052 masklen = IP6_MASK_TABLE_SIZE - 1; 1053 } 1054 1055 for (i = masklen; i >= 0; i--) { 1056 in6_addr_t tmpmask; 1057 1058 if ((ipst->ips_ip_forwarding_table_v6[i]) == NULL) 1059 continue; 1060 (void) ip_plen_to_mask_v6(i, &tmpmask); 1061 irb_ptr = &ipst->ips_ip_forwarding_table_v6[i][ 1062 IRE_ADDR_MASK_HASH_V6(*addr, tmpmask, 1063 ipst->ips_ip6_ftable_hash_size)]; 1064 rw_enter(&irb_ptr->irb_lock, RW_READER); 1065 for (ire = irb_ptr->irb_ire; ire != NULL; 1066 ire = ire->ire_next) { 1067 if (IRE_IS_CONDEMNED(ire)) 1068 continue; 1069 if (ire_match_args_v6(ire, addr, 1070 &ire->ire_mask_v6, gateway, type, ill, 1071 zoneid, tsl, flags)) 1072 goto found_ire; 1073 } 1074 rw_exit(&irb_ptr->irb_lock); 1075 } 1076 } 1077 ASSERT(ire == NULL); 1078 ip1dbg(("ire_ftable_lookup_v6: returning NULL ire")); 1079 return (NULL); 1080 1081 found_ire: 1082 ire_refhold(ire); 1083 rw_exit(&irb_ptr->irb_lock); 1084 return (ire); 1085 } 1086 1087 1088 /* 1089 * This function is called by 1090 * ip_input/ire_route_recursive when doing a route lookup on only the 1091 * destination address. 1092 * 1093 * The optimizations of this function over ire_ftable_lookup are: 1094 * o removing unnecessary flag matching 1095 * o doing longest prefix match instead of overloading it further 1096 * with the unnecessary "best_prefix_match" 1097 * 1098 * If no route is found we return IRE_NOROUTE. 1099 */ 1100 ire_t * 1101 ire_ftable_lookup_simple_v6(const in6_addr_t *addr, uint32_t xmit_hint, 1102 ip_stack_t *ipst, uint_t *generationp) 1103 { 1104 ire_t *ire; 1105 1106 ire = ire_ftable_lookup_v6(addr, NULL, NULL, 0, NULL, ALL_ZONES, NULL, 1107 MATCH_IRE_DSTONLY, xmit_hint, ipst, generationp); 1108 if (ire == NULL) { 1109 ire = ire_reject(ipst, B_TRUE); 1110 if (generationp != NULL) 1111 *generationp = IRE_GENERATION_VERIFY; 1112 } 1113 /* ftable_lookup did round robin */ 1114 return (ire); 1115 } 1116 1117 ire_t * 1118 ip_select_route_v6(const in6_addr_t *dst, ip_xmit_attr_t *ixa, 1119 uint_t *generationp, in6_addr_t *setsrcp, int *errorp, boolean_t *multirtp) 1120 { 1121 ASSERT(!(ixa->ixa_flags & IXAF_IS_IPV4)); 1122 1123 return (ip_select_route(dst, ixa, generationp, setsrcp, errorp, 1124 multirtp)); 1125 } 1126 1127 /* 1128 * Recursively look for a route to the destination. Can also match on 1129 * the zoneid, ill, and label. Used for the data paths. See also 1130 * ire_route_recursive_dstonly. 1131 * 1132 * If ill is set this means we will match it by adding MATCH_IRE_ILL. 1133 * 1134 * If IRR_ALLOCATE is not set then we will only inspect the existing IREs; never 1135 * create an IRE_IF_CLONE. This is used on the receive side when we are not 1136 * forwarding. 1137 * If IRR_INCOMPLETE is set then we return the IRE even if we can't correctly 1138 * resolve the gateway. 1139 * 1140 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1141 * instead. 1142 * 1143 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1144 * is an error. 1145 * Allow at most one RTF_INDIRECT. 1146 */ 1147 ire_t * 1148 ire_route_recursive_impl_v6(ire_t *ire, 1149 const in6_addr_t *nexthop, uint_t ire_type, const ill_t *ill_arg, 1150 zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1151 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1152 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1153 { 1154 int i, j; 1155 in6_addr_t v6nexthop = *nexthop; 1156 ire_t *ires[MAX_IRE_RECURSION]; 1157 uint_t generation; 1158 uint_t generations[MAX_IRE_RECURSION]; 1159 boolean_t need_refrele = B_FALSE; 1160 boolean_t invalidate = B_FALSE; 1161 int prefs[MAX_IRE_RECURSION]; 1162 ill_t *ill = NULL; 1163 1164 if (setsrcp != NULL) 1165 ASSERT(IN6_IS_ADDR_UNSPECIFIED(setsrcp)); 1166 if (gwattrp != NULL) 1167 ASSERT(*gwattrp == NULL); 1168 1169 if (ill_arg != NULL) 1170 match_args |= MATCH_IRE_ILL; 1171 1172 /* 1173 * We iterate up to three times to resolve a route, even though 1174 * we have four slots in the array. The extra slot is for an 1175 * IRE_IF_CLONE we might need to create. 1176 */ 1177 i = 0; 1178 while (i < MAX_IRE_RECURSION - 1) { 1179 /* ire_ftable_lookup handles round-robin/ECMP */ 1180 if (ire == NULL) { 1181 ire = ire_ftable_lookup_v6(&v6nexthop, 0, 0, ire_type, 1182 (ill_arg != NULL ? ill_arg : ill), zoneid, tsl, 1183 match_args, xmit_hint, ipst, &generation); 1184 } else { 1185 /* Caller passed it; extra hold since we will rele */ 1186 ire_refhold(ire); 1187 if (generationp != NULL) 1188 generation = *generationp; 1189 else 1190 generation = IRE_GENERATION_VERIFY; 1191 } 1192 1193 if (ire == NULL) 1194 ire = ire_reject(ipst, B_TRUE); 1195 1196 /* Need to return the ire with RTF_REJECT|BLACKHOLE */ 1197 if (ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) 1198 goto error; 1199 1200 ASSERT(!(ire->ire_type & IRE_MULTICAST)); /* Not in ftable */ 1201 1202 if (i != 0) { 1203 prefs[i] = ire_pref(ire); 1204 /* 1205 * Don't allow anything unusual past the first 1206 * iteration. 1207 */ 1208 if ((ire->ire_type & 1209 (IRE_LOCAL|IRE_LOOPBACK|IRE_BROADCAST)) || 1210 prefs[i] <= prefs[i-1]) { 1211 ire_refrele(ire); 1212 if (irr_flags & IRR_INCOMPLETE) { 1213 ire = ires[0]; 1214 ire_refhold(ire); 1215 } else { 1216 ire = ire_reject(ipst, B_TRUE); 1217 } 1218 goto error; 1219 } 1220 } 1221 /* We have a usable IRE */ 1222 ires[i] = ire; 1223 generations[i] = generation; 1224 i++; 1225 1226 /* The first RTF_SETSRC address is passed back if setsrcp */ 1227 if ((ire->ire_flags & RTF_SETSRC) && 1228 setsrcp != NULL && IN6_IS_ADDR_UNSPECIFIED(setsrcp)) { 1229 ASSERT(!IN6_IS_ADDR_UNSPECIFIED( 1230 &ire->ire_setsrc_addr_v6)); 1231 *setsrcp = ire->ire_setsrc_addr_v6; 1232 } 1233 1234 /* The first ire_gw_secattr is passed back if gwattrp */ 1235 if (ire->ire_gw_secattr != NULL && 1236 gwattrp != NULL && *gwattrp == NULL) 1237 *gwattrp = ire->ire_gw_secattr; 1238 1239 /* 1240 * Check if we have a short-cut pointer to an IRE for this 1241 * destination, and that the cached dependency isn't stale. 1242 * In that case we've rejoined an existing tree towards a 1243 * parent, thus we don't need to continue the loop to 1244 * discover the rest of the tree. 1245 */ 1246 mutex_enter(&ire->ire_lock); 1247 if (ire->ire_dep_parent != NULL && 1248 ire->ire_dep_parent->ire_generation == 1249 ire->ire_dep_parent_generation) { 1250 mutex_exit(&ire->ire_lock); 1251 ire = NULL; 1252 goto done; 1253 } 1254 mutex_exit(&ire->ire_lock); 1255 1256 /* 1257 * If this type should have an ire_nce_cache (even if it 1258 * doesn't yet have one) then we are done. Includes 1259 * IRE_INTERFACE with a full 128 bit mask. 1260 */ 1261 if (ire->ire_nce_capable) { 1262 ire = NULL; 1263 goto done; 1264 } 1265 ASSERT(!(ire->ire_type & IRE_IF_CLONE)); 1266 /* 1267 * For an IRE_INTERFACE we create an IRE_IF_CLONE for this 1268 * particular destination 1269 */ 1270 if (ire->ire_type & IRE_INTERFACE) { 1271 ire_t *clone; 1272 1273 ASSERT(ire->ire_masklen != IPV6_ABITS); 1274 1275 /* 1276 * In the case of ip_input and ILLF_FORWARDING not 1277 * being set, and in the case of RTM_GET, there is 1278 * no point in allocating an IRE_IF_CLONE. We return 1279 * the IRE_INTERFACE. Note that !IRR_ALLOCATE can 1280 * result in a ire_dep_parent which is IRE_IF_* 1281 * without an IRE_IF_CLONE. 1282 * We recover from that when we need to send packets 1283 * by ensuring that the generations become 1284 * IRE_GENERATION_VERIFY in this case. 1285 */ 1286 if (!(irr_flags & IRR_ALLOCATE)) { 1287 invalidate = B_TRUE; 1288 ire = NULL; 1289 goto done; 1290 } 1291 1292 clone = ire_create_if_clone(ire, &v6nexthop, 1293 &generation); 1294 if (clone == NULL) { 1295 /* 1296 * Temporary failure - no memory. 1297 * Don't want caller to cache IRE_NOROUTE. 1298 */ 1299 invalidate = B_TRUE; 1300 ire = ire_blackhole(ipst, B_TRUE); 1301 goto error; 1302 } 1303 /* 1304 * Make clone next to last entry and the 1305 * IRE_INTERFACE the last in the dependency 1306 * chain since the clone depends on the 1307 * IRE_INTERFACE. 1308 */ 1309 ASSERT(i >= 1); 1310 ASSERT(i < MAX_IRE_RECURSION); 1311 1312 ires[i] = ires[i-1]; 1313 generations[i] = generations[i-1]; 1314 ires[i-1] = clone; 1315 generations[i-1] = generation; 1316 i++; 1317 1318 ire = NULL; 1319 goto done; 1320 } 1321 1322 /* 1323 * We only match on the type and optionally ILL when 1324 * recursing. The type match is used by some callers 1325 * to exclude certain types (such as IRE_IF_CLONE or 1326 * IRE_LOCAL|IRE_LOOPBACK). 1327 */ 1328 match_args &= MATCH_IRE_TYPE; 1329 v6nexthop = ire->ire_gateway_addr_v6; 1330 if (ill == NULL && ire->ire_ill != NULL) { 1331 ill = ire->ire_ill; 1332 need_refrele = B_TRUE; 1333 ill_refhold(ill); 1334 match_args |= MATCH_IRE_ILL; 1335 } 1336 /* 1337 * We set the prefs[i] value above if i > 0. We've already 1338 * done i++ so i is one in the case of the first time around. 1339 */ 1340 if (i == 1) 1341 prefs[0] = ire_pref(ire); 1342 ire = NULL; 1343 } 1344 ASSERT(ire == NULL); 1345 ire = ire_reject(ipst, B_TRUE); 1346 1347 error: 1348 ASSERT(ire != NULL); 1349 if (need_refrele) 1350 ill_refrele(ill); 1351 1352 /* 1353 * In the case of MULTIRT we want to try a different IRE the next 1354 * time. We let the next packet retry in that case. 1355 */ 1356 if (i > 0 && (ires[0]->ire_flags & RTF_MULTIRT)) 1357 (void) ire_no_good(ires[0]); 1358 1359 cleanup: 1360 /* cleanup ires[i] */ 1361 ire_dep_unbuild(ires, i); 1362 for (j = 0; j < i; j++) 1363 ire_refrele(ires[j]); 1364 1365 ASSERT((ire->ire_flags & (RTF_REJECT|RTF_BLACKHOLE)) || 1366 (irr_flags & IRR_INCOMPLETE)); 1367 /* 1368 * Use IRE_GENERATION_VERIFY to ensure that ip_output will redo the 1369 * ip_select_route since the reject or lack of memory might be gone. 1370 */ 1371 if (generationp != NULL) 1372 *generationp = IRE_GENERATION_VERIFY; 1373 return (ire); 1374 1375 done: 1376 ASSERT(ire == NULL); 1377 if (need_refrele) 1378 ill_refrele(ill); 1379 1380 /* Build dependencies */ 1381 if (i > 1 && !ire_dep_build(ires, generations, i)) { 1382 /* Something in chain was condemned; tear it apart */ 1383 ire = ire_blackhole(ipst, B_TRUE); 1384 goto cleanup; 1385 } 1386 1387 /* 1388 * Release all refholds except the one for ires[0] that we 1389 * will return to the caller. 1390 */ 1391 for (j = 1; j < i; j++) 1392 ire_refrele(ires[j]); 1393 1394 if (invalidate) { 1395 /* 1396 * Since we needed to allocate but couldn't we need to make 1397 * sure that the dependency chain is rebuilt the next time. 1398 */ 1399 ire_dep_invalidate_generations(ires[0]); 1400 generation = IRE_GENERATION_VERIFY; 1401 } else { 1402 /* 1403 * IREs can have been added or deleted while we did the 1404 * recursive lookup and we can't catch those until we've built 1405 * the dependencies. We verify the stored 1406 * ire_dep_parent_generation to catch any such changes and 1407 * return IRE_GENERATION_VERIFY (which will cause 1408 * ip_select_route to be called again so we can redo the 1409 * recursive lookup next time we send a packet. 1410 */ 1411 if (ires[0]->ire_dep_parent == NULL) 1412 generation = ires[0]->ire_generation; 1413 else 1414 generation = ire_dep_validate_generations(ires[0]); 1415 if (generations[0] != ires[0]->ire_generation) { 1416 /* Something changed at the top */ 1417 generation = IRE_GENERATION_VERIFY; 1418 } 1419 } 1420 if (generationp != NULL) 1421 *generationp = generation; 1422 1423 return (ires[0]); 1424 } 1425 1426 ire_t * 1427 ire_route_recursive_v6(const in6_addr_t *nexthop, uint_t ire_type, 1428 const ill_t *ill, zoneid_t zoneid, const ts_label_t *tsl, uint_t match_args, 1429 uint_t irr_flags, uint32_t xmit_hint, ip_stack_t *ipst, 1430 in6_addr_t *setsrcp, tsol_ire_gw_secattr_t **gwattrp, uint_t *generationp) 1431 { 1432 return (ire_route_recursive_impl_v6(NULL, nexthop, ire_type, ill, 1433 zoneid, tsl, match_args, irr_flags, xmit_hint, ipst, setsrcp, 1434 gwattrp, generationp)); 1435 } 1436 1437 /* 1438 * Recursively look for a route to the destination. 1439 * We only handle a destination match here, yet we have the same arguments 1440 * as the full match to allow function pointers to select between the two. 1441 * 1442 * Note that this function never returns NULL. It returns an IRE_NOROUTE 1443 * instead. 1444 * 1445 * If we find any IRE_LOCAL|BROADCAST etc past the first iteration it 1446 * is an error. 1447 * Allow at most one RTF_INDIRECT. 1448 */ 1449 ire_t * 1450 ire_route_recursive_dstonly_v6(const in6_addr_t *nexthop, uint_t irr_flags, 1451 uint32_t xmit_hint, ip_stack_t *ipst) 1452 { 1453 ire_t *ire; 1454 ire_t *ire1; 1455 uint_t generation; 1456 1457 /* ire_ftable_lookup handles round-robin/ECMP */ 1458 ire = ire_ftable_lookup_simple_v6(nexthop, xmit_hint, ipst, 1459 &generation); 1460 ASSERT(ire != NULL); 1461 1462 /* 1463 * If this type should have an ire_nce_cache (even if it 1464 * doesn't yet have one) then we are done. Includes 1465 * IRE_INTERFACE with a full 128 bit mask. 1466 */ 1467 if (ire->ire_nce_capable) 1468 return (ire); 1469 1470 /* 1471 * If the IRE has a current cached parent we know that the whole 1472 * parent chain is current, hence we don't need to discover and 1473 * build any dependencies by doing a recursive lookup. 1474 */ 1475 mutex_enter(&ire->ire_lock); 1476 if (ire->ire_dep_parent != NULL && 1477 ire->ire_dep_parent->ire_generation == 1478 ire->ire_dep_parent_generation) { 1479 mutex_exit(&ire->ire_lock); 1480 return (ire); 1481 } 1482 mutex_exit(&ire->ire_lock); 1483 1484 /* 1485 * Fallback to loop in the normal code starting with the ire 1486 * we found. Normally this would return the same ire. 1487 */ 1488 ire1 = ire_route_recursive_impl_v6(ire, nexthop, 0, NULL, ALL_ZONES, 1489 NULL, MATCH_IRE_DSTONLY, irr_flags, xmit_hint, ipst, NULL, NULL, 1490 &generation); 1491 ire_refrele(ire); 1492 return (ire1); 1493 } 1494